1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012 by Delphix. All rights reserved.
24 * Copyright 2019 Joyent, Inc.
25 */
26
27 /*
28 * Architecture-independent CPU control functions.
29 */
30
31 #include <sys/types.h>
32 #include <sys/param.h>
33 #include <sys/var.h>
34 #include <sys/thread.h>
35 #include <sys/cpuvar.h>
36 #include <sys/cpu_event.h>
37 #include <sys/kstat.h>
38 #include <sys/uadmin.h>
39 #include <sys/systm.h>
40 #include <sys/errno.h>
41 #include <sys/cmn_err.h>
42 #include <sys/procset.h>
43 #include <sys/processor.h>
44 #include <sys/debug.h>
45 #include <sys/cpupart.h>
46 #include <sys/lgrp.h>
47 #include <sys/pset.h>
48 #include <sys/pghw.h>
49 #include <sys/kmem.h>
50 #include <sys/kmem_impl.h> /* to set per-cpu kmem_cache offset */
51 #include <sys/atomic.h>
52 #include <sys/callb.h>
53 #include <sys/vtrace.h>
54 #include <sys/cyclic.h>
55 #include <sys/bitmap.h>
56 #include <sys/nvpair.h>
57 #include <sys/pool_pset.h>
58 #include <sys/msacct.h>
59 #include <sys/time.h>
60 #include <sys/archsystm.h>
61 #include <sys/sdt.h>
62 #if defined(__x86) || defined(__amd64)
63 #include <sys/x86_archext.h>
64 #endif
65 #include <sys/callo.h>
66
67 extern int mp_cpu_start(cpu_t *);
68 extern int mp_cpu_stop(cpu_t *);
69 extern int mp_cpu_poweron(cpu_t *);
70 extern int mp_cpu_poweroff(cpu_t *);
71 extern int mp_cpu_configure(int);
72 extern int mp_cpu_unconfigure(int);
73 extern void mp_cpu_faulted_enter(cpu_t *);
74 extern void mp_cpu_faulted_exit(cpu_t *);
75
76 extern int cmp_cpu_to_chip(processorid_t cpuid);
77 #ifdef __sparcv9
78 extern char *cpu_fru_fmri(cpu_t *cp);
79 #endif
80
81 static void cpu_add_active_internal(cpu_t *cp);
82 static void cpu_remove_active(cpu_t *cp);
83 static void cpu_info_kstat_create(cpu_t *cp);
84 static void cpu_info_kstat_destroy(cpu_t *cp);
85 static void cpu_stats_kstat_create(cpu_t *cp);
86 static void cpu_stats_kstat_destroy(cpu_t *cp);
87
88 static int cpu_sys_stats_ks_update(kstat_t *ksp, int rw);
89 static int cpu_vm_stats_ks_update(kstat_t *ksp, int rw);
90 static int cpu_stat_ks_update(kstat_t *ksp, int rw);
91 static int cpu_state_change_hooks(int, cpu_setup_t, cpu_setup_t);
92
93 /*
94 * cpu_lock protects ncpus, ncpus_online, cpu_flag, cpu_list, cpu_active,
95 * max_cpu_seqid_ever, and dispatch queue reallocations. The lock ordering with
96 * respect to related locks is:
97 *
98 * cpu_lock --> thread_free_lock ---> p_lock ---> thread_lock()
99 *
100 * Warning: Certain sections of code do not use the cpu_lock when
101 * traversing the cpu_list (e.g. mutex_vector_enter(), clock()). Since
102 * all cpus are paused during modifications to this list, a solution
103 * to protect the list is too either disable kernel preemption while
104 * walking the list, *or* recheck the cpu_next pointer at each
105 * iteration in the loop. Note that in no cases can any cached
106 * copies of the cpu pointers be kept as they may become invalid.
107 */
108 kmutex_t cpu_lock;
109 cpu_t *cpu_list; /* list of all CPUs */
110 cpu_t *clock_cpu_list; /* used by clock to walk CPUs */
111 cpu_t *cpu_active; /* list of active CPUs */
112 cpuset_t cpu_active_set; /* cached set of active CPUs */
113 static cpuset_t cpu_available; /* set of available CPUs */
114 cpuset_t cpu_seqid_inuse; /* which cpu_seqids are in use */
115
116 cpu_t **cpu_seq; /* ptrs to CPUs, indexed by seq_id */
117
118 /*
119 * max_ncpus keeps the max cpus the system can have. Initially
120 * it's NCPU, but since most archs scan the devtree for cpus
121 * fairly early on during boot, the real max can be known before
122 * ncpus is set (useful for early NCPU based allocations).
123 */
124 int max_ncpus = NCPU;
125 /*
126 * platforms that set max_ncpus to maxiumum number of cpus that can be
127 * dynamically added will set boot_max_ncpus to the number of cpus found
128 * at device tree scan time during boot.
129 */
130 int boot_max_ncpus = -1;
131 int boot_ncpus = -1;
132 /*
133 * Maximum possible CPU id. This can never be >= NCPU since NCPU is
134 * used to size arrays that are indexed by CPU id.
135 */
136 processorid_t max_cpuid = NCPU - 1;
137
138 /*
139 * Maximum cpu_seqid was given. This number can only grow and never shrink. It
140 * can be used to optimize NCPU loops to avoid going through CPUs which were
141 * never on-line.
142 */
143 processorid_t max_cpu_seqid_ever = 0;
144
145 int ncpus = 1;
146 int ncpus_online = 1;
147
148 /*
149 * CPU that we're trying to offline. Protected by cpu_lock.
150 */
151 cpu_t *cpu_inmotion;
152
153 /*
154 * Can be raised to suppress further weakbinding, which are instead
155 * satisfied by disabling preemption. Must be raised/lowered under cpu_lock,
156 * while individual thread weakbinding synchronization is done under thread
157 * lock.
158 */
159 int weakbindingbarrier;
160
161 /*
162 * Variables used in pause_cpus().
163 */
164 static volatile char safe_list[NCPU];
165
166 static struct _cpu_pause_info {
167 int cp_spl; /* spl saved in pause_cpus() */
168 volatile int cp_go; /* Go signal sent after all ready */
169 int cp_count; /* # of CPUs to pause */
170 ksema_t cp_sem; /* synch pause_cpus & cpu_pause */
171 kthread_id_t cp_paused;
172 void *(*cp_func)(void *);
173 } cpu_pause_info;
174
175 static kmutex_t pause_free_mutex;
176 static kcondvar_t pause_free_cv;
177
178
179 static struct cpu_sys_stats_ks_data {
180 kstat_named_t cpu_ticks_idle;
181 kstat_named_t cpu_ticks_user;
182 kstat_named_t cpu_ticks_kernel;
183 kstat_named_t cpu_ticks_wait;
184 kstat_named_t cpu_nsec_idle;
185 kstat_named_t cpu_nsec_user;
186 kstat_named_t cpu_nsec_kernel;
187 kstat_named_t cpu_nsec_dtrace;
188 kstat_named_t cpu_nsec_intr;
189 kstat_named_t cpu_load_intr;
190 kstat_named_t wait_ticks_io;
191 kstat_named_t dtrace_probes;
192 kstat_named_t bread;
193 kstat_named_t bwrite;
194 kstat_named_t lread;
195 kstat_named_t lwrite;
196 kstat_named_t phread;
197 kstat_named_t phwrite;
198 kstat_named_t pswitch;
199 kstat_named_t trap;
200 kstat_named_t intr;
201 kstat_named_t syscall;
202 kstat_named_t sysread;
203 kstat_named_t syswrite;
204 kstat_named_t sysfork;
205 kstat_named_t sysvfork;
206 kstat_named_t sysexec;
207 kstat_named_t readch;
208 kstat_named_t writech;
209 kstat_named_t rcvint;
210 kstat_named_t xmtint;
211 kstat_named_t mdmint;
212 kstat_named_t rawch;
213 kstat_named_t canch;
214 kstat_named_t outch;
215 kstat_named_t msg;
216 kstat_named_t sema;
217 kstat_named_t namei;
218 kstat_named_t ufsiget;
219 kstat_named_t ufsdirblk;
220 kstat_named_t ufsipage;
221 kstat_named_t ufsinopage;
222 kstat_named_t procovf;
223 kstat_named_t intrthread;
224 kstat_named_t intrblk;
225 kstat_named_t intrunpin;
226 kstat_named_t idlethread;
227 kstat_named_t inv_swtch;
228 kstat_named_t nthreads;
229 kstat_named_t cpumigrate;
230 kstat_named_t xcalls;
231 kstat_named_t mutex_adenters;
232 kstat_named_t rw_rdfails;
233 kstat_named_t rw_wrfails;
234 kstat_named_t modload;
235 kstat_named_t modunload;
236 kstat_named_t bawrite;
237 kstat_named_t iowait;
238 } cpu_sys_stats_ks_data_template = {
239 { "cpu_ticks_idle", KSTAT_DATA_UINT64 },
240 { "cpu_ticks_user", KSTAT_DATA_UINT64 },
241 { "cpu_ticks_kernel", KSTAT_DATA_UINT64 },
242 { "cpu_ticks_wait", KSTAT_DATA_UINT64 },
243 { "cpu_nsec_idle", KSTAT_DATA_UINT64 },
244 { "cpu_nsec_user", KSTAT_DATA_UINT64 },
245 { "cpu_nsec_kernel", KSTAT_DATA_UINT64 },
246 { "cpu_nsec_dtrace", KSTAT_DATA_UINT64 },
247 { "cpu_nsec_intr", KSTAT_DATA_UINT64 },
248 { "cpu_load_intr", KSTAT_DATA_UINT64 },
249 { "wait_ticks_io", KSTAT_DATA_UINT64 },
250 { "dtrace_probes", KSTAT_DATA_UINT64 },
251 { "bread", KSTAT_DATA_UINT64 },
252 { "bwrite", KSTAT_DATA_UINT64 },
253 { "lread", KSTAT_DATA_UINT64 },
254 { "lwrite", KSTAT_DATA_UINT64 },
255 { "phread", KSTAT_DATA_UINT64 },
256 { "phwrite", KSTAT_DATA_UINT64 },
257 { "pswitch", KSTAT_DATA_UINT64 },
258 { "trap", KSTAT_DATA_UINT64 },
259 { "intr", KSTAT_DATA_UINT64 },
260 { "syscall", KSTAT_DATA_UINT64 },
261 { "sysread", KSTAT_DATA_UINT64 },
262 { "syswrite", KSTAT_DATA_UINT64 },
263 { "sysfork", KSTAT_DATA_UINT64 },
264 { "sysvfork", KSTAT_DATA_UINT64 },
265 { "sysexec", KSTAT_DATA_UINT64 },
266 { "readch", KSTAT_DATA_UINT64 },
267 { "writech", KSTAT_DATA_UINT64 },
268 { "rcvint", KSTAT_DATA_UINT64 },
269 { "xmtint", KSTAT_DATA_UINT64 },
270 { "mdmint", KSTAT_DATA_UINT64 },
271 { "rawch", KSTAT_DATA_UINT64 },
272 { "canch", KSTAT_DATA_UINT64 },
273 { "outch", KSTAT_DATA_UINT64 },
274 { "msg", KSTAT_DATA_UINT64 },
275 { "sema", KSTAT_DATA_UINT64 },
276 { "namei", KSTAT_DATA_UINT64 },
277 { "ufsiget", KSTAT_DATA_UINT64 },
278 { "ufsdirblk", KSTAT_DATA_UINT64 },
279 { "ufsipage", KSTAT_DATA_UINT64 },
280 { "ufsinopage", KSTAT_DATA_UINT64 },
281 { "procovf", KSTAT_DATA_UINT64 },
282 { "intrthread", KSTAT_DATA_UINT64 },
283 { "intrblk", KSTAT_DATA_UINT64 },
284 { "intrunpin", KSTAT_DATA_UINT64 },
285 { "idlethread", KSTAT_DATA_UINT64 },
286 { "inv_swtch", KSTAT_DATA_UINT64 },
287 { "nthreads", KSTAT_DATA_UINT64 },
288 { "cpumigrate", KSTAT_DATA_UINT64 },
289 { "xcalls", KSTAT_DATA_UINT64 },
290 { "mutex_adenters", KSTAT_DATA_UINT64 },
291 { "rw_rdfails", KSTAT_DATA_UINT64 },
292 { "rw_wrfails", KSTAT_DATA_UINT64 },
293 { "modload", KSTAT_DATA_UINT64 },
294 { "modunload", KSTAT_DATA_UINT64 },
295 { "bawrite", KSTAT_DATA_UINT64 },
296 { "iowait", KSTAT_DATA_UINT64 },
297 };
298
299 static struct cpu_vm_stats_ks_data {
300 kstat_named_t pgrec;
301 kstat_named_t pgfrec;
302 kstat_named_t pgin;
303 kstat_named_t pgpgin;
304 kstat_named_t pgout;
305 kstat_named_t pgpgout;
306 kstat_named_t swapin;
307 kstat_named_t pgswapin;
308 kstat_named_t swapout;
309 kstat_named_t pgswapout;
310 kstat_named_t zfod;
311 kstat_named_t dfree;
312 kstat_named_t scan;
313 kstat_named_t rev;
314 kstat_named_t hat_fault;
315 kstat_named_t as_fault;
316 kstat_named_t maj_fault;
317 kstat_named_t cow_fault;
318 kstat_named_t prot_fault;
319 kstat_named_t softlock;
320 kstat_named_t kernel_asflt;
321 kstat_named_t pgrrun;
322 kstat_named_t execpgin;
323 kstat_named_t execpgout;
324 kstat_named_t execfree;
325 kstat_named_t anonpgin;
326 kstat_named_t anonpgout;
327 kstat_named_t anonfree;
328 kstat_named_t fspgin;
329 kstat_named_t fspgout;
330 kstat_named_t fsfree;
331 } cpu_vm_stats_ks_data_template = {
332 { "pgrec", KSTAT_DATA_UINT64 },
333 { "pgfrec", KSTAT_DATA_UINT64 },
334 { "pgin", KSTAT_DATA_UINT64 },
335 { "pgpgin", KSTAT_DATA_UINT64 },
336 { "pgout", KSTAT_DATA_UINT64 },
337 { "pgpgout", KSTAT_DATA_UINT64 },
338 { "swapin", KSTAT_DATA_UINT64 },
339 { "pgswapin", KSTAT_DATA_UINT64 },
340 { "swapout", KSTAT_DATA_UINT64 },
341 { "pgswapout", KSTAT_DATA_UINT64 },
342 { "zfod", KSTAT_DATA_UINT64 },
343 { "dfree", KSTAT_DATA_UINT64 },
344 { "scan", KSTAT_DATA_UINT64 },
345 { "rev", KSTAT_DATA_UINT64 },
346 { "hat_fault", KSTAT_DATA_UINT64 },
347 { "as_fault", KSTAT_DATA_UINT64 },
348 { "maj_fault", KSTAT_DATA_UINT64 },
349 { "cow_fault", KSTAT_DATA_UINT64 },
350 { "prot_fault", KSTAT_DATA_UINT64 },
351 { "softlock", KSTAT_DATA_UINT64 },
352 { "kernel_asflt", KSTAT_DATA_UINT64 },
353 { "pgrrun", KSTAT_DATA_UINT64 },
354 { "execpgin", KSTAT_DATA_UINT64 },
355 { "execpgout", KSTAT_DATA_UINT64 },
356 { "execfree", KSTAT_DATA_UINT64 },
357 { "anonpgin", KSTAT_DATA_UINT64 },
358 { "anonpgout", KSTAT_DATA_UINT64 },
359 { "anonfree", KSTAT_DATA_UINT64 },
360 { "fspgin", KSTAT_DATA_UINT64 },
361 { "fspgout", KSTAT_DATA_UINT64 },
362 { "fsfree", KSTAT_DATA_UINT64 },
363 };
364
365 /*
366 * Force the specified thread to migrate to the appropriate processor.
367 * Called with thread lock held, returns with it dropped.
368 */
369 static void
370 force_thread_migrate(kthread_id_t tp)
371 {
372 ASSERT(THREAD_LOCK_HELD(tp));
373 if (tp == curthread) {
374 THREAD_TRANSITION(tp);
375 CL_SETRUN(tp);
376 thread_unlock_nopreempt(tp);
377 swtch();
378 } else {
379 if (tp->t_state == TS_ONPROC) {
380 cpu_surrender(tp);
381 } else if (tp->t_state == TS_RUN) {
382 (void) dispdeq(tp);
383 setbackdq(tp);
384 }
385 thread_unlock(tp);
386 }
387 }
388
389 /*
390 * Set affinity for a specified CPU.
391 *
392 * Specifying a cpu_id of CPU_CURRENT, allowed _only_ when setting affinity for
393 * curthread, will set affinity to the CPU on which the thread is currently
394 * running. For other cpu_id values, the caller must ensure that the
395 * referenced CPU remains valid, which can be done by holding cpu_lock across
396 * this call.
397 *
398 * CPU affinity is guaranteed after return of thread_affinity_set(). If a
399 * caller setting affinity to CPU_CURRENT requires that its thread not migrate
400 * CPUs prior to a successful return, it should take extra precautions (such as
401 * their own call to kpreempt_disable) to ensure that safety.
402 *
403 * A CPU affinity reference count is maintained by thread_affinity_set and
404 * thread_affinity_clear (incrementing and decrementing it, respectively),
405 * maintaining CPU affinity while the count is non-zero, and allowing regions
406 * of code which require affinity to be nested.
407 */
408 void
409 thread_affinity_set(kthread_id_t t, int cpu_id)
410 {
411 cpu_t *cp;
412
413 ASSERT(!(t == curthread && t->t_weakbound_cpu != NULL));
414
415 if (cpu_id == CPU_CURRENT) {
416 VERIFY3P(t, ==, curthread);
417 kpreempt_disable();
418 cp = CPU;
419 } else {
420 /*
421 * We should be asserting that cpu_lock is held here, but
422 * the NCA code doesn't acquire it. The following assert
423 * should be uncommented when the NCA code is fixed.
424 *
425 * ASSERT(MUTEX_HELD(&cpu_lock));
426 */
427 VERIFY((cpu_id >= 0) && (cpu_id < NCPU));
428 cp = cpu[cpu_id];
429
430 /* user must provide a good cpu_id */
431 VERIFY(cp != NULL);
432 }
433
434 /*
435 * If there is already a hard affinity requested, and this affinity
436 * conflicts with that, panic.
437 */
438 thread_lock(t);
439 if (t->t_affinitycnt > 0 && t->t_bound_cpu != cp) {
440 panic("affinity_set: setting %p but already bound to %p",
441 (void *)cp, (void *)t->t_bound_cpu);
442 }
443 t->t_affinitycnt++;
444 t->t_bound_cpu = cp;
445
446 /*
447 * Make sure we're running on the right CPU.
448 */
449 if (cp != t->t_cpu || t != curthread) {
450 ASSERT(cpu_id != CPU_CURRENT);
451 force_thread_migrate(t); /* drops thread lock */
452 } else {
453 thread_unlock(t);
454 }
455
456 if (cpu_id == CPU_CURRENT) {
457 kpreempt_enable();
458 }
459 }
460
461 /*
462 * Wrapper for backward compatibility.
463 */
464 void
465 affinity_set(int cpu_id)
466 {
467 thread_affinity_set(curthread, cpu_id);
468 }
469
470 /*
471 * Decrement the affinity reservation count and if it becomes zero,
472 * clear the CPU affinity for the current thread, or set it to the user's
473 * software binding request.
474 */
475 void
476 thread_affinity_clear(kthread_id_t t)
477 {
478 register processorid_t binding;
479
480 thread_lock(t);
481 if (--t->t_affinitycnt == 0) {
482 if ((binding = t->t_bind_cpu) == PBIND_NONE) {
483 /*
484 * Adjust disp_max_unbound_pri if necessary.
485 */
486 disp_adjust_unbound_pri(t);
487 t->t_bound_cpu = NULL;
488 if (t->t_cpu->cpu_part != t->t_cpupart) {
489 force_thread_migrate(t);
490 return;
491 }
492 } else {
493 t->t_bound_cpu = cpu[binding];
494 /*
495 * Make sure the thread is running on the bound CPU.
496 */
497 if (t->t_cpu != t->t_bound_cpu) {
498 force_thread_migrate(t);
499 return; /* already dropped lock */
500 }
501 }
502 }
503 thread_unlock(t);
504 }
505
506 /*
507 * Wrapper for backward compatibility.
508 */
509 void
510 affinity_clear(void)
511 {
512 thread_affinity_clear(curthread);
513 }
514
515 /*
516 * Weak cpu affinity. Bind to the "current" cpu for short periods
517 * of time during which the thread must not block (but may be preempted).
518 * Use this instead of kpreempt_disable() when it is only "no migration"
519 * rather than "no preemption" semantics that are required - disabling
520 * preemption holds higher priority threads off of cpu and if the
521 * operation that is protected is more than momentary this is not good
522 * for realtime etc.
523 *
524 * Weakly bound threads will not prevent a cpu from being offlined -
525 * we'll only run them on the cpu to which they are weakly bound but
526 * (because they do not block) we'll always be able to move them on to
527 * another cpu at offline time if we give them just a short moment to
528 * run during which they will unbind. To give a cpu a chance of offlining,
529 * however, we require a barrier to weak bindings that may be raised for a
530 * given cpu (offline/move code may set this and then wait a short time for
531 * existing weak bindings to drop); the cpu_inmotion pointer is that barrier.
532 *
533 * There are few restrictions on the calling context of thread_nomigrate.
534 * The caller must not hold the thread lock. Calls may be nested.
535 *
536 * After weakbinding a thread must not perform actions that may block.
537 * In particular it must not call thread_affinity_set; calling that when
538 * already weakbound is nonsensical anyway.
539 *
540 * If curthread is prevented from migrating for other reasons
541 * (kernel preemption disabled; high pil; strongly bound; interrupt thread)
542 * then the weak binding will succeed even if this cpu is the target of an
543 * offline/move request.
544 */
545 void
546 thread_nomigrate(void)
547 {
548 cpu_t *cp;
549 kthread_id_t t = curthread;
550
551 again:
552 kpreempt_disable();
553 cp = CPU;
554
555 /*
556 * A highlevel interrupt must not modify t_nomigrate or
557 * t_weakbound_cpu of the thread it has interrupted. A lowlevel
558 * interrupt thread cannot migrate and we can avoid the
559 * thread_lock call below by short-circuiting here. In either
560 * case we can just return since no migration is possible and
561 * the condition will persist (ie, when we test for these again
562 * in thread_allowmigrate they can't have changed). Migration
563 * is also impossible if we're at or above DISP_LEVEL pil.
564 */
565 if (CPU_ON_INTR(cp) || t->t_flag & T_INTR_THREAD ||
566 getpil() >= DISP_LEVEL) {
567 kpreempt_enable();
568 return;
569 }
570
571 /*
572 * We must be consistent with existing weak bindings. Since we
573 * may be interrupted between the increment of t_nomigrate and
574 * the store to t_weakbound_cpu below we cannot assume that
575 * t_weakbound_cpu will be set if t_nomigrate is. Note that we
576 * cannot assert t_weakbound_cpu == t_bind_cpu since that is not
577 * always the case.
578 */
579 if (t->t_nomigrate && t->t_weakbound_cpu && t->t_weakbound_cpu != cp) {
580 if (!panicstr)
581 panic("thread_nomigrate: binding to %p but already "
582 "bound to %p", (void *)cp,
583 (void *)t->t_weakbound_cpu);
584 }
585
586 /*
587 * At this point we have preemption disabled and we don't yet hold
588 * the thread lock. So it's possible that somebody else could
589 * set t_bind_cpu here and not be able to force us across to the
590 * new cpu (since we have preemption disabled).
591 */
592 thread_lock(curthread);
593
594 /*
595 * If further weak bindings are being (temporarily) suppressed then
596 * we'll settle for disabling kernel preemption (which assures
597 * no migration provided the thread does not block which it is
598 * not allowed to if using thread_nomigrate). We must remember
599 * this disposition so we can take appropriate action in
600 * thread_allowmigrate. If this is a nested call and the
601 * thread is already weakbound then fall through as normal.
602 * We remember the decision to settle for kpreempt_disable through
603 * negative nesting counting in t_nomigrate. Once a thread has had one
604 * weakbinding request satisfied in this way any further (nested)
605 * requests will continue to be satisfied in the same way,
606 * even if weak bindings have recommenced.
607 */
608 if (t->t_nomigrate < 0 || weakbindingbarrier && t->t_nomigrate == 0) {
609 --t->t_nomigrate;
610 thread_unlock(curthread);
611 return; /* with kpreempt_disable still active */
612 }
613
614 /*
615 * We hold thread_lock so t_bind_cpu cannot change. We could,
616 * however, be running on a different cpu to which we are t_bound_cpu
617 * to (as explained above). If we grant the weak binding request
618 * in that case then the dispatcher must favour our weak binding
619 * over our strong (in which case, just as when preemption is
620 * disabled, we can continue to run on a cpu other than the one to
621 * which we are strongbound; the difference in this case is that
622 * this thread can be preempted and so can appear on the dispatch
623 * queues of a cpu other than the one it is strongbound to).
624 *
625 * If the cpu we are running on does not appear to be a current
626 * offline target (we check cpu_inmotion to determine this - since
627 * we don't hold cpu_lock we may not see a recent store to that,
628 * so it's possible that we at times can grant a weak binding to a
629 * cpu that is an offline target, but that one request will not
630 * prevent the offline from succeeding) then we will always grant
631 * the weak binding request. This includes the case above where
632 * we grant a weakbinding not commensurate with our strong binding.
633 *
634 * If our cpu does appear to be an offline target then we're inclined
635 * not to grant the weakbinding request just yet - we'd prefer to
636 * migrate to another cpu and grant the request there. The
637 * exceptions are those cases where going through preemption code
638 * will not result in us changing cpu:
639 *
640 * . interrupts have already bypassed this case (see above)
641 * . we are already weakbound to this cpu (dispatcher code will
642 * always return us to the weakbound cpu)
643 * . preemption was disabled even before we disabled it above
644 * . we are strongbound to this cpu (if we're strongbound to
645 * another and not yet running there the trip through the
646 * dispatcher will move us to the strongbound cpu and we
647 * will grant the weak binding there)
648 */
649 if (cp != cpu_inmotion || t->t_nomigrate > 0 || t->t_preempt > 1 ||
650 t->t_bound_cpu == cp) {
651 /*
652 * Don't be tempted to store to t_weakbound_cpu only on
653 * the first nested bind request - if we're interrupted
654 * after the increment of t_nomigrate and before the
655 * store to t_weakbound_cpu and the interrupt calls
656 * thread_nomigrate then the assertion in thread_allowmigrate
657 * would fail.
658 */
659 t->t_nomigrate++;
660 t->t_weakbound_cpu = cp;
661 membar_producer();
662 thread_unlock(curthread);
663 /*
664 * Now that we have dropped the thread_lock another thread
665 * can set our t_weakbound_cpu, and will try to migrate us
666 * to the strongbound cpu (which will not be prevented by
667 * preemption being disabled since we're about to enable
668 * preemption). We have granted the weakbinding to the current
669 * cpu, so again we are in the position that is is is possible
670 * that our weak and strong bindings differ. Again this
671 * is catered for by dispatcher code which will favour our
672 * weak binding.
673 */
674 kpreempt_enable();
675 } else {
676 /*
677 * Move to another cpu before granting the request by
678 * forcing this thread through preemption code. When we
679 * get to set{front,back}dq called from CL_PREEMPT()
680 * cpu_choose() will be used to select a cpu to queue
681 * us on - that will see cpu_inmotion and take
682 * steps to avoid returning us to this cpu.
683 */
684 cp->cpu_kprunrun = 1;
685 thread_unlock(curthread);
686 kpreempt_enable(); /* will call preempt() */
687 goto again;
688 }
689 }
690
691 void
692 thread_allowmigrate(void)
693 {
694 kthread_id_t t = curthread;
695
696 ASSERT(t->t_weakbound_cpu == CPU ||
697 (t->t_nomigrate < 0 && t->t_preempt > 0) ||
698 CPU_ON_INTR(CPU) || t->t_flag & T_INTR_THREAD ||
699 getpil() >= DISP_LEVEL);
700
701 if (CPU_ON_INTR(CPU) || (t->t_flag & T_INTR_THREAD) ||
702 getpil() >= DISP_LEVEL)
703 return;
704
705 if (t->t_nomigrate < 0) {
706 /*
707 * This thread was granted "weak binding" in the
708 * stronger form of kernel preemption disabling.
709 * Undo a level of nesting for both t_nomigrate
710 * and t_preempt.
711 */
712 ++t->t_nomigrate;
713 kpreempt_enable();
714 } else if (--t->t_nomigrate == 0) {
715 /*
716 * Time to drop the weak binding. We need to cater
717 * for the case where we're weakbound to a different
718 * cpu than that to which we're strongbound (a very
719 * temporary arrangement that must only persist until
720 * weak binding drops). We don't acquire thread_lock
721 * here so even as this code executes t_bound_cpu
722 * may be changing. So we disable preemption and
723 * a) in the case that t_bound_cpu changes while we
724 * have preemption disabled kprunrun will be set
725 * asynchronously, and b) if before disabling
726 * preemption we were already on a different cpu to
727 * our t_bound_cpu then we set kprunrun ourselves
728 * to force a trip through the dispatcher when
729 * preemption is enabled.
730 */
731 kpreempt_disable();
732 if (t->t_bound_cpu &&
733 t->t_weakbound_cpu != t->t_bound_cpu)
734 CPU->cpu_kprunrun = 1;
735 t->t_weakbound_cpu = NULL;
736 membar_producer();
737 kpreempt_enable();
738 }
739 }
740
741 /*
742 * weakbinding_stop can be used to temporarily cause weakbindings made
743 * with thread_nomigrate to be satisfied through the stronger action of
744 * kpreempt_disable. weakbinding_start recommences normal weakbinding.
745 */
746
747 void
748 weakbinding_stop(void)
749 {
750 ASSERT(MUTEX_HELD(&cpu_lock));
751 weakbindingbarrier = 1;
752 membar_producer(); /* make visible before subsequent thread_lock */
753 }
754
755 void
756 weakbinding_start(void)
757 {
758 ASSERT(MUTEX_HELD(&cpu_lock));
759 weakbindingbarrier = 0;
760 }
761
762 void
763 null_xcall(void)
764 {
765 }
766
767 /*
768 * This routine is called to place the CPUs in a safe place so that
769 * one of them can be taken off line or placed on line. What we are
770 * trying to do here is prevent a thread from traversing the list
771 * of active CPUs while we are changing it or from getting placed on
772 * the run queue of a CPU that has just gone off line. We do this by
773 * creating a thread with the highest possible prio for each CPU and
774 * having it call this routine. The advantage of this method is that
775 * we can eliminate all checks for CPU_ACTIVE in the disp routines.
776 * This makes disp faster at the expense of making p_online() slower
777 * which is a good trade off.
778 */
779 static void
780 cpu_pause(int index)
781 {
782 int s;
783 struct _cpu_pause_info *cpi = &cpu_pause_info;
784 volatile char *safe = &safe_list[index];
785 long lindex = index;
786
787 ASSERT((curthread->t_bound_cpu != NULL) || (*safe == PAUSE_DIE));
788
789 while (*safe != PAUSE_DIE) {
790 *safe = PAUSE_READY;
791 membar_enter(); /* make sure stores are flushed */
792 sema_v(&cpi->cp_sem); /* signal requesting thread */
793
794 /*
795 * Wait here until all pause threads are running. That
796 * indicates that it's safe to do the spl. Until
797 * cpu_pause_info.cp_go is set, we don't want to spl
798 * because that might block clock interrupts needed
799 * to preempt threads on other CPUs.
800 */
801 while (cpi->cp_go == 0)
802 ;
803 /*
804 * Even though we are at the highest disp prio, we need
805 * to block out all interrupts below LOCK_LEVEL so that
806 * an intr doesn't come in, wake up a thread, and call
807 * setbackdq/setfrontdq.
808 */
809 s = splhigh();
810 /*
811 * if cp_func has been set then call it using index as the
812 * argument, currently only used by cpr_suspend_cpus().
813 * This function is used as the code to execute on the
814 * "paused" cpu's when a machine comes out of a sleep state
815 * and CPU's were powered off. (could also be used for
816 * hotplugging CPU's).
817 */
818 if (cpi->cp_func != NULL)
819 (*cpi->cp_func)((void *)lindex);
820
821 mach_cpu_pause(safe);
822
823 splx(s);
824 /*
825 * Waiting is at an end. Switch out of cpu_pause
826 * loop and resume useful work.
827 */
828 swtch();
829 }
830
831 mutex_enter(&pause_free_mutex);
832 *safe = PAUSE_DEAD;
833 cv_broadcast(&pause_free_cv);
834 mutex_exit(&pause_free_mutex);
835 }
836
837 /*
838 * Allow the cpus to start running again.
839 */
840 void
841 start_cpus()
842 {
843 int i;
844
845 ASSERT(MUTEX_HELD(&cpu_lock));
846 ASSERT(cpu_pause_info.cp_paused);
847 cpu_pause_info.cp_paused = NULL;
848 for (i = 0; i < NCPU; i++)
849 safe_list[i] = PAUSE_IDLE;
850 membar_enter(); /* make sure stores are flushed */
851 affinity_clear();
852 splx(cpu_pause_info.cp_spl);
853 kpreempt_enable();
854 }
855
856 /*
857 * Allocate a pause thread for a CPU.
858 */
859 static void
860 cpu_pause_alloc(cpu_t *cp)
861 {
862 kthread_id_t t;
863 long cpun = cp->cpu_id;
864
865 /*
866 * Note, v.v_nglobpris will not change value as long as I hold
867 * cpu_lock.
868 */
869 t = thread_create(NULL, 0, cpu_pause, (void *)cpun,
870 0, &p0, TS_STOPPED, v.v_nglobpris - 1);
871 thread_lock(t);
872 t->t_bound_cpu = cp;
873 t->t_disp_queue = cp->cpu_disp;
874 t->t_affinitycnt = 1;
875 t->t_preempt = 1;
876 thread_unlock(t);
877 cp->cpu_pause_thread = t;
878 /*
879 * Registering a thread in the callback table is usually done
880 * in the initialization code of the thread. In this
881 * case, we do it right after thread creation because the
882 * thread itself may never run, and we need to register the
883 * fact that it is safe for cpr suspend.
884 */
885 CALLB_CPR_INIT_SAFE(t, "cpu_pause");
886 }
887
888 /*
889 * Free a pause thread for a CPU.
890 */
891 static void
892 cpu_pause_free(cpu_t *cp)
893 {
894 kthread_id_t t;
895 int cpun = cp->cpu_id;
896
897 ASSERT(MUTEX_HELD(&cpu_lock));
898 /*
899 * We have to get the thread and tell it to die.
900 */
901 if ((t = cp->cpu_pause_thread) == NULL) {
902 ASSERT(safe_list[cpun] == PAUSE_IDLE);
903 return;
904 }
905 thread_lock(t);
906 t->t_cpu = CPU; /* disp gets upset if last cpu is quiesced. */
907 t->t_bound_cpu = NULL; /* Must un-bind; cpu may not be running. */
908 t->t_pri = v.v_nglobpris - 1;
909 ASSERT(safe_list[cpun] == PAUSE_IDLE);
910 safe_list[cpun] = PAUSE_DIE;
911 THREAD_TRANSITION(t);
912 setbackdq(t);
913 thread_unlock_nopreempt(t);
914
915 /*
916 * If we don't wait for the thread to actually die, it may try to
917 * run on the wrong cpu as part of an actual call to pause_cpus().
918 */
919 mutex_enter(&pause_free_mutex);
920 while (safe_list[cpun] != PAUSE_DEAD) {
921 cv_wait(&pause_free_cv, &pause_free_mutex);
922 }
923 mutex_exit(&pause_free_mutex);
924 safe_list[cpun] = PAUSE_IDLE;
925
926 cp->cpu_pause_thread = NULL;
927 }
928
929 /*
930 * Initialize basic structures for pausing CPUs.
931 */
932 void
933 cpu_pause_init()
934 {
935 sema_init(&cpu_pause_info.cp_sem, 0, NULL, SEMA_DEFAULT, NULL);
936 /*
937 * Create initial CPU pause thread.
938 */
939 cpu_pause_alloc(CPU);
940 }
941
942 /*
943 * Start the threads used to pause another CPU.
944 */
945 static int
946 cpu_pause_start(processorid_t cpu_id)
947 {
948 int i;
949 int cpu_count = 0;
950
951 for (i = 0; i < NCPU; i++) {
952 cpu_t *cp;
953 kthread_id_t t;
954
955 cp = cpu[i];
956 if (!CPU_IN_SET(cpu_available, i) || (i == cpu_id)) {
957 safe_list[i] = PAUSE_WAIT;
958 continue;
959 }
960
961 /*
962 * Skip CPU if it is quiesced or not yet started.
963 */
964 if ((cp->cpu_flags & (CPU_QUIESCED | CPU_READY)) != CPU_READY) {
965 safe_list[i] = PAUSE_WAIT;
966 continue;
967 }
968
969 /*
970 * Start this CPU's pause thread.
971 */
972 t = cp->cpu_pause_thread;
973 thread_lock(t);
974 /*
975 * Reset the priority, since nglobpris may have
976 * changed since the thread was created, if someone
977 * has loaded the RT (or some other) scheduling
978 * class.
979 */
980 t->t_pri = v.v_nglobpris - 1;
981 THREAD_TRANSITION(t);
982 setbackdq(t);
983 thread_unlock_nopreempt(t);
984 ++cpu_count;
985 }
986 return (cpu_count);
987 }
988
989
990 /*
991 * Pause all of the CPUs except the one we are on by creating a high
992 * priority thread bound to those CPUs.
993 *
994 * Note that one must be extremely careful regarding code
995 * executed while CPUs are paused. Since a CPU may be paused
996 * while a thread scheduling on that CPU is holding an adaptive
997 * lock, code executed with CPUs paused must not acquire adaptive
998 * (or low-level spin) locks. Also, such code must not block,
999 * since the thread that is supposed to initiate the wakeup may
1000 * never run.
1001 *
1002 * With a few exceptions, the restrictions on code executed with CPUs
1003 * paused match those for code executed at high-level interrupt
1004 * context.
1005 */
1006 void
1007 pause_cpus(cpu_t *off_cp, void *(*func)(void *))
1008 {
1009 processorid_t cpu_id;
1010 int i;
1011 struct _cpu_pause_info *cpi = &cpu_pause_info;
1012
1013 ASSERT(MUTEX_HELD(&cpu_lock));
1014 ASSERT(cpi->cp_paused == NULL);
1015 cpi->cp_count = 0;
1016 cpi->cp_go = 0;
1017 for (i = 0; i < NCPU; i++)
1018 safe_list[i] = PAUSE_IDLE;
1019 kpreempt_disable();
1020
1021 cpi->cp_func = func;
1022
1023 /*
1024 * If running on the cpu that is going offline, get off it.
1025 * This is so that it won't be necessary to rechoose a CPU
1026 * when done.
1027 */
1028 if (CPU == off_cp)
1029 cpu_id = off_cp->cpu_next_part->cpu_id;
1030 else
1031 cpu_id = CPU->cpu_id;
1032 affinity_set(cpu_id);
1033
1034 /*
1035 * Start the pause threads and record how many were started
1036 */
1037 cpi->cp_count = cpu_pause_start(cpu_id);
1038
1039 /*
1040 * Now wait for all CPUs to be running the pause thread.
1041 */
1042 while (cpi->cp_count > 0) {
1043 /*
1044 * Spin reading the count without grabbing the disp
1045 * lock to make sure we don't prevent the pause
1046 * threads from getting the lock.
1047 */
1048 while (sema_held(&cpi->cp_sem))
1049 ;
1050 if (sema_tryp(&cpi->cp_sem))
1051 --cpi->cp_count;
1052 }
1053 cpi->cp_go = 1; /* all have reached cpu_pause */
1054
1055 /*
1056 * Now wait for all CPUs to spl. (Transition from PAUSE_READY
1057 * to PAUSE_WAIT.)
1058 */
1059 for (i = 0; i < NCPU; i++) {
1060 while (safe_list[i] != PAUSE_WAIT)
1061 ;
1062 }
1063 cpi->cp_spl = splhigh(); /* block dispatcher on this CPU */
1064 cpi->cp_paused = curthread;
1065 }
1066
1067 /*
1068 * Check whether the current thread has CPUs paused
1069 */
1070 int
1071 cpus_paused(void)
1072 {
1073 if (cpu_pause_info.cp_paused != NULL) {
1074 ASSERT(cpu_pause_info.cp_paused == curthread);
1075 return (1);
1076 }
1077 return (0);
1078 }
1079
1080 static cpu_t *
1081 cpu_get_all(processorid_t cpun)
1082 {
1083 ASSERT(MUTEX_HELD(&cpu_lock));
1084
1085 if (cpun >= NCPU || cpun < 0 || !CPU_IN_SET(cpu_available, cpun))
1086 return (NULL);
1087 return (cpu[cpun]);
1088 }
1089
1090 /*
1091 * Check whether cpun is a valid processor id and whether it should be
1092 * visible from the current zone. If it is, return a pointer to the
1093 * associated CPU structure.
1094 */
1095 cpu_t *
1096 cpu_get(processorid_t cpun)
1097 {
1098 cpu_t *c;
1099
1100 ASSERT(MUTEX_HELD(&cpu_lock));
1101 c = cpu_get_all(cpun);
1102 if (c != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() &&
1103 zone_pset_get(curproc->p_zone) != cpupart_query_cpu(c))
1104 return (NULL);
1105 return (c);
1106 }
1107
1108 /*
1109 * The following functions should be used to check CPU states in the kernel.
1110 * They should be invoked with cpu_lock held. Kernel subsystems interested
1111 * in CPU states should *not* use cpu_get_state() and various P_ONLINE/etc
1112 * states. Those are for user-land (and system call) use only.
1113 */
1114
1115 /*
1116 * Determine whether the CPU is online and handling interrupts.
1117 */
1118 int
1119 cpu_is_online(cpu_t *cpu)
1120 {
1121 ASSERT(MUTEX_HELD(&cpu_lock));
1122 return (cpu_flagged_online(cpu->cpu_flags));
1123 }
1124
1125 /*
1126 * Determine whether the CPU is offline (this includes spare and faulted).
1127 */
1128 int
1129 cpu_is_offline(cpu_t *cpu)
1130 {
1131 ASSERT(MUTEX_HELD(&cpu_lock));
1132 return (cpu_flagged_offline(cpu->cpu_flags));
1133 }
1134
1135 /*
1136 * Determine whether the CPU is powered off.
1137 */
1138 int
1139 cpu_is_poweredoff(cpu_t *cpu)
1140 {
1141 ASSERT(MUTEX_HELD(&cpu_lock));
1142 return (cpu_flagged_poweredoff(cpu->cpu_flags));
1143 }
1144
1145 /*
1146 * Determine whether the CPU is handling interrupts.
1147 */
1148 int
1149 cpu_is_nointr(cpu_t *cpu)
1150 {
1151 ASSERT(MUTEX_HELD(&cpu_lock));
1152 return (cpu_flagged_nointr(cpu->cpu_flags));
1153 }
1154
1155 /*
1156 * Determine whether the CPU is active (scheduling threads).
1157 */
1158 int
1159 cpu_is_active(cpu_t *cpu)
1160 {
1161 ASSERT(MUTEX_HELD(&cpu_lock));
1162 return (cpu_flagged_active(cpu->cpu_flags));
1163 }
1164
1165 /*
1166 * Same as above, but these require cpu_flags instead of cpu_t pointers.
1167 */
1168 int
1169 cpu_flagged_online(cpu_flag_t cpu_flags)
1170 {
1171 return (cpu_flagged_active(cpu_flags) &&
1172 (cpu_flags & CPU_ENABLE));
1173 }
1174
1175 int
1176 cpu_flagged_offline(cpu_flag_t cpu_flags)
1177 {
1178 return (((cpu_flags & CPU_POWEROFF) == 0) &&
1179 ((cpu_flags & (CPU_READY | CPU_OFFLINE)) != CPU_READY));
1180 }
1181
1182 int
1183 cpu_flagged_poweredoff(cpu_flag_t cpu_flags)
1184 {
1185 return ((cpu_flags & CPU_POWEROFF) == CPU_POWEROFF);
1186 }
1187
1188 int
1189 cpu_flagged_nointr(cpu_flag_t cpu_flags)
1190 {
1191 return (cpu_flagged_active(cpu_flags) &&
1192 (cpu_flags & CPU_ENABLE) == 0);
1193 }
1194
1195 int
1196 cpu_flagged_active(cpu_flag_t cpu_flags)
1197 {
1198 return (((cpu_flags & (CPU_POWEROFF | CPU_FAULTED | CPU_SPARE)) == 0) &&
1199 ((cpu_flags & (CPU_READY | CPU_OFFLINE)) == CPU_READY));
1200 }
1201
1202 /*
1203 * Bring the indicated CPU online.
1204 */
1205 int
1206 cpu_online(cpu_t *cp)
1207 {
1208 int error = 0;
1209
1210 /*
1211 * Handle on-line request.
1212 * This code must put the new CPU on the active list before
1213 * starting it because it will not be paused, and will start
1214 * using the active list immediately. The real start occurs
1215 * when the CPU_QUIESCED flag is turned off.
1216 */
1217
1218 ASSERT(MUTEX_HELD(&cpu_lock));
1219
1220 /*
1221 * Put all the cpus into a known safe place.
1222 * No mutexes can be entered while CPUs are paused.
1223 */
1224 error = mp_cpu_start(cp); /* arch-dep hook */
1225 if (error == 0) {
1226 pg_cpupart_in(cp, cp->cpu_part);
1227 pause_cpus(NULL, NULL);
1228 cpu_add_active_internal(cp);
1229 if (cp->cpu_flags & CPU_FAULTED) {
1230 cp->cpu_flags &= ~CPU_FAULTED;
1231 mp_cpu_faulted_exit(cp);
1232 }
1233 cp->cpu_flags &= ~(CPU_QUIESCED | CPU_OFFLINE | CPU_FROZEN |
1234 CPU_SPARE);
1235 CPU_NEW_GENERATION(cp);
1236 start_cpus();
1237 cpu_stats_kstat_create(cp);
1238 cpu_create_intrstat(cp);
1239 lgrp_kstat_create(cp);
1240 cpu_state_change_notify(cp->cpu_id, CPU_ON);
1241 cpu_intr_enable(cp); /* arch-dep hook */
1242 cpu_state_change_notify(cp->cpu_id, CPU_INTR_ON);
1243 cpu_set_state(cp);
1244 cyclic_online(cp);
1245 /*
1246 * This has to be called only after cyclic_online(). This
1247 * function uses cyclics.
1248 */
1249 callout_cpu_online(cp);
1250 poke_cpu(cp->cpu_id);
1251 }
1252
1253 return (error);
1254 }
1255
1256 /*
1257 * Take the indicated CPU offline.
1258 */
1259 int
1260 cpu_offline(cpu_t *cp, int flags)
1261 {
1262 cpupart_t *pp;
1263 int error = 0;
1264 cpu_t *ncp;
1265 int intr_enable;
1266 int cyclic_off = 0;
1267 int callout_off = 0;
1268 int loop_count;
1269 int no_quiesce = 0;
1270 int (*bound_func)(struct cpu *, int);
1271 kthread_t *t;
1272 lpl_t *cpu_lpl;
1273 proc_t *p;
1274 int lgrp_diff_lpl;
1275 boolean_t unbind_all_threads = (flags & CPU_FORCED) != 0;
1276
1277 ASSERT(MUTEX_HELD(&cpu_lock));
1278
1279 /*
1280 * If we're going from faulted or spare to offline, just
1281 * clear these flags and update CPU state.
1282 */
1283 if (cp->cpu_flags & (CPU_FAULTED | CPU_SPARE)) {
1284 if (cp->cpu_flags & CPU_FAULTED) {
1285 cp->cpu_flags &= ~CPU_FAULTED;
1286 mp_cpu_faulted_exit(cp);
1287 }
1288 cp->cpu_flags &= ~CPU_SPARE;
1289 cpu_set_state(cp);
1290 return (0);
1291 }
1292
1293 /*
1294 * Handle off-line request.
1295 */
1296 pp = cp->cpu_part;
1297 /*
1298 * Don't offline last online CPU in partition
1299 */
1300 if (ncpus_online <= 1 || pp->cp_ncpus <= 1 || cpu_intr_count(cp) < 2)
1301 return (EBUSY);
1302 /*
1303 * Unbind all soft-bound threads bound to our CPU and hard bound threads
1304 * if we were asked to.
1305 */
1306 error = cpu_unbind(cp->cpu_id, unbind_all_threads);
1307 if (error != 0)
1308 return (error);
1309 /*
1310 * We shouldn't be bound to this CPU ourselves.
1311 */
1312 if (curthread->t_bound_cpu == cp)
1313 return (EBUSY);
1314
1315 /*
1316 * Tell interested parties that this CPU is going offline.
1317 */
1318 CPU_NEW_GENERATION(cp);
1319 cpu_state_change_notify(cp->cpu_id, CPU_OFF);
1320
1321 /*
1322 * Tell the PG subsystem that the CPU is leaving the partition
1323 */
1324 pg_cpupart_out(cp, pp);
1325
1326 /*
1327 * Take the CPU out of interrupt participation so we won't find
1328 * bound kernel threads. If the architecture cannot completely
1329 * shut off interrupts on the CPU, don't quiesce it, but don't
1330 * run anything but interrupt thread... this is indicated by
1331 * the CPU_OFFLINE flag being on but the CPU_QUIESCE flag being
1332 * off.
1333 */
1334 intr_enable = cp->cpu_flags & CPU_ENABLE;
1335 if (intr_enable)
1336 no_quiesce = cpu_intr_disable(cp);
1337
1338 /*
1339 * Record that we are aiming to offline this cpu. This acts as
1340 * a barrier to further weak binding requests in thread_nomigrate
1341 * and also causes cpu_choose, disp_lowpri_cpu and setfrontdq to
1342 * lean away from this cpu. Further strong bindings are already
1343 * avoided since we hold cpu_lock. Since threads that are set
1344 * runnable around now and others coming off the target cpu are
1345 * directed away from the target, existing strong and weak bindings
1346 * (especially the latter) to the target cpu stand maximum chance of
1347 * being able to unbind during the short delay loop below (if other
1348 * unbound threads compete they may not see cpu in time to unbind
1349 * even if they would do so immediately.
1350 */
1351 cpu_inmotion = cp;
1352 membar_enter();
1353
1354 /*
1355 * Check for kernel threads (strong or weak) bound to that CPU.
1356 * Strongly bound threads may not unbind, and we'll have to return
1357 * EBUSY. Weakly bound threads should always disappear - we've
1358 * stopped more weak binding with cpu_inmotion and existing
1359 * bindings will drain imminently (they may not block). Nonetheless
1360 * we will wait for a fixed period for all bound threads to disappear.
1361 * Inactive interrupt threads are OK (they'll be in TS_FREE
1362 * state). If test finds some bound threads, wait a few ticks
1363 * to give short-lived threads (such as interrupts) chance to
1364 * complete. Note that if no_quiesce is set, i.e. this cpu
1365 * is required to service interrupts, then we take the route
1366 * that permits interrupt threads to be active (or bypassed).
1367 */
1368 bound_func = no_quiesce ? disp_bound_threads : disp_bound_anythreads;
1369
1370 again: for (loop_count = 0; (*bound_func)(cp, 0); loop_count++) {
1371 if (loop_count >= 5) {
1372 error = EBUSY; /* some threads still bound */
1373 break;
1374 }
1375
1376 /*
1377 * If some threads were assigned, give them
1378 * a chance to complete or move.
1379 *
1380 * This assumes that the clock_thread is not bound
1381 * to any CPU, because the clock_thread is needed to
1382 * do the delay(hz/100).
1383 *
1384 * Note: we still hold the cpu_lock while waiting for
1385 * the next clock tick. This is OK since it isn't
1386 * needed for anything else except processor_bind(2),
1387 * and system initialization. If we drop the lock,
1388 * we would risk another p_online disabling the last
1389 * processor.
1390 */
1391 delay(hz/100);
1392 }
1393
1394 if (error == 0 && callout_off == 0) {
1395 callout_cpu_offline(cp);
1396 callout_off = 1;
1397 }
1398
1399 if (error == 0 && cyclic_off == 0) {
1400 if (!cyclic_offline(cp)) {
1401 /*
1402 * We must have bound cyclics...
1403 */
1404 error = EBUSY;
1405 goto out;
1406 }
1407 cyclic_off = 1;
1408 }
1409
1410 /*
1411 * Call mp_cpu_stop() to perform any special operations
1412 * needed for this machine architecture to offline a CPU.
1413 */
1414 if (error == 0)
1415 error = mp_cpu_stop(cp); /* arch-dep hook */
1416
1417 /*
1418 * If that all worked, take the CPU offline and decrement
1419 * ncpus_online.
1420 */
1421 if (error == 0) {
1422 /*
1423 * Put all the cpus into a known safe place.
1424 * No mutexes can be entered while CPUs are paused.
1425 */
1426 pause_cpus(cp, NULL);
1427 /*
1428 * Repeat the operation, if necessary, to make sure that
1429 * all outstanding low-level interrupts run to completion
1430 * before we set the CPU_QUIESCED flag. It's also possible
1431 * that a thread has weak bound to the cpu despite our raising
1432 * cpu_inmotion above since it may have loaded that
1433 * value before the barrier became visible (this would have
1434 * to be the thread that was on the target cpu at the time
1435 * we raised the barrier).
1436 */
1437 if ((!no_quiesce && cp->cpu_intr_actv != 0) ||
1438 (*bound_func)(cp, 1)) {
1439 start_cpus();
1440 (void) mp_cpu_start(cp);
1441 goto again;
1442 }
1443 ncp = cp->cpu_next_part;
1444 cpu_lpl = cp->cpu_lpl;
1445 ASSERT(cpu_lpl != NULL);
1446
1447 /*
1448 * Remove the CPU from the list of active CPUs.
1449 */
1450 cpu_remove_active(cp);
1451
1452 /*
1453 * Walk the active process list and look for threads
1454 * whose home lgroup needs to be updated, or
1455 * the last CPU they run on is the one being offlined now.
1456 */
1457
1458 ASSERT(curthread->t_cpu != cp);
1459 for (p = practive; p != NULL; p = p->p_next) {
1460
1461 t = p->p_tlist;
1462
1463 if (t == NULL)
1464 continue;
1465
1466 lgrp_diff_lpl = 0;
1467
1468 do {
1469 ASSERT(t->t_lpl != NULL);
1470 /*
1471 * Taking last CPU in lpl offline
1472 * Rehome thread if it is in this lpl
1473 * Otherwise, update the count of how many
1474 * threads are in this CPU's lgroup but have
1475 * a different lpl.
1476 */
1477
1478 if (cpu_lpl->lpl_ncpu == 0) {
1479 if (t->t_lpl == cpu_lpl)
1480 lgrp_move_thread(t,
1481 lgrp_choose(t,
1482 t->t_cpupart), 0);
1483 else if (t->t_lpl->lpl_lgrpid ==
1484 cpu_lpl->lpl_lgrpid)
1485 lgrp_diff_lpl++;
1486 }
1487 ASSERT(t->t_lpl->lpl_ncpu > 0);
1488
1489 /*
1490 * Update CPU last ran on if it was this CPU
1491 */
1492 if (t->t_cpu == cp && t->t_bound_cpu != cp)
1493 t->t_cpu = disp_lowpri_cpu(ncp,
1494 t->t_lpl, t->t_pri, NULL);
1495 ASSERT(t->t_cpu != cp || t->t_bound_cpu == cp ||
1496 t->t_weakbound_cpu == cp);
1497
1498 t = t->t_forw;
1499 } while (t != p->p_tlist);
1500
1501 /*
1502 * Didn't find any threads in the same lgroup as this
1503 * CPU with a different lpl, so remove the lgroup from
1504 * the process lgroup bitmask.
1505 */
1506
1507 if (lgrp_diff_lpl == 0)
1508 klgrpset_del(p->p_lgrpset, cpu_lpl->lpl_lgrpid);
1509 }
1510
1511 /*
1512 * Walk thread list looking for threads that need to be
1513 * rehomed, since there are some threads that are not in
1514 * their process's p_tlist.
1515 */
1516
1517 t = curthread;
1518 do {
1519 ASSERT(t != NULL && t->t_lpl != NULL);
1520
1521 /*
1522 * Rehome threads with same lpl as this CPU when this
1523 * is the last CPU in the lpl.
1524 */
1525
1526 if ((cpu_lpl->lpl_ncpu == 0) && (t->t_lpl == cpu_lpl))
1527 lgrp_move_thread(t,
1528 lgrp_choose(t, t->t_cpupart), 1);
1529
1530 ASSERT(t->t_lpl->lpl_ncpu > 0);
1531
1532 /*
1533 * Update CPU last ran on if it was this CPU
1534 */
1535
1536 if (t->t_cpu == cp && t->t_bound_cpu != cp) {
1537 t->t_cpu = disp_lowpri_cpu(ncp,
1538 t->t_lpl, t->t_pri, NULL);
1539 }
1540 ASSERT(t->t_cpu != cp || t->t_bound_cpu == cp ||
1541 t->t_weakbound_cpu == cp);
1542 t = t->t_next;
1543
1544 } while (t != curthread);
1545 ASSERT((cp->cpu_flags & (CPU_FAULTED | CPU_SPARE)) == 0);
1546 cp->cpu_flags |= CPU_OFFLINE;
1547 disp_cpu_inactive(cp);
1548 if (!no_quiesce)
1549 cp->cpu_flags |= CPU_QUIESCED;
1550 ncpus_online--;
1551 cpu_set_state(cp);
1552 cpu_inmotion = NULL;
1553 start_cpus();
1554 cpu_stats_kstat_destroy(cp);
1555 cpu_delete_intrstat(cp);
1556 lgrp_kstat_destroy(cp);
1557 }
1558
1559 out:
1560 cpu_inmotion = NULL;
1561
1562 /*
1563 * If we failed, re-enable interrupts.
1564 * Do this even if cpu_intr_disable returned an error, because
1565 * it may have partially disabled interrupts.
1566 */
1567 if (error && intr_enable)
1568 cpu_intr_enable(cp);
1569
1570 /*
1571 * If we failed, but managed to offline the cyclic subsystem on this
1572 * CPU, bring it back online.
1573 */
1574 if (error && cyclic_off)
1575 cyclic_online(cp);
1576
1577 /*
1578 * If we failed, but managed to offline callouts on this CPU,
1579 * bring it back online.
1580 */
1581 if (error && callout_off)
1582 callout_cpu_online(cp);
1583
1584 /*
1585 * If we failed, tell the PG subsystem that the CPU is back
1586 */
1587 pg_cpupart_in(cp, pp);
1588
1589 /*
1590 * If we failed, we need to notify everyone that this CPU is back on.
1591 */
1592 if (error != 0) {
1593 CPU_NEW_GENERATION(cp);
1594 cpu_state_change_notify(cp->cpu_id, CPU_ON);
1595 cpu_state_change_notify(cp->cpu_id, CPU_INTR_ON);
1596 }
1597
1598 return (error);
1599 }
1600
1601 /*
1602 * Mark the indicated CPU as faulted, taking it offline.
1603 */
1604 int
1605 cpu_faulted(cpu_t *cp, int flags)
1606 {
1607 int error = 0;
1608
1609 ASSERT(MUTEX_HELD(&cpu_lock));
1610 ASSERT(!cpu_is_poweredoff(cp));
1611
1612 if (cpu_is_offline(cp)) {
1613 cp->cpu_flags &= ~CPU_SPARE;
1614 cp->cpu_flags |= CPU_FAULTED;
1615 mp_cpu_faulted_enter(cp);
1616 cpu_set_state(cp);
1617 return (0);
1618 }
1619
1620 if ((error = cpu_offline(cp, flags)) == 0) {
1621 cp->cpu_flags |= CPU_FAULTED;
1622 mp_cpu_faulted_enter(cp);
1623 cpu_set_state(cp);
1624 }
1625
1626 return (error);
1627 }
1628
1629 /*
1630 * Mark the indicated CPU as a spare, taking it offline.
1631 */
1632 int
1633 cpu_spare(cpu_t *cp, int flags)
1634 {
1635 int error = 0;
1636
1637 ASSERT(MUTEX_HELD(&cpu_lock));
1638 ASSERT(!cpu_is_poweredoff(cp));
1639
1640 if (cpu_is_offline(cp)) {
1641 if (cp->cpu_flags & CPU_FAULTED) {
1642 cp->cpu_flags &= ~CPU_FAULTED;
1643 mp_cpu_faulted_exit(cp);
1644 }
1645 cp->cpu_flags |= CPU_SPARE;
1646 cpu_set_state(cp);
1647 return (0);
1648 }
1649
1650 if ((error = cpu_offline(cp, flags)) == 0) {
1651 cp->cpu_flags |= CPU_SPARE;
1652 cpu_set_state(cp);
1653 }
1654
1655 return (error);
1656 }
1657
1658 /*
1659 * Take the indicated CPU from poweroff to offline.
1660 */
1661 int
1662 cpu_poweron(cpu_t *cp)
1663 {
1664 int error = ENOTSUP;
1665
1666 ASSERT(MUTEX_HELD(&cpu_lock));
1667 ASSERT(cpu_is_poweredoff(cp));
1668
1669 error = mp_cpu_poweron(cp); /* arch-dep hook */
1670 if (error == 0)
1671 cpu_set_state(cp);
1672
1673 return (error);
1674 }
1675
1676 /*
1677 * Take the indicated CPU from any inactive state to powered off.
1678 */
1679 int
1680 cpu_poweroff(cpu_t *cp)
1681 {
1682 int error = ENOTSUP;
1683
1684 ASSERT(MUTEX_HELD(&cpu_lock));
1685 ASSERT(cpu_is_offline(cp));
1686
1687 if (!(cp->cpu_flags & CPU_QUIESCED))
1688 return (EBUSY); /* not completely idle */
1689
1690 error = mp_cpu_poweroff(cp); /* arch-dep hook */
1691 if (error == 0)
1692 cpu_set_state(cp);
1693
1694 return (error);
1695 }
1696
1697 /*
1698 * Initialize the Sequential CPU id lookup table
1699 */
1700 void
1701 cpu_seq_tbl_init()
1702 {
1703 cpu_t **tbl;
1704
1705 tbl = kmem_zalloc(sizeof (struct cpu *) * max_ncpus, KM_SLEEP);
1706 tbl[0] = CPU;
1707
1708 cpu_seq = tbl;
1709 }
1710
1711 /*
1712 * Initialize the CPU lists for the first CPU.
1713 */
1714 void
1715 cpu_list_init(cpu_t *cp)
1716 {
1717 cp->cpu_next = cp;
1718 cp->cpu_prev = cp;
1719 cpu_list = cp;
1720 clock_cpu_list = cp;
1721
1722 cp->cpu_next_onln = cp;
1723 cp->cpu_prev_onln = cp;
1724 cpu_active = cp;
1725
1726 cp->cpu_seqid = 0;
1727 CPUSET_ADD(cpu_seqid_inuse, 0);
1728
1729 /*
1730 * Bootstrap cpu_seq using cpu_list
1731 * The cpu_seq[] table will be dynamically allocated
1732 * when kmem later becomes available (but before going MP)
1733 */
1734 cpu_seq = &cpu_list;
1735
1736 cp->cpu_cache_offset = KMEM_CPU_CACHE_OFFSET(cp->cpu_seqid);
1737 cp_default.cp_cpulist = cp;
1738 cp_default.cp_ncpus = 1;
1739 cp->cpu_next_part = cp;
1740 cp->cpu_prev_part = cp;
1741 cp->cpu_part = &cp_default;
1742
1743 CPUSET_ADD(cpu_available, cp->cpu_id);
1744 CPUSET_ADD(cpu_active_set, cp->cpu_id);
1745 }
1746
1747 /*
1748 * Insert a CPU into the list of available CPUs.
1749 */
1750 void
1751 cpu_add_unit(cpu_t *cp)
1752 {
1753 int seqid;
1754
1755 ASSERT(MUTEX_HELD(&cpu_lock));
1756 ASSERT(cpu_list != NULL); /* list started in cpu_list_init */
1757
1758 lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)cp, 0);
1759
1760 /*
1761 * Note: most users of the cpu_list will grab the
1762 * cpu_lock to insure that it isn't modified. However,
1763 * certain users can't or won't do that. To allow this
1764 * we pause the other cpus. Users who walk the list
1765 * without cpu_lock, must disable kernel preemption
1766 * to insure that the list isn't modified underneath
1767 * them. Also, any cached pointers to cpu structures
1768 * must be revalidated by checking to see if the
1769 * cpu_next pointer points to itself. This check must
1770 * be done with the cpu_lock held or kernel preemption
1771 * disabled. This check relies upon the fact that
1772 * old cpu structures are not free'ed or cleared after
1773 * then are removed from the cpu_list.
1774 *
1775 * Note that the clock code walks the cpu list dereferencing
1776 * the cpu_part pointer, so we need to initialize it before
1777 * adding the cpu to the list.
1778 */
1779 cp->cpu_part = &cp_default;
1780 pause_cpus(NULL, NULL);
1781 cp->cpu_next = cpu_list;
1782 cp->cpu_prev = cpu_list->cpu_prev;
1783 cpu_list->cpu_prev->cpu_next = cp;
1784 cpu_list->cpu_prev = cp;
1785 start_cpus();
1786
1787 for (seqid = 0; CPU_IN_SET(cpu_seqid_inuse, seqid); seqid++)
1788 continue;
1789 CPUSET_ADD(cpu_seqid_inuse, seqid);
1790 cp->cpu_seqid = seqid;
1791
1792 if (seqid > max_cpu_seqid_ever)
1793 max_cpu_seqid_ever = seqid;
1794
1795 ASSERT(ncpus < max_ncpus);
1796 ncpus++;
1797 cp->cpu_cache_offset = KMEM_CPU_CACHE_OFFSET(cp->cpu_seqid);
1798 cpu[cp->cpu_id] = cp;
1799 CPUSET_ADD(cpu_available, cp->cpu_id);
1800 cpu_seq[cp->cpu_seqid] = cp;
1801
1802 /*
1803 * allocate a pause thread for this CPU.
1804 */
1805 cpu_pause_alloc(cp);
1806
1807 /*
1808 * So that new CPUs won't have NULL prev_onln and next_onln pointers,
1809 * link them into a list of just that CPU.
1810 * This is so that disp_lowpri_cpu will work for thread_create in
1811 * pause_cpus() when called from the startup thread in a new CPU.
1812 */
1813 cp->cpu_next_onln = cp;
1814 cp->cpu_prev_onln = cp;
1815 cpu_info_kstat_create(cp);
1816 cp->cpu_next_part = cp;
1817 cp->cpu_prev_part = cp;
1818
1819 init_cpu_mstate(cp, CMS_SYSTEM);
1820
1821 pool_pset_mod = gethrtime();
1822 }
1823
1824 /*
1825 * Do the opposite of cpu_add_unit().
1826 */
1827 void
1828 cpu_del_unit(int cpuid)
1829 {
1830 struct cpu *cp, *cpnext;
1831
1832 ASSERT(MUTEX_HELD(&cpu_lock));
1833 cp = cpu[cpuid];
1834 ASSERT(cp != NULL);
1835
1836 ASSERT(cp->cpu_next_onln == cp);
1837 ASSERT(cp->cpu_prev_onln == cp);
1838 ASSERT(cp->cpu_next_part == cp);
1839 ASSERT(cp->cpu_prev_part == cp);
1840
1841 /*
1842 * Tear down the CPU's physical ID cache, and update any
1843 * processor groups
1844 */
1845 pg_cpu_fini(cp, NULL);
1846 pghw_physid_destroy(cp);
1847
1848 /*
1849 * Destroy kstat stuff.
1850 */
1851 cpu_info_kstat_destroy(cp);
1852 term_cpu_mstate(cp);
1853 /*
1854 * Free up pause thread.
1855 */
1856 cpu_pause_free(cp);
1857 CPUSET_DEL(cpu_available, cp->cpu_id);
1858 cpu[cp->cpu_id] = NULL;
1859 cpu_seq[cp->cpu_seqid] = NULL;
1860
1861 /*
1862 * The clock thread and mutex_vector_enter cannot hold the
1863 * cpu_lock while traversing the cpu list, therefore we pause
1864 * all other threads by pausing the other cpus. These, and any
1865 * other routines holding cpu pointers while possibly sleeping
1866 * must be sure to call kpreempt_disable before processing the
1867 * list and be sure to check that the cpu has not been deleted
1868 * after any sleeps (check cp->cpu_next != NULL). We guarantee
1869 * to keep the deleted cpu structure around.
1870 *
1871 * Note that this MUST be done AFTER cpu_available
1872 * has been updated so that we don't waste time
1873 * trying to pause the cpu we're trying to delete.
1874 */
1875 pause_cpus(NULL, NULL);
1876
1877 cpnext = cp->cpu_next;
1878 cp->cpu_prev->cpu_next = cp->cpu_next;
1879 cp->cpu_next->cpu_prev = cp->cpu_prev;
1880 if (cp == cpu_list)
1881 cpu_list = cpnext;
1882
1883 /*
1884 * Signals that the cpu has been deleted (see above).
1885 */
1886 cp->cpu_next = NULL;
1887 cp->cpu_prev = NULL;
1888
1889 start_cpus();
1890
1891 CPUSET_DEL(cpu_seqid_inuse, cp->cpu_seqid);
1892 ncpus--;
1893 lgrp_config(LGRP_CONFIG_CPU_DEL, (uintptr_t)cp, 0);
1894
1895 pool_pset_mod = gethrtime();
1896 }
1897
1898 /*
1899 * Add a CPU to the list of active CPUs.
1900 * This routine must not get any locks, because other CPUs are paused.
1901 */
1902 static void
1903 cpu_add_active_internal(cpu_t *cp)
1904 {
1905 cpupart_t *pp = cp->cpu_part;
1906
1907 ASSERT(MUTEX_HELD(&cpu_lock));
1908 ASSERT(cpu_list != NULL); /* list started in cpu_list_init */
1909
1910 ncpus_online++;
1911 cpu_set_state(cp);
1912 cp->cpu_next_onln = cpu_active;
1913 cp->cpu_prev_onln = cpu_active->cpu_prev_onln;
1914 cpu_active->cpu_prev_onln->cpu_next_onln = cp;
1915 cpu_active->cpu_prev_onln = cp;
1916 CPUSET_ADD(cpu_active_set, cp->cpu_id);
1917
1918 if (pp->cp_cpulist) {
1919 cp->cpu_next_part = pp->cp_cpulist;
1920 cp->cpu_prev_part = pp->cp_cpulist->cpu_prev_part;
1921 pp->cp_cpulist->cpu_prev_part->cpu_next_part = cp;
1922 pp->cp_cpulist->cpu_prev_part = cp;
1923 } else {
1924 ASSERT(pp->cp_ncpus == 0);
1925 pp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp;
1926 }
1927 pp->cp_ncpus++;
1928 if (pp->cp_ncpus == 1) {
1929 cp_numparts_nonempty++;
1930 ASSERT(cp_numparts_nonempty != 0);
1931 }
1932
1933 pg_cpu_active(cp);
1934 lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)cp, 0);
1935
1936 bzero(&cp->cpu_loadavg, sizeof (cp->cpu_loadavg));
1937 }
1938
1939 /*
1940 * Add a CPU to the list of active CPUs.
1941 * This is called from machine-dependent layers when a new CPU is started.
1942 */
1943 void
1944 cpu_add_active(cpu_t *cp)
1945 {
1946 pg_cpupart_in(cp, cp->cpu_part);
1947
1948 pause_cpus(NULL, NULL);
1949 cpu_add_active_internal(cp);
1950 start_cpus();
1951
1952 cpu_stats_kstat_create(cp);
1953 cpu_create_intrstat(cp);
1954 lgrp_kstat_create(cp);
1955 cpu_state_change_notify(cp->cpu_id, CPU_INIT);
1956 }
1957
1958
1959 /*
1960 * Remove a CPU from the list of active CPUs.
1961 * This routine must not get any locks, because other CPUs are paused.
1962 */
1963 /* ARGSUSED */
1964 static void
1965 cpu_remove_active(cpu_t *cp)
1966 {
1967 cpupart_t *pp = cp->cpu_part;
1968
1969 ASSERT(MUTEX_HELD(&cpu_lock));
1970 ASSERT(cp->cpu_next_onln != cp); /* not the last one */
1971 ASSERT(cp->cpu_prev_onln != cp); /* not the last one */
1972
1973 pg_cpu_inactive(cp);
1974
1975 lgrp_config(LGRP_CONFIG_CPU_OFFLINE, (uintptr_t)cp, 0);
1976
1977 if (cp == clock_cpu_list)
1978 clock_cpu_list = cp->cpu_next_onln;
1979
1980 cp->cpu_prev_onln->cpu_next_onln = cp->cpu_next_onln;
1981 cp->cpu_next_onln->cpu_prev_onln = cp->cpu_prev_onln;
1982 if (cpu_active == cp) {
1983 cpu_active = cp->cpu_next_onln;
1984 }
1985 cp->cpu_next_onln = cp;
1986 cp->cpu_prev_onln = cp;
1987 CPUSET_DEL(cpu_active_set, cp->cpu_id);
1988
1989 cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part;
1990 cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part;
1991 if (pp->cp_cpulist == cp) {
1992 pp->cp_cpulist = cp->cpu_next_part;
1993 ASSERT(pp->cp_cpulist != cp);
1994 }
1995 cp->cpu_next_part = cp;
1996 cp->cpu_prev_part = cp;
1997 pp->cp_ncpus--;
1998 if (pp->cp_ncpus == 0) {
1999 cp_numparts_nonempty--;
2000 ASSERT(cp_numparts_nonempty != 0);
2001 }
2002 }
2003
2004 /*
2005 * Routine used to setup a newly inserted CPU in preparation for starting
2006 * it running code.
2007 */
2008 int
2009 cpu_configure(int cpuid)
2010 {
2011 int retval = 0;
2012
2013 ASSERT(MUTEX_HELD(&cpu_lock));
2014
2015 /*
2016 * Some structures are statically allocated based upon
2017 * the maximum number of cpus the system supports. Do not
2018 * try to add anything beyond this limit.
2019 */
2020 if (cpuid < 0 || cpuid >= NCPU) {
2021 return (EINVAL);
2022 }
2023
2024 if ((cpu[cpuid] != NULL) && (cpu[cpuid]->cpu_flags != 0)) {
2025 return (EALREADY);
2026 }
2027
2028 if ((retval = mp_cpu_configure(cpuid)) != 0) {
2029 return (retval);
2030 }
2031
2032 cpu[cpuid]->cpu_flags = CPU_QUIESCED | CPU_OFFLINE | CPU_POWEROFF;
2033 cpu_set_state(cpu[cpuid]);
2034 retval = cpu_state_change_hooks(cpuid, CPU_CONFIG, CPU_UNCONFIG);
2035 if (retval != 0)
2036 (void) mp_cpu_unconfigure(cpuid);
2037
2038 return (retval);
2039 }
2040
2041 /*
2042 * Routine used to cleanup a CPU that has been powered off. This will
2043 * destroy all per-cpu information related to this cpu.
2044 */
2045 int
2046 cpu_unconfigure(int cpuid)
2047 {
2048 int error;
2049
2050 ASSERT(MUTEX_HELD(&cpu_lock));
2051
2052 if (cpu[cpuid] == NULL) {
2053 return (ENODEV);
2054 }
2055
2056 if (cpu[cpuid]->cpu_flags == 0) {
2057 return (EALREADY);
2058 }
2059
2060 if ((cpu[cpuid]->cpu_flags & CPU_POWEROFF) == 0) {
2061 return (EBUSY);
2062 }
2063
2064 if (cpu[cpuid]->cpu_props != NULL) {
2065 (void) nvlist_free(cpu[cpuid]->cpu_props);
2066 cpu[cpuid]->cpu_props = NULL;
2067 }
2068
2069 error = cpu_state_change_hooks(cpuid, CPU_UNCONFIG, CPU_CONFIG);
2070
2071 if (error != 0)
2072 return (error);
2073
2074 return (mp_cpu_unconfigure(cpuid));
2075 }
2076
2077 /*
2078 * Routines for registering and de-registering cpu_setup callback functions.
2079 *
2080 * Caller's context
2081 * These routines must not be called from a driver's attach(9E) or
2082 * detach(9E) entry point.
2083 *
2084 * NOTE: CPU callbacks should not block. They are called with cpu_lock held.
2085 */
2086
2087 /*
2088 * Ideally, these would be dynamically allocated and put into a linked
2089 * list; however that is not feasible because the registration routine
2090 * has to be available before the kmem allocator is working (in fact,
2091 * it is called by the kmem allocator init code). In any case, there
2092 * are quite a few extra entries for future users.
2093 */
2094 #define NCPU_SETUPS 20
2095
2096 struct cpu_setup {
2097 cpu_setup_func_t *func;
2098 void *arg;
2099 } cpu_setups[NCPU_SETUPS];
2100
2101 void
2102 register_cpu_setup_func(cpu_setup_func_t *func, void *arg)
2103 {
2104 int i;
2105
2106 ASSERT(MUTEX_HELD(&cpu_lock));
2107
2108 for (i = 0; i < NCPU_SETUPS; i++)
2109 if (cpu_setups[i].func == NULL)
2110 break;
2111 if (i >= NCPU_SETUPS)
2112 cmn_err(CE_PANIC, "Ran out of cpu_setup callback entries");
2113
2114 cpu_setups[i].func = func;
2115 cpu_setups[i].arg = arg;
2116 }
2117
2118 void
2119 unregister_cpu_setup_func(cpu_setup_func_t *func, void *arg)
2120 {
2121 int i;
2122
2123 ASSERT(MUTEX_HELD(&cpu_lock));
2124
2125 for (i = 0; i < NCPU_SETUPS; i++)
2126 if ((cpu_setups[i].func == func) &&
2127 (cpu_setups[i].arg == arg))
2128 break;
2129 if (i >= NCPU_SETUPS)
2130 cmn_err(CE_PANIC, "Could not find cpu_setup callback to "
2131 "deregister");
2132
2133 cpu_setups[i].func = NULL;
2134 cpu_setups[i].arg = 0;
2135 }
2136
2137 /*
2138 * Call any state change hooks for this CPU, ignore any errors.
2139 */
2140 void
2141 cpu_state_change_notify(int id, cpu_setup_t what)
2142 {
2143 int i;
2144
2145 ASSERT(MUTEX_HELD(&cpu_lock));
2146
2147 for (i = 0; i < NCPU_SETUPS; i++) {
2148 if (cpu_setups[i].func != NULL) {
2149 cpu_setups[i].func(what, id, cpu_setups[i].arg);
2150 }
2151 }
2152 }
2153
2154 /*
2155 * Call any state change hooks for this CPU, undo it if error found.
2156 */
2157 static int
2158 cpu_state_change_hooks(int id, cpu_setup_t what, cpu_setup_t undo)
2159 {
2160 int i;
2161 int retval = 0;
2162
2163 ASSERT(MUTEX_HELD(&cpu_lock));
2164
2165 for (i = 0; i < NCPU_SETUPS; i++) {
2166 if (cpu_setups[i].func != NULL) {
2167 retval = cpu_setups[i].func(what, id,
2168 cpu_setups[i].arg);
2169 if (retval) {
2170 for (i--; i >= 0; i--) {
2171 if (cpu_setups[i].func != NULL)
2172 cpu_setups[i].func(undo,
2173 id, cpu_setups[i].arg);
2174 }
2175 break;
2176 }
2177 }
2178 }
2179 return (retval);
2180 }
2181
2182 /*
2183 * Export information about this CPU via the kstat mechanism.
2184 */
2185 static struct {
2186 kstat_named_t ci_state;
2187 kstat_named_t ci_state_begin;
2188 kstat_named_t ci_cpu_type;
2189 kstat_named_t ci_fpu_type;
2190 kstat_named_t ci_clock_MHz;
2191 kstat_named_t ci_chip_id;
2192 kstat_named_t ci_implementation;
2193 kstat_named_t ci_brandstr;
2194 kstat_named_t ci_core_id;
2195 kstat_named_t ci_curr_clock_Hz;
2196 kstat_named_t ci_supp_freq_Hz;
2197 kstat_named_t ci_pg_id;
2198 #if defined(__sparcv9)
2199 kstat_named_t ci_device_ID;
2200 kstat_named_t ci_cpu_fru;
2201 #endif
2202 #if defined(__x86)
2203 kstat_named_t ci_vendorstr;
2204 kstat_named_t ci_family;
2205 kstat_named_t ci_model;
2206 kstat_named_t ci_step;
2207 kstat_named_t ci_clogid;
2208 kstat_named_t ci_pkg_core_id;
2209 kstat_named_t ci_ncpuperchip;
2210 kstat_named_t ci_ncoreperchip;
2211 kstat_named_t ci_max_cstates;
2212 kstat_named_t ci_curr_cstate;
2213 kstat_named_t ci_cacheid;
2214 kstat_named_t ci_sktstr;
2215 #endif
2216 } cpu_info_template = {
2217 { "state", KSTAT_DATA_CHAR },
2218 { "state_begin", KSTAT_DATA_LONG },
2219 { "cpu_type", KSTAT_DATA_CHAR },
2220 { "fpu_type", KSTAT_DATA_CHAR },
2221 { "clock_MHz", KSTAT_DATA_LONG },
2222 { "chip_id", KSTAT_DATA_LONG },
2223 { "implementation", KSTAT_DATA_STRING },
2224 { "brand", KSTAT_DATA_STRING },
2225 { "core_id", KSTAT_DATA_LONG },
2226 { "current_clock_Hz", KSTAT_DATA_UINT64 },
2227 { "supported_frequencies_Hz", KSTAT_DATA_STRING },
2228 { "pg_id", KSTAT_DATA_LONG },
2229 #if defined(__sparcv9)
2230 { "device_ID", KSTAT_DATA_UINT64 },
2231 { "cpu_fru", KSTAT_DATA_STRING },
2232 #endif
2233 #if defined(__x86)
2234 { "vendor_id", KSTAT_DATA_STRING },
2235 { "family", KSTAT_DATA_INT32 },
2236 { "model", KSTAT_DATA_INT32 },
2237 { "stepping", KSTAT_DATA_INT32 },
2238 { "clog_id", KSTAT_DATA_INT32 },
2239 { "pkg_core_id", KSTAT_DATA_LONG },
2240 { "ncpu_per_chip", KSTAT_DATA_INT32 },
2241 { "ncore_per_chip", KSTAT_DATA_INT32 },
2242 { "supported_max_cstates", KSTAT_DATA_INT32 },
2243 { "current_cstate", KSTAT_DATA_INT32 },
2244 { "cache_id", KSTAT_DATA_INT32 },
2245 { "socket_type", KSTAT_DATA_STRING },
2246 #endif
2247 };
2248
2249 static kmutex_t cpu_info_template_lock;
2250
2251 static int
2252 cpu_info_kstat_update(kstat_t *ksp, int rw)
2253 {
2254 cpu_t *cp = ksp->ks_private;
2255 const char *pi_state;
2256
2257 if (rw == KSTAT_WRITE)
2258 return (EACCES);
2259
2260 #if defined(__x86)
2261 /* Is the cpu still initialising itself? */
2262 if (cpuid_checkpass(cp, 1) == 0)
2263 return (ENXIO);
2264 #endif
2265 switch (cp->cpu_type_info.pi_state) {
2266 case P_ONLINE:
2267 pi_state = PS_ONLINE;
2268 break;
2269 case P_POWEROFF:
2270 pi_state = PS_POWEROFF;
2271 break;
2272 case P_NOINTR:
2273 pi_state = PS_NOINTR;
2274 break;
2275 case P_FAULTED:
2276 pi_state = PS_FAULTED;
2277 break;
2278 case P_SPARE:
2279 pi_state = PS_SPARE;
2280 break;
2281 case P_OFFLINE:
2282 pi_state = PS_OFFLINE;
2283 break;
2284 default:
2285 pi_state = "unknown";
2286 }
2287 (void) strcpy(cpu_info_template.ci_state.value.c, pi_state);
2288 cpu_info_template.ci_state_begin.value.l = cp->cpu_state_begin;
2289 (void) strncpy(cpu_info_template.ci_cpu_type.value.c,
2290 cp->cpu_type_info.pi_processor_type, 15);
2291 (void) strncpy(cpu_info_template.ci_fpu_type.value.c,
2292 cp->cpu_type_info.pi_fputypes, 15);
2293 cpu_info_template.ci_clock_MHz.value.l = cp->cpu_type_info.pi_clock;
2294 cpu_info_template.ci_chip_id.value.l =
2295 pg_plat_hw_instance_id(cp, PGHW_CHIP);
2296 kstat_named_setstr(&cpu_info_template.ci_implementation,
2297 cp->cpu_idstr);
2298 kstat_named_setstr(&cpu_info_template.ci_brandstr, cp->cpu_brandstr);
2299 cpu_info_template.ci_core_id.value.l = pg_plat_get_core_id(cp);
2300 cpu_info_template.ci_curr_clock_Hz.value.ui64 =
2301 cp->cpu_curr_clock;
2302 cpu_info_template.ci_pg_id.value.l =
2303 cp->cpu_pg && cp->cpu_pg->cmt_lineage ?
2304 cp->cpu_pg->cmt_lineage->pg_id : -1;
2305 kstat_named_setstr(&cpu_info_template.ci_supp_freq_Hz,
2306 cp->cpu_supp_freqs);
2307 #if defined(__sparcv9)
2308 cpu_info_template.ci_device_ID.value.ui64 =
2309 cpunodes[cp->cpu_id].device_id;
2310 kstat_named_setstr(&cpu_info_template.ci_cpu_fru, cpu_fru_fmri(cp));
2311 #endif
2312 #if defined(__x86)
2313 kstat_named_setstr(&cpu_info_template.ci_vendorstr,
2314 cpuid_getvendorstr(cp));
2315 cpu_info_template.ci_family.value.l = cpuid_getfamily(cp);
2316 cpu_info_template.ci_model.value.l = cpuid_getmodel(cp);
2317 cpu_info_template.ci_step.value.l = cpuid_getstep(cp);
2318 cpu_info_template.ci_clogid.value.l = cpuid_get_clogid(cp);
2319 cpu_info_template.ci_ncpuperchip.value.l = cpuid_get_ncpu_per_chip(cp);
2320 cpu_info_template.ci_ncoreperchip.value.l =
2321 cpuid_get_ncore_per_chip(cp);
2322 cpu_info_template.ci_pkg_core_id.value.l = cpuid_get_pkgcoreid(cp);
2323 cpu_info_template.ci_max_cstates.value.l = cp->cpu_m.max_cstates;
2324 cpu_info_template.ci_curr_cstate.value.l = cpu_idle_get_cpu_state(cp);
2325 cpu_info_template.ci_cacheid.value.i32 = cpuid_get_cacheid(cp);
2326 kstat_named_setstr(&cpu_info_template.ci_sktstr,
2327 cpuid_getsocketstr(cp));
2328 #endif
2329
2330 return (0);
2331 }
2332
2333 static void
2334 cpu_info_kstat_create(cpu_t *cp)
2335 {
2336 zoneid_t zoneid;
2337
2338 ASSERT(MUTEX_HELD(&cpu_lock));
2339
2340 if (pool_pset_enabled())
2341 zoneid = GLOBAL_ZONEID;
2342 else
2343 zoneid = ALL_ZONES;
2344 if ((cp->cpu_info_kstat = kstat_create_zone("cpu_info", cp->cpu_id,
2345 NULL, "misc", KSTAT_TYPE_NAMED,
2346 sizeof (cpu_info_template) / sizeof (kstat_named_t),
2347 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_VAR_SIZE, zoneid)) != NULL) {
2348 cp->cpu_info_kstat->ks_data_size += 2 * CPU_IDSTRLEN;
2349 #if defined(__sparcv9)
2350 cp->cpu_info_kstat->ks_data_size +=
2351 strlen(cpu_fru_fmri(cp)) + 1;
2352 #endif
2353 #if defined(__x86)
2354 cp->cpu_info_kstat->ks_data_size += X86_VENDOR_STRLEN;
2355 #endif
2356 if (cp->cpu_supp_freqs != NULL)
2357 cp->cpu_info_kstat->ks_data_size +=
2358 strlen(cp->cpu_supp_freqs) + 1;
2359 cp->cpu_info_kstat->ks_lock = &cpu_info_template_lock;
2360 cp->cpu_info_kstat->ks_data = &cpu_info_template;
2361 cp->cpu_info_kstat->ks_private = cp;
2362 cp->cpu_info_kstat->ks_update = cpu_info_kstat_update;
2363 kstat_install(cp->cpu_info_kstat);
2364 }
2365 }
2366
2367 static void
2368 cpu_info_kstat_destroy(cpu_t *cp)
2369 {
2370 ASSERT(MUTEX_HELD(&cpu_lock));
2371
2372 kstat_delete(cp->cpu_info_kstat);
2373 cp->cpu_info_kstat = NULL;
2374 }
2375
2376 /*
2377 * Create and install kstats for the boot CPU.
2378 */
2379 void
2380 cpu_kstat_init(cpu_t *cp)
2381 {
2382 mutex_enter(&cpu_lock);
2383 cpu_info_kstat_create(cp);
2384 cpu_stats_kstat_create(cp);
2385 cpu_create_intrstat(cp);
2386 cpu_set_state(cp);
2387 mutex_exit(&cpu_lock);
2388 }
2389
2390 /*
2391 * Make visible to the zone that subset of the cpu information that would be
2392 * initialized when a cpu is configured (but still offline).
2393 */
2394 void
2395 cpu_visibility_configure(cpu_t *cp, zone_t *zone)
2396 {
2397 zoneid_t zoneid = zone ? zone->zone_id : ALL_ZONES;
2398
2399 ASSERT(MUTEX_HELD(&cpu_lock));
2400 ASSERT(pool_pset_enabled());
2401 ASSERT(cp != NULL);
2402
2403 if (zoneid != ALL_ZONES && zoneid != GLOBAL_ZONEID) {
2404 zone->zone_ncpus++;
2405 ASSERT(zone->zone_ncpus <= ncpus);
2406 }
2407 if (cp->cpu_info_kstat != NULL)
2408 kstat_zone_add(cp->cpu_info_kstat, zoneid);
2409 }
2410
2411 /*
2412 * Make visible to the zone that subset of the cpu information that would be
2413 * initialized when a previously configured cpu is onlined.
2414 */
2415 void
2416 cpu_visibility_online(cpu_t *cp, zone_t *zone)
2417 {
2418 kstat_t *ksp;
2419 char name[sizeof ("cpu_stat") + 10]; /* enough for 32-bit cpuids */
2420 zoneid_t zoneid = zone ? zone->zone_id : ALL_ZONES;
2421 processorid_t cpun;
2422
2423 ASSERT(MUTEX_HELD(&cpu_lock));
2424 ASSERT(pool_pset_enabled());
2425 ASSERT(cp != NULL);
2426 ASSERT(cpu_is_active(cp));
2427
2428 cpun = cp->cpu_id;
2429 if (zoneid != ALL_ZONES && zoneid != GLOBAL_ZONEID) {
2430 zone->zone_ncpus_online++;
2431 ASSERT(zone->zone_ncpus_online <= ncpus_online);
2432 }
2433 (void) snprintf(name, sizeof (name), "cpu_stat%d", cpun);
2434 if ((ksp = kstat_hold_byname("cpu_stat", cpun, name, ALL_ZONES))
2435 != NULL) {
2436 kstat_zone_add(ksp, zoneid);
2437 kstat_rele(ksp);
2438 }
2439 if ((ksp = kstat_hold_byname("cpu", cpun, "sys", ALL_ZONES)) != NULL) {
2440 kstat_zone_add(ksp, zoneid);
2441 kstat_rele(ksp);
2442 }
2443 if ((ksp = kstat_hold_byname("cpu", cpun, "vm", ALL_ZONES)) != NULL) {
2444 kstat_zone_add(ksp, zoneid);
2445 kstat_rele(ksp);
2446 }
2447 if ((ksp = kstat_hold_byname("cpu", cpun, "intrstat", ALL_ZONES)) !=
2448 NULL) {
2449 kstat_zone_add(ksp, zoneid);
2450 kstat_rele(ksp);
2451 }
2452 }
2453
2454 /*
2455 * Update relevant kstats such that cpu is now visible to processes
2456 * executing in specified zone.
2457 */
2458 void
2459 cpu_visibility_add(cpu_t *cp, zone_t *zone)
2460 {
2461 cpu_visibility_configure(cp, zone);
2462 if (cpu_is_active(cp))
2463 cpu_visibility_online(cp, zone);
2464 }
2465
2466 /*
2467 * Make invisible to the zone that subset of the cpu information that would be
2468 * torn down when a previously offlined cpu is unconfigured.
2469 */
2470 void
2471 cpu_visibility_unconfigure(cpu_t *cp, zone_t *zone)
2472 {
2473 zoneid_t zoneid = zone ? zone->zone_id : ALL_ZONES;
2474
2475 ASSERT(MUTEX_HELD(&cpu_lock));
2476 ASSERT(pool_pset_enabled());
2477 ASSERT(cp != NULL);
2478
2479 if (zoneid != ALL_ZONES && zoneid != GLOBAL_ZONEID) {
2480 ASSERT(zone->zone_ncpus != 0);
2481 zone->zone_ncpus--;
2482 }
2483 if (cp->cpu_info_kstat)
2484 kstat_zone_remove(cp->cpu_info_kstat, zoneid);
2485 }
2486
2487 /*
2488 * Make invisible to the zone that subset of the cpu information that would be
2489 * torn down when a cpu is offlined (but still configured).
2490 */
2491 void
2492 cpu_visibility_offline(cpu_t *cp, zone_t *zone)
2493 {
2494 kstat_t *ksp;
2495 char name[sizeof ("cpu_stat") + 10]; /* enough for 32-bit cpuids */
2496 zoneid_t zoneid = zone ? zone->zone_id : ALL_ZONES;
2497 processorid_t cpun;
2498
2499 ASSERT(MUTEX_HELD(&cpu_lock));
2500 ASSERT(pool_pset_enabled());
2501 ASSERT(cp != NULL);
2502 ASSERT(cpu_is_active(cp));
2503
2504 cpun = cp->cpu_id;
2505 if (zoneid != ALL_ZONES && zoneid != GLOBAL_ZONEID) {
2506 ASSERT(zone->zone_ncpus_online != 0);
2507 zone->zone_ncpus_online--;
2508 }
2509
2510 if ((ksp = kstat_hold_byname("cpu", cpun, "intrstat", ALL_ZONES)) !=
2511 NULL) {
2512 kstat_zone_remove(ksp, zoneid);
2513 kstat_rele(ksp);
2514 }
2515 if ((ksp = kstat_hold_byname("cpu", cpun, "vm", ALL_ZONES)) != NULL) {
2516 kstat_zone_remove(ksp, zoneid);
2517 kstat_rele(ksp);
2518 }
2519 if ((ksp = kstat_hold_byname("cpu", cpun, "sys", ALL_ZONES)) != NULL) {
2520 kstat_zone_remove(ksp, zoneid);
2521 kstat_rele(ksp);
2522 }
2523 (void) snprintf(name, sizeof (name), "cpu_stat%d", cpun);
2524 if ((ksp = kstat_hold_byname("cpu_stat", cpun, name, ALL_ZONES))
2525 != NULL) {
2526 kstat_zone_remove(ksp, zoneid);
2527 kstat_rele(ksp);
2528 }
2529 }
2530
2531 /*
2532 * Update relevant kstats such that cpu is no longer visible to processes
2533 * executing in specified zone.
2534 */
2535 void
2536 cpu_visibility_remove(cpu_t *cp, zone_t *zone)
2537 {
2538 if (cpu_is_active(cp))
2539 cpu_visibility_offline(cp, zone);
2540 cpu_visibility_unconfigure(cp, zone);
2541 }
2542
2543 /*
2544 * Bind a thread to a CPU as requested.
2545 */
2546 int
2547 cpu_bind_thread(kthread_id_t tp, processorid_t bind, processorid_t *obind,
2548 int *error)
2549 {
2550 processorid_t binding;
2551 cpu_t *cp = NULL;
2552
2553 ASSERT(MUTEX_HELD(&cpu_lock));
2554 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
2555
2556 thread_lock(tp);
2557
2558 /*
2559 * Record old binding, but change the obind, which was initialized
2560 * to PBIND_NONE, only if this thread has a binding. This avoids
2561 * reporting PBIND_NONE for a process when some LWPs are bound.
2562 */
2563 binding = tp->t_bind_cpu;
2564 if (binding != PBIND_NONE)
2565 *obind = binding; /* record old binding */
2566
2567 switch (bind) {
2568 case PBIND_QUERY:
2569 /* Just return the old binding */
2570 thread_unlock(tp);
2571 return (0);
2572
2573 case PBIND_QUERY_TYPE:
2574 /* Return the binding type */
2575 *obind = TB_CPU_IS_SOFT(tp) ? PBIND_SOFT : PBIND_HARD;
2576 thread_unlock(tp);
2577 return (0);
2578
2579 case PBIND_SOFT:
2580 /*
2581 * Set soft binding for this thread and return the actual
2582 * binding
2583 */
2584 TB_CPU_SOFT_SET(tp);
2585 thread_unlock(tp);
2586 return (0);
2587
2588 case PBIND_HARD:
2589 /*
2590 * Set hard binding for this thread and return the actual
2591 * binding
2592 */
2593 TB_CPU_HARD_SET(tp);
2594 thread_unlock(tp);
2595 return (0);
2596
2597 default:
2598 break;
2599 }
2600
2601 /*
2602 * If this thread/LWP cannot be bound because of permission
2603 * problems, just note that and return success so that the
2604 * other threads/LWPs will be bound. This is the way
2605 * processor_bind() is defined to work.
2606 *
2607 * Binding will get EPERM if the thread is of system class
2608 * or hasprocperm() fails.
2609 */
2610 if (tp->t_cid == 0 || !hasprocperm(tp->t_cred, CRED())) {
2611 *error = EPERM;
2612 thread_unlock(tp);
2613 return (0);
2614 }
2615
2616 binding = bind;
2617 if (binding != PBIND_NONE) {
2618 cp = cpu_get((processorid_t)binding);
2619 /*
2620 * Make sure binding is valid and is in right partition.
2621 */
2622 if (cp == NULL || tp->t_cpupart != cp->cpu_part) {
2623 *error = EINVAL;
2624 thread_unlock(tp);
2625 return (0);
2626 }
2627 }
2628 tp->t_bind_cpu = binding; /* set new binding */
2629
2630 /*
2631 * If there is no system-set reason for affinity, set
2632 * the t_bound_cpu field to reflect the binding.
2633 */
2634 if (tp->t_affinitycnt == 0) {
2635 if (binding == PBIND_NONE) {
2636 /*
2637 * We may need to adjust disp_max_unbound_pri
2638 * since we're becoming unbound.
2639 */
2640 disp_adjust_unbound_pri(tp);
2641
2642 tp->t_bound_cpu = NULL; /* set new binding */
2643
2644 /*
2645 * Move thread to lgroup with strongest affinity
2646 * after unbinding
2647 */
2648 if (tp->t_lgrp_affinity)
2649 lgrp_move_thread(tp,
2650 lgrp_choose(tp, tp->t_cpupart), 1);
2651
2652 if (tp->t_state == TS_ONPROC &&
2653 tp->t_cpu->cpu_part != tp->t_cpupart)
2654 cpu_surrender(tp);
2655 } else {
2656 lpl_t *lpl;
2657
2658 tp->t_bound_cpu = cp;
2659 ASSERT(cp->cpu_lpl != NULL);
2660
2661 /*
2662 * Set home to lgroup with most affinity containing CPU
2663 * that thread is being bound or minimum bounding
2664 * lgroup if no affinities set
2665 */
2666 if (tp->t_lgrp_affinity)
2667 lpl = lgrp_affinity_best(tp, tp->t_cpupart,
2668 LGRP_NONE, B_FALSE);
2669 else
2670 lpl = cp->cpu_lpl;
2671
2672 if (tp->t_lpl != lpl) {
2673 /* can't grab cpu_lock */
2674 lgrp_move_thread(tp, lpl, 1);
2675 }
2676
2677 /*
2678 * Make the thread switch to the bound CPU.
2679 * If the thread is runnable, we need to
2680 * requeue it even if t_cpu is already set
2681 * to the right CPU, since it may be on a
2682 * kpreempt queue and need to move to a local
2683 * queue. We could check t_disp_queue to
2684 * avoid unnecessary overhead if it's already
2685 * on the right queue, but since this isn't
2686 * a performance-critical operation it doesn't
2687 * seem worth the extra code and complexity.
2688 *
2689 * If the thread is weakbound to the cpu then it will
2690 * resist the new binding request until the weak
2691 * binding drops. The cpu_surrender or requeueing
2692 * below could be skipped in such cases (since it
2693 * will have no effect), but that would require
2694 * thread_allowmigrate to acquire thread_lock so
2695 * we'll take the very occasional hit here instead.
2696 */
2697 if (tp->t_state == TS_ONPROC) {
2698 cpu_surrender(tp);
2699 } else if (tp->t_state == TS_RUN) {
2700 cpu_t *ocp = tp->t_cpu;
2701
2702 (void) dispdeq(tp);
2703 setbackdq(tp);
2704 /*
2705 * Either on the bound CPU's disp queue now,
2706 * or swapped out or on the swap queue.
2707 */
2708 ASSERT(tp->t_disp_queue == cp->cpu_disp ||
2709 tp->t_weakbound_cpu == ocp ||
2710 (tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ))
2711 != TS_LOAD);
2712 }
2713 }
2714 }
2715
2716 /*
2717 * Our binding has changed; set TP_CHANGEBIND.
2718 */
2719 tp->t_proc_flag |= TP_CHANGEBIND;
2720 aston(tp);
2721
2722 thread_unlock(tp);
2723
2724 return (0);
2725 }
2726
2727
2728 cpuset_t *
2729 cpuset_alloc(int kmflags)
2730 {
2731 return (kmem_alloc(sizeof (cpuset_t), kmflags));
2732 }
2733
2734 void
2735 cpuset_free(cpuset_t *s)
2736 {
2737 kmem_free(s, sizeof (cpuset_t));
2738 }
2739
2740 void
2741 cpuset_all(cpuset_t *s)
2742 {
2743 int i;
2744
2745 for (i = 0; i < CPUSET_WORDS; i++)
2746 s->cpub[i] = ~0UL;
2747 }
2748
2749 void
2750 cpuset_all_but(cpuset_t *s, const uint_t cpu)
2751 {
2752 cpuset_all(s);
2753 CPUSET_DEL(*s, cpu);
2754 }
2755
2756 void
2757 cpuset_only(cpuset_t *s, const uint_t cpu)
2758 {
2759 CPUSET_ZERO(*s);
2760 CPUSET_ADD(*s, cpu);
2761 }
2762
2763 long
2764 cpu_in_set(const cpuset_t *s, const uint_t cpu)
2765 {
2766 VERIFY(cpu < NCPU);
2767 return (BT_TEST(s->cpub, cpu));
2768 }
2769
2770 void
2771 cpuset_add(cpuset_t *s, const uint_t cpu)
2772 {
2773 VERIFY(cpu < NCPU);
2774 BT_SET(s->cpub, cpu);
2775 }
2776
2777 void
2778 cpuset_del(cpuset_t *s, const uint_t cpu)
2779 {
2780 VERIFY(cpu < NCPU);
2781 BT_CLEAR(s->cpub, cpu);
2782 }
2783
2784 int
2785 cpuset_isnull(const cpuset_t *s)
2786 {
2787 int i;
2788
2789 for (i = 0; i < CPUSET_WORDS; i++) {
2790 if (s->cpub[i] != 0)
2791 return (0);
2792 }
2793 return (1);
2794 }
2795
2796 int
2797 cpuset_isequal(const cpuset_t *s1, const cpuset_t *s2)
2798 {
2799 int i;
2800
2801 for (i = 0; i < CPUSET_WORDS; i++) {
2802 if (s1->cpub[i] != s2->cpub[i])
2803 return (0);
2804 }
2805 return (1);
2806 }
2807
2808 uint_t
2809 cpuset_find(const cpuset_t *s)
2810 {
2811
2812 uint_t i;
2813 uint_t cpu = (uint_t)-1;
2814
2815 /*
2816 * Find a cpu in the cpuset
2817 */
2818 for (i = 0; i < CPUSET_WORDS; i++) {
2819 cpu = (uint_t)(lowbit(s->cpub[i]) - 1);
2820 if (cpu != (uint_t)-1) {
2821 cpu += i * BT_NBIPUL;
2822 break;
2823 }
2824 }
2825 return (cpu);
2826 }
2827
2828 void
2829 cpuset_bounds(const cpuset_t *s, uint_t *smallestid, uint_t *largestid)
2830 {
2831 int i, j;
2832 uint_t bit;
2833
2834 /*
2835 * First, find the smallest cpu id in the set.
2836 */
2837 for (i = 0; i < CPUSET_WORDS; i++) {
2838 if (s->cpub[i] != 0) {
2839 bit = (uint_t)(lowbit(s->cpub[i]) - 1);
2840 ASSERT(bit != (uint_t)-1);
2841 *smallestid = bit + (i * BT_NBIPUL);
2842
2843 /*
2844 * Now find the largest cpu id in
2845 * the set and return immediately.
2846 * Done in an inner loop to avoid
2847 * having to break out of the first
2848 * loop.
2849 */
2850 for (j = CPUSET_WORDS - 1; j >= i; j--) {
2851 if (s->cpub[j] != 0) {
2852 bit = (uint_t)(highbit(s->cpub[j]) - 1);
2853 ASSERT(bit != (uint_t)-1);
2854 *largestid = bit + (j * BT_NBIPUL);
2855 ASSERT(*largestid >= *smallestid);
2856 return;
2857 }
2858 }
2859
2860 /*
2861 * If this code is reached, a
2862 * smallestid was found, but not a
2863 * largestid. The cpuset must have
2864 * been changed during the course
2865 * of this function call.
2866 */
2867 ASSERT(0);
2868 }
2869 }
2870 *smallestid = *largestid = CPUSET_NOTINSET;
2871 }
2872
2873 void
2874 cpuset_atomic_del(cpuset_t *s, const uint_t cpu)
2875 {
2876 VERIFY(cpu < NCPU);
2877 BT_ATOMIC_CLEAR(s->cpub, (cpu))
2878 }
2879
2880 void
2881 cpuset_atomic_add(cpuset_t *s, const uint_t cpu)
2882 {
2883 VERIFY(cpu < NCPU);
2884 BT_ATOMIC_SET(s->cpub, (cpu))
2885 }
2886
2887 long
2888 cpuset_atomic_xadd(cpuset_t *s, const uint_t cpu)
2889 {
2890 long res;
2891
2892 VERIFY(cpu < NCPU);
2893 BT_ATOMIC_SET_EXCL(s->cpub, cpu, res);
2894 return (res);
2895 }
2896
2897 long
2898 cpuset_atomic_xdel(cpuset_t *s, const uint_t cpu)
2899 {
2900 long res;
2901
2902 VERIFY(cpu < NCPU);
2903 BT_ATOMIC_CLEAR_EXCL(s->cpub, cpu, res);
2904 return (res);
2905 }
2906
2907 void
2908 cpuset_or(cpuset_t *dst, cpuset_t *src)
2909 {
2910 for (int i = 0; i < CPUSET_WORDS; i++) {
2911 dst->cpub[i] |= src->cpub[i];
2912 }
2913 }
2914
2915 void
2916 cpuset_xor(cpuset_t *dst, cpuset_t *src)
2917 {
2918 for (int i = 0; i < CPUSET_WORDS; i++) {
2919 dst->cpub[i] ^= src->cpub[i];
2920 }
2921 }
2922
2923 void
2924 cpuset_and(cpuset_t *dst, cpuset_t *src)
2925 {
2926 for (int i = 0; i < CPUSET_WORDS; i++) {
2927 dst->cpub[i] &= src->cpub[i];
2928 }
2929 }
2930
2931 void
2932 cpuset_zero(cpuset_t *dst)
2933 {
2934 for (int i = 0; i < CPUSET_WORDS; i++) {
2935 dst->cpub[i] = 0;
2936 }
2937 }
2938
2939
2940 /*
2941 * Unbind threads bound to specified CPU.
2942 *
2943 * If `unbind_all_threads' is true, unbind all user threads bound to a given
2944 * CPU. Otherwise unbind all soft-bound user threads.
2945 */
2946 int
2947 cpu_unbind(processorid_t cpu, boolean_t unbind_all_threads)
2948 {
2949 processorid_t obind;
2950 kthread_t *tp;
2951 int ret = 0;
2952 proc_t *pp;
2953 int err, berr = 0;
2954
2955 ASSERT(MUTEX_HELD(&cpu_lock));
2956
2957 mutex_enter(&pidlock);
2958 for (pp = practive; pp != NULL; pp = pp->p_next) {
2959 mutex_enter(&pp->p_lock);
2960 tp = pp->p_tlist;
2961 /*
2962 * Skip zombies, kernel processes, and processes in
2963 * other zones, if called from a non-global zone.
2964 */
2965 if (tp == NULL || (pp->p_flag & SSYS) ||
2966 !HASZONEACCESS(curproc, pp->p_zone->zone_id)) {
2967 mutex_exit(&pp->p_lock);
2968 continue;
2969 }
2970 do {
2971 if (tp->t_bind_cpu != cpu)
2972 continue;
2973 /*
2974 * Skip threads with hard binding when
2975 * `unbind_all_threads' is not specified.
2976 */
2977 if (!unbind_all_threads && TB_CPU_IS_HARD(tp))
2978 continue;
2979 err = cpu_bind_thread(tp, PBIND_NONE, &obind, &berr);
2980 if (ret == 0)
2981 ret = err;
2982 } while ((tp = tp->t_forw) != pp->p_tlist);
2983 mutex_exit(&pp->p_lock);
2984 }
2985 mutex_exit(&pidlock);
2986 if (ret == 0)
2987 ret = berr;
2988 return (ret);
2989 }
2990
2991
2992 /*
2993 * Destroy all remaining bound threads on a cpu.
2994 */
2995 void
2996 cpu_destroy_bound_threads(cpu_t *cp)
2997 {
2998 extern id_t syscid;
2999 register kthread_id_t t, tlist, tnext;
3000
3001 /*
3002 * Destroy all remaining bound threads on the cpu. This
3003 * should include both the interrupt threads and the idle thread.
3004 * This requires some care, since we need to traverse the
3005 * thread list with the pidlock mutex locked, but thread_free
3006 * also locks the pidlock mutex. So, we collect the threads
3007 * we're going to reap in a list headed by "tlist", then we
3008 * unlock the pidlock mutex and traverse the tlist list,
3009 * doing thread_free's on the thread's. Simple, n'est pas?
3010 * Also, this depends on thread_free not mucking with the
3011 * t_next and t_prev links of the thread.
3012 */
3013
3014 if ((t = curthread) != NULL) {
3015
3016 tlist = NULL;
3017 mutex_enter(&pidlock);
3018 do {
3019 tnext = t->t_next;
3020 if (t->t_bound_cpu == cp) {
3021
3022 /*
3023 * We've found a bound thread, carefully unlink
3024 * it out of the thread list, and add it to
3025 * our "tlist". We "know" we don't have to
3026 * worry about unlinking curthread (the thread
3027 * that is executing this code).
3028 */
3029 t->t_next->t_prev = t->t_prev;
3030 t->t_prev->t_next = t->t_next;
3031 t->t_next = tlist;
3032 tlist = t;
3033 ASSERT(t->t_cid == syscid);
3034 /* wake up anyone blocked in thread_join */
3035 cv_broadcast(&t->t_joincv);
3036 /*
3037 * t_lwp set by interrupt threads and not
3038 * cleared.
3039 */
3040 t->t_lwp = NULL;
3041 /*
3042 * Pause and idle threads always have
3043 * t_state set to TS_ONPROC.
3044 */
3045 t->t_state = TS_FREE;
3046 t->t_prev = NULL; /* Just in case */
3047 }
3048
3049 } while ((t = tnext) != curthread);
3050
3051 mutex_exit(&pidlock);
3052
3053 mutex_sync();
3054 for (t = tlist; t != NULL; t = tnext) {
3055 tnext = t->t_next;
3056 thread_free(t);
3057 }
3058 }
3059 }
3060
3061 /*
3062 * Update the cpu_supp_freqs of this cpu. This information is returned
3063 * as part of cpu_info kstats. If the cpu_info_kstat exists already, then
3064 * maintain the kstat data size.
3065 */
3066 void
3067 cpu_set_supp_freqs(cpu_t *cp, const char *freqs)
3068 {
3069 char clkstr[sizeof ("18446744073709551615") + 1]; /* ui64 MAX */
3070 const char *lfreqs = clkstr;
3071 boolean_t kstat_exists = B_FALSE;
3072 kstat_t *ksp;
3073 size_t len;
3074
3075 /*
3076 * A NULL pointer means we only support one speed.
3077 */
3078 if (freqs == NULL)
3079 (void) snprintf(clkstr, sizeof (clkstr), "%"PRIu64,
3080 cp->cpu_curr_clock);
3081 else
3082 lfreqs = freqs;
3083
3084 /*
3085 * Make sure the frequency doesn't change while a snapshot is
3086 * going on. Of course, we only need to worry about this if
3087 * the kstat exists.
3088 */
3089 if ((ksp = cp->cpu_info_kstat) != NULL) {
3090 mutex_enter(ksp->ks_lock);
3091 kstat_exists = B_TRUE;
3092 }
3093
3094 /*
3095 * Free any previously allocated string and if the kstat
3096 * already exists, then update its data size.
3097 */
3098 if (cp->cpu_supp_freqs != NULL) {
3099 len = strlen(cp->cpu_supp_freqs) + 1;
3100 kmem_free(cp->cpu_supp_freqs, len);
3101 if (kstat_exists)
3102 ksp->ks_data_size -= len;
3103 }
3104
3105 /*
3106 * Allocate the new string and set the pointer.
3107 */
3108 len = strlen(lfreqs) + 1;
3109 cp->cpu_supp_freqs = kmem_alloc(len, KM_SLEEP);
3110 (void) strcpy(cp->cpu_supp_freqs, lfreqs);
3111
3112 /*
3113 * If the kstat already exists then update the data size and
3114 * free the lock.
3115 */
3116 if (kstat_exists) {
3117 ksp->ks_data_size += len;
3118 mutex_exit(ksp->ks_lock);
3119 }
3120 }
3121
3122 /*
3123 * Indicate the current CPU's clock freqency (in Hz).
3124 * The calling context must be such that CPU references are safe.
3125 */
3126 void
3127 cpu_set_curr_clock(uint64_t new_clk)
3128 {
3129 uint64_t old_clk;
3130
3131 old_clk = CPU->cpu_curr_clock;
3132 CPU->cpu_curr_clock = new_clk;
3133
3134 /*
3135 * The cpu-change-speed DTrace probe exports the frequency in Hz
3136 */
3137 DTRACE_PROBE3(cpu__change__speed, processorid_t, CPU->cpu_id,
3138 uint64_t, old_clk, uint64_t, new_clk);
3139 }
3140
3141 /*
3142 * processor_info(2) and p_online(2) status support functions
3143 * The constants returned by the cpu_get_state() and cpu_get_state_str() are
3144 * for use in communicating processor state information to userland. Kernel
3145 * subsystems should only be using the cpu_flags value directly. Subsystems
3146 * modifying cpu_flags should record the state change via a call to the
3147 * cpu_set_state().
3148 */
3149
3150 /*
3151 * Update the pi_state of this CPU. This function provides the CPU status for
3152 * the information returned by processor_info(2).
3153 */
3154 void
3155 cpu_set_state(cpu_t *cpu)
3156 {
3157 ASSERT(MUTEX_HELD(&cpu_lock));
3158 cpu->cpu_type_info.pi_state = cpu_get_state(cpu);
3159 cpu->cpu_state_begin = gethrestime_sec();
3160 pool_cpu_mod = gethrtime();
3161 }
3162
3163 /*
3164 * Return offline/online/other status for the indicated CPU. Use only for
3165 * communication with user applications; cpu_flags provides the in-kernel
3166 * interface.
3167 */
3168 int
3169 cpu_get_state(cpu_t *cpu)
3170 {
3171 ASSERT(MUTEX_HELD(&cpu_lock));
3172 if (cpu->cpu_flags & CPU_POWEROFF)
3173 return (P_POWEROFF);
3174 else if (cpu->cpu_flags & CPU_FAULTED)
3175 return (P_FAULTED);
3176 else if (cpu->cpu_flags & CPU_SPARE)
3177 return (P_SPARE);
3178 else if ((cpu->cpu_flags & (CPU_READY | CPU_OFFLINE)) != CPU_READY)
3179 return (P_OFFLINE);
3180 else if (cpu->cpu_flags & CPU_ENABLE)
3181 return (P_ONLINE);
3182 else
3183 return (P_NOINTR);
3184 }
3185
3186 /*
3187 * Return processor_info(2) state as a string.
3188 */
3189 const char *
3190 cpu_get_state_str(cpu_t *cpu)
3191 {
3192 const char *string;
3193
3194 switch (cpu_get_state(cpu)) {
3195 case P_ONLINE:
3196 string = PS_ONLINE;
3197 break;
3198 case P_POWEROFF:
3199 string = PS_POWEROFF;
3200 break;
3201 case P_NOINTR:
3202 string = PS_NOINTR;
3203 break;
3204 case P_SPARE:
3205 string = PS_SPARE;
3206 break;
3207 case P_FAULTED:
3208 string = PS_FAULTED;
3209 break;
3210 case P_OFFLINE:
3211 string = PS_OFFLINE;
3212 break;
3213 default:
3214 string = "unknown";
3215 break;
3216 }
3217 return (string);
3218 }
3219
3220 /*
3221 * Export this CPU's statistics (cpu_stat_t and cpu_stats_t) as raw and named
3222 * kstats, respectively. This is done when a CPU is initialized or placed
3223 * online via p_online(2).
3224 */
3225 static void
3226 cpu_stats_kstat_create(cpu_t *cp)
3227 {
3228 int instance = cp->cpu_id;
3229 char *module = "cpu";
3230 char *class = "misc";
3231 kstat_t *ksp;
3232 zoneid_t zoneid;
3233
3234 ASSERT(MUTEX_HELD(&cpu_lock));
3235
3236 if (pool_pset_enabled())
3237 zoneid = GLOBAL_ZONEID;
3238 else
3239 zoneid = ALL_ZONES;
3240 /*
3241 * Create named kstats
3242 */
3243 #define CPU_STATS_KS_CREATE(name, tsize, update_func) \
3244 ksp = kstat_create_zone(module, instance, (name), class, \
3245 KSTAT_TYPE_NAMED, (tsize) / sizeof (kstat_named_t), 0, \
3246 zoneid); \
3247 if (ksp != NULL) { \
3248 ksp->ks_private = cp; \
3249 ksp->ks_update = (update_func); \
3250 kstat_install(ksp); \
3251 } else \
3252 cmn_err(CE_WARN, "cpu: unable to create %s:%d:%s kstat", \
3253 module, instance, (name));
3254
3255 CPU_STATS_KS_CREATE("sys", sizeof (cpu_sys_stats_ks_data_template),
3256 cpu_sys_stats_ks_update);
3257 CPU_STATS_KS_CREATE("vm", sizeof (cpu_vm_stats_ks_data_template),
3258 cpu_vm_stats_ks_update);
3259
3260 /*
3261 * Export the familiar cpu_stat_t KSTAT_TYPE_RAW kstat.
3262 */
3263 ksp = kstat_create_zone("cpu_stat", cp->cpu_id, NULL,
3264 "misc", KSTAT_TYPE_RAW, sizeof (cpu_stat_t), 0, zoneid);
3265 if (ksp != NULL) {
3266 ksp->ks_update = cpu_stat_ks_update;
3267 ksp->ks_private = cp;
3268 kstat_install(ksp);
3269 }
3270 }
3271
3272 static void
3273 cpu_stats_kstat_destroy(cpu_t *cp)
3274 {
3275 char ks_name[KSTAT_STRLEN];
3276
3277 (void) sprintf(ks_name, "cpu_stat%d", cp->cpu_id);
3278 kstat_delete_byname("cpu_stat", cp->cpu_id, ks_name);
3279
3280 kstat_delete_byname("cpu", cp->cpu_id, "sys");
3281 kstat_delete_byname("cpu", cp->cpu_id, "vm");
3282 }
3283
3284 static int
3285 cpu_sys_stats_ks_update(kstat_t *ksp, int rw)
3286 {
3287 cpu_t *cp = (cpu_t *)ksp->ks_private;
3288 struct cpu_sys_stats_ks_data *csskd;
3289 cpu_sys_stats_t *css;
3290 hrtime_t msnsecs[NCMSTATES];
3291 int i;
3292
3293 if (rw == KSTAT_WRITE)
3294 return (EACCES);
3295
3296 csskd = ksp->ks_data;
3297 css = &cp->cpu_stats.sys;
3298
3299 /*
3300 * Read CPU mstate, but compare with the last values we
3301 * received to make sure that the returned kstats never
3302 * decrease.
3303 */
3304
3305 get_cpu_mstate(cp, msnsecs);
3306 if (csskd->cpu_nsec_idle.value.ui64 > msnsecs[CMS_IDLE])
3307 msnsecs[CMS_IDLE] = csskd->cpu_nsec_idle.value.ui64;
3308 if (csskd->cpu_nsec_user.value.ui64 > msnsecs[CMS_USER])
3309 msnsecs[CMS_USER] = csskd->cpu_nsec_user.value.ui64;
3310 if (csskd->cpu_nsec_kernel.value.ui64 > msnsecs[CMS_SYSTEM])
3311 msnsecs[CMS_SYSTEM] = csskd->cpu_nsec_kernel.value.ui64;
3312
3313 bcopy(&cpu_sys_stats_ks_data_template, ksp->ks_data,
3314 sizeof (cpu_sys_stats_ks_data_template));
3315
3316 csskd->cpu_ticks_wait.value.ui64 = 0;
3317 csskd->wait_ticks_io.value.ui64 = 0;
3318
3319 csskd->cpu_nsec_idle.value.ui64 = msnsecs[CMS_IDLE];
3320 csskd->cpu_nsec_user.value.ui64 = msnsecs[CMS_USER];
3321 csskd->cpu_nsec_kernel.value.ui64 = msnsecs[CMS_SYSTEM];
3322 csskd->cpu_ticks_idle.value.ui64 =
3323 NSEC_TO_TICK(csskd->cpu_nsec_idle.value.ui64);
3324 csskd->cpu_ticks_user.value.ui64 =
3325 NSEC_TO_TICK(csskd->cpu_nsec_user.value.ui64);
3326 csskd->cpu_ticks_kernel.value.ui64 =
3327 NSEC_TO_TICK(csskd->cpu_nsec_kernel.value.ui64);
3328 csskd->cpu_nsec_dtrace.value.ui64 = cp->cpu_dtrace_nsec;
3329 csskd->dtrace_probes.value.ui64 = cp->cpu_dtrace_probes;
3330 csskd->cpu_nsec_intr.value.ui64 = cp->cpu_intrlast;
3331 csskd->cpu_load_intr.value.ui64 = cp->cpu_intrload;
3332 csskd->bread.value.ui64 = css->bread;
3333 csskd->bwrite.value.ui64 = css->bwrite;
3334 csskd->lread.value.ui64 = css->lread;
3335 csskd->lwrite.value.ui64 = css->lwrite;
3336 csskd->phread.value.ui64 = css->phread;
3337 csskd->phwrite.value.ui64 = css->phwrite;
3338 csskd->pswitch.value.ui64 = css->pswitch;
3339 csskd->trap.value.ui64 = css->trap;
3340 csskd->intr.value.ui64 = 0;
3341 for (i = 0; i < PIL_MAX; i++)
3342 csskd->intr.value.ui64 += css->intr[i];
3343 csskd->syscall.value.ui64 = css->syscall;
3344 csskd->sysread.value.ui64 = css->sysread;
3345 csskd->syswrite.value.ui64 = css->syswrite;
3346 csskd->sysfork.value.ui64 = css->sysfork;
3347 csskd->sysvfork.value.ui64 = css->sysvfork;
3348 csskd->sysexec.value.ui64 = css->sysexec;
3349 csskd->readch.value.ui64 = css->readch;
3350 csskd->writech.value.ui64 = css->writech;
3351 csskd->rcvint.value.ui64 = css->rcvint;
3352 csskd->xmtint.value.ui64 = css->xmtint;
3353 csskd->mdmint.value.ui64 = css->mdmint;
3354 csskd->rawch.value.ui64 = css->rawch;
3355 csskd->canch.value.ui64 = css->canch;
3356 csskd->outch.value.ui64 = css->outch;
3357 csskd->msg.value.ui64 = css->msg;
3358 csskd->sema.value.ui64 = css->sema;
3359 csskd->namei.value.ui64 = css->namei;
3360 csskd->ufsiget.value.ui64 = css->ufsiget;
3361 csskd->ufsdirblk.value.ui64 = css->ufsdirblk;
3362 csskd->ufsipage.value.ui64 = css->ufsipage;
3363 csskd->ufsinopage.value.ui64 = css->ufsinopage;
3364 csskd->procovf.value.ui64 = css->procovf;
3365 csskd->intrthread.value.ui64 = 0;
3366 for (i = 0; i < LOCK_LEVEL - 1; i++)
3367 csskd->intrthread.value.ui64 += css->intr[i];
3368 csskd->intrblk.value.ui64 = css->intrblk;
3369 csskd->intrunpin.value.ui64 = css->intrunpin;
3370 csskd->idlethread.value.ui64 = css->idlethread;
3371 csskd->inv_swtch.value.ui64 = css->inv_swtch;
3372 csskd->nthreads.value.ui64 = css->nthreads;
3373 csskd->cpumigrate.value.ui64 = css->cpumigrate;
3374 csskd->xcalls.value.ui64 = css->xcalls;
3375 csskd->mutex_adenters.value.ui64 = css->mutex_adenters;
3376 csskd->rw_rdfails.value.ui64 = css->rw_rdfails;
3377 csskd->rw_wrfails.value.ui64 = css->rw_wrfails;
3378 csskd->modload.value.ui64 = css->modload;
3379 csskd->modunload.value.ui64 = css->modunload;
3380 csskd->bawrite.value.ui64 = css->bawrite;
3381 csskd->iowait.value.ui64 = css->iowait;
3382
3383 return (0);
3384 }
3385
3386 static int
3387 cpu_vm_stats_ks_update(kstat_t *ksp, int rw)
3388 {
3389 cpu_t *cp = (cpu_t *)ksp->ks_private;
3390 struct cpu_vm_stats_ks_data *cvskd;
3391 cpu_vm_stats_t *cvs;
3392
3393 if (rw == KSTAT_WRITE)
3394 return (EACCES);
3395
3396 cvs = &cp->cpu_stats.vm;
3397 cvskd = ksp->ks_data;
3398
3399 bcopy(&cpu_vm_stats_ks_data_template, ksp->ks_data,
3400 sizeof (cpu_vm_stats_ks_data_template));
3401 cvskd->pgrec.value.ui64 = cvs->pgrec;
3402 cvskd->pgfrec.value.ui64 = cvs->pgfrec;
3403 cvskd->pgin.value.ui64 = cvs->pgin;
3404 cvskd->pgpgin.value.ui64 = cvs->pgpgin;
3405 cvskd->pgout.value.ui64 = cvs->pgout;
3406 cvskd->pgpgout.value.ui64 = cvs->pgpgout;
3407 cvskd->swapin.value.ui64 = cvs->swapin;
3408 cvskd->pgswapin.value.ui64 = cvs->pgswapin;
3409 cvskd->swapout.value.ui64 = cvs->swapout;
3410 cvskd->pgswapout.value.ui64 = cvs->pgswapout;
3411 cvskd->zfod.value.ui64 = cvs->zfod;
3412 cvskd->dfree.value.ui64 = cvs->dfree;
3413 cvskd->scan.value.ui64 = cvs->scan;
3414 cvskd->rev.value.ui64 = cvs->rev;
3415 cvskd->hat_fault.value.ui64 = cvs->hat_fault;
3416 cvskd->as_fault.value.ui64 = cvs->as_fault;
3417 cvskd->maj_fault.value.ui64 = cvs->maj_fault;
3418 cvskd->cow_fault.value.ui64 = cvs->cow_fault;
3419 cvskd->prot_fault.value.ui64 = cvs->prot_fault;
3420 cvskd->softlock.value.ui64 = cvs->softlock;
3421 cvskd->kernel_asflt.value.ui64 = cvs->kernel_asflt;
3422 cvskd->pgrrun.value.ui64 = cvs->pgrrun;
3423 cvskd->execpgin.value.ui64 = cvs->execpgin;
3424 cvskd->execpgout.value.ui64 = cvs->execpgout;
3425 cvskd->execfree.value.ui64 = cvs->execfree;
3426 cvskd->anonpgin.value.ui64 = cvs->anonpgin;
3427 cvskd->anonpgout.value.ui64 = cvs->anonpgout;
3428 cvskd->anonfree.value.ui64 = cvs->anonfree;
3429 cvskd->fspgin.value.ui64 = cvs->fspgin;
3430 cvskd->fspgout.value.ui64 = cvs->fspgout;
3431 cvskd->fsfree.value.ui64 = cvs->fsfree;
3432
3433 return (0);
3434 }
3435
3436 static int
3437 cpu_stat_ks_update(kstat_t *ksp, int rw)
3438 {
3439 cpu_stat_t *cso;
3440 cpu_t *cp;
3441 int i;
3442 hrtime_t msnsecs[NCMSTATES];
3443
3444 cso = (cpu_stat_t *)ksp->ks_data;
3445 cp = (cpu_t *)ksp->ks_private;
3446
3447 if (rw == KSTAT_WRITE)
3448 return (EACCES);
3449
3450 /*
3451 * Read CPU mstate, but compare with the last values we
3452 * received to make sure that the returned kstats never
3453 * decrease.
3454 */
3455
3456 get_cpu_mstate(cp, msnsecs);
3457 msnsecs[CMS_IDLE] = NSEC_TO_TICK(msnsecs[CMS_IDLE]);
3458 msnsecs[CMS_USER] = NSEC_TO_TICK(msnsecs[CMS_USER]);
3459 msnsecs[CMS_SYSTEM] = NSEC_TO_TICK(msnsecs[CMS_SYSTEM]);
3460 if (cso->cpu_sysinfo.cpu[CPU_IDLE] < msnsecs[CMS_IDLE])
3461 cso->cpu_sysinfo.cpu[CPU_IDLE] = msnsecs[CMS_IDLE];
3462 if (cso->cpu_sysinfo.cpu[CPU_USER] < msnsecs[CMS_USER])
3463 cso->cpu_sysinfo.cpu[CPU_USER] = msnsecs[CMS_USER];
3464 if (cso->cpu_sysinfo.cpu[CPU_KERNEL] < msnsecs[CMS_SYSTEM])
3465 cso->cpu_sysinfo.cpu[CPU_KERNEL] = msnsecs[CMS_SYSTEM];
3466 cso->cpu_sysinfo.cpu[CPU_WAIT] = 0;
3467 cso->cpu_sysinfo.wait[W_IO] = 0;
3468 cso->cpu_sysinfo.wait[W_SWAP] = 0;
3469 cso->cpu_sysinfo.wait[W_PIO] = 0;
3470 cso->cpu_sysinfo.bread = CPU_STATS(cp, sys.bread);
3471 cso->cpu_sysinfo.bwrite = CPU_STATS(cp, sys.bwrite);
3472 cso->cpu_sysinfo.lread = CPU_STATS(cp, sys.lread);
3473 cso->cpu_sysinfo.lwrite = CPU_STATS(cp, sys.lwrite);
3474 cso->cpu_sysinfo.phread = CPU_STATS(cp, sys.phread);
3475 cso->cpu_sysinfo.phwrite = CPU_STATS(cp, sys.phwrite);
3476 cso->cpu_sysinfo.pswitch = CPU_STATS(cp, sys.pswitch);
3477 cso->cpu_sysinfo.trap = CPU_STATS(cp, sys.trap);
3478 cso->cpu_sysinfo.intr = 0;
3479 for (i = 0; i < PIL_MAX; i++)
3480 cso->cpu_sysinfo.intr += CPU_STATS(cp, sys.intr[i]);
3481 cso->cpu_sysinfo.syscall = CPU_STATS(cp, sys.syscall);
3482 cso->cpu_sysinfo.sysread = CPU_STATS(cp, sys.sysread);
3483 cso->cpu_sysinfo.syswrite = CPU_STATS(cp, sys.syswrite);
3484 cso->cpu_sysinfo.sysfork = CPU_STATS(cp, sys.sysfork);
3485 cso->cpu_sysinfo.sysvfork = CPU_STATS(cp, sys.sysvfork);
3486 cso->cpu_sysinfo.sysexec = CPU_STATS(cp, sys.sysexec);
3487 cso->cpu_sysinfo.readch = CPU_STATS(cp, sys.readch);
3488 cso->cpu_sysinfo.writech = CPU_STATS(cp, sys.writech);
3489 cso->cpu_sysinfo.rcvint = CPU_STATS(cp, sys.rcvint);
3490 cso->cpu_sysinfo.xmtint = CPU_STATS(cp, sys.xmtint);
3491 cso->cpu_sysinfo.mdmint = CPU_STATS(cp, sys.mdmint);
3492 cso->cpu_sysinfo.rawch = CPU_STATS(cp, sys.rawch);
3493 cso->cpu_sysinfo.canch = CPU_STATS(cp, sys.canch);
3494 cso->cpu_sysinfo.outch = CPU_STATS(cp, sys.outch);
3495 cso->cpu_sysinfo.msg = CPU_STATS(cp, sys.msg);
3496 cso->cpu_sysinfo.sema = CPU_STATS(cp, sys.sema);
3497 cso->cpu_sysinfo.namei = CPU_STATS(cp, sys.namei);
3498 cso->cpu_sysinfo.ufsiget = CPU_STATS(cp, sys.ufsiget);
3499 cso->cpu_sysinfo.ufsdirblk = CPU_STATS(cp, sys.ufsdirblk);
3500 cso->cpu_sysinfo.ufsipage = CPU_STATS(cp, sys.ufsipage);
3501 cso->cpu_sysinfo.ufsinopage = CPU_STATS(cp, sys.ufsinopage);
3502 cso->cpu_sysinfo.inodeovf = 0;
3503 cso->cpu_sysinfo.fileovf = 0;
3504 cso->cpu_sysinfo.procovf = CPU_STATS(cp, sys.procovf);
3505 cso->cpu_sysinfo.intrthread = 0;
3506 for (i = 0; i < LOCK_LEVEL - 1; i++)
3507 cso->cpu_sysinfo.intrthread += CPU_STATS(cp, sys.intr[i]);
3508 cso->cpu_sysinfo.intrblk = CPU_STATS(cp, sys.intrblk);
3509 cso->cpu_sysinfo.idlethread = CPU_STATS(cp, sys.idlethread);
3510 cso->cpu_sysinfo.inv_swtch = CPU_STATS(cp, sys.inv_swtch);
3511 cso->cpu_sysinfo.nthreads = CPU_STATS(cp, sys.nthreads);
3512 cso->cpu_sysinfo.cpumigrate = CPU_STATS(cp, sys.cpumigrate);
3513 cso->cpu_sysinfo.xcalls = CPU_STATS(cp, sys.xcalls);
3514 cso->cpu_sysinfo.mutex_adenters = CPU_STATS(cp, sys.mutex_adenters);
3515 cso->cpu_sysinfo.rw_rdfails = CPU_STATS(cp, sys.rw_rdfails);
3516 cso->cpu_sysinfo.rw_wrfails = CPU_STATS(cp, sys.rw_wrfails);
3517 cso->cpu_sysinfo.modload = CPU_STATS(cp, sys.modload);
3518 cso->cpu_sysinfo.modunload = CPU_STATS(cp, sys.modunload);
3519 cso->cpu_sysinfo.bawrite = CPU_STATS(cp, sys.bawrite);
3520 cso->cpu_sysinfo.rw_enters = 0;
3521 cso->cpu_sysinfo.win_uo_cnt = 0;
3522 cso->cpu_sysinfo.win_uu_cnt = 0;
3523 cso->cpu_sysinfo.win_so_cnt = 0;
3524 cso->cpu_sysinfo.win_su_cnt = 0;
3525 cso->cpu_sysinfo.win_suo_cnt = 0;
3526
3527 cso->cpu_syswait.iowait = CPU_STATS(cp, sys.iowait);
3528 cso->cpu_syswait.swap = 0;
3529 cso->cpu_syswait.physio = 0;
3530
3531 cso->cpu_vminfo.pgrec = CPU_STATS(cp, vm.pgrec);
3532 cso->cpu_vminfo.pgfrec = CPU_STATS(cp, vm.pgfrec);
3533 cso->cpu_vminfo.pgin = CPU_STATS(cp, vm.pgin);
3534 cso->cpu_vminfo.pgpgin = CPU_STATS(cp, vm.pgpgin);
3535 cso->cpu_vminfo.pgout = CPU_STATS(cp, vm.pgout);
3536 cso->cpu_vminfo.pgpgout = CPU_STATS(cp, vm.pgpgout);
3537 cso->cpu_vminfo.swapin = CPU_STATS(cp, vm.swapin);
3538 cso->cpu_vminfo.pgswapin = CPU_STATS(cp, vm.pgswapin);
3539 cso->cpu_vminfo.swapout = CPU_STATS(cp, vm.swapout);
3540 cso->cpu_vminfo.pgswapout = CPU_STATS(cp, vm.pgswapout);
3541 cso->cpu_vminfo.zfod = CPU_STATS(cp, vm.zfod);
3542 cso->cpu_vminfo.dfree = CPU_STATS(cp, vm.dfree);
3543 cso->cpu_vminfo.scan = CPU_STATS(cp, vm.scan);
3544 cso->cpu_vminfo.rev = CPU_STATS(cp, vm.rev);
3545 cso->cpu_vminfo.hat_fault = CPU_STATS(cp, vm.hat_fault);
3546 cso->cpu_vminfo.as_fault = CPU_STATS(cp, vm.as_fault);
3547 cso->cpu_vminfo.maj_fault = CPU_STATS(cp, vm.maj_fault);
3548 cso->cpu_vminfo.cow_fault = CPU_STATS(cp, vm.cow_fault);
3549 cso->cpu_vminfo.prot_fault = CPU_STATS(cp, vm.prot_fault);
3550 cso->cpu_vminfo.softlock = CPU_STATS(cp, vm.softlock);
3551 cso->cpu_vminfo.kernel_asflt = CPU_STATS(cp, vm.kernel_asflt);
3552 cso->cpu_vminfo.pgrrun = CPU_STATS(cp, vm.pgrrun);
3553 cso->cpu_vminfo.execpgin = CPU_STATS(cp, vm.execpgin);
3554 cso->cpu_vminfo.execpgout = CPU_STATS(cp, vm.execpgout);
3555 cso->cpu_vminfo.execfree = CPU_STATS(cp, vm.execfree);
3556 cso->cpu_vminfo.anonpgin = CPU_STATS(cp, vm.anonpgin);
3557 cso->cpu_vminfo.anonpgout = CPU_STATS(cp, vm.anonpgout);
3558 cso->cpu_vminfo.anonfree = CPU_STATS(cp, vm.anonfree);
3559 cso->cpu_vminfo.fspgin = CPU_STATS(cp, vm.fspgin);
3560 cso->cpu_vminfo.fspgout = CPU_STATS(cp, vm.fspgout);
3561 cso->cpu_vminfo.fsfree = CPU_STATS(cp, vm.fsfree);
3562
3563 return (0);
3564 }