1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2013 David Hoeppner. All rights reserved.
14 */
15
16 /*
17 * Interrupt Load Balancer.
18 *
19 * The interrupt load balancer reassigns interrupts from one cpu
20 * to another, if the interrupt load
21 */
22
23 #include <sys/param.h>
24 #include <sys/types.h>
25 #include <sys/systm.h>
26 #include <sys/callb.h>
27 #include <sys/cpuvar.h>
28 #include <sys/proc.h>
29 #include <sys/processor.h>
30 #include <sys/sdt.h>
31 #include <sys/sysmacros.h>
32 #include <sys/time.h>
33 #include <sys/cmn_err.h>
34 #include <sys/zone.h>
35 #include <sys/lgrp.h>
36
37 extern proc_t *proc_intrd;
38
39 #define IB_NAME "intrd"
40
41 /*
42 * Various sleeptimes.
43 */
44 #define IB_NORMAL_SLEEPTIME 10
45 #define IB_IDLE_SLEEPTIME 45
46 #define IB_ONECPU_SLEEPTIME (60 * 15)
47
48 static kmutex_t ib_lock;
49 static kcondvar_t ib_cv;
50
51 /*
52 * System tuneable.
53 *
54 * Exclude interrupts in this list.
55 */
56 static char *ib_exclude = NULL;
57
58 typedef struct _ib_exclude {
59 list_node_t *ix_next;
60 processorid_t ix_cpu_id;
61 } ib_exclude_list_t;
62
63 static list_t ib_exclude_list;
64
65
66
67 /*
68 * CPU structure.
69 */
70 typedef struct _ib_cpu {
71 list_node_t ic_next; /* link */
72 processorid_t ic_cpu_id; /* processor id */
73 boolean_t ic_offline; /* CPU is offline */
74 list_t ic_ivec_list; /* list of interrupt vectors */
75 uint32_t ic_num_ivecs; /* number of interrupt vectors */
76 hrtime_t ic_tot; /* total time */
77 int64_t ic_intrs; /* number of interrupts */
78 int ic_intr_load; /* interrupts / total time */
79 int64_t ic_big_intrs;
80 int64_t ic_bigintr; /* largest interrupt on cpu */
81 lgrp_t *ic_lgrp; /* locality group of this cpu */
82 } ib_cpu_t;
83
84 /*
85 * Interrupt vector.
86 */
87 typedef struct _ib_ivec {
88 list_node_t ii_next; /* link */
89 uint64_t ii_ihs;
90 uint64_t ii_ino;
91 uint64_t ii_num_ino;
92 uint64_t ii_pil;
93 uint64_t ii_time;
94 char *ii_buspath;
95 char *ii_name;
96 processorid_t ii_orig_cpu; /* current CPU */
97 processorid_t ii_now_cpu; /* new to be assigned CPU */
98 uint64_t ii_inum;
99 boolean_t ii_goal;
100 } ib_ivec_t;
101
102 /*
103 * MSI.
104 */
105 typedef struct _ib_msi {
106 list_node_t im_next; /* link */
107 const char *im_name;
108 list_t im_ino_list;
109 } ib_msi_t;
110
111 typedef struct _ib_msi_ino {
112 list_node_t imi_next; /* link */
113 uint64_t imi_ino;
114 ib_ivec_t *imi_ivec;
115 } ib_msi_ino_t;
116
117 /*
118 * Snapshot.
119 */
120 typedef struct _ib_snapshot {
121 list_node_t is_next; /* link */
122 list_t is_cpu_list;
123 processorid_t is_num_cpus;
124 } ib_snapshot_t;
125
126 /*
127 * Snapshot delta structure.
128 */
129 typedef struct _ib_delta {
130 list_node_t id_next; /* link */
131 list_t id_cpu_list;
132 boolean_t id_missing;
133 int id_avgintrload; /* interrupts / total time */
134 uint64_t id_avgintrnsec;
135 int id_goodness;
136 } ib_delta_t;
137
138 static list_t ib_cpu_list; /* List of all OS CPUs */
139
140 static long ib_sleeptime = IB_NORMAL_SLEEPTIME;
141 static processorid_t ib_num_cpus;
142
143 static int goodness_unsafe_load = 90;
144 static int goodness_mindelta = 10;
145
146 /*
147 * Function prototypes.
148 */
149 static void ib_cpu_register(processorid_t);
150 static int ib_cpu_setup(cpu_setup_t, int, void *);
151 static boolean_t ib_cpu_exclude(processorid_t);
152 static ib_cpu_t *ib_cpu_create(void);
153 static ib_cpu_t *ib_cpu_find(list_t, processorid_t);
154 static void ib_cpu_destroy(ib_cpu_t *);
155
156 static int ib_goodness(ib_delta_t *);
157 static int ib_do_reconfig(ib_delta_t *);
158 static int ib_imbalanced(int, int);
159 static int ib_interrupt_do_move(ib_ivec_t *, processorid_t);
160 static void ib_interrupt_move_check(ib_delta_t *, processorid_t, processorid_t);
161
162 static ib_snapshot_t *ib_get_statistics(void);
163 static ib_delta_t *ib_delta_generate(ib_snapshot_t *, ib_snapshot_t *);
164
165 /*
166 * Helper macros.
167 */
168 #define IS_CPU(cpu_id) (cpu[cpu_id] != NULL)
169
170 #define FOREACH_CPU(icpu, icpu_list) \
171 for (icpu = list_head(&icpu_list); icpu != NULL; \
172 icpu = list_next(&icpu_list, icpu))
173
174 #define FOREACH_IVEC(ivec, ivec_list) \
175 for (ivec = list_head(&ivec_list); ivec != NULL; \
176 ivec = list_next(&ivec_list, ivec))
177
178 #define DTRACE_INTRD(name) \
179 DTRACE_PROBE(__intrd_##name)
180
181 #define DEBUG 1
182 #ifdef DEBUG
183 #define IB_APIDBG(args) cmn_err args
184 #define IB_IMPLDBG(args) cmn_err args
185 #else
186 #define IB_APIDBG(args)
187 #define IB_IMPLDBG(args)
188 #endif
189
190 #define IB_LOG(args) cmn_err args
191
192 void
193 interrupt_balancer(void)
194 {
195 processorid_t cpu_id;
196 callb_cpr_t cpr;
197 user_t *u = PTOU(curproc);
198 int error;
199
200 boolean_t do_reconfig = B_FALSE;
201 int goodness;
202 int baseline_goodness = 0;
203 list_t ib_delta_list;
204 hrtime_t statslen = 60;
205
206 proc_intrd = ttoproc(curthread);
207 proc_intrd->p_cstime = proc_intrd->p_stime = 0;
208 proc_intrd->p_cutime = proc_intrd->p_utime = 0;
209
210 (void) strncpy(u->u_psargs, IB_NAME, sizeof(u->u_psargs));
211 (void) strncpy(u->u_comm, IB_NAME, sizeof(u->u_comm));
212
213 /* Initialize global mutex lock */
214 mutex_init(&ib_lock, NULL, MUTEX_DEFAULT, NULL);
215
216 /* Initialize CPU list */
217 list_create(&ib_cpu_list, sizeof (ib_cpu_t),
218 offsetof(ib_cpu_t, ic_next));
219
220 /* Initialize delta list */
221 list_create(&ib_delta_list, sizeof (ib_delta_t),
222 offsetof(ib_delta_t, id_next));
223
224 /* Initialize interrupt exclude list */
225 list_create(&ib_exclude_list, sizeof (ib_exclude_list_t),
226 offsetof(ib_exclude_list_t, ix_next));
227
228 /*
229 * Parse list of interrupts to exclude.
230 *
231 * XXX: move interrupts to active processors.
232 */
233 if (ib_exclude != NULL) {
234 processorid_t rval;
235
236 IB_LOG((CE_CONT, "XXX %s XXX", ib_exclude));
237 }
238
239 /*
240 * Build a list of all CPUs available for interrupt handling.
241 */
242 for (cpu_id = 0; cpu_id <= max_cpu_seqid_ever; cpu_id++) {
243 if (IS_CPU(cpu_id))
244 ib_cpu_register(cpu_id);
245 }
246
247 /*
248 * Locality group information.
249 */
250 int i;
251 for (i = 0; i < lgrp_plat_max_lgrps(); i++) {
252 lgrp_t *lgrp;
253
254 lgrp = lgrp_table[i];
255 if (lgrp->lgrp_cpu != NULL) {
256 cpu_t *lgrp_cpu;
257
258 for (lgrp_cpu = lgrp->lgrp_cpu; lgrp_cpu != NULL;
259 lgrp_cpu =lgrp_cpu->cpu_next_lgrp) {
260 ib_cpu_t *icpu;
261
262 icpu = ib_cpu_find(ib_cpu_list, lgrp_cpu->cpu_id);
263
264 /*
265 * Assign locality group if we found a CPU.
266 */
267 if (icpu != NULL)
268 icpu->ic_lgrp = lgrp;
269 }
270 }
271 }
272
273 /*
274 * Register a callback if a CPU goes offline or comes online.
275 */
276 mutex_enter(&cpu_lock);
277 register_cpu_setup_func(ib_cpu_setup, NULL);
278 mutex_exit(&cpu_lock);
279
280 CALLB_CPR_INIT(&cpr, &ib_lock, callb_generic_cpr, IB_NAME);
281
282 ib_snapshot_t *snapshot = NULL;
283 ib_snapshot_t *new_snapshot = NULL;
284 hrtime_t delta_time;
285 hrtime_t deltas_tottime = 0;
286 boolean_t below_statslen;
287
288 snapshot = ib_get_statistics();
289
290 mutex_enter(&ib_lock);
291 for (;;) {
292 ib_delta_t *delta;
293
294 DTRACE_INTRD(get_stats);
295 new_snapshot = ib_get_statistics();
296
297 delta = ib_delta_generate(snapshot, new_snapshot);
298
299 below_statslen = (deltas_tottime < statslen);
300 deltas_tottime += delta_time;
301 do_reconfig = (below_statslen && deltas_tottime >= statslen);
302
303 list_insert_tail(&ib_delta_list, delta);
304
305 /*
306 * Calculate the goodness of the current configuration.
307 */
308 goodness = ib_goodness(delta);
309
310 if (ib_imbalanced(goodness, baseline_goodness))
311 do_reconfig = B_TRUE;
312
313 /*
314 * Reconfigure interrupt distribution.
315 */
316 if (do_reconfig) {
317 error = ib_do_reconfig(delta);
318
319 if (error != 0) {
320 if (error == -1)
321 IB_LOG((CE_CONT, "ib_do_reconfig failed!"));
322 } else {
323 IB_LOG((CE_CONT, "setting new baseline of %d", goodness));
324 baseline_goodness = goodness;
325 }
326 }
327
328 /*
329 * Wait for timeout or CPU reconfiguration.
330 */
331 CALLB_CPR_SAFE_BEGIN(&cpr);
332 cv_timedwait(&ib_cv, &ib_lock, ddi_get_lbolt() +
333 SEC_TO_TICK(ib_sleeptime));
334 CALLB_CPR_SAFE_END(&cpr, &ib_lock);
335 }
336
337 CALLB_CPR_EXIT(&cpr);
338
339 /*
340 * Unregister CPU callback.
341 */
342 mutex_enter(&cpu_lock);
343 unregister_cpu_setup_func(ib_cpu_setup, NULL);
344 mutex_exit(&cpu_lock);
345
346 list_destroy(&ib_exclude_list);
347 list_destroy(&ib_delta_list);
348 list_destroy(&ib_cpu_list);
349
350 }
351
352 /*
353 * Register a new CPU in the global list of CPUs.
354 */
355 static void
356 ib_cpu_register(processorid_t cpu_id)
357 {
358 cpu_t *cp = cpu[cpu_id];
359 ib_cpu_t *new_cpu;
360
361 /*
362 * Is this CPU baned from interrupt handling?
363 */
364 if (ib_cpu_exclude(cpu_id))
365 return;
366
367 new_cpu = ib_cpu_create();
368 new_cpu->ic_cpu_id = cpu_id;
369
370 /* Initialize list for interrupt vectors */
371 list_create(&new_cpu->ic_ivec_list, sizeof (ib_ivec_t),
372 offsetof(ib_ivec_t, ii_next));
373
374 list_link_init(&new_cpu->ic_next);
375
376 /* Check if this CPU can handle interrupts */
377 mutex_enter(&cpu_lock);
378 if (cpu_is_nointr(cp))
379 new_cpu->ic_offline = B_TRUE;
380 mutex_exit(&cpu_lock);
381
382 /* Add CPU to list of CPUs */
383 list_insert_tail(&ib_cpu_list, new_cpu);
384
385 ib_num_cpus++;
386
387 IB_IMPLDBG((CE_CONT, "ib_cpu_register: cpu=0x%x", cpu_id));
388 }
389
390 /*
391 * Unregister CPU from the global list of CPUs.
392 */
393 static void
394 ib_cpu_unregister(processorid_t cpu_id)
395 {
396 ib_cpu_t *icpu;
397
398 mutex_enter(&ib_lock);
399 FOREACH_CPU(icpu, ib_cpu_list) {
400 if (icpu->ic_cpu_id == cpu_id) {
401 /* Remove CPU from global list */
402 list_remove(&ib_cpu_list, icpu);
403
404 /* Free CPU structure */
405 ib_cpu_destroy(icpu);
406
407 /* XXX or just offline CPU; statistics? */
408 break;
409 }
410 }
411 mutex_exit(&ib_lock);
412
413 ib_num_cpus--;
414
415 IB_IMPLDBG((CE_CONT, "ib_cpu_unregister: cpu=0x%x",
416 cpu_id));
417 }
418
419 /*
420 * Hook for CPU changes.
421 */
422 static int
423 ib_cpu_setup(cpu_setup_t what, int cpu_id, void *arg)
424 {
425
426 switch (what) {
427 case CPU_UNCONFIG:
428 case CPU_CPUPART_OUT:
429 case CPU_OFF:
430 ib_cpu_unregister(cpu_id);
431 cv_signal(&ib_cv);
432 break;
433
434 case CPU_INTR_ON:
435 ib_cpu_register(cpu_id);
436 cv_signal(&ib_cv);
437 break;
438
439 default:
440 break;
441 }
442
443 return (0);
444 }
445
446 static ib_cpu_t *
447 ib_cpu_create(void)
448 {
449 ib_cpu_t *new_cpu;
450
451 new_cpu = kmem_alloc(sizeof (ib_cpu_t), KM_SLEEP);
452 new_cpu->ic_offline = B_FALSE;
453
454 return (new_cpu);
455 }
456
457 static void
458 ib_cpu_destroy(ib_cpu_t *old_cpu)
459 {
460 ib_ivec_t *ivec;
461
462 FOREACH_IVEC(ivec, old_cpu->ic_ivec_list) {
463 kmem_free(ivec, sizeof (ib_ivec_t));
464 }
465
466 kmem_free(old_cpu, sizeof (ib_cpu_t));
467 }
468
469 /*
470 * Find a CPU in the global list of CPUs by processor id.
471 */
472 static ib_cpu_t *
473 ib_cpu_find(list_t cpu_list, processorid_t cpu_id)
474 {
475 ib_cpu_t *icpu;
476
477 IB_APIDBG((CE_CONT, "ib_cpu_find: API cpu = %d", cpu_id));
478
479 FOREACH_CPU(icpu, cpu_list) {
480 if (icpu->ic_cpu_id == cpu_id)
481 return (icpu);
482 }
483
484 return (NULL);
485 }
486
487 /*
488 * Find a interrupt vector for a specific CPU.
489 */
490 static ib_ivec_t *
491 ib_cpu_find_ivec(list_t cpu_list, processorid_t cpu_id, char *buspath,
492 uint64_t ino)
493 {
494 ib_cpu_t *icpu;
495 ib_ivec_t *ivec;
496
497 icpu = ib_cpu_find(cpu_list, cpu_id);
498 if (icpu == NULL)
499 return (NULL);
500
501 for (ivec = list_head(&icpu->ic_ivec_list); ivec != NULL;
502 ivec = list_next(&icpu->ic_ivec_list, ivec)) {
503 if (ivec->ii_ino == ino)
504 return (ivec);
505 }
506
507 return (NULL);
508 }
509
510 /*
511 * Search exclude lists.
512 */
513 static boolean_t
514 ib_cpu_exclude(processorid_t cpu_id)
515 {
516 ib_exclude_list_t *excluded_cpu;
517
518 /*
519 * Search global list of CPUs excluded from interrupt handling.
520 */
521 for (excluded_cpu = list_head(&ib_exclude_list); excluded_cpu != NULL;
522 excluded_cpu = list_next(&ib_exclude_list, excluded_cpu)) {
523 if (excluded_cpu->ix_cpu_id == cpu_id)
524 return (B_TRUE);
525 }
526
527 return (B_FALSE);
528 }
529
530 /*
531 * Total times spend.
532 */
533 static void
534 ib_cpu_statistics(ib_cpu_t *icpu)
535 {
536 cpu_t *cp;
537 hrtime_t msnsecs[NCMSTATES];
538 hrtime_t new_tot;
539
540 cp = cpu[icpu->ic_cpu_id];
541 get_cpu_mstate(cp, msnsecs);
542
543 icpu->ic_tot = msnsecs[CMS_IDLE] + msnsecs[CMS_USER] +
544 msnsecs[CMS_SYSTEM];
545
546 }
547
548 /*
549 * Create a new interrupt vector.
550 */
551 static ib_ivec_t *
552 ib_ivec_create(const char *buspath, uint64_t ino)
553 {
554 ib_ivec_t *ivec;
555
556 ivec = (ib_ivec_t *)kmem_alloc(sizeof (ib_ivec_t), KM_SLEEP);
557
558 list_link_init(&ivec->ii_next);
559
560 ivec->ii_buspath = (char *)buspath; /* XXX: strdup */
561 ivec->ii_ino = ino;
562 ivec->ii_ihs = 1;
563
564 return (ivec);
565 }
566
567 static void
568 ib_ivec_register(ib_cpu_t *icpu)
569 {
570 }
571
572 /*
573 * Find interrupt vector by ino.
574 */
575 static ib_ivec_t *
576 ib_ivec_find_ino(list_t ivec_list, uint64_t ino)
577 {
578 ib_ivec_t *ivec;
579
580 FOREACH_IVEC(ivec, ivec_list) {
581 if (ivec->ii_inum == ino)
582 return (ivec);
583 }
584
585 return (NULL);
586 }
587
588 /*
589 * Delete a interrupt vector from a list.
590 */
591 static void
592 ib_ivec_delete_ino(list_t ivec_list, uint64_t ino)
593 {
594 ib_ivec_t *ivec;
595
596 FOREACH_IVEC(ivec, ivec_list) {
597 if (ivec->ii_inum == ino) {
598 /* XXX: remove from list */
599 ;
600 }
601 }
602 }
603
604 /*
605 * Add a new interrupt vector to a list.
606 */
607 static void
608 ib_ivec_add_ino(list_t ivec_list, ib_ivec_t *ivec)
609 {
610 list_insert_tail(&ivec_list, ivec);
611 }
612
613 static ib_msi_t *
614 ib_msi_create(const char *name)
615 {
616 ib_msi_t *msi;
617
618 msi = (ib_msi_t *)kmem_alloc(sizeof (ib_msi_t), KM_SLEEP);
619
620 msi->im_name = name;
621
622 list_link_init(&msi->im_next);
623 list_create(&msi->im_ino_list, sizeof (ib_msi_ino_t),
624 offsetof(ib_msi_ino_t, imi_next));
625
626 return (msi);
627 }
628
629 /*
630 * Allocate and initialize a new snapshot structure.
631 */
632 static ib_snapshot_t *
633 ib_snapshot_create(void)
634 {
635 ib_snapshot_t *snapshot;
636
637 snapshot = kmem_alloc(sizeof (ib_snapshot_t), KM_SLEEP);
638
639 /* init link */
640
641 /* Initialize CPU list */
642 list_create(&snapshot->is_cpu_list, sizeof (ib_cpu_t),
643 offsetof(ib_cpu_t, ic_next));
644
645 snapshot->is_num_cpus = 0;
646
647 return (snapshot);
648 }
649
650 /*
651 * Destroy a snapshot.
652 */
653 static void
654 ib_snapshot_destroy(ib_snapshot_t *snapshot)
655 {
656 ib_cpu_t *icpu;
657
658 FOREACH_CPU(icpu, snapshot->is_cpu_list) {
659 ib_cpu_destroy(icpu);
660 }
661
662 kmem_free(snapshot, sizeof (ib_snapshot_t));
663 }
664
665 static ib_ivec_t *
666 ib_irq_fill_ivec(kstat_t *ksp)
667 {
668 kstat_named_t *knp;
669 ib_ivec_t *ivec;
670 char *datap;
671 uint64_t time;
672 int i;
673
674 datap = ksp->ks_data;
675 knp = KSTAT_NAMED_PTR(ksp);
676 for (i = 0; i < ksp->ks_ndata; i++, knp++) {
677 IB_IMPLDBG((CE_CONT, "ib_irq_fill_ivec: %s",
678 knp->name));
679
680 if (strcmp(knp->name, "time") == 0) {
681 cmn_err(CE_CONT, "XXX ib time");
682 time = knp->value.ui64;
683 }
684
685 knp += sizeof (kstat_named_t);
686 datap += sizeof (kstat_named_t);
687 }
688
689 /* Allocate a new interrupt vector */
690 ivec = ib_ivec_create("", 0);
691 ivec->ii_time = time;
692
693 return (ivec);
694 }
695
696 /*
697 * XXX: icpu not needed, move out of loop
698 */
699 static void
700 ib_irq_statistics(ib_cpu_t *icpu)
701 {
702 kstat_t *ksp;
703 int instance = 1;
704
705 /*
706 * Read pci interrupts.
707 */
708 ksp = kstat_hold_byname("pci_intrs", instance, "pci", ALL_ZONES);
709 while (ksp != NULL) {
710 KSTAT_ENTER(ksp);
711
712 if (ksp->ks_data != NULL && ksp->ks_type == KSTAT_TYPE_NAMED) {
713 ib_cpu_t *icpu;
714 ib_ivec_t *ivec;
715 kstat_named_t *knp;
716 kstat_named_t *datap;
717 uint64_t ino;
718 char *buspath;
719 char *namep;
720 processorid_t cpu_id;
721 int i;
722 boolean_t is_enabled = B_TRUE;
723
724 (void) KSTAT_UPDATE(ksp, KSTAT_READ);
725
726 /*
727 * Find the CPU this interrupt vector is on and
728 * if the vector itself is enabled.
729 */
730 datap = ksp->ks_data;
731 namep = KSTAT_NAMED_PTR(ksp)->name;
732 for (i = 0; i < ksp->ks_ndata; i++) {
733 if (strcmp(namep, "cpu") == 0) {
734 cpu_id = datap->value.ui64;
735 } else if (strcmp(namep, "type") == 0) {
736 if (strcmp(datap->value.c, "disabled") == 0) {
737 is_enabled = B_FALSE;
738 break;
739 }
740 }
741
742 namep += sizeof (kstat_named_t);
743 datap += sizeof (kstat_named_t);
744 }
745
746 /*
747 * Skip this interrupt vector if its disabled.
748 */
749 if (!is_enabled)
750 continue;
751
752 /*
753 * Check if CPU is online.
754 */
755 icpu = ib_cpu_find(ib_cpu_list, cpu_id);
756 if (icpu == NULL || icpu->ic_offline)
757 continue;
758
759 /*
760 * Fill information.
761 */
762 ivec = ib_irq_fill_ivec(ksp);
763 if (ivec == NULL)
764 continue;
765
766 list_insert_tail(&icpu->ic_ivec_list, ivec);
767 }
768
769 KSTAT_EXIT(ksp);
770 kstat_rele(ksp);
771
772 instance++;
773 ksp = kstat_hold_byname("pci_intrs", instance, "pci", ALL_ZONES);
774 }
775 }
776
777 /*
778 * Collect data from CPUs and interrupt vectors.
779 */
780 static ib_snapshot_t *
781 ib_get_statistics(void)
782 {
783 ib_cpu_t *os_cpu;
784 ib_snapshot_t *snapshot;
785 ib_cpu_t *snapshot_cpu;
786
787 /*
788 * Nothing to balance with one CPU. XXX: right place?
789 */
790 if (ib_num_cpus <= 1) {
791 ib_sleeptime = IB_ONECPU_SLEEPTIME;
792 return (NULL);
793 }
794
795 /*
796 * Store all CPUs and ivecs here.
797 */
798 snapshot = ib_snapshot_create();
799
800 /*
801 * Loop over all active CPUs
802 */
803 FOREACH_CPU(os_cpu, ib_cpu_list) {
804
805 snapshot->is_num_cpus++;
806
807 snapshot_cpu = ib_cpu_create();
808 snapshot_cpu->ic_cpu_id = os_cpu->ic_cpu_id;
809
810 list_insert_tail(&snapshot->is_cpu_list, snapshot_cpu);
811
812 ib_cpu_statistics(snapshot_cpu);
813 ib_irq_statistics(os_cpu);
814 }
815
816 return (snapshot);
817 }
818
819 static ib_delta_t *
820 ib_delta_create(void)
821 {
822 ib_delta_t *delta;
823
824 delta = kmem_alloc(sizeof (ib_delta_t), KM_SLEEP);
825 delta->id_missing = B_FALSE;
826
827 list_create(&delta->id_cpu_list, sizeof (ib_cpu_t),
828 offsetof(ib_cpu_t, ic_next));
829
830 return (delta);
831 }
832
833 /*
834 * Generate the delta of two snapshots.
835 */
836 static ib_delta_t *
837 ib_delta_generate(ib_snapshot_t *old_snapshot, ib_snapshot_t *new_snapshot)
838 {
839 ib_cpu_t *old_cpu, *new_cpu;
840 ib_delta_t *delta;
841 int intrload = 0;
842 int intrnsec = 0;
843 processorid_t cpus = 0;
844
845 /*
846 * Allocate a new delta structure.
847 */
848 delta = ib_delta_create();
849
850 /*
851 * Number of CPUs must be the same.
852 */
853 delta->id_missing = old_snapshot->is_num_cpus !=
854 new_snapshot->is_num_cpus;
855
856 if (delta->id_missing != 0) {
857 IB_LOG((CE_CONT, "ib_delta_generate: number of CPUs changed"));
858 return (delta);
859 }
860
861 /*
862 * Loop over the CPUs in both snapshots.
863 */
864 for (new_cpu = list_head(&new_snapshot->is_cpu_list),
865 old_cpu = list_head(&old_snapshot->is_cpu_list);
866 new_cpu != NULL && old_cpu != NULL;
867 new_cpu = list_next(&new_snapshot->is_cpu_list, new_cpu),
868 old_cpu = list_next(&old_snapshot->is_cpu_list, old_cpu)) {
869 ib_cpu_t *delta_cpu;
870 ib_ivec_t *new_ivec;
871
872 /* XXX: just onlined CPU? */
873
874 /* Allocate a new CPU structure */
875 delta_cpu = ib_cpu_create();
876
877 /* Difference of total time */
878 delta_cpu->ic_tot = new_cpu->ic_tot - old_cpu->ic_tot;
879 if (!(delta_cpu->ic_tot >= 0)) {
880 delta->id_missing = B_TRUE;
881 kmem_free(delta_cpu, sizeof (ib_cpu_t));
882 return (delta);
883 }
884
885 list_insert_tail(&delta->id_cpu_list, delta_cpu);
886
887 /* Avoid division by zero */
888 if (delta_cpu->ic_tot == 0)
889 delta_cpu->ic_tot = 1;
890
891 delta_cpu->ic_intrs = 0;
892 delta_cpu->ic_big_intrs = 0;
893
894 /*
895 * Number of interrupt vectors must be the same.
896 */
897 if (old_cpu->ic_num_ivecs != new_cpu->ic_num_ivecs) {
898 IB_LOG((CE_CONT, "ib_delta_generate: cpu %d has more "
899 "or less interrupts", old_cpu->ic_cpu_id));
900 delta->id_missing = B_TRUE;
901 return (delta);
902 }
903
904 /*
905 * Loop over the interrupt vectors of the new CPU.
906 */
907 for (new_ivec = list_head(&new_cpu->ic_ivec_list);
908 new_ivec != NULL; new_ivec =
909 list_next(&new_cpu->ic_ivec_list, new_ivec)) {
910 ib_ivec_t *ivec;
911 ib_ivec_t *delta_ivec;
912 hrtime_t time;
913
914 if (new_ivec->ii_num_ino == 0)
915 continue;
916
917 /*
918 * If interrupt vector does not exists or XXX crtime
919 * is different, set missing.
920 */
921 ivec = ib_ivec_find_ino(old_cpu->ic_ivec_list,
922 new_ivec->ii_ino);
923 if (ivec == NULL) {
924 delta->id_missing = B_TRUE;
925 return (delta);
926 }
927
928 /* Allocate a new delta interrupt vector */
929 delta_ivec = ib_ivec_create(new_ivec->ii_buspath,
930 new_ivec->ii_ino);
931
932 /*
933 * Time used by this interrupt.
934 */
935 time = new_ivec->ii_time - ivec->ii_time;
936 if (time < 0) {
937 delta->id_missing = B_TRUE;
938 kmem_free(delta_ivec, sizeof (ib_delta_t));
939 return (delta);
940 }
941
942 delta_cpu->ic_intrs += time;
943 delta_ivec->ii_time = time;
944
945 if (time > delta_cpu->ic_bigintr)
946 delta_cpu->ic_bigintr = time;
947
948 /*
949 * Fill in the rest.
950 */
951 delta_ivec->ii_ihs = new_ivec->ii_ihs;
952 delta_ivec->ii_pil = new_ivec->ii_pil;
953 delta_ivec->ii_ino = new_ivec->ii_ino;
954 delta_ivec->ii_num_ino = new_ivec->ii_num_ino;
955 /* XXX: buspath, name */
956 }
957
958 /*
959 * Rounding error
960 */
961 if (delta_cpu->ic_tot < delta_cpu->ic_intrs)
962 delta_cpu->ic_tot = delta_cpu->ic_intrs;
963
964 delta_cpu->ic_intr_load =
965 delta_cpu->ic_intrs / delta_cpu->ic_tot;
966 intrload += delta_cpu->ic_intr_load;
967 intrnsec += delta_cpu->ic_intrs;
968
969 cpus++;
970 }
971
972 if (cpus > 0) {
973 delta->id_avgintrload = intrload / cpus;
974 delta->id_avgintrnsec = intrnsec / cpus;
975 } else {
976 delta->id_avgintrload = 0;
977 delta->id_avgintrnsec = 0;
978 }
979
980 return (delta);
981 }
982
983 /*
984 * Compress deltas.
985 */
986 static ib_delta_t *
987 ib_delta_compress(list_t *deltas)
988 {
989 ib_cpu_t *icpu;
990 ib_ivec_t *ivec;
991 ib_delta_t *new_delta, *delta;
992 processorid_t cpus = 0;
993 int high_intrload = 0;
994 int intrs = 0, tot;
995
996 /* Check if empty list of deltas */
997 if (deltas == NULL || list_is_empty(deltas) != 0) {
998 IB_LOG((CE_CONT, "ib_delta_compress: deltas are empty?"));
999 return (NULL);
1000 }
1001
1002 /* Allocate a new delta structure */
1003 new_delta = ib_delta_create();
1004
1005 /*
1006 * Loop over the deltas in the list.
1007 */
1008 for (delta = list_head(deltas); delta != NULL;
1009 delta = list_next(deltas, delta)) {
1010
1011 /* Compressing bad delta? */
1012 if (delta->id_missing) {
1013 IB_LOG((CE_CONT,
1014 "ib_delta_compress: compressing bad deltas?"));
1015 return (NULL);
1016 }
1017
1018 FOREACH_CPU(icpu, delta->id_cpu_list) {
1019 ib_cpu_t *new_cpu;
1020 ib_ivec_t *new_ivec;
1021
1022 intrs += icpu->ic_intrs;
1023 tot += icpu->ic_tot;
1024 new_cpu = ib_cpu_create();
1025 new_cpu->ic_cpu_id = icpu->ic_cpu_id;
1026 new_cpu->ic_intrs = icpu->ic_intrs;
1027 new_cpu->ic_tot = icpu->ic_tot;
1028
1029 /* XXX: exists ivecs */
1030 FOREACH_IVEC(new_ivec, icpu->ic_ivec_list) {
1031 ib_ivec_t *new_delta_ivec;
1032
1033 new_delta_ivec = ib_ivec_create(
1034 new_ivec->ii_buspath, new_ivec->ii_ino);
1035
1036 }
1037 }
1038 }
1039
1040 FOREACH_CPU(icpu, new_delta->id_cpu_list) {
1041 int bigintr = 0;
1042
1043 cpus++;
1044
1045 FOREACH_IVEC(ivec, icpu->ic_ivec_list) {
1046 if (ivec->ii_time > bigintr)
1047 bigintr = ivec->ii_time;
1048 }
1049
1050 icpu->ic_bigintr = bigintr;
1051 icpu->ic_intr_load = icpu->ic_intrs / icpu->ic_tot;
1052
1053 if (high_intrload < icpu->ic_intr_load)
1054 high_intrload = icpu->ic_intr_load;
1055
1056 if (icpu->ic_tot <= 0)
1057 icpu->ic_tot = 100;
1058 }
1059
1060 if (cpus > 0) {
1061 new_delta->id_avgintrload = intrs / tot;
1062 new_delta->id_avgintrnsec = intrs / cpus;
1063 } else {
1064 new_delta->id_avgintrload = 0;
1065 new_delta->id_avgintrnsec = 0;
1066 }
1067
1068 /* XXX: global sleeptime */
1069
1070 return (new_delta);
1071 }
1072
1073 /*
1074 * Decide if the load is out of balance.
1075 */
1076 static int
1077 ib_imbalanced(int goodness, int baseline)
1078 {
1079 if (goodness > 50)
1080 return (100);
1081
1082 /* XXX: abs */
1083 if ((goodness - baseline) > goodness_mindelta)
1084 return (100);
1085
1086 return (0);
1087 }
1088
1089 /*
1090 * Calculate goodness of a CPU.
1091 */
1092 static int
1093 ib_goodness_cpu(ib_cpu_t *icpu, int avg_interrupt_load)
1094 {
1095 int goodness;
1096 int load, load_no_bigintr;
1097
1098 load = icpu->ic_intrs / icpu->ic_tot;
1099 if (load < avg_interrupt_load)
1100 return (0);
1101
1102 load_no_bigintr = (icpu->ic_intrs - icpu->ic_bigintr) / icpu->ic_tot;
1103
1104 if ((load > goodness_unsafe_load) && (icpu->ic_num_ivecs > 1))
1105 return (1);
1106
1107 goodness = load - avg_interrupt_load;
1108 if (goodness > load_no_bigintr)
1109 goodness = load_no_bigintr;
1110
1111 return (goodness);
1112 }
1113
1114 /*
1115 * Calculate goodness.
1116 */
1117 static int
1118 ib_goodness(ib_delta_t *delta)
1119 {
1120 ib_cpu_t *icpu;
1121 int goodness, high_goodness = 0;
1122
1123 if (delta->id_missing > 0)
1124 return (1);
1125
1126 FOREACH_CPU(icpu, delta->id_cpu_list) {
1127 goodness = ib_goodness_cpu(icpu, delta->id_avgintrload);
1128 if (!(goodness >= 0 && goodness <= 100)) {
1129 IB_LOG((CE_CONT,
1130 "ib_goodness: cpu goodness out of range?"));
1131 return (100);
1132 }
1133
1134 if (goodness == 100)
1135 return (100);
1136
1137 if (goodness > high_goodness)
1138 high_goodness = goodness;
1139 }
1140
1141 return (high_goodness);
1142 }
1143
1144 static void
1145 ib_do_find_goal(list_t ivecs, list_t loads, int goal, int idx)
1146 {
1147 list_t goals_with;
1148 list_t goals_without;
1149 int with, without;
1150 int which, load;
1151
1152
1153 if (goal <= load) {
1154 with = load;
1155 } else {
1156 /* XXX: do_find_goal */
1157 with += load;
1158 }
1159
1160 IB_LOG((CE_CONT, "XXX"));
1161
1162 if (with >= goal && without < goal) {
1163 which = 0;
1164 } else if (with < goal && without >= goal) {
1165 which = 1;
1166 } else if (with >= goal && without >= goal) {
1167 which = without < with;
1168 } else {
1169 which = without > with;
1170 }
1171
1172 if (which == 1) {
1173 IB_LOG((CE_CONT, "ib_do_find_goal: going without"));
1174 /* XXX */
1175 } else {
1176 IB_LOG((CE_CONT, "ib_do_find_goal: going with"));
1177 /* XXX */
1178 }
1179 }
1180
1181 typedef struct _ib_goal {
1182 list_node_t *ig_link;
1183 int ig_value;
1184 } ib_goal_t;
1185
1186 typedef struct _ib_goal_load {
1187 list_node_t *igl_link;
1188 int igl_value;
1189 } ib_goal_load_t;
1190
1191 static void
1192 ib_find_goal(list_t ivecs, int goal)
1193 {
1194 ib_ivec_t *ivec;
1195 list_t goals;
1196 int load;
1197
1198 if (goal <= 0) {
1199 list_create(&goals, sizeof (ib_goal_t),
1200 offsetof (ib_goal_t, ig_link));
1201 } else {
1202 list_t loads;
1203 hrtime_t tot = 0;
1204
1205 IB_LOG((CE_CONT, "ib_find_goal: finding goal from intrs XXX"));
1206
1207 FOREACH_IVEC(ivec, ivecs) {
1208 tot += ivec->ii_time;
1209 }
1210
1211 list_create(&loads, sizeof (ib_goal_load_t),
1212 offsetof (ib_goal_load_t, igl_link));
1213
1214 FOREACH_IVEC(ivec, ivecs) {
1215 ib_goal_load_t *igl = kmem_alloc(sizeof (ib_goal_load_t), KM_SLEEP);
1216
1217 igl->igl_value = tot;
1218 list_insert_tail(&loads, igl);
1219
1220 tot -= ivec->ii_time;
1221 }
1222 }
1223 }
1224
1225 static void
1226 ib_do_reconfig_cpu2cpu(ib_delta_t *delta, processorid_t src_cpuid,
1227 processorid_t tgt_cpuid, int src_load)
1228 {
1229 ib_cpu_t *src_cpu, *tgt_cpu;
1230 ib_ivec_t *ivec;
1231 list_t ivecs;
1232 int goal, new_load;
1233 int avg_nsec;
1234
1235 if (delta == NULL)
1236 return;
1237
1238 goal = delta->id_avgintrnsec;
1239
1240 src_cpu = ib_cpu_find(delta->id_cpu_list, src_cpuid);
1241 if (src_cpu == NULL)
1242 return;
1243
1244 tgt_cpu = ib_cpu_find(delta->id_cpu_list, tgt_cpuid);
1245 if (tgt_cpu == NULL)
1246 return;
1247
1248 avg_nsec = (src_cpu->ic_intrs + tgt_cpu->ic_intrs) / 2;
1249 if (goal < avg_nsec)
1250 goal = avg_nsec;
1251
1252
1253 /*
1254 * Sort interrupt vectors by time.
1255 */
1256 list_create(&ivecs, sizeof (ib_ivec_t),
1257 offsetof (ib_ivec_t, ii_next));
1258
1259 ivec = list_head(&ivecs);
1260 if (ivec->ii_orig_cpu == src_cpuid) {
1261 IB_LOG((CE_CONT, "Keeping XXX on %d",
1262 src_cpuid)); /* ivec->ii_inum, */
1263 goal -= ivec->ii_time;
1264 /* XXX: shift */
1265 }
1266
1267 IB_LOG((CE_CONT, "ib_reconfig_cpu2cpu: inums should total %d", goal));
1268
1269 ib_find_goal(ivecs, goal);
1270
1271 FOREACH_IVEC(ivec, ivecs) {
1272 if (!(ivec->ii_now_cpu == src_cpuid ||
1273 ivec->ii_now_cpu == tgt_cpuid)) {
1274 IB_LOG((CE_CONT, "ib_do_reconfig_cpu2cpu: "));
1275 }
1276
1277 if (ivec->ii_goal && ivec->ii_now_cpu != src_cpuid) {
1278 ib_interrupt_do_move(ivec, src_cpuid);
1279 } else if (ivec->ii_goal == B_FALSE &&
1280 ivec->ii_now_cpu != tgt_cpuid) {
1281 ib_interrupt_do_move(ivec, tgt_cpuid);
1282 }
1283 }
1284
1285 ib_interrupt_move_check(delta, src_cpuid, tgt_cpuid);
1286
1287 new_load = src_cpu->ic_intrs / src_cpu->ic_tot;
1288
1289 if (!(new_load <= src_load && new_load > delta->id_avgintrload)) {
1290 IB_LOG((CE_CONT, "ib_reconfig_cpu2cpu: %d", new_load));
1291 }
1292 }
1293
1294 static void
1295 ib_do_reconfig_cpu(ib_delta_t *delta, list_t *cpu_sorted_list,
1296 processorid_t old_cpu_id)
1297 {
1298 ib_cpu_t *icpu;
1299 int avgintrload;
1300
1301 if (delta == NULL)
1302 return;
1303
1304 icpu = ib_cpu_find(delta->id_cpu_list, old_cpu_id);
1305 if (icpu == NULL)
1306 return;
1307
1308 avgintrload = delta->id_avgintrload;
1309
1310 }
1311
1312 /*
1313 * Reconfigure interrupt distribution among CPUs.
1314 */
1315 static int
1316 ib_do_reconfig(ib_delta_t *delta)
1317 {
1318 ib_cpu_t *icpu;
1319 ib_ivec_t *ivec;
1320 list_t cpu_sorted_list;
1321 int goodness, new_goodness;
1322 int warned = 0;
1323 int rval = 1, ret = 1;
1324
1325 if (delta == NULL)
1326 return (-1);
1327
1328 goodness = delta->id_goodness;
1329 if (goodness < goodness_mindelta) {
1330 IB_LOG((CE_CONT, "ib_do_reconfig: goodness is good enough"));
1331 return (0);
1332 }
1333
1334 IB_LOG((CE_CONT, "ib_do_reconfig: optimizing interrupt assignments"));
1335
1336 if (delta->id_missing != 0) {
1337 IB_LOG((CE_CONT, "ib_do_reconfig: aborted"));
1338 return (-1);
1339 }
1340
1341 FOREACH_CPU(icpu, delta->id_cpu_list) {
1342 FOREACH_IVEC(ivec, icpu->ic_ivec_list) {
1343 ivec->ii_orig_cpu = icpu->ic_cpu_id;
1344 ivec->ii_now_cpu = icpu->ic_cpu_id;
1345 /* XXX: inum */
1346 }
1347 }
1348
1349 list_create(&cpu_sorted_list, sizeof (ib_cpu_t),
1350 offsetof(ib_cpu_t, ic_next));
1351
1352 /*
1353 * Have we an improvement?
1354 */
1355 new_goodness = ib_goodness(delta);
1356 if (!(new_goodness <= goodness)) {
1357 IB_LOG((CE_CONT,
1358 "ib_do_reconfig: result has worse goodness"));
1359 }
1360
1361 if ((goodness != 100 || new_goodness == 100) &&
1362 goodness - new_goodness < goodness_mindelta) {
1363 IB_LOG((CE_CONT,
1364 "ib_do_reconfig: goodness already near optimum"));
1365 return (0);
1366 }
1367
1368 /*
1369 * Move interrupts.
1370 */
1371 FOREACH_CPU(icpu, delta->id_cpu_list) {
1372 FOREACH_IVEC(ivec, icpu->ic_ivec_list) {
1373 int error;
1374
1375 if (ivec->ii_orig_cpu == icpu->ic_cpu_id)
1376 continue;
1377
1378 error = ib_interrupt_do_move(ivec, icpu->ic_cpu_id);
1379 if (error != 0) {
1380 if (warned++ == 0) {
1381 IB_LOG((CE_CONT, "ib_do_reconfig: "
1382 "unable to move interrupt"));
1383 }
1384
1385 IB_LOG((CE_CONT, "ib_do_reconfig: "
1386 "unable to move buspath"));
1387
1388 ret = -1;
1389 }
1390 }
1391 }
1392
1393 return (rval);
1394 }
1395
1396
1397 /*
1398 * Check if the interrupt load did decrease.
1399 */
1400 static void
1401 ib_interrupt_move_check(ib_delta_t *delta, processorid_t old_cpuid,
1402 processorid_t new_cpuid)
1403 {
1404 ib_cpu_t *old_cpu, *new_cpu;
1405
1406 /*
1407 * Check old CPU.
1408 */
1409 old_cpu = ib_cpu_find(delta->id_cpu_list, old_cpuid);
1410 if (old_cpu == NULL)
1411 return;
1412 if (!(old_cpu->ic_tot >= old_cpu->ic_intrs)) {
1413 IB_LOG((CE_CONT,
1414 "Moved interrupts left 100+%% load on source CPU"));
1415 }
1416
1417 /*
1418 * Check new CPU.
1419 */
1420 new_cpu = ib_cpu_find(delta->id_cpu_list, new_cpuid);
1421 if (new_cpu == NULL)
1422 return;
1423 if (!(new_cpu->ic_tot >= new_cpu->ic_intrs)) {
1424 IB_LOG((CE_CONT,
1425 "Moved interrupts left 100+%% load on target CPU"));
1426 }
1427 }
1428
1429 /*
1430 * Actually move the interrupt.
1431 */
1432 static int
1433 ib_interrupt_do_move(ib_ivec_t *ivec, processorid_t cpu_id)
1434 {
1435 int ret, result;
1436
1437 struct psm_ops *pops;
1438
1439 //pops = mach_set[0];
1440
1441 // ret = (*psm_intr_ops)(NULL, &info_hdl, PSM_INTR_OP_SET_CPU,
1442 // &result);
1443
1444 return (-1);
1445 }
1446
1447 /*
1448 * Move an interrupt to a different CPU.
1449 */
1450 static int
1451 ib_interrupt_move(ib_delta_t *delta, uint64_t inum, processorid_t old_cpuid,
1452 processorid_t new_cpuid)
1453 {
1454 ib_cpu_t *old_cpu, *new_cpu;
1455 ib_ivec_t *ivec;
1456
1457 if (delta == NULL)
1458 return (-1);
1459
1460 /*
1461 * Remove interrupt vector from old CPU.
1462 */
1463 old_cpu = ib_cpu_find(delta->id_cpu_list, old_cpuid);
1464 if (old_cpu == NULL)
1465 return (-1);
1466
1467 ivec = ib_ivec_find_ino(old_cpu->ic_ivec_list, inum);
1468
1469 old_cpu->ic_intrs -= ivec->ii_time;
1470 old_cpu->ic_intr_load = old_cpu->ic_intrs / old_cpu->ic_tot;
1471 ib_ivec_delete_ino(old_cpu->ic_ivec_list, inum);
1472
1473 /*
1474 * Verify interrupts.
1475 */
1476 if (!(old_cpu->ic_intrs >= 0)) {
1477 IB_LOG((CE_CONT,
1478 "ib_interrupt_move: interrupt time > total time?"));
1479 }
1480
1481 if (!(ivec->ii_time <= old_cpu->ic_bigintr)) {
1482 IB_LOG((CE_CONT,
1483 "ib_interrupt_move: interrupt time > big interrupt?"));
1484 }
1485
1486 if (ivec->ii_time >= old_cpu->ic_bigintr) {
1487 ib_ivec_t *time_ivec;
1488 uint64_t bigtime = 0;
1489
1490 FOREACH_IVEC(time_ivec, old_cpu->ic_ivec_list) {
1491 if (time_ivec->ii_time > bigtime)
1492 bigtime = time_ivec->ii_time;
1493 }
1494 }
1495
1496 /*
1497 * Insert interrupt vector into new CPU.
1498 */
1499 new_cpu = ib_cpu_find(delta->id_cpu_list, new_cpuid);
1500 if (new_cpu == NULL)
1501 return (-1);
1502
1503 ivec->ii_now_cpu = new_cpuid;
1504 new_cpu->ic_intrs += ivec->ii_time;
1505 new_cpu->ic_intr_load = new_cpu->ic_intrs / new_cpu->ic_tot;
1506 ib_ivec_add_ino(new_cpu->ic_ivec_list, ivec);
1507
1508 if (ivec->ii_time > new_cpu->ic_bigintr)
1509 new_cpu->ic_bigintr = ivec->ii_time;
1510
1511 return (0);
1512 }