1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2013 David Hoeppner. All rights reserved.
14 */
15
16 /*
17 * Interrupt Load Balancer.
18 *
19 * The original balance functions views all CPU as equally.
20 */
21
22 /* XXX
23 *
24 * ib_cpu_list::walk list |::print ib_cpu_t
25 *
26 *
27 *
28 */
29 #include <sys/param.h>
30 #include <sys/types.h>
31 #include <sys/systm.h>
32 #include <sys/callb.h>
33 #include <sys/cpuvar.h>
34 #include <sys/proc.h>
35 #include <sys/processor.h>
36 #include <sys/sdt.h>
37 #include <sys/sysmacros.h>
38 #include <sys/time.h>
39 #include <sys/cmn_err.h>
40 #include <sys/zone.h>
41 #include <sys/lgrp.h>
42
43 #include <sys/pci_tools.h>
44
45 extern proc_t *proc_intrd;
46
47 #define IB_NAME "intrd"
48
49 #define IS_CPU(cpu_id) (cpu[cpu_id] != NULL)
50
51 #define IB_NORMAL_SLEEPTIME 10
52 #define IB_IDLE_SLEEPTIME 45
53 #define IB_ONECPU_SLEEPTIME (60 * 15)
54
55 #define IB_NUM_SAMPLES 6
56
57
58 static kmutex_t ib_lock;
59 static kcondvar_t ib_cv;
60
61 /*
62 * Interrupt CPU instance.
63 */
64 typedef struct _ib_cpu {
65 list_node_t ic_next;
66 boolean_t ic_offline;
67
68 hrtime_t ic_tot;
69 list_t ic_ivec_list;
70 uint32_t ic_num_ivecs;
71 processorid_t ic_cpu_id; /* XXX duplicate */
72 int64_t ic_intrs;
73 int64_t ic_big_intrs;
74 int64_t ic_bigintr; /* XXX bitintrs */
75
76 int ic_intr_load; /* intrs / tot */
77 } ib_cpu_t;
78
79 /*
80 * Interrupt vector instance.
81 */
82 typedef struct _ib_ivec {
83 list_node_t ii_next; /* link */
84
85 uint64_t ii_ihs;
86 uint64_t ii_ino;
87 uint64_t ii_num_ino;
88 uint64_t ii_pil;
89 uint64_t ii_time;
90 char *ii_buspath;
91 char *ii_name;
92
93 processorid_t ii_orig_cpu;
94 processorid_t ii_now_cpu;
95 uint64_t ii_inum;
96 } ib_ivec_t;
97
98 /*
99 * MSI
100 */
101 typedef struct _ib_msi {
102 list_node_t im_next; /* link */
103 const char *im_name;
104 list_t im_ino_list;
105 } ib_msi_t;
106
107 typedef struct _ib_msi_ino {
108 list_node_t imi_next; /* link */
109 uint64_t imi_ino;
110 ib_ivec_t *imi_ivec;
111 } ib_msi_ino_t;
112
113 /*
114 * Snapshot
115 */
116 typedef struct _ib_snapshot {
117 list_node_t is_next; /* link */
118 list_t is_cpu_list;
119 processorid_t is_num_cpus;
120 } ib_snapshot_t;
121
122 /*
123 * Snapshot delta structure.
124 */
125 typedef struct _ib_delta {
126 list_node_t id_next; /* link */
127 list_t id_cpu_list;
128 boolean_t id_missing;
129 int id_avgintrload; /* interrupts / total time */
130 uint64_t id_avgintrnsec;
131 int id_goodness;
132 } ib_delta_t;
133
134
135 static list_t ib_cpu_list; /* List of all CPU's */
136
137 static uint8_t ib_cs = 0; /* Index of current sample */
138 static long ib_sleeptime = IB_NORMAL_SLEEPTIME;
139 static processorid_t ib_num_cpus;
140
141 static int goodness_unsafe_load = 90;
142 static int goodness_mindelta = 10;
143
144 /*
145 * Function prototypes.
146 */
147 static void ib_cpu_register(processorid_t);
148 static int ib_cpu_setup(cpu_setup_t, int, void *);
149 static int ib_goodness(ib_delta_t *);
150 static int ib_do_reconfig(ib_delta_t *);
151 static int ib_imbalanced(int, int);
152 static int ib_interrupt_do_move(ib_ivec_t *, processorid_t);
153
154 static ib_snapshot_t *ib_get_statistics(void);
155 static ib_delta_t *ib_delta_generate(ib_snapshot_t *, ib_snapshot_t *);
156
157 /*
158 * Helper macros.
159 */
160 #define FOREACH_CPU(icpu, icpu_list) \
161 for (icpu = list_head(&icpu_list); icpu != NULL; \
162 icpu = list_next(&icpu_list, icpu))
163
164 #define FOREACH_IVEC(ivec, ivec_list) \
165 for (ivec = list_head(&ivec_list); ivec != NULL; \
166 ivec = list_next(&ivec_list, ivec))
167
168 #define DTRACE_INTRD(name) \
169 DTRACE_PROBE(__intrd_##name)
170
171 #define DEBUG 1
172 #ifdef DEBUG
173 #define IB_APIDBG(args) cmn_err args
174 #define IB_IMPLDBG(args) cmn_err args
175 #else
176 #define IB_APIDBG(args)
177 #define IB_IMPLDBG(args)
178 #endif
179
180 #define IB_LOG(args) cmn_err args
181
182 void
183 interrupt_balancer(void)
184 {
185 processorid_t cpu_id;
186 callb_cpr_t cpr;
187 user_t *u = PTOU(curproc);
188 int error;
189
190 boolean_t do_reconfig = B_FALSE;
191 int goodness;
192 int baseline_goodness = 0;
193 list_t ib_delta_list;
194 hrtime_t statslen = 60;
195
196 proc_intrd = ttoproc(curthread);
197 proc_intrd->p_cstime = proc_intrd->p_stime = 0;
198 proc_intrd->p_cutime = proc_intrd->p_utime = 0;
199
200 (void) strncpy(u->u_psargs, IB_NAME, sizeof(u->u_psargs));
201 (void) strncpy(u->u_comm, IB_NAME, sizeof(u->u_comm));
202
203 /* Initialize global mutex lock */
204 mutex_init(&ib_lock, NULL, MUTEX_DEFAULT, NULL);
205
206 /* Initialize CPU list */
207 list_create(&ib_cpu_list, sizeof (ib_cpu_t),
208 offsetof(ib_cpu_t, ic_next));
209
210 /* Initialize delta list */
211 list_create(&ib_delta_list, sizeof (ib_delta_t),
212 offsetof(ib_delta_t, id_next));
213
214 /*
215 * Build a list of all CPUs available for interrupt handling.
216 */
217 for (cpu_id = 0; cpu_id <= max_cpu_seqid_ever; cpu_id++) {
218 if (IS_CPU(cpu_id))
219 ib_cpu_register(cpu_id);
220 }
221
222 /*
223 * Locality group information.
224 */
225 int i;
226 for (i = 0; i < lgrp_plat_max_lgrps(); i++) {
227 lgrp_t *lgrp;
228
229 lgrp = lgrp_table[i];
230 }
231
232 /*
233 * Register a callback if a CPU goes offline or comes online.
234 */
235 mutex_enter(&cpu_lock);
236 register_cpu_setup_func(ib_cpu_setup, NULL);
237 mutex_exit(&cpu_lock);
238
239 CALLB_CPR_INIT(&cpr, &ib_lock, callb_generic_cpr, IB_NAME);
240
241 ib_snapshot_t *snapshot = NULL;
242 ib_snapshot_t *new_snapshot = NULL;
243 hrtime_t delta_time;
244 hrtime_t deltas_tottime = 0;
245 boolean_t below_statslen;
246
247 snapshot = ib_get_statistics();
248
249 mutex_enter(&ib_lock);
250 for (;;) {
251 ib_delta_t *delta;
252
253 DTRACE_INTRD(get_stats);
254 new_snapshot = ib_get_statistics();
255
256 delta = ib_delta_generate(snapshot, new_snapshot);
257
258 below_statslen = (deltas_tottime < statslen);
259 deltas_tottime += delta_time;
260 do_reconfig = (below_statslen && deltas_tottime >= statslen);
261
262 list_insert_tail(&ib_delta_list, delta);
263
264 /*
265 * Calculate the goodness of the current configuration.
266 */
267 goodness = ib_goodness(delta);
268
269 if (ib_imbalanced(goodness, baseline_goodness))
270 do_reconfig = B_TRUE;
271
272 /*
273 * Reconfigure interrupt distribution.
274 */
275 if (do_reconfig) {
276 error = ib_do_reconfig(delta);
277
278 if (error != 0) {
279 if (error == -1)
280 IB_LOG((CE_CONT, "ib_do_reconfig failed!"));
281 } else {
282 IB_LOG((CE_CONT, "setting new baseline of %d", goodness));
283 baseline_goodness = goodness;
284 }
285 }
286
287 /*
288 * Wait for timeout or CPU reconfiguration.
289 */
290 CALLB_CPR_SAFE_BEGIN(&cpr);
291 cv_timedwait(&ib_cv, &ib_lock, ddi_get_lbolt() +
292 SEC_TO_TICK(ib_sleeptime));
293 CALLB_CPR_SAFE_END(&cpr, &ib_lock);
294 }
295
296 CALLB_CPR_EXIT(&cpr);
297
298 /*
299 * Unregister CPU callback.
300 */
301 mutex_enter(&cpu_lock);
302 unregister_cpu_setup_func(ib_cpu_setup, NULL);
303 mutex_exit(&cpu_lock);
304 }
305
306 /*
307 * Register a new CPU in the global list of CPUs.
308 */
309 static void
310 ib_cpu_register(processorid_t cpu_id)
311 {
312 cpu_t *cp = cpu[cpu_id];
313 ib_cpu_t *new_cpu;
314
315 new_cpu = kmem_alloc(sizeof (ib_cpu_t), KM_SLEEP);
316 new_cpu->ic_cpu_id = cpu_id;
317
318 /* Initialize list for interrupt vectors */
319 list_create(&new_cpu->ic_ivec_list, sizeof (ib_ivec_t),
320 offsetof(ib_ivec_t, ii_next));
321
322 list_link_init(&new_cpu->ic_next);
323
324 /* Check if this CPU can handle interrupts */
325 mutex_enter(&cpu_lock);
326 if (cpu_is_nointr(cp))
327 new_cpu->ic_offline = B_TRUE;
328 else
329 new_cpu->ic_offline = B_FALSE;
330 mutex_exit(&cpu_lock);
331
332 /* Add CPU to list of CPUs */
333 list_insert_tail(&ib_cpu_list, new_cpu);
334
335 ib_num_cpus++;
336
337 IB_IMPLDBG((CE_CONT, "ib_cpu_register: cpu=0x%x", cpu_id));
338 }
339
340 /*
341 * Unregister CPU from the global list of CPUs.
342 */
343 static void
344 ib_cpu_unregister(processorid_t cpu_id)
345 {
346 ib_cpu_t *icpu;
347
348 mutex_enter(&ib_lock);
349 FOREACH_CPU(icpu, ib_cpu_list) {
350 if (icpu->ic_cpu_id == cpu_id) {
351 list_remove(&ib_cpu_list, icpu);
352 /* XXX or just offline CPU; statistics? */
353 break;
354 }
355 }
356 mutex_exit(&ib_lock);
357
358 ib_num_cpus--;
359
360 IB_IMPLDBG((CE_CONT, "ib_cpu_unregister: cpu=0x%x",
361 cpu_id));
362 }
363
364 /*
365 * Hook for CPU changes.
366 */
367 static int
368 ib_cpu_setup(cpu_setup_t what, int cpu_id, void *arg)
369 {
370
371 switch (what) {
372 /* XXX */
373 case CPU_OFF:
374 ib_cpu_unregister(cpu_id);
375 cv_signal(&ib_cv);
376 break;
377
378 case CPU_INTR_ON:
379 ib_cpu_register(cpu_id);
380 cv_signal(&ib_cv);
381 break;
382
383 default:
384 break;
385 }
386
387 return (0);
388 }
389
390 static ib_cpu_t *
391 ib_cpu_create(void)
392 {
393 ib_cpu_t *icpu;
394
395 icpu = kmem_alloc(sizeof (ib_cpu_t), KM_SLEEP);
396
397 return (icpu);
398 }
399
400 /*
401 * Find a CPU in the global list of CPUs by processor id.
402 */
403 static ib_cpu_t *
404 ib_cpu_find(list_t cpu_list, processorid_t cpu_id)
405 {
406 ib_cpu_t *icpu;
407
408 IB_APIDBG((CE_CONT, "ib_cpu_find: API cpu = %d", cpu_id));
409
410 FOREACH_CPU(icpu, cpu_list) {
411 if (icpu->ic_cpu_id == cpu_id)
412 return (icpu);
413 }
414
415 return (NULL);
416 }
417
418 /*
419 * Find a interrupt vector for a specific CPU.
420 */
421 static ib_ivec_t *
422 ib_cpu_find_ivec(list_t cpu_list, processorid_t cpu_id, char *buspath, uint64_t ino)
423 {
424 ib_cpu_t *icpu;
425 ib_ivec_t *ivec;
426
427 icpu = ib_cpu_find(cpu_list, cpu_id);
428 if (icpu == NULL)
429 return (NULL);
430
431 for (ivec = list_head(&icpu->ic_ivec_list); ivec != NULL;
432 ivec = list_next(&icpu->ic_ivec_list, ivec)) {
433 if (ivec->ii_ino == ino)
434 return (ivec);
435 }
436
437 return (NULL);
438 }
439
440 /*
441 * Total times spend.
442 */
443 static void
444 ib_cpu_statistics(ib_cpu_t *icpu)
445 {
446 cpu_t *cp;
447 hrtime_t msnsecs[NCMSTATES];
448 hrtime_t new_tot;
449
450 cp = cpu[icpu->ic_cpu_id];
451 get_cpu_mstate(cp, msnsecs);
452
453 icpu->ic_tot = msnsecs[CMS_IDLE] + msnsecs[CMS_USER] +
454 msnsecs[CMS_SYSTEM];
455
456 }
457
458 /*
459 * Create a new interrupt vector.
460 */
461 static ib_ivec_t *
462 ib_ivec_create(const char *buspath, uint64_t ino)
463 {
464 ib_ivec_t *ivec;
465
466 ivec = (ib_ivec_t *)kmem_alloc(sizeof (ib_ivec_t), KM_SLEEP);
467
468 list_link_init(&ivec->ii_next);
469
470 ivec->ii_buspath = (char *)buspath; /* XXX: strdup */
471 ivec->ii_ino = ino;
472 ivec->ii_ihs = 1;
473
474 return (ivec);
475 }
476
477 static void
478 intrd_ivec_register(ib_cpu_t *icpu)
479 {
480 }
481
482 /*
483 * Find interrupt vector by ino.
484 */
485 static ib_ivec_t *
486 ib_ivec_find_ino(list_t ivec_list, uint64_t ino)
487 {
488 ib_ivec_t *ivec;
489
490 FOREACH_IVEC(ivec, ivec_list) {
491 if (ivec->ii_inum == ino)
492 return (ivec);
493 }
494
495 return (NULL);
496 }
497
498 /*
499 * Delete a interrupt vector from a list.
500 */
501 static void
502 ib_ivec_delete_ino(list_t ivec_list, uint64_t ino)
503 {
504 ib_ivec_t *ivec;
505
506 FOREACH_IVEC(ivec, ivec_list) {
507 if (ivec->ii_inum == ino) {
508 /* XXX: remove from list */
509 ;
510 }
511 }
512 }
513
514 /*
515 * Add a new interrupt vector to a list.
516 */
517 static void
518 ib_ivec_add_ino(list_t ivec_list, ib_ivec_t *ivec)
519 {
520 list_insert_tail(&ivec_list, ivec);
521 }
522
523 static ib_msi_t *
524 ib_msi_create(const char *name)
525 {
526 ib_msi_t *msi;
527
528 msi = (ib_msi_t *)kmem_alloc(sizeof (ib_msi_t), KM_SLEEP);
529
530 msi->im_name = name;
531
532 list_link_init(&msi->im_next);
533 list_create(&msi->im_ino_list, sizeof (ib_msi_ino_t),
534 offsetof(ib_msi_ino_t, imi_next));
535
536 return (msi);
537 }
538
539 /*
540 * Allocate and initialize a new snapshot structure.
541 */
542 static ib_snapshot_t *
543 ib_snapshot_create(void)
544 {
545 ib_snapshot_t *snapshot;
546
547 snapshot = kmem_alloc(sizeof (ib_snapshot_t), KM_SLEEP);
548
549 /* init link */
550
551 /* Initialize CPU list */
552 list_create(&snapshot->is_cpu_list, sizeof (ib_cpu_t),
553 offsetof(ib_cpu_t, ic_next));
554
555 snapshot->is_num_cpus = 0;
556
557 return (snapshot);
558 }
559
560 static ib_ivec_t *
561 ib_irq_fill_ivec(kstat_t *ksp)
562 {
563 kstat_named_t *knp;
564 ib_ivec_t *ivec;
565 char *datap;
566 uint64_t time;
567 int i;
568
569 datap = ksp->ks_data;
570 knp = KSTAT_NAMED_PTR(ksp);
571 for (i = 0; i < ksp->ks_ndata; i++, knp++) {
572 IB_IMPLDBG((CE_CONT, "ib_irq_fill_ivec: %s",
573 knp->name));
574
575 if (strcmp(knp->name, "time") == 0) {
576 cmn_err(CE_CONT, "XXX ib time");
577 time = knp->value.ui64;
578 }
579
580 knp += sizeof (kstat_named_t);
581 datap += sizeof (kstat_named_t);
582 }
583
584 /* Allocate a new interrupt vector */
585 ivec = ib_ivec_create("", 0);
586 ivec->ii_time = time;
587
588 return (ivec);
589 }
590
591 /*
592 * XXX: icpu not needed, move out of loop
593 */
594 static void
595 ib_irq_statistics(ib_cpu_t *icpu)
596 {
597 kstat_t *ksp;
598 int instance = 1;
599
600 /*
601 * Read pci interrupts.
602 */
603 ksp = kstat_hold_byname("pci_intrs", instance, "pci", ALL_ZONES);
604 while (ksp != NULL) {
605 KSTAT_ENTER(ksp);
606
607 if (ksp->ks_data != NULL && ksp->ks_type == KSTAT_TYPE_NAMED) {
608 ib_cpu_t *icpu;
609 ib_ivec_t *ivec;
610 kstat_named_t *knp;
611 kstat_named_t *datap;
612 uint64_t ino;
613 char *buspath;
614 char *namep;
615 processorid_t cpu_id;
616 int i;
617 boolean_t is_enabled = B_TRUE;
618
619 (void) KSTAT_UPDATE(ksp, KSTAT_READ);
620
621 /*
622 * Find the CPU this interrupt vector is on and
623 * if the vector itself is enabled.
624 */
625 datap = ksp->ks_data;
626 namep = KSTAT_NAMED_PTR(ksp)->name;
627 for (i = 0; i < ksp->ks_ndata; i++) {
628 if (strcmp(namep, "cpu") == 0) {
629 cpu_id = datap->value.ui64;
630 } else if (strcmp(namep, "type") == 0) {
631 if (strcmp(datap->value.c, "disabled") == 0) {
632 is_enabled = B_FALSE;
633 break;
634 }
635 }
636
637 namep += sizeof (kstat_named_t);
638 datap += sizeof (kstat_named_t);
639 }
640
641 /*
642 * Skip this interrupt vector if its disabled.
643 */
644 if (!is_enabled)
645 continue;
646
647 /*
648 * Check if CPU is online.
649 */
650 icpu = ib_cpu_find(ib_cpu_list, cpu_id);
651 if (icpu == NULL || icpu->ic_offline)
652 continue;
653
654 /*
655 * Fill information.
656 */
657 ivec = ib_irq_fill_ivec(ksp);
658 if (ivec == NULL)
659 continue;
660
661 list_insert_tail(&icpu->ic_ivec_list, ivec);
662 }
663
664 KSTAT_EXIT(ksp);
665 kstat_rele(ksp);
666
667 instance++;
668 ksp = kstat_hold_byname("pci_intrs", instance, "pci", ALL_ZONES);
669 }
670 }
671
672 /*
673 * Collect data from CPUs and interrupt vectors.
674 */
675 static ib_snapshot_t *
676 ib_get_statistics(void)
677 {
678 ib_cpu_t *os_cpu;
679 ib_snapshot_t *snapshot;
680 ib_cpu_t *snapshot_cpu;
681
682 /*
683 * Nothing to balance with one CPU. XXX: right place?
684 */
685 if (ib_num_cpus <= 1) {
686 ib_sleeptime = IB_ONECPU_SLEEPTIME;
687 return (NULL);
688 }
689
690 /*
691 * Store all CPUs and ivecs here.
692 */
693 snapshot = ib_snapshot_create();
694
695 /*
696 * Loop over all active CPUs
697 */
698 FOREACH_CPU(os_cpu, ib_cpu_list) {
699
700 snapshot->is_num_cpus++;
701
702 snapshot_cpu = ib_cpu_create();
703 snapshot_cpu->ic_cpu_id = os_cpu->ic_cpu_id;
704
705 list_insert_tail(&snapshot->is_cpu_list, snapshot_cpu);
706
707 ib_cpu_statistics(snapshot_cpu);
708 ib_irq_statistics(os_cpu);
709 }
710
711 return (snapshot);
712 }
713
714 static ib_delta_t *
715 ib_delta_create(void)
716 {
717 ib_delta_t *delta;
718
719 delta = kmem_alloc(sizeof (ib_delta_t), KM_SLEEP);
720 delta->id_missing = B_FALSE;
721
722 list_create(&delta->id_cpu_list, sizeof (ib_cpu_t),
723 offsetof(ib_cpu_t, ic_next));
724
725 return (delta);
726 }
727
728 /*
729 * Generate the delta of two snapshots.
730 */
731 static ib_delta_t *
732 ib_delta_generate(ib_snapshot_t *old_snapshot, ib_snapshot_t *new_snapshot)
733 {
734 ib_cpu_t *old_cpu, *new_cpu;
735 ib_delta_t *delta;
736 int intrload = 0;
737 int intrnsec = 0;
738 processorid_t cpus = 0;
739
740 /*
741 * Allocate a new delta structure.
742 */
743 delta = ib_delta_create();
744
745 /*
746 * Number of CPUs must be the same.
747 */
748 delta->id_missing = old_snapshot->is_num_cpus !=
749 new_snapshot->is_num_cpus;
750
751 if (delta->id_missing != 0) {
752 IB_LOG((CE_CONT, "ib_delta_generate: number of CPUs changed"));
753 return (delta);
754 }
755
756 /*
757 * Loop over the CPUs in both snapshots.
758 */
759 for (new_cpu = list_head(&new_snapshot->is_cpu_list),
760 old_cpu = list_head(&old_snapshot->is_cpu_list);
761 new_cpu != NULL && old_cpu != NULL;
762 new_cpu = list_next(&new_snapshot->is_cpu_list, new_cpu),
763 old_cpu = list_next(&old_snapshot->is_cpu_list, old_cpu)) {
764 ib_cpu_t *delta_cpu;
765 ib_ivec_t *new_ivec;
766
767 /* XXX: just onlined CPU? */
768
769 /* Allocate a new CPU structure */
770 delta_cpu = ib_cpu_create();
771
772 /* Difference of total time */
773 delta_cpu->ic_tot = new_cpu->ic_tot - old_cpu->ic_tot;
774 if (!(delta_cpu->ic_tot >= 0)) {
775 delta->id_missing = B_TRUE;
776 kmem_free(delta_cpu, sizeof (ib_cpu_t));
777 return (delta);
778 }
779
780 list_insert_tail(&delta->id_cpu_list, delta_cpu);
781
782 /* Avoid division by zero */
783 if (delta_cpu->ic_tot == 0)
784 delta_cpu->ic_tot = 1;
785
786 delta_cpu->ic_intrs = 0;
787 delta_cpu->ic_big_intrs = 0;
788
789 /*
790 * Number of interrupt vectors must be the same.
791 */
792 if (old_cpu->ic_num_ivecs != new_cpu->ic_num_ivecs) {
793 IB_LOG((CE_CONT, "ib_delta_generate: cpu %d has more "
794 "or less interrupts", old_cpu->ic_cpu_id));
795 delta->id_missing = B_TRUE;
796 return (delta);
797 }
798
799 /*
800 * Loop over the interrupt vectors of the new CPU.
801 */
802 for (new_ivec = list_head(&new_cpu->ic_ivec_list);
803 new_ivec != NULL; new_ivec =
804 list_next(&new_cpu->ic_ivec_list, new_ivec)) {
805 ib_ivec_t *ivec;
806 ib_ivec_t *delta_ivec;
807 hrtime_t time;
808
809 if (new_ivec->ii_num_ino == 0)
810 continue;
811
812 /*
813 * If interrupt vector does not exists or XXX crtime
814 * is different, set missing.
815 */
816 ivec = ib_ivec_find_ino(old_cpu->ic_ivec_list,
817 new_ivec->ii_ino);
818 if (ivec == NULL) {
819 delta->id_missing = B_TRUE;
820 return (delta);
821 }
822
823 /* Allocate a new delta interrupt vector */
824 delta_ivec = ib_ivec_create(new_ivec->ii_buspath,
825 new_ivec->ii_ino);
826
827 /*
828 * Time used by this interrupt.
829 */
830 time = new_ivec->ii_time - ivec->ii_time;
831 if (time < 0) {
832 delta->id_missing = B_TRUE;
833 kmem_free(delta_ivec, sizeof (ib_delta_t));
834 return (delta);
835 }
836
837 delta_cpu->ic_intrs += time;
838 delta_ivec->ii_time = time;
839
840 if (time > delta_cpu->ic_bigintr)
841 delta_cpu->ic_bigintr = time;
842
843 /*
844 * Fill in the rest.
845 */
846 delta_ivec->ii_ihs = new_ivec->ii_ihs;
847 delta_ivec->ii_pil = new_ivec->ii_pil;
848 delta_ivec->ii_ino = new_ivec->ii_ino;
849 delta_ivec->ii_num_ino = new_ivec->ii_num_ino;
850 /* XXX: buspath, name */
851 }
852
853 /*
854 * Rounding error
855 */
856 if (delta_cpu->ic_tot < delta_cpu->ic_intrs)
857 delta_cpu->ic_tot = delta_cpu->ic_intrs;
858
859 delta_cpu->ic_intr_load =
860 delta_cpu->ic_intrs / delta_cpu->ic_tot;
861 intrload += delta_cpu->ic_intr_load;
862 intrnsec += delta_cpu->ic_intrs;
863
864 cpus++;
865 }
866
867 if (cpus > 0) {
868 delta->id_avgintrload = intrload / cpus;
869 delta->id_avgintrnsec = intrnsec / cpus;
870 } else {
871 delta->id_avgintrload = 0;
872 delta->id_avgintrnsec = 0;
873 }
874
875 return (delta);
876 }
877
878 /*
879 * Compress deltas.
880 */
881 static ib_delta_t *
882 ib_delta_compress(list_t *deltas)
883 {
884 ib_cpu_t *icpu;
885 ib_ivec_t *ivec;
886 ib_delta_t *new_delta, *delta;
887 processorid_t cpus = 0;
888 int high_intrload = 0;
889 int intrs = 0, tot;
890
891 /* Check if empty list of deltas */
892 if (deltas == NULL || list_is_empty(deltas) != 0) {
893 IB_LOG((CE_CONT, "ib_delta_compress: deltas are empty?"));
894 return (NULL);
895 }
896
897 /* Allocate a new delta structure */
898 new_delta = ib_delta_create();
899
900 /*
901 * Loop over the deltas in the list.
902 */
903 for (delta = list_head(deltas); delta != NULL;
904 delta = list_next(deltas, delta)) {
905
906 /* Compressing bad delta? */
907 if (delta->id_missing) {
908 IB_LOG((CE_CONT,
909 "ib_delta_compress: compressing bad deltas?"));
910 return (NULL);
911 }
912
913 FOREACH_CPU(icpu, delta->id_cpu_list) {
914 ib_cpu_t *new_cpu;
915 ib_ivec_t *new_ivec;
916
917 intrs += icpu->ic_intrs;
918 tot += icpu->ic_tot;
919 new_cpu = ib_cpu_create();
920 new_cpu->ic_cpu_id = icpu->ic_cpu_id;
921 new_cpu->ic_intrs = icpu->ic_intrs;
922 new_cpu->ic_tot = icpu->ic_tot;
923
924 /* XXX: exists ivecs */
925 FOREACH_IVEC(new_ivec, icpu->ic_ivec_list) {
926 ib_ivec_t *new_delta_ivec;
927
928 new_delta_ivec = ib_ivec_create(
929 new_ivec->ii_buspath, new_ivec->ii_ino);
930
931 }
932 }
933 }
934
935 FOREACH_CPU(icpu, new_delta->id_cpu_list) {
936 int bigintr = 0;
937
938 cpus++;
939
940 FOREACH_IVEC(ivec, icpu->ic_ivec_list) {
941 if (ivec->ii_time > bigintr)
942 bigintr = ivec->ii_time;
943 }
944
945 icpu->ic_bigintr = bigintr;
946 icpu->ic_intr_load = icpu->ic_intrs / icpu->ic_tot;
947
948 if (high_intrload < icpu->ic_intr_load)
949 high_intrload = icpu->ic_intr_load;
950
951 if (icpu->ic_tot <= 0)
952 icpu->ic_tot = 100;
953 }
954
955 if (cpus > 0) {
956 new_delta->id_avgintrload = intrs / tot;
957 new_delta->id_avgintrnsec = intrs / cpus;
958 } else {
959 new_delta->id_avgintrload = 0;
960 new_delta->id_avgintrnsec = 0;
961 }
962
963 /* XXX: global sleeptime */
964
965 return (new_delta);
966 }
967
968 /*
969 * Decide if the load is out of balance.
970 */
971 static int
972 ib_imbalanced(int goodness, int baseline)
973 {
974 if (goodness > 50)
975 return (100);
976
977 /* XXX: abs */
978 if ((goodness - baseline) > goodness_mindelta)
979 return (100);
980
981 return (0);
982 }
983
984 /*
985 * Calculate goodness of a CPU.
986 */
987 static int
988 ib_goodness_cpu(ib_cpu_t *icpu, int avg_interrupt_load)
989 {
990 int goodness;
991 int load, load_no_bigintr;
992
993 load = icpu->ic_intrs / icpu->ic_tot;
994 if (load < avg_interrupt_load)
995 return (0);
996
997 load_no_bigintr = (icpu->ic_intrs - icpu->ic_bigintr) / icpu->ic_tot;
998
999 if ((load > goodness_unsafe_load) && (icpu->ic_num_ivecs > 1))
1000 return (1);
1001
1002 goodness = load - avg_interrupt_load;
1003 if (goodness > load_no_bigintr)
1004 goodness = load_no_bigintr;
1005
1006 return (goodness);
1007 }
1008
1009 /*
1010 * Calculate goodness.
1011 */
1012 static int
1013 ib_goodness(ib_delta_t *delta)
1014 {
1015 ib_cpu_t *icpu;
1016 int goodness, high_goodness = 0;
1017
1018 if (delta->id_missing > 0)
1019 return (1);
1020
1021 FOREACH_CPU(icpu, delta->id_cpu_list) {
1022 goodness = ib_goodness_cpu(icpu, delta->id_avgintrload);
1023 if (!(goodness >= 0 && goodness <= 100)) {
1024 IB_LOG((CE_CONT,
1025 "ib_goodness: cpu goodness out of range?"));
1026 return (100);
1027 }
1028
1029 if (goodness == 100)
1030 return (100);
1031
1032 if (goodness > high_goodness)
1033 high_goodness = goodness;
1034 }
1035
1036 return (high_goodness);
1037 }
1038
1039 static void
1040 ib_do_find_goal(list_t ivecs, list_t loads, int goal, int idx)
1041 {
1042 list_t goals_with;
1043 list_t goals_without;
1044 int with, without;
1045 int which, load;
1046
1047
1048 if (goal <= load) {
1049 with = load;
1050 } else {
1051 /* XXX: do_find_goal */
1052 with += load;
1053 }
1054
1055 IB_LOG((CE_CONT, "XXX"));
1056
1057 if (with >= goal && without < goal) {
1058 which = 0;
1059 } else if (with < goal && without >= goal) {
1060 which = 1;
1061 } else if (with >= goal && without >= goal) {
1062 which = without < with;
1063 } else {
1064 which = without > with;
1065 }
1066
1067 if (which == 1) {
1068 IB_LOG((CE_CONT, "ib_do_find_goal: going without"));
1069 /* XXX */
1070 } else {
1071 IB_LOG((CE_CONT, "ib_do_find_goal: going with"));
1072 /* XXX */
1073 }
1074 }
1075
1076 typedef struct _ib_goal {
1077 list_node_t *ig_link;
1078 int ig_value;
1079 } ib_goal_t;
1080
1081 typedef struct _ib_goal_load {
1082 list_node_t *igl_link;
1083 int igl_value;
1084 } ib_goal_load_t;
1085
1086 static void
1087 ib_find_goal(list_t ivecs, int goal)
1088 {
1089 ib_ivec_t *ivec;
1090 list_t goals;
1091 int load;
1092
1093 if (goal <= 0) {
1094 list_create(&goals, sizeof (ib_goal_t),
1095 offsetof (ib_goal_t, ig_link));
1096 } else {
1097 list_t loads;
1098 hrtime_t tot = 0;
1099
1100 IB_LOG((CE_CONT, "ib_find_goal: finding goal from intrs XXX"));
1101
1102 FOREACH_IVEC(ivec, ivecs) {
1103 tot += ivec->ii_time;
1104 }
1105
1106 list_create(&loads, sizeof (ib_goal_load_t),
1107 offsetof (ib_goal_load_t, igl_link));
1108
1109 FOREACH_IVEC(ivec, ivecs) {
1110 ib_goal_load_t *igl = kmem_alloc(sizeof (ib_goal_load_t), KM_SLEEP);
1111
1112 igl->igl_value = tot;
1113 list_insert_tail(&loads, igl);
1114
1115 tot -= ivec->ii_time;
1116 }
1117 }
1118 }
1119
1120 static void
1121 ib_do_reconfig_cpu2cpu(ib_delta_t *delta, processorid_t src_cpuid,
1122 processorid_t tgt_cpuid, int src_load)
1123 {
1124 ib_cpu_t *src_cpu, *tgt_cpu;
1125 ib_ivec_t *ivec;
1126 list_t ivecs;
1127 int goal;
1128 int avg_nsec;
1129
1130 if (delta == NULL)
1131 return;
1132
1133 goal = delta->id_avgintrnsec;
1134
1135 src_cpu = ib_cpu_find(delta->id_cpu_list, src_cpuid);
1136 if (src_cpu == NULL)
1137 return;
1138
1139 tgt_cpu = ib_cpu_find(delta->id_cpu_list, tgt_cpuid);
1140 if (tgt_cpu == NULL)
1141 return;
1142
1143 avg_nsec = (src_cpu->ic_intrs + tgt_cpu->ic_intrs) / 2;
1144 if (goal < avg_nsec)
1145 goal = avg_nsec;
1146
1147
1148 /*
1149 * Sort interrupt vectors by time.
1150 */
1151 list_create(&ivecs, sizeof (ib_ivec_t),
1152 offsetof (ib_ivec_t, ii_next));
1153
1154 ivec = list_head(&ivecs);
1155 if (ivec->ii_orig_cpu == src_cpuid) {
1156 IB_LOG((CE_CONT, "Keeping XXX on %d",
1157 src_cpuid)); /* ivec->ii_inum, */
1158 goal -= ivec->ii_time;
1159 /* XXX: shift */
1160 }
1161
1162 IB_LOG((CE_CONT, "ib_reconfig_cpu2cpu: inums should total %d", goal));
1163
1164 ib_find_goal(ivecs, goal);
1165 }
1166
1167 static void
1168 ib_do_reconfig_cpu(ib_delta_t *delta, list_t *cpu_sorted_list,
1169 processorid_t old_cpu_id)
1170 {
1171 ib_cpu_t *icpu;
1172 int avgintrload;
1173
1174 if (delta == NULL)
1175 return;
1176
1177 icpu = ib_cpu_find(delta->id_cpu_list, old_cpu_id);
1178 if (icpu == NULL)
1179 return;
1180
1181 avgintrload = delta->id_avgintrload;
1182
1183 }
1184
1185 /*
1186 * Reconfigure interrupt distribution among CPUs.
1187 */
1188 static int
1189 ib_do_reconfig(ib_delta_t *delta)
1190 {
1191 ib_cpu_t *icpu;
1192 ib_ivec_t *ivec;
1193 list_t cpu_sorted_list;
1194 int goodness, new_goodness;
1195 int warned = 0;
1196 int rval = 1, ret = 1;
1197
1198 if (delta == NULL)
1199 return (-1);
1200
1201 goodness = delta->id_goodness;
1202 if (goodness < goodness_mindelta) {
1203 IB_LOG((CE_CONT, "ib_do_reconfig: goodness is good enough"));
1204 return (0);
1205 }
1206
1207 IB_LOG((CE_CONT, "ib_do_reconfig: optimizing interrupt assignments"));
1208
1209 if (delta->id_missing != 0) {
1210 IB_LOG((CE_CONT, "ib_do_reconfig: aborted"));
1211 return (-1);
1212 }
1213
1214 FOREACH_CPU(icpu, delta->id_cpu_list) {
1215 FOREACH_IVEC(ivec, icpu->ic_ivec_list) {
1216 ivec->ii_orig_cpu = icpu->ic_cpu_id;
1217 ivec->ii_now_cpu = icpu->ic_cpu_id;
1218 /* XXX: inum */
1219 }
1220 }
1221
1222 list_create(&cpu_sorted_list, sizeof (ib_cpu_t),
1223 offsetof(ib_cpu_t, ic_next));
1224
1225 /*
1226 * Have we an improvement?
1227 */
1228 new_goodness = ib_goodness(delta);
1229 if (!(new_goodness <= goodness)) {
1230 IB_LOG((CE_CONT,
1231 "ib_do_reconfig: result has worse goodness"));
1232 }
1233
1234 if ((goodness != 100 || new_goodness == 100) &&
1235 goodness - new_goodness < goodness_mindelta) {
1236 IB_LOG((CE_CONT,
1237 "ib_do_reconfig: goodness already near optimum"));
1238 return (0);
1239 }
1240
1241 /*
1242 * Move interrupts.
1243 */
1244
1245 FOREACH_CPU(icpu, delta->id_cpu_list) {
1246 FOREACH_IVEC(ivec, icpu->ic_ivec_list) {
1247 int error;
1248
1249 if (ivec->ii_orig_cpu == icpu->ic_cpu_id)
1250 continue;
1251
1252 error = ib_interrupt_do_move(ivec, icpu->ic_cpu_id);
1253 if (error != 0) {
1254 if (warned++ == 0) {
1255 IB_LOG((CE_CONT, "ib_do_reconfig: "
1256 "unable to move interrupt"));
1257 }
1258
1259 IB_LOG((CE_CONT, "ib_do_reconfig: "
1260 "unable to move buspath"));
1261
1262 ret = -1;
1263 }
1264 }
1265 }
1266
1267 return (rval);
1268 }
1269
1270
1271 /*
1272 * Check if the interrupt load did decrease.
1273 */
1274 static void
1275 ib_interrupt_move_check(ib_delta_t *delta, processorid_t old_cpuid,
1276 processorid_t new_cpuid)
1277 {
1278 ib_cpu_t *old_cpu, *new_cpu;
1279
1280 /*
1281 * Check old CPU.
1282 */
1283 old_cpu = ib_cpu_find(delta->id_cpu_list, old_cpuid);
1284 if (old_cpu == NULL)
1285 return;
1286 if (!(old_cpu->ic_tot >= old_cpu->ic_intrs)) {
1287 IB_LOG((CE_CONT,
1288 "Moved interrupts left 100+%% load on source CPU"));
1289 }
1290
1291 /*
1292 * Check new CPU.
1293 */
1294 new_cpu = ib_cpu_find(delta->id_cpu_list, new_cpuid);
1295 if (new_cpu == NULL)
1296 return;
1297 if (!(new_cpu->ic_tot >= new_cpu->ic_intrs)) {
1298 IB_LOG((CE_CONT,
1299 "Moved interrupts left 100+%% load on target CPU"));
1300 }
1301 }
1302
1303 /*
1304 * Actually moving the interrupt.
1305 */
1306 static int
1307 ib_interrupt_do_move(ib_ivec_t *ivec, processorid_t cpu_id)
1308 {
1309 int ret, result;
1310
1311 struct psm_ops *pops;
1312
1313 //pops = mach_set[0];
1314
1315 // ret = (*psm_intr_ops)(NULL, &info_hdl, PSM_INTR_OP_SET_CPU,
1316 // &result);
1317
1318 return (-1);
1319 }
1320
1321 /*
1322 * Move an interrupt to a different CPU.
1323 */
1324 static int
1325 ib_interrupt_move(ib_delta_t *delta, uint64_t inum, processorid_t old_cpuid,
1326 processorid_t new_cpuid)
1327 {
1328 ib_cpu_t *old_cpu, *new_cpu;
1329 ib_ivec_t *ivec;
1330
1331 if (delta == NULL)
1332 return (-1);
1333
1334 /*
1335 * Remove interrupt vector from old CPU.
1336 */
1337 old_cpu = ib_cpu_find(delta->id_cpu_list, old_cpuid);
1338 if (old_cpu == NULL)
1339 return (-1);
1340
1341 ivec = ib_ivec_find_ino(old_cpu->ic_ivec_list, inum);
1342
1343 old_cpu->ic_intrs -= ivec->ii_time;
1344 old_cpu->ic_intr_load = old_cpu->ic_intrs / old_cpu->ic_tot;
1345 ib_ivec_delete_ino(old_cpu->ic_ivec_list, inum);
1346
1347 /*
1348 * Verify interrupts.
1349 */
1350 if (!(old_cpu->ic_intrs >= 0)) {
1351 IB_LOG((CE_CONT,
1352 "ib_interrupt_move: interrupt time > total time?"));
1353 }
1354
1355 if (!(ivec->ii_time <= old_cpu->ic_bigintr)) {
1356 IB_LOG((CE_CONT,
1357 "ib_interrupt_move: interrupt time > big interrupt?"));
1358 }
1359
1360 if (ivec->ii_time >= old_cpu->ic_bigintr) {
1361 ib_ivec_t *time_ivec;
1362 uint64_t bigtime = 0;
1363
1364 FOREACH_IVEC(time_ivec, old_cpu->ic_ivec_list) {
1365 if (time_ivec->ii_time > bigtime)
1366 bigtime = time_ivec->ii_time;
1367 }
1368 }
1369
1370
1371 /*
1372 * Insert interrupt vector into new CPU.
1373 */
1374 new_cpu = ib_cpu_find(delta->id_cpu_list, new_cpuid);
1375 if (new_cpu == NULL)
1376 return (-1);
1377
1378 ivec->ii_now_cpu = new_cpuid;
1379 new_cpu->ic_intrs += ivec->ii_time;
1380 new_cpu->ic_intr_load = new_cpu->ic_intrs / new_cpu->ic_tot;
1381 ib_ivec_add_ino(new_cpu->ic_ivec_list, ivec);
1382
1383 if (ivec->ii_time > new_cpu->ic_bigintr)
1384 new_cpu->ic_bigintr = ivec->ii_time;
1385
1386 return (0);
1387 }