1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2013 David Hoeppner. All rights reserved. 14 */ 15 16 /* 17 * Interrupt Load Balancer. 18 * 19 * The interrupt load balancer reassigns interrupts from one cpu 20 * to another, if the interrupt load 21 */ 22 23 #include <sys/param.h> 24 #include <sys/types.h> 25 #include <sys/systm.h> 26 #include <sys/callb.h> 27 #include <sys/cpuvar.h> 28 #include <sys/proc.h> 29 #include <sys/processor.h> 30 #include <sys/sdt.h> 31 #include <sys/sysmacros.h> 32 #include <sys/time.h> 33 #include <sys/cmn_err.h> 34 #include <sys/zone.h> 35 #include <sys/lgrp.h> 36 37 extern proc_t *proc_intrd; 38 39 #define IB_NAME "intrd" 40 41 /* 42 * Various sleeptimes. 43 */ 44 #define IB_NORMAL_SLEEPTIME 10 45 #define IB_IDLE_SLEEPTIME 45 46 #define IB_ONECPU_SLEEPTIME (60 * 15) 47 48 static kmutex_t ib_lock; 49 static kcondvar_t ib_cv; 50 51 /* 52 * System tuneable. 53 * 54 * Exclude interrupts in this list. 55 */ 56 static char *ib_exclude = NULL; 57 58 typedef struct _ib_exclude { 59 list_node_t *ix_next; 60 processorid_t ix_cpu_id; 61 } ib_exclude_list_t; 62 63 static list_t ib_exclude_list; 64 65 66 67 /* 68 * CPU structure. 69 */ 70 typedef struct _ib_cpu { 71 list_node_t ic_next; /* link */ 72 processorid_t ic_cpu_id; /* processor id */ 73 boolean_t ic_offline; /* CPU is offline */ 74 list_t ic_ivec_list; /* list of interrupt vectors */ 75 uint32_t ic_num_ivecs; /* number of interrupt vectors */ 76 hrtime_t ic_tot; /* total time */ 77 int64_t ic_intrs; /* number of interrupts */ 78 int ic_intr_load; /* interrupts / total time */ 79 int64_t ic_big_intrs; 80 int64_t ic_bigintr; /* largest interrupt on cpu */ 81 lgrp_t *ic_lgrp; /* locality group of this cpu */ 82 } ib_cpu_t; 83 84 /* 85 * Interrupt vector. 86 */ 87 typedef struct _ib_ivec { 88 list_node_t ii_next; /* link */ 89 uint64_t ii_ihs; 90 uint64_t ii_ino; 91 uint64_t ii_num_ino; 92 uint64_t ii_pil; 93 uint64_t ii_time; 94 char *ii_buspath; 95 char *ii_name; 96 processorid_t ii_orig_cpu; /* current CPU */ 97 processorid_t ii_now_cpu; /* new to be assigned CPU */ 98 uint64_t ii_inum; 99 boolean_t ii_goal; 100 } ib_ivec_t; 101 102 /* 103 * MSI. 104 */ 105 typedef struct _ib_msi { 106 list_node_t im_next; /* link */ 107 const char *im_name; 108 list_t im_ino_list; 109 } ib_msi_t; 110 111 typedef struct _ib_msi_ino { 112 list_node_t imi_next; /* link */ 113 uint64_t imi_ino; 114 ib_ivec_t *imi_ivec; 115 } ib_msi_ino_t; 116 117 /* 118 * Snapshot. 119 */ 120 typedef struct _ib_snapshot { 121 list_node_t is_next; /* link */ 122 list_t is_cpu_list; 123 processorid_t is_num_cpus; 124 } ib_snapshot_t; 125 126 /* 127 * Snapshot delta structure. 128 */ 129 typedef struct _ib_delta { 130 list_node_t id_next; /* link */ 131 list_t id_cpu_list; 132 boolean_t id_missing; 133 int id_avgintrload; /* interrupts / total time */ 134 uint64_t id_avgintrnsec; 135 int id_goodness; 136 } ib_delta_t; 137 138 static list_t ib_cpu_list; /* List of all OS CPUs */ 139 140 static long ib_sleeptime = IB_NORMAL_SLEEPTIME; 141 static processorid_t ib_num_cpus; 142 143 static int goodness_unsafe_load = 90; 144 static int goodness_mindelta = 10; 145 146 /* 147 * Function prototypes. 148 */ 149 static void ib_cpu_register(processorid_t); 150 static int ib_cpu_setup(cpu_setup_t, int, void *); 151 static boolean_t ib_cpu_exclude(processorid_t); 152 static ib_cpu_t *ib_cpu_create(void); 153 static ib_cpu_t *ib_cpu_find(list_t, processorid_t); 154 static void ib_cpu_destroy(ib_cpu_t *); 155 156 static int ib_goodness(ib_delta_t *); 157 static int ib_do_reconfig(ib_delta_t *); 158 static int ib_imbalanced(int, int); 159 static int ib_interrupt_do_move(ib_ivec_t *, processorid_t); 160 static void ib_interrupt_move_check(ib_delta_t *, processorid_t, processorid_t); 161 162 static ib_snapshot_t *ib_get_statistics(void); 163 static ib_delta_t *ib_delta_generate(ib_snapshot_t *, ib_snapshot_t *); 164 165 /* 166 * Helper macros. 167 */ 168 #define IS_CPU(cpu_id) (cpu[cpu_id] != NULL) 169 170 #define FOREACH_CPU(icpu, icpu_list) \ 171 for (icpu = list_head(&icpu_list); icpu != NULL; \ 172 icpu = list_next(&icpu_list, icpu)) 173 174 #define FOREACH_IVEC(ivec, ivec_list) \ 175 for (ivec = list_head(&ivec_list); ivec != NULL; \ 176 ivec = list_next(&ivec_list, ivec)) 177 178 #define DTRACE_INTRD(name) \ 179 DTRACE_PROBE(__intrd_##name) 180 181 #define DEBUG 1 182 #ifdef DEBUG 183 #define IB_APIDBG(args) cmn_err args 184 #define IB_IMPLDBG(args) cmn_err args 185 #else 186 #define IB_APIDBG(args) 187 #define IB_IMPLDBG(args) 188 #endif 189 190 #define IB_LOG(args) cmn_err args 191 192 void 193 interrupt_balancer(void) 194 { 195 processorid_t cpu_id; 196 callb_cpr_t cpr; 197 user_t *u = PTOU(curproc); 198 int error; 199 200 boolean_t do_reconfig = B_FALSE; 201 int goodness; 202 int baseline_goodness = 0; 203 list_t ib_delta_list; 204 hrtime_t statslen = 60; 205 206 proc_intrd = ttoproc(curthread); 207 proc_intrd->p_cstime = proc_intrd->p_stime = 0; 208 proc_intrd->p_cutime = proc_intrd->p_utime = 0; 209 210 (void) strncpy(u->u_psargs, IB_NAME, sizeof(u->u_psargs)); 211 (void) strncpy(u->u_comm, IB_NAME, sizeof(u->u_comm)); 212 213 /* Initialize global mutex lock */ 214 mutex_init(&ib_lock, NULL, MUTEX_DEFAULT, NULL); 215 216 /* Initialize CPU list */ 217 list_create(&ib_cpu_list, sizeof (ib_cpu_t), 218 offsetof(ib_cpu_t, ic_next)); 219 220 /* Initialize delta list */ 221 list_create(&ib_delta_list, sizeof (ib_delta_t), 222 offsetof(ib_delta_t, id_next)); 223 224 /* Initialize interrupt exclude list */ 225 list_create(&ib_exclude_list, sizeof (ib_exclude_list_t), 226 offsetof(ib_exclude_list_t, ix_next)); 227 228 /* 229 * Parse list of interrupts to exclude. 230 * 231 * XXX: move interrupts to active processors. 232 */ 233 if (ib_exclude != NULL) { 234 processorid_t rval; 235 236 IB_LOG((CE_CONT, "XXX %s XXX", ib_exclude)); 237 } 238 239 /* 240 * Build a list of all CPUs available for interrupt handling. 241 */ 242 for (cpu_id = 0; cpu_id <= max_cpu_seqid_ever; cpu_id++) { 243 if (IS_CPU(cpu_id)) 244 ib_cpu_register(cpu_id); 245 } 246 247 /* 248 * Locality group information. 249 */ 250 int i; 251 for (i = 0; i < lgrp_plat_max_lgrps(); i++) { 252 lgrp_t *lgrp; 253 254 lgrp = lgrp_table[i]; 255 if (lgrp->lgrp_cpu != NULL) { 256 cpu_t *lgrp_cpu; 257 258 for (lgrp_cpu = lgrp->lgrp_cpu; lgrp_cpu != NULL; 259 lgrp_cpu =lgrp_cpu->cpu_next_lgrp) { 260 ib_cpu_t *icpu; 261 262 icpu = ib_cpu_find(ib_cpu_list, lgrp_cpu->cpu_id); 263 264 /* 265 * Assign locality group if we found a CPU. 266 */ 267 if (icpu != NULL) 268 icpu->ic_lgrp = lgrp; 269 } 270 } 271 } 272 273 /* 274 * Register a callback if a CPU goes offline or comes online. 275 */ 276 mutex_enter(&cpu_lock); 277 register_cpu_setup_func(ib_cpu_setup, NULL); 278 mutex_exit(&cpu_lock); 279 280 CALLB_CPR_INIT(&cpr, &ib_lock, callb_generic_cpr, IB_NAME); 281 282 ib_snapshot_t *snapshot = NULL; 283 ib_snapshot_t *new_snapshot = NULL; 284 hrtime_t delta_time; 285 hrtime_t deltas_tottime = 0; 286 boolean_t below_statslen; 287 288 snapshot = ib_get_statistics(); 289 290 mutex_enter(&ib_lock); 291 for (;;) { 292 ib_delta_t *delta; 293 294 DTRACE_INTRD(get_stats); 295 new_snapshot = ib_get_statistics(); 296 297 delta = ib_delta_generate(snapshot, new_snapshot); 298 299 below_statslen = (deltas_tottime < statslen); 300 deltas_tottime += delta_time; 301 do_reconfig = (below_statslen && deltas_tottime >= statslen); 302 303 list_insert_tail(&ib_delta_list, delta); 304 305 /* 306 * Calculate the goodness of the current configuration. 307 */ 308 goodness = ib_goodness(delta); 309 310 if (ib_imbalanced(goodness, baseline_goodness)) 311 do_reconfig = B_TRUE; 312 313 /* 314 * Reconfigure interrupt distribution. 315 */ 316 if (do_reconfig) { 317 error = ib_do_reconfig(delta); 318 319 if (error != 0) { 320 if (error == -1) 321 IB_LOG((CE_CONT, "ib_do_reconfig failed!")); 322 } else { 323 IB_LOG((CE_CONT, "setting new baseline of %d", goodness)); 324 baseline_goodness = goodness; 325 } 326 } 327 328 /* 329 * Wait for timeout or CPU reconfiguration. 330 */ 331 CALLB_CPR_SAFE_BEGIN(&cpr); 332 cv_timedwait(&ib_cv, &ib_lock, ddi_get_lbolt() + 333 SEC_TO_TICK(ib_sleeptime)); 334 CALLB_CPR_SAFE_END(&cpr, &ib_lock); 335 } 336 337 CALLB_CPR_EXIT(&cpr); 338 339 /* 340 * Unregister CPU callback. 341 */ 342 mutex_enter(&cpu_lock); 343 unregister_cpu_setup_func(ib_cpu_setup, NULL); 344 mutex_exit(&cpu_lock); 345 346 list_destroy(&ib_exclude_list); 347 list_destroy(&ib_delta_list); 348 list_destroy(&ib_cpu_list); 349 350 } 351 352 /* 353 * Register a new CPU in the global list of CPUs. 354 */ 355 static void 356 ib_cpu_register(processorid_t cpu_id) 357 { 358 cpu_t *cp = cpu[cpu_id]; 359 ib_cpu_t *new_cpu; 360 361 /* 362 * Is this CPU baned from interrupt handling? 363 */ 364 if (ib_cpu_exclude(cpu_id)) 365 return; 366 367 new_cpu = ib_cpu_create(); 368 new_cpu->ic_cpu_id = cpu_id; 369 370 /* Initialize list for interrupt vectors */ 371 list_create(&new_cpu->ic_ivec_list, sizeof (ib_ivec_t), 372 offsetof(ib_ivec_t, ii_next)); 373 374 list_link_init(&new_cpu->ic_next); 375 376 /* Check if this CPU can handle interrupts */ 377 mutex_enter(&cpu_lock); 378 if (cpu_is_nointr(cp)) 379 new_cpu->ic_offline = B_TRUE; 380 mutex_exit(&cpu_lock); 381 382 /* Add CPU to list of CPUs */ 383 list_insert_tail(&ib_cpu_list, new_cpu); 384 385 ib_num_cpus++; 386 387 IB_IMPLDBG((CE_CONT, "ib_cpu_register: cpu=0x%x", cpu_id)); 388 } 389 390 /* 391 * Unregister CPU from the global list of CPUs. 392 */ 393 static void 394 ib_cpu_unregister(processorid_t cpu_id) 395 { 396 ib_cpu_t *icpu; 397 398 mutex_enter(&ib_lock); 399 FOREACH_CPU(icpu, ib_cpu_list) { 400 if (icpu->ic_cpu_id == cpu_id) { 401 /* Remove CPU from global list */ 402 list_remove(&ib_cpu_list, icpu); 403 404 /* Free CPU structure */ 405 ib_cpu_destroy(icpu); 406 407 /* XXX or just offline CPU; statistics? */ 408 break; 409 } 410 } 411 mutex_exit(&ib_lock); 412 413 ib_num_cpus--; 414 415 IB_IMPLDBG((CE_CONT, "ib_cpu_unregister: cpu=0x%x", 416 cpu_id)); 417 } 418 419 /* 420 * Hook for CPU changes. 421 */ 422 static int 423 ib_cpu_setup(cpu_setup_t what, int cpu_id, void *arg) 424 { 425 426 switch (what) { 427 case CPU_UNCONFIG: 428 case CPU_CPUPART_OUT: 429 case CPU_OFF: 430 ib_cpu_unregister(cpu_id); 431 cv_signal(&ib_cv); 432 break; 433 434 case CPU_INTR_ON: 435 ib_cpu_register(cpu_id); 436 cv_signal(&ib_cv); 437 break; 438 439 default: 440 break; 441 } 442 443 return (0); 444 } 445 446 static ib_cpu_t * 447 ib_cpu_create(void) 448 { 449 ib_cpu_t *new_cpu; 450 451 new_cpu = kmem_alloc(sizeof (ib_cpu_t), KM_SLEEP); 452 new_cpu->ic_offline = B_FALSE; 453 454 return (new_cpu); 455 } 456 457 static void 458 ib_cpu_destroy(ib_cpu_t *old_cpu) 459 { 460 ib_ivec_t *ivec; 461 462 FOREACH_IVEC(ivec, old_cpu->ic_ivec_list) { 463 kmem_free(ivec, sizeof (ib_ivec_t)); 464 } 465 466 kmem_free(old_cpu, sizeof (ib_cpu_t)); 467 } 468 469 /* 470 * Find a CPU in the global list of CPUs by processor id. 471 */ 472 static ib_cpu_t * 473 ib_cpu_find(list_t cpu_list, processorid_t cpu_id) 474 { 475 ib_cpu_t *icpu; 476 477 IB_APIDBG((CE_CONT, "ib_cpu_find: API cpu = %d", cpu_id)); 478 479 FOREACH_CPU(icpu, cpu_list) { 480 if (icpu->ic_cpu_id == cpu_id) 481 return (icpu); 482 } 483 484 return (NULL); 485 } 486 487 /* 488 * Find a interrupt vector for a specific CPU. 489 */ 490 static ib_ivec_t * 491 ib_cpu_find_ivec(list_t cpu_list, processorid_t cpu_id, char *buspath, 492 uint64_t ino) 493 { 494 ib_cpu_t *icpu; 495 ib_ivec_t *ivec; 496 497 icpu = ib_cpu_find(cpu_list, cpu_id); 498 if (icpu == NULL) 499 return (NULL); 500 501 for (ivec = list_head(&icpu->ic_ivec_list); ivec != NULL; 502 ivec = list_next(&icpu->ic_ivec_list, ivec)) { 503 if (ivec->ii_ino == ino) 504 return (ivec); 505 } 506 507 return (NULL); 508 } 509 510 /* 511 * Search exclude lists. 512 */ 513 static boolean_t 514 ib_cpu_exclude(processorid_t cpu_id) 515 { 516 ib_exclude_list_t *excluded_cpu; 517 518 /* 519 * Search global list of CPUs excluded from interrupt handling. 520 */ 521 for (excluded_cpu = list_head(&ib_exclude_list); excluded_cpu != NULL; 522 excluded_cpu = list_next(&ib_exclude_list, excluded_cpu)) { 523 if (excluded_cpu->ix_cpu_id == cpu_id) 524 return (B_TRUE); 525 } 526 527 return (B_FALSE); 528 } 529 530 /* 531 * Total times spend. 532 */ 533 static void 534 ib_cpu_statistics(ib_cpu_t *icpu) 535 { 536 cpu_t *cp; 537 hrtime_t msnsecs[NCMSTATES]; 538 hrtime_t new_tot; 539 540 cp = cpu[icpu->ic_cpu_id]; 541 get_cpu_mstate(cp, msnsecs); 542 543 icpu->ic_tot = msnsecs[CMS_IDLE] + msnsecs[CMS_USER] + 544 msnsecs[CMS_SYSTEM]; 545 546 } 547 548 /* 549 * Create a new interrupt vector. 550 */ 551 static ib_ivec_t * 552 ib_ivec_create(const char *buspath, uint64_t ino) 553 { 554 ib_ivec_t *ivec; 555 556 ivec = (ib_ivec_t *)kmem_alloc(sizeof (ib_ivec_t), KM_SLEEP); 557 558 list_link_init(&ivec->ii_next); 559 560 ivec->ii_buspath = (char *)buspath; /* XXX: strdup */ 561 ivec->ii_ino = ino; 562 ivec->ii_ihs = 1; 563 564 return (ivec); 565 } 566 567 static void 568 ib_ivec_register(ib_cpu_t *icpu) 569 { 570 } 571 572 /* 573 * Find interrupt vector by ino. 574 */ 575 static ib_ivec_t * 576 ib_ivec_find_ino(list_t ivec_list, uint64_t ino) 577 { 578 ib_ivec_t *ivec; 579 580 FOREACH_IVEC(ivec, ivec_list) { 581 if (ivec->ii_inum == ino) 582 return (ivec); 583 } 584 585 return (NULL); 586 } 587 588 /* 589 * Delete a interrupt vector from a list. 590 */ 591 static void 592 ib_ivec_delete_ino(list_t ivec_list, uint64_t ino) 593 { 594 ib_ivec_t *ivec; 595 596 FOREACH_IVEC(ivec, ivec_list) { 597 if (ivec->ii_inum == ino) { 598 /* XXX: remove from list */ 599 ; 600 } 601 } 602 } 603 604 /* 605 * Add a new interrupt vector to a list. 606 */ 607 static void 608 ib_ivec_add_ino(list_t ivec_list, ib_ivec_t *ivec) 609 { 610 list_insert_tail(&ivec_list, ivec); 611 } 612 613 static ib_msi_t * 614 ib_msi_create(const char *name) 615 { 616 ib_msi_t *msi; 617 618 msi = (ib_msi_t *)kmem_alloc(sizeof (ib_msi_t), KM_SLEEP); 619 620 msi->im_name = name; 621 622 list_link_init(&msi->im_next); 623 list_create(&msi->im_ino_list, sizeof (ib_msi_ino_t), 624 offsetof(ib_msi_ino_t, imi_next)); 625 626 return (msi); 627 } 628 629 /* 630 * Allocate and initialize a new snapshot structure. 631 */ 632 static ib_snapshot_t * 633 ib_snapshot_create(void) 634 { 635 ib_snapshot_t *snapshot; 636 637 snapshot = kmem_alloc(sizeof (ib_snapshot_t), KM_SLEEP); 638 639 /* init link */ 640 641 /* Initialize CPU list */ 642 list_create(&snapshot->is_cpu_list, sizeof (ib_cpu_t), 643 offsetof(ib_cpu_t, ic_next)); 644 645 snapshot->is_num_cpus = 0; 646 647 return (snapshot); 648 } 649 650 /* 651 * Destroy a snapshot. 652 */ 653 static void 654 ib_snapshot_destroy(ib_snapshot_t *snapshot) 655 { 656 ib_cpu_t *icpu; 657 658 FOREACH_CPU(icpu, snapshot->is_cpu_list) { 659 ib_cpu_destroy(icpu); 660 } 661 662 kmem_free(snapshot, sizeof (ib_snapshot_t)); 663 } 664 665 static ib_ivec_t * 666 ib_irq_fill_ivec(kstat_t *ksp) 667 { 668 kstat_named_t *knp; 669 ib_ivec_t *ivec; 670 char *datap; 671 uint64_t time; 672 int i; 673 674 datap = ksp->ks_data; 675 knp = KSTAT_NAMED_PTR(ksp); 676 for (i = 0; i < ksp->ks_ndata; i++, knp++) { 677 IB_IMPLDBG((CE_CONT, "ib_irq_fill_ivec: %s", 678 knp->name)); 679 680 if (strcmp(knp->name, "time") == 0) { 681 cmn_err(CE_CONT, "XXX ib time"); 682 time = knp->value.ui64; 683 } 684 685 knp += sizeof (kstat_named_t); 686 datap += sizeof (kstat_named_t); 687 } 688 689 /* Allocate a new interrupt vector */ 690 ivec = ib_ivec_create("", 0); 691 ivec->ii_time = time; 692 693 return (ivec); 694 } 695 696 /* 697 * XXX: icpu not needed, move out of loop 698 */ 699 static void 700 ib_irq_statistics(ib_cpu_t *icpu) 701 { 702 kstat_t *ksp; 703 int instance = 1; 704 705 /* 706 * Read pci interrupts. 707 */ 708 ksp = kstat_hold_byname("pci_intrs", instance, "pci", ALL_ZONES); 709 while (ksp != NULL) { 710 KSTAT_ENTER(ksp); 711 712 if (ksp->ks_data != NULL && ksp->ks_type == KSTAT_TYPE_NAMED) { 713 ib_cpu_t *icpu; 714 ib_ivec_t *ivec; 715 kstat_named_t *knp; 716 kstat_named_t *datap; 717 uint64_t ino; 718 char *buspath; 719 char *namep; 720 processorid_t cpu_id; 721 int i; 722 boolean_t is_enabled = B_TRUE; 723 724 (void) KSTAT_UPDATE(ksp, KSTAT_READ); 725 726 /* 727 * Find the CPU this interrupt vector is on and 728 * if the vector itself is enabled. 729 */ 730 datap = ksp->ks_data; 731 namep = KSTAT_NAMED_PTR(ksp)->name; 732 for (i = 0; i < ksp->ks_ndata; i++) { 733 if (strcmp(namep, "cpu") == 0) { 734 cpu_id = datap->value.ui64; 735 } else if (strcmp(namep, "type") == 0) { 736 if (strcmp(datap->value.c, "disabled") == 0) { 737 is_enabled = B_FALSE; 738 break; 739 } 740 } 741 742 namep += sizeof (kstat_named_t); 743 datap += sizeof (kstat_named_t); 744 } 745 746 /* 747 * Skip this interrupt vector if its disabled. 748 */ 749 if (!is_enabled) 750 continue; 751 752 /* 753 * Check if CPU is online. 754 */ 755 icpu = ib_cpu_find(ib_cpu_list, cpu_id); 756 if (icpu == NULL || icpu->ic_offline) 757 continue; 758 759 /* 760 * Fill information. 761 */ 762 ivec = ib_irq_fill_ivec(ksp); 763 if (ivec == NULL) 764 continue; 765 766 list_insert_tail(&icpu->ic_ivec_list, ivec); 767 } 768 769 KSTAT_EXIT(ksp); 770 kstat_rele(ksp); 771 772 instance++; 773 ksp = kstat_hold_byname("pci_intrs", instance, "pci", ALL_ZONES); 774 } 775 } 776 777 /* 778 * Collect data from CPUs and interrupt vectors. 779 */ 780 static ib_snapshot_t * 781 ib_get_statistics(void) 782 { 783 ib_cpu_t *os_cpu; 784 ib_snapshot_t *snapshot; 785 ib_cpu_t *snapshot_cpu; 786 787 /* 788 * Nothing to balance with one CPU. XXX: right place? 789 */ 790 if (ib_num_cpus <= 1) { 791 ib_sleeptime = IB_ONECPU_SLEEPTIME; 792 return (NULL); 793 } 794 795 /* 796 * Store all CPUs and ivecs here. 797 */ 798 snapshot = ib_snapshot_create(); 799 800 /* 801 * Loop over all active CPUs 802 */ 803 FOREACH_CPU(os_cpu, ib_cpu_list) { 804 805 snapshot->is_num_cpus++; 806 807 snapshot_cpu = ib_cpu_create(); 808 snapshot_cpu->ic_cpu_id = os_cpu->ic_cpu_id; 809 810 list_insert_tail(&snapshot->is_cpu_list, snapshot_cpu); 811 812 ib_cpu_statistics(snapshot_cpu); 813 ib_irq_statistics(os_cpu); 814 } 815 816 return (snapshot); 817 } 818 819 static ib_delta_t * 820 ib_delta_create(void) 821 { 822 ib_delta_t *delta; 823 824 delta = kmem_alloc(sizeof (ib_delta_t), KM_SLEEP); 825 delta->id_missing = B_FALSE; 826 827 list_create(&delta->id_cpu_list, sizeof (ib_cpu_t), 828 offsetof(ib_cpu_t, ic_next)); 829 830 return (delta); 831 } 832 833 /* 834 * Generate the delta of two snapshots. 835 */ 836 static ib_delta_t * 837 ib_delta_generate(ib_snapshot_t *old_snapshot, ib_snapshot_t *new_snapshot) 838 { 839 ib_cpu_t *old_cpu, *new_cpu; 840 ib_delta_t *delta; 841 int intrload = 0; 842 int intrnsec = 0; 843 processorid_t cpus = 0; 844 845 /* 846 * Allocate a new delta structure. 847 */ 848 delta = ib_delta_create(); 849 850 /* 851 * Number of CPUs must be the same. 852 */ 853 delta->id_missing = old_snapshot->is_num_cpus != 854 new_snapshot->is_num_cpus; 855 856 if (delta->id_missing != 0) { 857 IB_LOG((CE_CONT, "ib_delta_generate: number of CPUs changed")); 858 return (delta); 859 } 860 861 /* 862 * Loop over the CPUs in both snapshots. 863 */ 864 for (new_cpu = list_head(&new_snapshot->is_cpu_list), 865 old_cpu = list_head(&old_snapshot->is_cpu_list); 866 new_cpu != NULL && old_cpu != NULL; 867 new_cpu = list_next(&new_snapshot->is_cpu_list, new_cpu), 868 old_cpu = list_next(&old_snapshot->is_cpu_list, old_cpu)) { 869 ib_cpu_t *delta_cpu; 870 ib_ivec_t *new_ivec; 871 872 /* XXX: just onlined CPU? */ 873 874 /* Allocate a new CPU structure */ 875 delta_cpu = ib_cpu_create(); 876 877 /* Difference of total time */ 878 delta_cpu->ic_tot = new_cpu->ic_tot - old_cpu->ic_tot; 879 if (!(delta_cpu->ic_tot >= 0)) { 880 delta->id_missing = B_TRUE; 881 kmem_free(delta_cpu, sizeof (ib_cpu_t)); 882 return (delta); 883 } 884 885 list_insert_tail(&delta->id_cpu_list, delta_cpu); 886 887 /* Avoid division by zero */ 888 if (delta_cpu->ic_tot == 0) 889 delta_cpu->ic_tot = 1; 890 891 delta_cpu->ic_intrs = 0; 892 delta_cpu->ic_big_intrs = 0; 893 894 /* 895 * Number of interrupt vectors must be the same. 896 */ 897 if (old_cpu->ic_num_ivecs != new_cpu->ic_num_ivecs) { 898 IB_LOG((CE_CONT, "ib_delta_generate: cpu %d has more " 899 "or less interrupts", old_cpu->ic_cpu_id)); 900 delta->id_missing = B_TRUE; 901 return (delta); 902 } 903 904 /* 905 * Loop over the interrupt vectors of the new CPU. 906 */ 907 for (new_ivec = list_head(&new_cpu->ic_ivec_list); 908 new_ivec != NULL; new_ivec = 909 list_next(&new_cpu->ic_ivec_list, new_ivec)) { 910 ib_ivec_t *ivec; 911 ib_ivec_t *delta_ivec; 912 hrtime_t time; 913 914 if (new_ivec->ii_num_ino == 0) 915 continue; 916 917 /* 918 * If interrupt vector does not exists or XXX crtime 919 * is different, set missing. 920 */ 921 ivec = ib_ivec_find_ino(old_cpu->ic_ivec_list, 922 new_ivec->ii_ino); 923 if (ivec == NULL) { 924 delta->id_missing = B_TRUE; 925 return (delta); 926 } 927 928 /* Allocate a new delta interrupt vector */ 929 delta_ivec = ib_ivec_create(new_ivec->ii_buspath, 930 new_ivec->ii_ino); 931 932 /* 933 * Time used by this interrupt. 934 */ 935 time = new_ivec->ii_time - ivec->ii_time; 936 if (time < 0) { 937 delta->id_missing = B_TRUE; 938 kmem_free(delta_ivec, sizeof (ib_delta_t)); 939 return (delta); 940 } 941 942 delta_cpu->ic_intrs += time; 943 delta_ivec->ii_time = time; 944 945 if (time > delta_cpu->ic_bigintr) 946 delta_cpu->ic_bigintr = time; 947 948 /* 949 * Fill in the rest. 950 */ 951 delta_ivec->ii_ihs = new_ivec->ii_ihs; 952 delta_ivec->ii_pil = new_ivec->ii_pil; 953 delta_ivec->ii_ino = new_ivec->ii_ino; 954 delta_ivec->ii_num_ino = new_ivec->ii_num_ino; 955 /* XXX: buspath, name */ 956 } 957 958 /* 959 * Rounding error 960 */ 961 if (delta_cpu->ic_tot < delta_cpu->ic_intrs) 962 delta_cpu->ic_tot = delta_cpu->ic_intrs; 963 964 delta_cpu->ic_intr_load = 965 delta_cpu->ic_intrs / delta_cpu->ic_tot; 966 intrload += delta_cpu->ic_intr_load; 967 intrnsec += delta_cpu->ic_intrs; 968 969 cpus++; 970 } 971 972 if (cpus > 0) { 973 delta->id_avgintrload = intrload / cpus; 974 delta->id_avgintrnsec = intrnsec / cpus; 975 } else { 976 delta->id_avgintrload = 0; 977 delta->id_avgintrnsec = 0; 978 } 979 980 return (delta); 981 } 982 983 /* 984 * Compress deltas. 985 */ 986 static ib_delta_t * 987 ib_delta_compress(list_t *deltas) 988 { 989 ib_cpu_t *icpu; 990 ib_ivec_t *ivec; 991 ib_delta_t *new_delta, *delta; 992 processorid_t cpus = 0; 993 int high_intrload = 0; 994 int intrs = 0, tot; 995 996 /* Check if empty list of deltas */ 997 if (deltas == NULL || list_is_empty(deltas) != 0) { 998 IB_LOG((CE_CONT, "ib_delta_compress: deltas are empty?")); 999 return (NULL); 1000 } 1001 1002 /* Allocate a new delta structure */ 1003 new_delta = ib_delta_create(); 1004 1005 /* 1006 * Loop over the deltas in the list. 1007 */ 1008 for (delta = list_head(deltas); delta != NULL; 1009 delta = list_next(deltas, delta)) { 1010 1011 /* Compressing bad delta? */ 1012 if (delta->id_missing) { 1013 IB_LOG((CE_CONT, 1014 "ib_delta_compress: compressing bad deltas?")); 1015 return (NULL); 1016 } 1017 1018 FOREACH_CPU(icpu, delta->id_cpu_list) { 1019 ib_cpu_t *new_cpu; 1020 ib_ivec_t *new_ivec; 1021 1022 intrs += icpu->ic_intrs; 1023 tot += icpu->ic_tot; 1024 new_cpu = ib_cpu_create(); 1025 new_cpu->ic_cpu_id = icpu->ic_cpu_id; 1026 new_cpu->ic_intrs = icpu->ic_intrs; 1027 new_cpu->ic_tot = icpu->ic_tot; 1028 1029 /* XXX: exists ivecs */ 1030 FOREACH_IVEC(new_ivec, icpu->ic_ivec_list) { 1031 ib_ivec_t *new_delta_ivec; 1032 1033 new_delta_ivec = ib_ivec_create( 1034 new_ivec->ii_buspath, new_ivec->ii_ino); 1035 1036 } 1037 } 1038 } 1039 1040 FOREACH_CPU(icpu, new_delta->id_cpu_list) { 1041 int bigintr = 0; 1042 1043 cpus++; 1044 1045 FOREACH_IVEC(ivec, icpu->ic_ivec_list) { 1046 if (ivec->ii_time > bigintr) 1047 bigintr = ivec->ii_time; 1048 } 1049 1050 icpu->ic_bigintr = bigintr; 1051 icpu->ic_intr_load = icpu->ic_intrs / icpu->ic_tot; 1052 1053 if (high_intrload < icpu->ic_intr_load) 1054 high_intrload = icpu->ic_intr_load; 1055 1056 if (icpu->ic_tot <= 0) 1057 icpu->ic_tot = 100; 1058 } 1059 1060 if (cpus > 0) { 1061 new_delta->id_avgintrload = intrs / tot; 1062 new_delta->id_avgintrnsec = intrs / cpus; 1063 } else { 1064 new_delta->id_avgintrload = 0; 1065 new_delta->id_avgintrnsec = 0; 1066 } 1067 1068 /* XXX: global sleeptime */ 1069 1070 return (new_delta); 1071 } 1072 1073 /* 1074 * Decide if the load is out of balance. 1075 */ 1076 static int 1077 ib_imbalanced(int goodness, int baseline) 1078 { 1079 if (goodness > 50) 1080 return (100); 1081 1082 /* XXX: abs */ 1083 if ((goodness - baseline) > goodness_mindelta) 1084 return (100); 1085 1086 return (0); 1087 } 1088 1089 /* 1090 * Calculate goodness of a CPU. 1091 */ 1092 static int 1093 ib_goodness_cpu(ib_cpu_t *icpu, int avg_interrupt_load) 1094 { 1095 int goodness; 1096 int load, load_no_bigintr; 1097 1098 load = icpu->ic_intrs / icpu->ic_tot; 1099 if (load < avg_interrupt_load) 1100 return (0); 1101 1102 load_no_bigintr = (icpu->ic_intrs - icpu->ic_bigintr) / icpu->ic_tot; 1103 1104 if ((load > goodness_unsafe_load) && (icpu->ic_num_ivecs > 1)) 1105 return (1); 1106 1107 goodness = load - avg_interrupt_load; 1108 if (goodness > load_no_bigintr) 1109 goodness = load_no_bigintr; 1110 1111 return (goodness); 1112 } 1113 1114 /* 1115 * Calculate goodness. 1116 */ 1117 static int 1118 ib_goodness(ib_delta_t *delta) 1119 { 1120 ib_cpu_t *icpu; 1121 int goodness, high_goodness = 0; 1122 1123 if (delta->id_missing > 0) 1124 return (1); 1125 1126 FOREACH_CPU(icpu, delta->id_cpu_list) { 1127 goodness = ib_goodness_cpu(icpu, delta->id_avgintrload); 1128 if (!(goodness >= 0 && goodness <= 100)) { 1129 IB_LOG((CE_CONT, 1130 "ib_goodness: cpu goodness out of range?")); 1131 return (100); 1132 } 1133 1134 if (goodness == 100) 1135 return (100); 1136 1137 if (goodness > high_goodness) 1138 high_goodness = goodness; 1139 } 1140 1141 return (high_goodness); 1142 } 1143 1144 static void 1145 ib_do_find_goal(list_t ivecs, list_t loads, int goal, int idx) 1146 { 1147 list_t goals_with; 1148 list_t goals_without; 1149 int with, without; 1150 int which, load; 1151 1152 1153 if (goal <= load) { 1154 with = load; 1155 } else { 1156 /* XXX: do_find_goal */ 1157 with += load; 1158 } 1159 1160 IB_LOG((CE_CONT, "XXX")); 1161 1162 if (with >= goal && without < goal) { 1163 which = 0; 1164 } else if (with < goal && without >= goal) { 1165 which = 1; 1166 } else if (with >= goal && without >= goal) { 1167 which = without < with; 1168 } else { 1169 which = without > with; 1170 } 1171 1172 if (which == 1) { 1173 IB_LOG((CE_CONT, "ib_do_find_goal: going without")); 1174 /* XXX */ 1175 } else { 1176 IB_LOG((CE_CONT, "ib_do_find_goal: going with")); 1177 /* XXX */ 1178 } 1179 } 1180 1181 typedef struct _ib_goal { 1182 list_node_t *ig_link; 1183 int ig_value; 1184 } ib_goal_t; 1185 1186 typedef struct _ib_goal_load { 1187 list_node_t *igl_link; 1188 int igl_value; 1189 } ib_goal_load_t; 1190 1191 static void 1192 ib_find_goal(list_t ivecs, int goal) 1193 { 1194 ib_ivec_t *ivec; 1195 list_t goals; 1196 int load; 1197 1198 if (goal <= 0) { 1199 list_create(&goals, sizeof (ib_goal_t), 1200 offsetof (ib_goal_t, ig_link)); 1201 } else { 1202 list_t loads; 1203 hrtime_t tot = 0; 1204 1205 IB_LOG((CE_CONT, "ib_find_goal: finding goal from intrs XXX")); 1206 1207 FOREACH_IVEC(ivec, ivecs) { 1208 tot += ivec->ii_time; 1209 } 1210 1211 list_create(&loads, sizeof (ib_goal_load_t), 1212 offsetof (ib_goal_load_t, igl_link)); 1213 1214 FOREACH_IVEC(ivec, ivecs) { 1215 ib_goal_load_t *igl = kmem_alloc(sizeof (ib_goal_load_t), KM_SLEEP); 1216 1217 igl->igl_value = tot; 1218 list_insert_tail(&loads, igl); 1219 1220 tot -= ivec->ii_time; 1221 } 1222 } 1223 } 1224 1225 static void 1226 ib_do_reconfig_cpu2cpu(ib_delta_t *delta, processorid_t src_cpuid, 1227 processorid_t tgt_cpuid, int src_load) 1228 { 1229 ib_cpu_t *src_cpu, *tgt_cpu; 1230 ib_ivec_t *ivec; 1231 list_t ivecs; 1232 int goal, new_load; 1233 int avg_nsec; 1234 1235 if (delta == NULL) 1236 return; 1237 1238 goal = delta->id_avgintrnsec; 1239 1240 src_cpu = ib_cpu_find(delta->id_cpu_list, src_cpuid); 1241 if (src_cpu == NULL) 1242 return; 1243 1244 tgt_cpu = ib_cpu_find(delta->id_cpu_list, tgt_cpuid); 1245 if (tgt_cpu == NULL) 1246 return; 1247 1248 avg_nsec = (src_cpu->ic_intrs + tgt_cpu->ic_intrs) / 2; 1249 if (goal < avg_nsec) 1250 goal = avg_nsec; 1251 1252 1253 /* 1254 * Sort interrupt vectors by time. 1255 */ 1256 list_create(&ivecs, sizeof (ib_ivec_t), 1257 offsetof (ib_ivec_t, ii_next)); 1258 1259 ivec = list_head(&ivecs); 1260 if (ivec->ii_orig_cpu == src_cpuid) { 1261 IB_LOG((CE_CONT, "Keeping XXX on %d", 1262 src_cpuid)); /* ivec->ii_inum, */ 1263 goal -= ivec->ii_time; 1264 /* XXX: shift */ 1265 } 1266 1267 IB_LOG((CE_CONT, "ib_reconfig_cpu2cpu: inums should total %d", goal)); 1268 1269 ib_find_goal(ivecs, goal); 1270 1271 FOREACH_IVEC(ivec, ivecs) { 1272 if (!(ivec->ii_now_cpu == src_cpuid || 1273 ivec->ii_now_cpu == tgt_cpuid)) { 1274 IB_LOG((CE_CONT, "ib_do_reconfig_cpu2cpu: ")); 1275 } 1276 1277 if (ivec->ii_goal && ivec->ii_now_cpu != src_cpuid) { 1278 ib_interrupt_do_move(ivec, src_cpuid); 1279 } else if (ivec->ii_goal == B_FALSE && 1280 ivec->ii_now_cpu != tgt_cpuid) { 1281 ib_interrupt_do_move(ivec, tgt_cpuid); 1282 } 1283 } 1284 1285 ib_interrupt_move_check(delta, src_cpuid, tgt_cpuid); 1286 1287 new_load = src_cpu->ic_intrs / src_cpu->ic_tot; 1288 1289 if (!(new_load <= src_load && new_load > delta->id_avgintrload)) { 1290 IB_LOG((CE_CONT, "ib_reconfig_cpu2cpu: %d", new_load)); 1291 } 1292 } 1293 1294 static void 1295 ib_do_reconfig_cpu(ib_delta_t *delta, list_t *cpu_sorted_list, 1296 processorid_t old_cpu_id) 1297 { 1298 ib_cpu_t *icpu; 1299 int avgintrload; 1300 1301 if (delta == NULL) 1302 return; 1303 1304 icpu = ib_cpu_find(delta->id_cpu_list, old_cpu_id); 1305 if (icpu == NULL) 1306 return; 1307 1308 avgintrload = delta->id_avgintrload; 1309 1310 } 1311 1312 /* 1313 * Reconfigure interrupt distribution among CPUs. 1314 */ 1315 static int 1316 ib_do_reconfig(ib_delta_t *delta) 1317 { 1318 ib_cpu_t *icpu; 1319 ib_ivec_t *ivec; 1320 list_t cpu_sorted_list; 1321 int goodness, new_goodness; 1322 int warned = 0; 1323 int rval = 1, ret = 1; 1324 1325 if (delta == NULL) 1326 return (-1); 1327 1328 goodness = delta->id_goodness; 1329 if (goodness < goodness_mindelta) { 1330 IB_LOG((CE_CONT, "ib_do_reconfig: goodness is good enough")); 1331 return (0); 1332 } 1333 1334 IB_LOG((CE_CONT, "ib_do_reconfig: optimizing interrupt assignments")); 1335 1336 if (delta->id_missing != 0) { 1337 IB_LOG((CE_CONT, "ib_do_reconfig: aborted")); 1338 return (-1); 1339 } 1340 1341 FOREACH_CPU(icpu, delta->id_cpu_list) { 1342 FOREACH_IVEC(ivec, icpu->ic_ivec_list) { 1343 ivec->ii_orig_cpu = icpu->ic_cpu_id; 1344 ivec->ii_now_cpu = icpu->ic_cpu_id; 1345 /* XXX: inum */ 1346 } 1347 } 1348 1349 list_create(&cpu_sorted_list, sizeof (ib_cpu_t), 1350 offsetof(ib_cpu_t, ic_next)); 1351 1352 /* 1353 * Have we an improvement? 1354 */ 1355 new_goodness = ib_goodness(delta); 1356 if (!(new_goodness <= goodness)) { 1357 IB_LOG((CE_CONT, 1358 "ib_do_reconfig: result has worse goodness")); 1359 } 1360 1361 if ((goodness != 100 || new_goodness == 100) && 1362 goodness - new_goodness < goodness_mindelta) { 1363 IB_LOG((CE_CONT, 1364 "ib_do_reconfig: goodness already near optimum")); 1365 return (0); 1366 } 1367 1368 /* 1369 * Move interrupts. 1370 */ 1371 FOREACH_CPU(icpu, delta->id_cpu_list) { 1372 FOREACH_IVEC(ivec, icpu->ic_ivec_list) { 1373 int error; 1374 1375 if (ivec->ii_orig_cpu == icpu->ic_cpu_id) 1376 continue; 1377 1378 error = ib_interrupt_do_move(ivec, icpu->ic_cpu_id); 1379 if (error != 0) { 1380 if (warned++ == 0) { 1381 IB_LOG((CE_CONT, "ib_do_reconfig: " 1382 "unable to move interrupt")); 1383 } 1384 1385 IB_LOG((CE_CONT, "ib_do_reconfig: " 1386 "unable to move buspath")); 1387 1388 ret = -1; 1389 } 1390 } 1391 } 1392 1393 return (rval); 1394 } 1395 1396 1397 /* 1398 * Check if the interrupt load did decrease. 1399 */ 1400 static void 1401 ib_interrupt_move_check(ib_delta_t *delta, processorid_t old_cpuid, 1402 processorid_t new_cpuid) 1403 { 1404 ib_cpu_t *old_cpu, *new_cpu; 1405 1406 /* 1407 * Check old CPU. 1408 */ 1409 old_cpu = ib_cpu_find(delta->id_cpu_list, old_cpuid); 1410 if (old_cpu == NULL) 1411 return; 1412 if (!(old_cpu->ic_tot >= old_cpu->ic_intrs)) { 1413 IB_LOG((CE_CONT, 1414 "Moved interrupts left 100+%% load on source CPU")); 1415 } 1416 1417 /* 1418 * Check new CPU. 1419 */ 1420 new_cpu = ib_cpu_find(delta->id_cpu_list, new_cpuid); 1421 if (new_cpu == NULL) 1422 return; 1423 if (!(new_cpu->ic_tot >= new_cpu->ic_intrs)) { 1424 IB_LOG((CE_CONT, 1425 "Moved interrupts left 100+%% load on target CPU")); 1426 } 1427 } 1428 1429 /* 1430 * Actually move the interrupt. 1431 */ 1432 static int 1433 ib_interrupt_do_move(ib_ivec_t *ivec, processorid_t cpu_id) 1434 { 1435 int ret, result; 1436 1437 struct psm_ops *pops; 1438 1439 //pops = mach_set[0]; 1440 1441 // ret = (*psm_intr_ops)(NULL, &info_hdl, PSM_INTR_OP_SET_CPU, 1442 // &result); 1443 1444 return (-1); 1445 } 1446 1447 /* 1448 * Move an interrupt to a different CPU. 1449 */ 1450 static int 1451 ib_interrupt_move(ib_delta_t *delta, uint64_t inum, processorid_t old_cpuid, 1452 processorid_t new_cpuid) 1453 { 1454 ib_cpu_t *old_cpu, *new_cpu; 1455 ib_ivec_t *ivec; 1456 1457 if (delta == NULL) 1458 return (-1); 1459 1460 /* 1461 * Remove interrupt vector from old CPU. 1462 */ 1463 old_cpu = ib_cpu_find(delta->id_cpu_list, old_cpuid); 1464 if (old_cpu == NULL) 1465 return (-1); 1466 1467 ivec = ib_ivec_find_ino(old_cpu->ic_ivec_list, inum); 1468 1469 old_cpu->ic_intrs -= ivec->ii_time; 1470 old_cpu->ic_intr_load = old_cpu->ic_intrs / old_cpu->ic_tot; 1471 ib_ivec_delete_ino(old_cpu->ic_ivec_list, inum); 1472 1473 /* 1474 * Verify interrupts. 1475 */ 1476 if (!(old_cpu->ic_intrs >= 0)) { 1477 IB_LOG((CE_CONT, 1478 "ib_interrupt_move: interrupt time > total time?")); 1479 } 1480 1481 if (!(ivec->ii_time <= old_cpu->ic_bigintr)) { 1482 IB_LOG((CE_CONT, 1483 "ib_interrupt_move: interrupt time > big interrupt?")); 1484 } 1485 1486 if (ivec->ii_time >= old_cpu->ic_bigintr) { 1487 ib_ivec_t *time_ivec; 1488 uint64_t bigtime = 0; 1489 1490 FOREACH_IVEC(time_ivec, old_cpu->ic_ivec_list) { 1491 if (time_ivec->ii_time > bigtime) 1492 bigtime = time_ivec->ii_time; 1493 } 1494 } 1495 1496 /* 1497 * Insert interrupt vector into new CPU. 1498 */ 1499 new_cpu = ib_cpu_find(delta->id_cpu_list, new_cpuid); 1500 if (new_cpu == NULL) 1501 return (-1); 1502 1503 ivec->ii_now_cpu = new_cpuid; 1504 new_cpu->ic_intrs += ivec->ii_time; 1505 new_cpu->ic_intr_load = new_cpu->ic_intrs / new_cpu->ic_tot; 1506 ib_ivec_add_ino(new_cpu->ic_ivec_list, ivec); 1507 1508 if (ivec->ii_time > new_cpu->ic_bigintr) 1509 new_cpu->ic_bigintr = ivec->ii_time; 1510 1511 return (0); 1512 }