1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2013 David Hoeppner. All rights reserved. 14 */ 15 16 /* 17 * Interrupt Load Balancer. 18 * 19 * The original balance functions views all CPU as equally. 20 */ 21 22 /* XXX 23 * 24 * ib_cpu_list::walk list |::print ib_cpu_t 25 * 26 * 27 * 28 */ 29 #include <sys/param.h> 30 #include <sys/types.h> 31 #include <sys/systm.h> 32 #include <sys/callb.h> 33 #include <sys/cpuvar.h> 34 #include <sys/proc.h> 35 #include <sys/processor.h> 36 #include <sys/sdt.h> 37 #include <sys/sysmacros.h> 38 #include <sys/time.h> 39 #include <sys/cmn_err.h> 40 #include <sys/zone.h> 41 #include <sys/lgrp.h> 42 43 #include <sys/pci_tools.h> 44 45 extern proc_t *proc_intrd; 46 47 #define IB_NAME "intrd" 48 49 #define IS_CPU(cpu_id) (cpu[cpu_id] != NULL) 50 51 #define IB_NORMAL_SLEEPTIME 10 52 #define IB_IDLE_SLEEPTIME 45 53 #define IB_ONECPU_SLEEPTIME (60 * 15) 54 55 #define IB_NUM_SAMPLES 6 56 57 58 static kmutex_t ib_lock; 59 static kcondvar_t ib_cv; 60 61 /* 62 * Interrupt CPU instance. 63 */ 64 typedef struct _ib_cpu { 65 list_node_t ic_next; 66 boolean_t ic_offline; 67 68 hrtime_t ic_tot; 69 list_t ic_ivec_list; 70 uint32_t ic_num_ivecs; 71 processorid_t ic_cpu_id; /* XXX duplicate */ 72 int64_t ic_intrs; 73 int64_t ic_big_intrs; 74 int64_t ic_bigintr; /* XXX bitintrs */ 75 76 int ic_intr_load; /* intrs / tot */ 77 } ib_cpu_t; 78 79 /* 80 * Interrupt vector instance. 81 */ 82 typedef struct _ib_ivec { 83 list_node_t ii_next; /* link */ 84 85 uint64_t ii_ihs; 86 uint64_t ii_ino; 87 uint64_t ii_num_ino; 88 uint64_t ii_pil; 89 uint64_t ii_time; 90 char *ii_buspath; 91 char *ii_name; 92 93 processorid_t ii_orig_cpu; 94 processorid_t ii_now_cpu; 95 uint64_t ii_inum; 96 } ib_ivec_t; 97 98 /* 99 * MSI 100 */ 101 typedef struct _ib_msi { 102 list_node_t im_next; /* link */ 103 const char *im_name; 104 list_t im_ino_list; 105 } ib_msi_t; 106 107 typedef struct _ib_msi_ino { 108 list_node_t imi_next; /* link */ 109 uint64_t imi_ino; 110 ib_ivec_t *imi_ivec; 111 } ib_msi_ino_t; 112 113 /* 114 * Snapshot 115 */ 116 typedef struct _ib_snapshot { 117 list_node_t is_next; /* link */ 118 list_t is_cpu_list; 119 processorid_t is_num_cpus; 120 } ib_snapshot_t; 121 122 /* 123 * Snapshot delta structure. 124 */ 125 typedef struct _ib_delta { 126 list_node_t id_next; /* link */ 127 list_t id_cpu_list; 128 boolean_t id_missing; 129 int id_avgintrload; /* interrupts / total time */ 130 uint64_t id_avgintrnsec; 131 int id_goodness; 132 } ib_delta_t; 133 134 135 static list_t ib_cpu_list; /* List of all CPU's */ 136 137 static uint8_t ib_cs = 0; /* Index of current sample */ 138 static long ib_sleeptime = IB_NORMAL_SLEEPTIME; 139 static processorid_t ib_num_cpus; 140 141 static int goodness_unsafe_load = 90; 142 static int goodness_mindelta = 10; 143 144 /* 145 * Function prototypes. 146 */ 147 static void ib_cpu_register(processorid_t); 148 static int ib_cpu_setup(cpu_setup_t, int, void *); 149 static int ib_goodness(ib_delta_t *); 150 static int ib_do_reconfig(ib_delta_t *); 151 static int ib_imbalanced(int, int); 152 static int ib_interrupt_do_move(ib_ivec_t *, processorid_t); 153 154 static ib_snapshot_t *ib_get_statistics(void); 155 static ib_delta_t *ib_delta_generate(ib_snapshot_t *, ib_snapshot_t *); 156 157 /* 158 * Helper macros. 159 */ 160 #define FOREACH_CPU(icpu, icpu_list) \ 161 for (icpu = list_head(&icpu_list); icpu != NULL; \ 162 icpu = list_next(&icpu_list, icpu)) 163 164 #define FOREACH_IVEC(ivec, ivec_list) \ 165 for (ivec = list_head(&ivec_list); ivec != NULL; \ 166 ivec = list_next(&ivec_list, ivec)) 167 168 #define DTRACE_INTRD(name) \ 169 DTRACE_PROBE(__intrd_##name) 170 171 #define DEBUG 1 172 #ifdef DEBUG 173 #define IB_APIDBG(args) cmn_err args 174 #define IB_IMPLDBG(args) cmn_err args 175 #else 176 #define IB_APIDBG(args) 177 #define IB_IMPLDBG(args) 178 #endif 179 180 #define IB_LOG(args) cmn_err args 181 182 void 183 interrupt_balancer(void) 184 { 185 processorid_t cpu_id; 186 callb_cpr_t cpr; 187 user_t *u = PTOU(curproc); 188 int error; 189 190 boolean_t do_reconfig = B_FALSE; 191 int goodness; 192 int baseline_goodness = 0; 193 list_t ib_delta_list; 194 hrtime_t statslen = 60; 195 196 proc_intrd = ttoproc(curthread); 197 proc_intrd->p_cstime = proc_intrd->p_stime = 0; 198 proc_intrd->p_cutime = proc_intrd->p_utime = 0; 199 200 (void) strncpy(u->u_psargs, IB_NAME, sizeof(u->u_psargs)); 201 (void) strncpy(u->u_comm, IB_NAME, sizeof(u->u_comm)); 202 203 /* Initialize global mutex lock */ 204 mutex_init(&ib_lock, NULL, MUTEX_DEFAULT, NULL); 205 206 /* Initialize CPU list */ 207 list_create(&ib_cpu_list, sizeof (ib_cpu_t), 208 offsetof(ib_cpu_t, ic_next)); 209 210 /* Initialize delta list */ 211 list_create(&ib_delta_list, sizeof (ib_delta_t), 212 offsetof(ib_delta_t, id_next)); 213 214 /* 215 * Build a list of all CPUs available for interrupt handling. 216 */ 217 for (cpu_id = 0; cpu_id <= max_cpu_seqid_ever; cpu_id++) { 218 if (IS_CPU(cpu_id)) 219 ib_cpu_register(cpu_id); 220 } 221 222 /* 223 * Locality group information. 224 */ 225 int i; 226 for (i = 0; i < lgrp_plat_max_lgrps(); i++) { 227 lgrp_t *lgrp; 228 229 lgrp = lgrp_table[i]; 230 } 231 232 /* 233 * Register a callback if a CPU goes offline or comes online. 234 */ 235 mutex_enter(&cpu_lock); 236 register_cpu_setup_func(ib_cpu_setup, NULL); 237 mutex_exit(&cpu_lock); 238 239 CALLB_CPR_INIT(&cpr, &ib_lock, callb_generic_cpr, IB_NAME); 240 241 ib_snapshot_t *snapshot = NULL; 242 ib_snapshot_t *new_snapshot = NULL; 243 hrtime_t delta_time; 244 hrtime_t deltas_tottime = 0; 245 boolean_t below_statslen; 246 247 snapshot = ib_get_statistics(); 248 249 mutex_enter(&ib_lock); 250 for (;;) { 251 ib_delta_t *delta; 252 253 DTRACE_INTRD(get_stats); 254 new_snapshot = ib_get_statistics(); 255 256 delta = ib_delta_generate(snapshot, new_snapshot); 257 258 below_statslen = (deltas_tottime < statslen); 259 deltas_tottime += delta_time; 260 do_reconfig = (below_statslen && deltas_tottime >= statslen); 261 262 list_insert_tail(&ib_delta_list, delta); 263 264 /* 265 * Calculate the goodness of the current configuration. 266 */ 267 goodness = ib_goodness(delta); 268 269 if (ib_imbalanced(goodness, baseline_goodness)) 270 do_reconfig = B_TRUE; 271 272 /* 273 * Reconfigure interrupt distribution. 274 */ 275 if (do_reconfig) { 276 error = ib_do_reconfig(delta); 277 278 if (error != 0) { 279 if (error == -1) 280 IB_LOG((CE_CONT, "ib_do_reconfig failed!")); 281 } else { 282 IB_LOG((CE_CONT, "setting new baseline of %d", goodness)); 283 baseline_goodness = goodness; 284 } 285 } 286 287 /* 288 * Wait for timeout or CPU reconfiguration. 289 */ 290 CALLB_CPR_SAFE_BEGIN(&cpr); 291 cv_timedwait(&ib_cv, &ib_lock, ddi_get_lbolt() + 292 SEC_TO_TICK(ib_sleeptime)); 293 CALLB_CPR_SAFE_END(&cpr, &ib_lock); 294 } 295 296 CALLB_CPR_EXIT(&cpr); 297 298 /* 299 * Unregister CPU callback. 300 */ 301 mutex_enter(&cpu_lock); 302 unregister_cpu_setup_func(ib_cpu_setup, NULL); 303 mutex_exit(&cpu_lock); 304 } 305 306 /* 307 * Register a new CPU in the global list of CPUs. 308 */ 309 static void 310 ib_cpu_register(processorid_t cpu_id) 311 { 312 cpu_t *cp = cpu[cpu_id]; 313 ib_cpu_t *new_cpu; 314 315 new_cpu = kmem_alloc(sizeof (ib_cpu_t), KM_SLEEP); 316 new_cpu->ic_cpu_id = cpu_id; 317 318 /* Initialize list for interrupt vectors */ 319 list_create(&new_cpu->ic_ivec_list, sizeof (ib_ivec_t), 320 offsetof(ib_ivec_t, ii_next)); 321 322 list_link_init(&new_cpu->ic_next); 323 324 /* Check if this CPU can handle interrupts */ 325 mutex_enter(&cpu_lock); 326 if (cpu_is_nointr(cp)) 327 new_cpu->ic_offline = B_TRUE; 328 else 329 new_cpu->ic_offline = B_FALSE; 330 mutex_exit(&cpu_lock); 331 332 /* Add CPU to list of CPUs */ 333 list_insert_tail(&ib_cpu_list, new_cpu); 334 335 ib_num_cpus++; 336 337 IB_IMPLDBG((CE_CONT, "ib_cpu_register: cpu=0x%x", cpu_id)); 338 } 339 340 /* 341 * Unregister CPU from the global list of CPUs. 342 */ 343 static void 344 ib_cpu_unregister(processorid_t cpu_id) 345 { 346 ib_cpu_t *icpu; 347 348 mutex_enter(&ib_lock); 349 FOREACH_CPU(icpu, ib_cpu_list) { 350 if (icpu->ic_cpu_id == cpu_id) { 351 list_remove(&ib_cpu_list, icpu); 352 /* XXX or just offline CPU; statistics? */ 353 break; 354 } 355 } 356 mutex_exit(&ib_lock); 357 358 ib_num_cpus--; 359 360 IB_IMPLDBG((CE_CONT, "ib_cpu_unregister: cpu=0x%x", 361 cpu_id)); 362 } 363 364 /* 365 * Hook for CPU changes. 366 */ 367 static int 368 ib_cpu_setup(cpu_setup_t what, int cpu_id, void *arg) 369 { 370 371 switch (what) { 372 /* XXX */ 373 case CPU_OFF: 374 ib_cpu_unregister(cpu_id); 375 cv_signal(&ib_cv); 376 break; 377 378 case CPU_INTR_ON: 379 ib_cpu_register(cpu_id); 380 cv_signal(&ib_cv); 381 break; 382 383 default: 384 break; 385 } 386 387 return (0); 388 } 389 390 static ib_cpu_t * 391 ib_cpu_create(void) 392 { 393 ib_cpu_t *icpu; 394 395 icpu = kmem_alloc(sizeof (ib_cpu_t), KM_SLEEP); 396 397 return (icpu); 398 } 399 400 /* 401 * Find a CPU in the global list of CPUs by processor id. 402 */ 403 static ib_cpu_t * 404 ib_cpu_find(list_t cpu_list, processorid_t cpu_id) 405 { 406 ib_cpu_t *icpu; 407 408 IB_APIDBG((CE_CONT, "ib_cpu_find: API cpu = %d", cpu_id)); 409 410 FOREACH_CPU(icpu, cpu_list) { 411 if (icpu->ic_cpu_id == cpu_id) 412 return (icpu); 413 } 414 415 return (NULL); 416 } 417 418 /* 419 * Find a interrupt vector for a specific CPU. 420 */ 421 static ib_ivec_t * 422 ib_cpu_find_ivec(list_t cpu_list, processorid_t cpu_id, char *buspath, uint64_t ino) 423 { 424 ib_cpu_t *icpu; 425 ib_ivec_t *ivec; 426 427 icpu = ib_cpu_find(cpu_list, cpu_id); 428 if (icpu == NULL) 429 return (NULL); 430 431 for (ivec = list_head(&icpu->ic_ivec_list); ivec != NULL; 432 ivec = list_next(&icpu->ic_ivec_list, ivec)) { 433 if (ivec->ii_ino == ino) 434 return (ivec); 435 } 436 437 return (NULL); 438 } 439 440 /* 441 * Total times spend. 442 */ 443 static void 444 ib_cpu_statistics(ib_cpu_t *icpu) 445 { 446 cpu_t *cp; 447 hrtime_t msnsecs[NCMSTATES]; 448 hrtime_t new_tot; 449 450 cp = cpu[icpu->ic_cpu_id]; 451 get_cpu_mstate(cp, msnsecs); 452 453 icpu->ic_tot = msnsecs[CMS_IDLE] + msnsecs[CMS_USER] + 454 msnsecs[CMS_SYSTEM]; 455 456 } 457 458 /* 459 * Create a new interrupt vector. 460 */ 461 static ib_ivec_t * 462 ib_ivec_create(const char *buspath, uint64_t ino) 463 { 464 ib_ivec_t *ivec; 465 466 ivec = (ib_ivec_t *)kmem_alloc(sizeof (ib_ivec_t), KM_SLEEP); 467 468 list_link_init(&ivec->ii_next); 469 470 ivec->ii_buspath = (char *)buspath; /* XXX: strdup */ 471 ivec->ii_ino = ino; 472 ivec->ii_ihs = 1; 473 474 return (ivec); 475 } 476 477 static void 478 intrd_ivec_register(ib_cpu_t *icpu) 479 { 480 } 481 482 /* 483 * Find interrupt vector by ino. 484 */ 485 static ib_ivec_t * 486 ib_ivec_find_ino(list_t ivec_list, uint64_t ino) 487 { 488 ib_ivec_t *ivec; 489 490 FOREACH_IVEC(ivec, ivec_list) { 491 if (ivec->ii_inum == ino) 492 return (ivec); 493 } 494 495 return (NULL); 496 } 497 498 /* 499 * Delete a interrupt vector from a list. 500 */ 501 static void 502 ib_ivec_delete_ino(list_t ivec_list, uint64_t ino) 503 { 504 ib_ivec_t *ivec; 505 506 FOREACH_IVEC(ivec, ivec_list) { 507 if (ivec->ii_inum == ino) { 508 /* XXX: remove from list */ 509 ; 510 } 511 } 512 } 513 514 /* 515 * Add a new interrupt vector to a list. 516 */ 517 static void 518 ib_ivec_add_ino(list_t ivec_list, ib_ivec_t *ivec) 519 { 520 list_insert_tail(&ivec_list, ivec); 521 } 522 523 static ib_msi_t * 524 ib_msi_create(const char *name) 525 { 526 ib_msi_t *msi; 527 528 msi = (ib_msi_t *)kmem_alloc(sizeof (ib_msi_t), KM_SLEEP); 529 530 msi->im_name = name; 531 532 list_link_init(&msi->im_next); 533 list_create(&msi->im_ino_list, sizeof (ib_msi_ino_t), 534 offsetof(ib_msi_ino_t, imi_next)); 535 536 return (msi); 537 } 538 539 /* 540 * Allocate and initialize a new snapshot structure. 541 */ 542 static ib_snapshot_t * 543 ib_snapshot_create(void) 544 { 545 ib_snapshot_t *snapshot; 546 547 snapshot = kmem_alloc(sizeof (ib_snapshot_t), KM_SLEEP); 548 549 /* init link */ 550 551 /* Initialize CPU list */ 552 list_create(&snapshot->is_cpu_list, sizeof (ib_cpu_t), 553 offsetof(ib_cpu_t, ic_next)); 554 555 snapshot->is_num_cpus = 0; 556 557 return (snapshot); 558 } 559 560 static ib_ivec_t * 561 ib_irq_fill_ivec(kstat_t *ksp) 562 { 563 kstat_named_t *knp; 564 ib_ivec_t *ivec; 565 char *datap; 566 uint64_t time; 567 int i; 568 569 datap = ksp->ks_data; 570 knp = KSTAT_NAMED_PTR(ksp); 571 for (i = 0; i < ksp->ks_ndata; i++, knp++) { 572 IB_IMPLDBG((CE_CONT, "ib_irq_fill_ivec: %s", 573 knp->name)); 574 575 if (strcmp(knp->name, "time") == 0) { 576 cmn_err(CE_CONT, "XXX ib time"); 577 time = knp->value.ui64; 578 } 579 580 knp += sizeof (kstat_named_t); 581 datap += sizeof (kstat_named_t); 582 } 583 584 /* Allocate a new interrupt vector */ 585 ivec = ib_ivec_create("", 0); 586 ivec->ii_time = time; 587 588 return (ivec); 589 } 590 591 /* 592 * XXX: icpu not needed, move out of loop 593 */ 594 static void 595 ib_irq_statistics(ib_cpu_t *icpu) 596 { 597 kstat_t *ksp; 598 int instance = 1; 599 600 /* 601 * Read pci interrupts. 602 */ 603 ksp = kstat_hold_byname("pci_intrs", instance, "pci", ALL_ZONES); 604 while (ksp != NULL) { 605 KSTAT_ENTER(ksp); 606 607 if (ksp->ks_data != NULL && ksp->ks_type == KSTAT_TYPE_NAMED) { 608 ib_cpu_t *icpu; 609 ib_ivec_t *ivec; 610 kstat_named_t *knp; 611 kstat_named_t *datap; 612 uint64_t ino; 613 char *buspath; 614 char *namep; 615 processorid_t cpu_id; 616 int i; 617 boolean_t is_enabled = B_TRUE; 618 619 (void) KSTAT_UPDATE(ksp, KSTAT_READ); 620 621 /* 622 * Find the CPU this interrupt vector is on and 623 * if the vector itself is enabled. 624 */ 625 datap = ksp->ks_data; 626 namep = KSTAT_NAMED_PTR(ksp)->name; 627 for (i = 0; i < ksp->ks_ndata; i++) { 628 if (strcmp(namep, "cpu") == 0) { 629 cpu_id = datap->value.ui64; 630 } else if (strcmp(namep, "type") == 0) { 631 if (strcmp(datap->value.c, "disabled") == 0) { 632 is_enabled = B_FALSE; 633 break; 634 } 635 } 636 637 namep += sizeof (kstat_named_t); 638 datap += sizeof (kstat_named_t); 639 } 640 641 /* 642 * Skip this interrupt vector if its disabled. 643 */ 644 if (!is_enabled) 645 continue; 646 647 /* 648 * Check if CPU is online. 649 */ 650 icpu = ib_cpu_find(ib_cpu_list, cpu_id); 651 if (icpu == NULL || icpu->ic_offline) 652 continue; 653 654 /* 655 * Fill information. 656 */ 657 ivec = ib_irq_fill_ivec(ksp); 658 if (ivec == NULL) 659 continue; 660 661 list_insert_tail(&icpu->ic_ivec_list, ivec); 662 } 663 664 KSTAT_EXIT(ksp); 665 kstat_rele(ksp); 666 667 instance++; 668 ksp = kstat_hold_byname("pci_intrs", instance, "pci", ALL_ZONES); 669 } 670 } 671 672 /* 673 * Collect data from CPUs and interrupt vectors. 674 */ 675 static ib_snapshot_t * 676 ib_get_statistics(void) 677 { 678 ib_cpu_t *os_cpu; 679 ib_snapshot_t *snapshot; 680 ib_cpu_t *snapshot_cpu; 681 682 /* 683 * Nothing to balance with one CPU. XXX: right place? 684 */ 685 if (ib_num_cpus <= 1) { 686 ib_sleeptime = IB_ONECPU_SLEEPTIME; 687 return (NULL); 688 } 689 690 /* 691 * Store all CPUs and ivecs here. 692 */ 693 snapshot = ib_snapshot_create(); 694 695 /* 696 * Loop over all active CPUs 697 */ 698 FOREACH_CPU(os_cpu, ib_cpu_list) { 699 700 snapshot->is_num_cpus++; 701 702 snapshot_cpu = ib_cpu_create(); 703 snapshot_cpu->ic_cpu_id = os_cpu->ic_cpu_id; 704 705 list_insert_tail(&snapshot->is_cpu_list, snapshot_cpu); 706 707 ib_cpu_statistics(snapshot_cpu); 708 ib_irq_statistics(os_cpu); 709 } 710 711 return (snapshot); 712 } 713 714 static ib_delta_t * 715 ib_delta_create(void) 716 { 717 ib_delta_t *delta; 718 719 delta = kmem_alloc(sizeof (ib_delta_t), KM_SLEEP); 720 delta->id_missing = B_FALSE; 721 722 list_create(&delta->id_cpu_list, sizeof (ib_cpu_t), 723 offsetof(ib_cpu_t, ic_next)); 724 725 return (delta); 726 } 727 728 /* 729 * Generate the delta of two snapshots. 730 */ 731 static ib_delta_t * 732 ib_delta_generate(ib_snapshot_t *old_snapshot, ib_snapshot_t *new_snapshot) 733 { 734 ib_cpu_t *old_cpu, *new_cpu; 735 ib_delta_t *delta; 736 int intrload = 0; 737 int intrnsec = 0; 738 processorid_t cpus = 0; 739 740 /* 741 * Allocate a new delta structure. 742 */ 743 delta = ib_delta_create(); 744 745 /* 746 * Number of CPUs must be the same. 747 */ 748 delta->id_missing = old_snapshot->is_num_cpus != 749 new_snapshot->is_num_cpus; 750 751 if (delta->id_missing != 0) { 752 IB_LOG((CE_CONT, "ib_delta_generate: number of CPUs changed")); 753 return (delta); 754 } 755 756 /* 757 * Loop over the CPUs in both snapshots. 758 */ 759 for (new_cpu = list_head(&new_snapshot->is_cpu_list), 760 old_cpu = list_head(&old_snapshot->is_cpu_list); 761 new_cpu != NULL && old_cpu != NULL; 762 new_cpu = list_next(&new_snapshot->is_cpu_list, new_cpu), 763 old_cpu = list_next(&old_snapshot->is_cpu_list, old_cpu)) { 764 ib_cpu_t *delta_cpu; 765 ib_ivec_t *new_ivec; 766 767 /* XXX: just onlined CPU? */ 768 769 /* Allocate a new CPU structure */ 770 delta_cpu = ib_cpu_create(); 771 772 /* Difference of total time */ 773 delta_cpu->ic_tot = new_cpu->ic_tot - old_cpu->ic_tot; 774 if (!(delta_cpu->ic_tot >= 0)) { 775 delta->id_missing = B_TRUE; 776 kmem_free(delta_cpu, sizeof (ib_cpu_t)); 777 return (delta); 778 } 779 780 list_insert_tail(&delta->id_cpu_list, delta_cpu); 781 782 /* Avoid division by zero */ 783 if (delta_cpu->ic_tot == 0) 784 delta_cpu->ic_tot = 1; 785 786 delta_cpu->ic_intrs = 0; 787 delta_cpu->ic_big_intrs = 0; 788 789 /* 790 * Number of interrupt vectors must be the same. 791 */ 792 if (old_cpu->ic_num_ivecs != new_cpu->ic_num_ivecs) { 793 IB_LOG((CE_CONT, "ib_delta_generate: cpu %d has more " 794 "or less interrupts", old_cpu->ic_cpu_id)); 795 delta->id_missing = B_TRUE; 796 return (delta); 797 } 798 799 /* 800 * Loop over the interrupt vectors of the new CPU. 801 */ 802 for (new_ivec = list_head(&new_cpu->ic_ivec_list); 803 new_ivec != NULL; new_ivec = 804 list_next(&new_cpu->ic_ivec_list, new_ivec)) { 805 ib_ivec_t *ivec; 806 ib_ivec_t *delta_ivec; 807 hrtime_t time; 808 809 if (new_ivec->ii_num_ino == 0) 810 continue; 811 812 /* 813 * If interrupt vector does not exists or XXX crtime 814 * is different, set missing. 815 */ 816 ivec = ib_ivec_find_ino(old_cpu->ic_ivec_list, 817 new_ivec->ii_ino); 818 if (ivec == NULL) { 819 delta->id_missing = B_TRUE; 820 return (delta); 821 } 822 823 /* Allocate a new delta interrupt vector */ 824 delta_ivec = ib_ivec_create(new_ivec->ii_buspath, 825 new_ivec->ii_ino); 826 827 /* 828 * Time used by this interrupt. 829 */ 830 time = new_ivec->ii_time - ivec->ii_time; 831 if (time < 0) { 832 delta->id_missing = B_TRUE; 833 kmem_free(delta_ivec, sizeof (ib_delta_t)); 834 return (delta); 835 } 836 837 delta_cpu->ic_intrs += time; 838 delta_ivec->ii_time = time; 839 840 if (time > delta_cpu->ic_bigintr) 841 delta_cpu->ic_bigintr = time; 842 843 /* 844 * Fill in the rest. 845 */ 846 delta_ivec->ii_ihs = new_ivec->ii_ihs; 847 delta_ivec->ii_pil = new_ivec->ii_pil; 848 delta_ivec->ii_ino = new_ivec->ii_ino; 849 delta_ivec->ii_num_ino = new_ivec->ii_num_ino; 850 /* XXX: buspath, name */ 851 } 852 853 /* 854 * Rounding error 855 */ 856 if (delta_cpu->ic_tot < delta_cpu->ic_intrs) 857 delta_cpu->ic_tot = delta_cpu->ic_intrs; 858 859 delta_cpu->ic_intr_load = 860 delta_cpu->ic_intrs / delta_cpu->ic_tot; 861 intrload += delta_cpu->ic_intr_load; 862 intrnsec += delta_cpu->ic_intrs; 863 864 cpus++; 865 } 866 867 if (cpus > 0) { 868 delta->id_avgintrload = intrload / cpus; 869 delta->id_avgintrnsec = intrnsec / cpus; 870 } else { 871 delta->id_avgintrload = 0; 872 delta->id_avgintrnsec = 0; 873 } 874 875 return (delta); 876 } 877 878 /* 879 * Compress deltas. 880 */ 881 static ib_delta_t * 882 ib_delta_compress(list_t *deltas) 883 { 884 ib_cpu_t *icpu; 885 ib_ivec_t *ivec; 886 ib_delta_t *new_delta, *delta; 887 processorid_t cpus = 0; 888 int high_intrload = 0; 889 int intrs = 0, tot; 890 891 /* Check if empty list of deltas */ 892 if (deltas == NULL || list_is_empty(deltas) != 0) { 893 IB_LOG((CE_CONT, "ib_delta_compress: deltas are empty?")); 894 return (NULL); 895 } 896 897 /* Allocate a new delta structure */ 898 new_delta = ib_delta_create(); 899 900 /* 901 * Loop over the deltas in the list. 902 */ 903 for (delta = list_head(deltas); delta != NULL; 904 delta = list_next(deltas, delta)) { 905 906 /* Compressing bad delta? */ 907 if (delta->id_missing) { 908 IB_LOG((CE_CONT, 909 "ib_delta_compress: compressing bad deltas?")); 910 return (NULL); 911 } 912 913 FOREACH_CPU(icpu, delta->id_cpu_list) { 914 ib_cpu_t *new_cpu; 915 ib_ivec_t *new_ivec; 916 917 intrs += icpu->ic_intrs; 918 tot += icpu->ic_tot; 919 new_cpu = ib_cpu_create(); 920 new_cpu->ic_cpu_id = icpu->ic_cpu_id; 921 new_cpu->ic_intrs = icpu->ic_intrs; 922 new_cpu->ic_tot = icpu->ic_tot; 923 924 /* XXX: exists ivecs */ 925 FOREACH_IVEC(new_ivec, icpu->ic_ivec_list) { 926 ib_ivec_t *new_delta_ivec; 927 928 new_delta_ivec = ib_ivec_create( 929 new_ivec->ii_buspath, new_ivec->ii_ino); 930 931 } 932 } 933 } 934 935 FOREACH_CPU(icpu, new_delta->id_cpu_list) { 936 int bigintr = 0; 937 938 cpus++; 939 940 FOREACH_IVEC(ivec, icpu->ic_ivec_list) { 941 if (ivec->ii_time > bigintr) 942 bigintr = ivec->ii_time; 943 } 944 945 icpu->ic_bigintr = bigintr; 946 icpu->ic_intr_load = icpu->ic_intrs / icpu->ic_tot; 947 948 if (high_intrload < icpu->ic_intr_load) 949 high_intrload = icpu->ic_intr_load; 950 951 if (icpu->ic_tot <= 0) 952 icpu->ic_tot = 100; 953 } 954 955 if (cpus > 0) { 956 new_delta->id_avgintrload = intrs / tot; 957 new_delta->id_avgintrnsec = intrs / cpus; 958 } else { 959 new_delta->id_avgintrload = 0; 960 new_delta->id_avgintrnsec = 0; 961 } 962 963 /* XXX: global sleeptime */ 964 965 return (new_delta); 966 } 967 968 /* 969 * Decide if the load is out of balance. 970 */ 971 static int 972 ib_imbalanced(int goodness, int baseline) 973 { 974 if (goodness > 50) 975 return (100); 976 977 /* XXX: abs */ 978 if ((goodness - baseline) > goodness_mindelta) 979 return (100); 980 981 return (0); 982 } 983 984 /* 985 * Calculate goodness of a CPU. 986 */ 987 static int 988 ib_goodness_cpu(ib_cpu_t *icpu, int avg_interrupt_load) 989 { 990 int goodness; 991 int load, load_no_bigintr; 992 993 load = icpu->ic_intrs / icpu->ic_tot; 994 if (load < avg_interrupt_load) 995 return (0); 996 997 load_no_bigintr = (icpu->ic_intrs - icpu->ic_bigintr) / icpu->ic_tot; 998 999 if ((load > goodness_unsafe_load) && (icpu->ic_num_ivecs > 1)) 1000 return (1); 1001 1002 goodness = load - avg_interrupt_load; 1003 if (goodness > load_no_bigintr) 1004 goodness = load_no_bigintr; 1005 1006 return (goodness); 1007 } 1008 1009 /* 1010 * Calculate goodness. 1011 */ 1012 static int 1013 ib_goodness(ib_delta_t *delta) 1014 { 1015 ib_cpu_t *icpu; 1016 int goodness, high_goodness = 0; 1017 1018 if (delta->id_missing > 0) 1019 return (1); 1020 1021 FOREACH_CPU(icpu, delta->id_cpu_list) { 1022 goodness = ib_goodness_cpu(icpu, delta->id_avgintrload); 1023 if (!(goodness >= 0 && goodness <= 100)) { 1024 IB_LOG((CE_CONT, 1025 "ib_goodness: cpu goodness out of range?")); 1026 return (100); 1027 } 1028 1029 if (goodness == 100) 1030 return (100); 1031 1032 if (goodness > high_goodness) 1033 high_goodness = goodness; 1034 } 1035 1036 return (high_goodness); 1037 } 1038 1039 static void 1040 ib_do_find_goal(list_t ivecs, list_t loads, int goal, int idx) 1041 { 1042 list_t goals_with; 1043 list_t goals_without; 1044 int with, without; 1045 int which, load; 1046 1047 1048 if (goal <= load) { 1049 with = load; 1050 } else { 1051 /* XXX: do_find_goal */ 1052 with += load; 1053 } 1054 1055 IB_LOG((CE_CONT, "XXX")); 1056 1057 if (with >= goal && without < goal) { 1058 which = 0; 1059 } else if (with < goal && without >= goal) { 1060 which = 1; 1061 } else if (with >= goal && without >= goal) { 1062 which = without < with; 1063 } else { 1064 which = without > with; 1065 } 1066 1067 if (which == 1) { 1068 IB_LOG((CE_CONT, "ib_do_find_goal: going without")); 1069 /* XXX */ 1070 } else { 1071 IB_LOG((CE_CONT, "ib_do_find_goal: going with")); 1072 /* XXX */ 1073 } 1074 } 1075 1076 typedef struct _ib_goal { 1077 list_node_t *ig_link; 1078 int ig_value; 1079 } ib_goal_t; 1080 1081 typedef struct _ib_goal_load { 1082 list_node_t *igl_link; 1083 int igl_value; 1084 } ib_goal_load_t; 1085 1086 static void 1087 ib_find_goal(list_t ivecs, int goal) 1088 { 1089 ib_ivec_t *ivec; 1090 list_t goals; 1091 int load; 1092 1093 if (goal <= 0) { 1094 list_create(&goals, sizeof (ib_goal_t), 1095 offsetof (ib_goal_t, ig_link)); 1096 } else { 1097 list_t loads; 1098 hrtime_t tot = 0; 1099 1100 IB_LOG((CE_CONT, "ib_find_goal: finding goal from intrs XXX")); 1101 1102 FOREACH_IVEC(ivec, ivecs) { 1103 tot += ivec->ii_time; 1104 } 1105 1106 list_create(&loads, sizeof (ib_goal_load_t), 1107 offsetof (ib_goal_load_t, igl_link)); 1108 1109 FOREACH_IVEC(ivec, ivecs) { 1110 ib_goal_load_t *igl = kmem_alloc(sizeof (ib_goal_load_t), KM_SLEEP); 1111 1112 igl->igl_value = tot; 1113 list_insert_tail(&loads, igl); 1114 1115 tot -= ivec->ii_time; 1116 } 1117 } 1118 } 1119 1120 static void 1121 ib_do_reconfig_cpu2cpu(ib_delta_t *delta, processorid_t src_cpuid, 1122 processorid_t tgt_cpuid, int src_load) 1123 { 1124 ib_cpu_t *src_cpu, *tgt_cpu; 1125 ib_ivec_t *ivec; 1126 list_t ivecs; 1127 int goal; 1128 int avg_nsec; 1129 1130 if (delta == NULL) 1131 return; 1132 1133 goal = delta->id_avgintrnsec; 1134 1135 src_cpu = ib_cpu_find(delta->id_cpu_list, src_cpuid); 1136 if (src_cpu == NULL) 1137 return; 1138 1139 tgt_cpu = ib_cpu_find(delta->id_cpu_list, tgt_cpuid); 1140 if (tgt_cpu == NULL) 1141 return; 1142 1143 avg_nsec = (src_cpu->ic_intrs + tgt_cpu->ic_intrs) / 2; 1144 if (goal < avg_nsec) 1145 goal = avg_nsec; 1146 1147 1148 /* 1149 * Sort interrupt vectors by time. 1150 */ 1151 list_create(&ivecs, sizeof (ib_ivec_t), 1152 offsetof (ib_ivec_t, ii_next)); 1153 1154 ivec = list_head(&ivecs); 1155 if (ivec->ii_orig_cpu == src_cpuid) { 1156 IB_LOG((CE_CONT, "Keeping XXX on %d", 1157 src_cpuid)); /* ivec->ii_inum, */ 1158 goal -= ivec->ii_time; 1159 /* XXX: shift */ 1160 } 1161 1162 IB_LOG((CE_CONT, "ib_reconfig_cpu2cpu: inums should total %d", goal)); 1163 1164 ib_find_goal(ivecs, goal); 1165 } 1166 1167 static void 1168 ib_do_reconfig_cpu(ib_delta_t *delta, list_t *cpu_sorted_list, 1169 processorid_t old_cpu_id) 1170 { 1171 ib_cpu_t *icpu; 1172 int avgintrload; 1173 1174 if (delta == NULL) 1175 return; 1176 1177 icpu = ib_cpu_find(delta->id_cpu_list, old_cpu_id); 1178 if (icpu == NULL) 1179 return; 1180 1181 avgintrload = delta->id_avgintrload; 1182 1183 } 1184 1185 /* 1186 * Reconfigure interrupt distribution among CPUs. 1187 */ 1188 static int 1189 ib_do_reconfig(ib_delta_t *delta) 1190 { 1191 ib_cpu_t *icpu; 1192 ib_ivec_t *ivec; 1193 list_t cpu_sorted_list; 1194 int goodness, new_goodness; 1195 int warned = 0; 1196 int rval = 1, ret = 1; 1197 1198 if (delta == NULL) 1199 return (-1); 1200 1201 goodness = delta->id_goodness; 1202 if (goodness < goodness_mindelta) { 1203 IB_LOG((CE_CONT, "ib_do_reconfig: goodness is good enough")); 1204 return (0); 1205 } 1206 1207 IB_LOG((CE_CONT, "ib_do_reconfig: optimizing interrupt assignments")); 1208 1209 if (delta->id_missing != 0) { 1210 IB_LOG((CE_CONT, "ib_do_reconfig: aborted")); 1211 return (-1); 1212 } 1213 1214 FOREACH_CPU(icpu, delta->id_cpu_list) { 1215 FOREACH_IVEC(ivec, icpu->ic_ivec_list) { 1216 ivec->ii_orig_cpu = icpu->ic_cpu_id; 1217 ivec->ii_now_cpu = icpu->ic_cpu_id; 1218 /* XXX: inum */ 1219 } 1220 } 1221 1222 list_create(&cpu_sorted_list, sizeof (ib_cpu_t), 1223 offsetof(ib_cpu_t, ic_next)); 1224 1225 /* 1226 * Have we an improvement? 1227 */ 1228 new_goodness = ib_goodness(delta); 1229 if (!(new_goodness <= goodness)) { 1230 IB_LOG((CE_CONT, 1231 "ib_do_reconfig: result has worse goodness")); 1232 } 1233 1234 if ((goodness != 100 || new_goodness == 100) && 1235 goodness - new_goodness < goodness_mindelta) { 1236 IB_LOG((CE_CONT, 1237 "ib_do_reconfig: goodness already near optimum")); 1238 return (0); 1239 } 1240 1241 /* 1242 * Move interrupts. 1243 */ 1244 1245 FOREACH_CPU(icpu, delta->id_cpu_list) { 1246 FOREACH_IVEC(ivec, icpu->ic_ivec_list) { 1247 int error; 1248 1249 if (ivec->ii_orig_cpu == icpu->ic_cpu_id) 1250 continue; 1251 1252 error = ib_interrupt_do_move(ivec, icpu->ic_cpu_id); 1253 if (error != 0) { 1254 if (warned++ == 0) { 1255 IB_LOG((CE_CONT, "ib_do_reconfig: " 1256 "unable to move interrupt")); 1257 } 1258 1259 IB_LOG((CE_CONT, "ib_do_reconfig: " 1260 "unable to move buspath")); 1261 1262 ret = -1; 1263 } 1264 } 1265 } 1266 1267 return (rval); 1268 } 1269 1270 1271 /* 1272 * Check if the interrupt load did decrease. 1273 */ 1274 static void 1275 ib_interrupt_move_check(ib_delta_t *delta, processorid_t old_cpuid, 1276 processorid_t new_cpuid) 1277 { 1278 ib_cpu_t *old_cpu, *new_cpu; 1279 1280 /* 1281 * Check old CPU. 1282 */ 1283 old_cpu = ib_cpu_find(delta->id_cpu_list, old_cpuid); 1284 if (old_cpu == NULL) 1285 return; 1286 if (!(old_cpu->ic_tot >= old_cpu->ic_intrs)) { 1287 IB_LOG((CE_CONT, 1288 "Moved interrupts left 100+%% load on source CPU")); 1289 } 1290 1291 /* 1292 * Check new CPU. 1293 */ 1294 new_cpu = ib_cpu_find(delta->id_cpu_list, new_cpuid); 1295 if (new_cpu == NULL) 1296 return; 1297 if (!(new_cpu->ic_tot >= new_cpu->ic_intrs)) { 1298 IB_LOG((CE_CONT, 1299 "Moved interrupts left 100+%% load on target CPU")); 1300 } 1301 } 1302 1303 /* 1304 * Actually moving the interrupt. 1305 */ 1306 static int 1307 ib_interrupt_do_move(ib_ivec_t *ivec, processorid_t cpu_id) 1308 { 1309 int ret, result; 1310 1311 struct psm_ops *pops; 1312 1313 //pops = mach_set[0]; 1314 1315 // ret = (*psm_intr_ops)(NULL, &info_hdl, PSM_INTR_OP_SET_CPU, 1316 // &result); 1317 1318 return (-1); 1319 } 1320 1321 /* 1322 * Move an interrupt to a different CPU. 1323 */ 1324 static int 1325 ib_interrupt_move(ib_delta_t *delta, uint64_t inum, processorid_t old_cpuid, 1326 processorid_t new_cpuid) 1327 { 1328 ib_cpu_t *old_cpu, *new_cpu; 1329 ib_ivec_t *ivec; 1330 1331 if (delta == NULL) 1332 return (-1); 1333 1334 /* 1335 * Remove interrupt vector from old CPU. 1336 */ 1337 old_cpu = ib_cpu_find(delta->id_cpu_list, old_cpuid); 1338 if (old_cpu == NULL) 1339 return (-1); 1340 1341 ivec = ib_ivec_find_ino(old_cpu->ic_ivec_list, inum); 1342 1343 old_cpu->ic_intrs -= ivec->ii_time; 1344 old_cpu->ic_intr_load = old_cpu->ic_intrs / old_cpu->ic_tot; 1345 ib_ivec_delete_ino(old_cpu->ic_ivec_list, inum); 1346 1347 /* 1348 * Verify interrupts. 1349 */ 1350 if (!(old_cpu->ic_intrs >= 0)) { 1351 IB_LOG((CE_CONT, 1352 "ib_interrupt_move: interrupt time > total time?")); 1353 } 1354 1355 if (!(ivec->ii_time <= old_cpu->ic_bigintr)) { 1356 IB_LOG((CE_CONT, 1357 "ib_interrupt_move: interrupt time > big interrupt?")); 1358 } 1359 1360 if (ivec->ii_time >= old_cpu->ic_bigintr) { 1361 ib_ivec_t *time_ivec; 1362 uint64_t bigtime = 0; 1363 1364 FOREACH_IVEC(time_ivec, old_cpu->ic_ivec_list) { 1365 if (time_ivec->ii_time > bigtime) 1366 bigtime = time_ivec->ii_time; 1367 } 1368 } 1369 1370 1371 /* 1372 * Insert interrupt vector into new CPU. 1373 */ 1374 new_cpu = ib_cpu_find(delta->id_cpu_list, new_cpuid); 1375 if (new_cpu == NULL) 1376 return (-1); 1377 1378 ivec->ii_now_cpu = new_cpuid; 1379 new_cpu->ic_intrs += ivec->ii_time; 1380 new_cpu->ic_intr_load = new_cpu->ic_intrs / new_cpu->ic_tot; 1381 ib_ivec_add_ino(new_cpu->ic_ivec_list, ivec); 1382 1383 if (ivec->ii_time > new_cpu->ic_bigintr) 1384 new_cpu->ic_bigintr = ivec->ii_time; 1385 1386 return (0); 1387 }