1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2013 David Hoeppner.  All rights reserved.
  14  */
  15 
  16 /*
  17  * Interrupt Load Balancer.
  18  *
  19  * The original balance functions views all CPU as equally.
  20  */
  21 
  22 /* XXX
  23  *
  24  * ib_cpu_list::walk list |::print ib_cpu_t
  25  *
  26  *
  27  *
  28  */
  29 #include <sys/param.h>
  30 #include <sys/types.h>
  31 #include <sys/systm.h>
  32 #include <sys/callb.h>
  33 #include <sys/cpuvar.h>
  34 #include <sys/proc.h>
  35 #include <sys/processor.h>
  36 #include <sys/sdt.h>
  37 #include <sys/sysmacros.h>
  38 #include <sys/time.h>
  39 #include <sys/cmn_err.h>
  40 #include <sys/zone.h>
  41 #include <sys/lgrp.h>
  42 
  43 #include <sys/pci_tools.h>
  44 
  45 extern  proc_t  *proc_intrd;
  46 
  47 #define IB_NAME                 "intrd"
  48 
  49 #define IS_CPU(cpu_id)          (cpu[cpu_id] != NULL)
  50 
  51 #define IB_NORMAL_SLEEPTIME     10
  52 #define IB_IDLE_SLEEPTIME       45
  53 #define IB_ONECPU_SLEEPTIME     (60 * 15)
  54 
  55 #define IB_NUM_SAMPLES          6
  56 
  57 
  58 static kmutex_t         ib_lock;
  59 static kcondvar_t       ib_cv;
  60 
  61 /*
  62  * Interrupt CPU instance.
  63  */
  64 typedef struct _ib_cpu {
  65         list_node_t     ic_next;
  66         boolean_t       ic_offline;
  67 
  68         hrtime_t        ic_tot;
  69         list_t          ic_ivec_list;
  70         uint32_t        ic_num_ivecs;
  71         processorid_t   ic_cpu_id;      /* XXX duplicate */
  72         int64_t         ic_intrs;
  73         int64_t         ic_big_intrs;
  74         int64_t         ic_bigintr;     /* XXX bitintrs */
  75 
  76         int             ic_intr_load;   /* intrs / tot */
  77 } ib_cpu_t;
  78 
  79 /*
  80  * Interrupt vector instance.
  81  */
  82 typedef struct _ib_ivec {
  83         list_node_t     ii_next;        /* link */
  84 
  85         uint64_t        ii_ihs;
  86         uint64_t        ii_ino;
  87         uint64_t        ii_num_ino;
  88         uint64_t        ii_pil;
  89         uint64_t        ii_time;
  90         char            *ii_buspath;
  91         char            *ii_name;
  92 
  93         processorid_t   ii_orig_cpu;
  94         processorid_t   ii_now_cpu;
  95         uint64_t        ii_inum;
  96 } ib_ivec_t;
  97 
  98 /*
  99  * MSI
 100  */
 101 typedef struct _ib_msi {
 102         list_node_t     im_next;        /* link */
 103         const char      *im_name;
 104         list_t          im_ino_list;
 105 } ib_msi_t;
 106 
 107 typedef struct _ib_msi_ino {
 108         list_node_t     imi_next;       /* link */
 109         uint64_t        imi_ino;
 110         ib_ivec_t       *imi_ivec;
 111 } ib_msi_ino_t;
 112 
 113 /*
 114  * Snapshot
 115  */
 116 typedef struct _ib_snapshot {
 117         list_node_t     is_next;        /* link */
 118         list_t          is_cpu_list;
 119         processorid_t   is_num_cpus;
 120 } ib_snapshot_t;
 121 
 122 /*
 123  * Snapshot delta structure.
 124  */
 125 typedef struct _ib_delta {
 126         list_node_t     id_next;        /* link */
 127         list_t          id_cpu_list;
 128         boolean_t       id_missing;
 129         int             id_avgintrload; /* interrupts / total time */
 130         uint64_t        id_avgintrnsec;
 131         int             id_goodness;
 132 } ib_delta_t;
 133 
 134 
 135 static list_t   ib_cpu_list;            /* List of all CPU's */
 136 
 137 static uint8_t  ib_cs = 0;              /* Index of current sample  */
 138 static long     ib_sleeptime = IB_NORMAL_SLEEPTIME;
 139 static processorid_t    ib_num_cpus;
 140 
 141 static int      goodness_unsafe_load = 90;
 142 static int      goodness_mindelta = 10;
 143 
 144 /*
 145  * Function prototypes.
 146  */
 147 static  void    ib_cpu_register(processorid_t);
 148 static  int     ib_cpu_setup(cpu_setup_t, int, void *);
 149 static  int     ib_goodness(ib_delta_t *);
 150 static  int     ib_do_reconfig(ib_delta_t *);
 151 static  int     ib_imbalanced(int, int);
 152 static  int     ib_interrupt_do_move(ib_ivec_t *, processorid_t);
 153 
 154 static  ib_snapshot_t   *ib_get_statistics(void);
 155 static  ib_delta_t      *ib_delta_generate(ib_snapshot_t *, ib_snapshot_t *);
 156 
 157 /*
 158  * Helper macros.
 159  */
 160 #define FOREACH_CPU(icpu, icpu_list)                            \
 161         for (icpu = list_head(&icpu_list); icpu != NULL;    \
 162             icpu = list_next(&icpu_list, icpu))
 163 
 164 #define FOREACH_IVEC(ivec, ivec_list)                           \
 165         for (ivec = list_head(&ivec_list); ivec != NULL;    \
 166             ivec = list_next(&ivec_list, ivec))
 167 
 168 #define DTRACE_INTRD(name)      \
 169         DTRACE_PROBE(__intrd_##name)
 170 
 171 #define DEBUG   1
 172 #ifdef  DEBUG
 173 #define IB_APIDBG(args) cmn_err args
 174 #define IB_IMPLDBG(args)        cmn_err args
 175 #else
 176 #define IB_APIDBG(args)
 177 #define IB_IMPLDBG(args)
 178 #endif
 179 
 180 #define IB_LOG(args)    cmn_err args
 181 
 182 void
 183 interrupt_balancer(void)
 184 {
 185         processorid_t   cpu_id;
 186         callb_cpr_t     cpr;
 187         user_t          *u = PTOU(curproc);
 188         int             error;
 189 
 190         boolean_t       do_reconfig = B_FALSE;
 191         int             goodness;
 192         int             baseline_goodness = 0;
 193         list_t          ib_delta_list;
 194         hrtime_t        statslen = 60;
 195 
 196         proc_intrd = ttoproc(curthread);
 197         proc_intrd->p_cstime = proc_intrd->p_stime = 0;
 198         proc_intrd->p_cutime = proc_intrd->p_utime = 0;
 199 
 200         (void) strncpy(u->u_psargs, IB_NAME, sizeof(u->u_psargs));
 201         (void) strncpy(u->u_comm, IB_NAME, sizeof(u->u_comm));
 202 
 203         /* Initialize global mutex lock */
 204         mutex_init(&ib_lock, NULL, MUTEX_DEFAULT, NULL);
 205 
 206         /* Initialize CPU list */
 207         list_create(&ib_cpu_list, sizeof (ib_cpu_t),
 208             offsetof(ib_cpu_t, ic_next));
 209 
 210         /* Initialize delta list */
 211         list_create(&ib_delta_list, sizeof (ib_delta_t),
 212             offsetof(ib_delta_t, id_next));
 213 
 214         /*
 215          * Build a list of all CPUs available for interrupt handling.
 216          */
 217         for (cpu_id = 0; cpu_id <= max_cpu_seqid_ever; cpu_id++) {
 218                 if (IS_CPU(cpu_id))
 219                         ib_cpu_register(cpu_id);
 220         }
 221 
 222         /*
 223          * Locality group information.
 224          */
 225         int     i;
 226         for (i = 0; i < lgrp_plat_max_lgrps(); i++) {
 227                 lgrp_t  *lgrp;
 228 
 229                 lgrp = lgrp_table[i];
 230         }
 231 
 232         /*
 233          * Register a callback if a CPU goes offline or comes online.
 234          */
 235         mutex_enter(&cpu_lock);
 236         register_cpu_setup_func(ib_cpu_setup, NULL);
 237         mutex_exit(&cpu_lock);
 238 
 239         CALLB_CPR_INIT(&cpr, &ib_lock, callb_generic_cpr, IB_NAME);
 240 
 241         ib_snapshot_t   *snapshot = NULL;
 242         ib_snapshot_t   *new_snapshot = NULL;
 243         hrtime_t        delta_time;
 244         hrtime_t        deltas_tottime = 0;
 245         boolean_t       below_statslen;
 246 
 247         snapshot = ib_get_statistics();
 248 
 249         mutex_enter(&ib_lock);
 250         for (;;) {
 251                 ib_delta_t      *delta;
 252 
 253                 DTRACE_INTRD(get_stats);
 254                 new_snapshot = ib_get_statistics();
 255 
 256                 delta = ib_delta_generate(snapshot, new_snapshot);
 257 
 258                 below_statslen = (deltas_tottime < statslen);
 259                 deltas_tottime += delta_time;
 260                 do_reconfig = (below_statslen && deltas_tottime >= statslen);
 261 
 262                 list_insert_tail(&ib_delta_list, delta);
 263 
 264                 /*
 265                  * Calculate the goodness of the current configuration.
 266                  */
 267                 goodness = ib_goodness(delta);
 268 
 269                 if (ib_imbalanced(goodness, baseline_goodness))
 270                         do_reconfig = B_TRUE;
 271 
 272                 /*
 273                  * Reconfigure interrupt distribution.
 274                  */
 275                 if (do_reconfig) {
 276                         error = ib_do_reconfig(delta);
 277 
 278                         if (error != 0) {
 279                                 if (error == -1)
 280                                         IB_LOG((CE_CONT, "ib_do_reconfig failed!"));
 281                         } else {
 282                                 IB_LOG((CE_CONT, "setting new baseline of %d", goodness));
 283                                 baseline_goodness = goodness;
 284                         }
 285                 }
 286 
 287                 /*
 288                  * Wait for timeout or CPU reconfiguration.
 289                  */
 290                 CALLB_CPR_SAFE_BEGIN(&cpr);
 291                 cv_timedwait(&ib_cv, &ib_lock, ddi_get_lbolt() +
 292                     SEC_TO_TICK(ib_sleeptime));
 293                 CALLB_CPR_SAFE_END(&cpr, &ib_lock);
 294         }
 295 
 296         CALLB_CPR_EXIT(&cpr);
 297 
 298         /*
 299          * Unregister CPU callback.
 300          */
 301         mutex_enter(&cpu_lock);
 302         unregister_cpu_setup_func(ib_cpu_setup, NULL);
 303         mutex_exit(&cpu_lock);
 304 }
 305 
 306 /*
 307  * Register a new CPU in the global list of CPUs.
 308  */
 309 static void
 310 ib_cpu_register(processorid_t cpu_id)
 311 {
 312         cpu_t           *cp = cpu[cpu_id];
 313         ib_cpu_t        *new_cpu;
 314 
 315         new_cpu = kmem_alloc(sizeof (ib_cpu_t), KM_SLEEP);
 316         new_cpu->ic_cpu_id = cpu_id;
 317 
 318         /* Initialize list for interrupt vectors */
 319         list_create(&new_cpu->ic_ivec_list, sizeof (ib_ivec_t),
 320             offsetof(ib_ivec_t, ii_next));
 321 
 322         list_link_init(&new_cpu->ic_next);
 323 
 324         /* Check if this CPU can handle interrupts */
 325         mutex_enter(&cpu_lock);
 326         if (cpu_is_nointr(cp))
 327                 new_cpu->ic_offline = B_TRUE;
 328         else
 329                 new_cpu->ic_offline = B_FALSE;
 330         mutex_exit(&cpu_lock);
 331 
 332         /* Add CPU to list of CPUs */
 333         list_insert_tail(&ib_cpu_list, new_cpu);
 334 
 335         ib_num_cpus++;
 336 
 337         IB_IMPLDBG((CE_CONT, "ib_cpu_register: cpu=0x%x", cpu_id));
 338 }
 339 
 340 /*
 341  * Unregister CPU from the global list of CPUs.
 342  */
 343 static void
 344 ib_cpu_unregister(processorid_t cpu_id)
 345 {
 346         ib_cpu_t        *icpu;
 347 
 348         mutex_enter(&ib_lock);
 349         FOREACH_CPU(icpu, ib_cpu_list) {
 350                 if (icpu->ic_cpu_id == cpu_id) {
 351                         list_remove(&ib_cpu_list, icpu);
 352                         /* XXX or just offline CPU; statistics? */
 353                         break;
 354                 }
 355         }
 356         mutex_exit(&ib_lock);
 357 
 358         ib_num_cpus--;
 359 
 360         IB_IMPLDBG((CE_CONT, "ib_cpu_unregister: cpu=0x%x",
 361             cpu_id));
 362 }
 363 
 364 /*
 365  * Hook for CPU changes.
 366  */
 367 static int
 368 ib_cpu_setup(cpu_setup_t what, int cpu_id, void *arg)
 369 {
 370 
 371         switch (what) {
 372         /* XXX */
 373         case CPU_OFF:
 374                 ib_cpu_unregister(cpu_id);
 375                 cv_signal(&ib_cv);
 376                 break;
 377 
 378         case CPU_INTR_ON:
 379                 ib_cpu_register(cpu_id);
 380                 cv_signal(&ib_cv);
 381                 break;
 382 
 383         default:
 384                 break;
 385         }
 386 
 387         return (0);
 388 }
 389 
 390 static ib_cpu_t *
 391 ib_cpu_create(void)
 392 {
 393         ib_cpu_t        *icpu;
 394 
 395         icpu = kmem_alloc(sizeof (ib_cpu_t), KM_SLEEP);
 396 
 397         return (icpu);
 398 }
 399 
 400 /*
 401  * Find a CPU in the global list of CPUs by processor id.
 402  */
 403 static ib_cpu_t *
 404 ib_cpu_find(list_t cpu_list, processorid_t cpu_id)
 405 {
 406         ib_cpu_t        *icpu;
 407 
 408         IB_APIDBG((CE_CONT, "ib_cpu_find: API cpu = %d", cpu_id));
 409 
 410         FOREACH_CPU(icpu, cpu_list) {
 411                 if (icpu->ic_cpu_id == cpu_id)
 412                         return (icpu);
 413         }
 414 
 415         return (NULL);
 416 }
 417 
 418 /*
 419  * Find a interrupt vector for a specific CPU.
 420  */
 421 static ib_ivec_t *
 422 ib_cpu_find_ivec(list_t cpu_list, processorid_t cpu_id, char *buspath, uint64_t ino)
 423 {
 424         ib_cpu_t        *icpu;
 425         ib_ivec_t       *ivec;
 426 
 427         icpu = ib_cpu_find(cpu_list, cpu_id);
 428         if (icpu == NULL)
 429                 return (NULL);
 430 
 431         for (ivec = list_head(&icpu->ic_ivec_list); ivec != NULL;
 432             ivec = list_next(&icpu->ic_ivec_list, ivec)) {
 433                 if (ivec->ii_ino == ino)
 434                         return (ivec);
 435         }
 436 
 437         return (NULL);
 438 }
 439 
 440 /*
 441  * Total times spend.
 442  */
 443 static void
 444 ib_cpu_statistics(ib_cpu_t *icpu)
 445 {
 446         cpu_t           *cp;
 447         hrtime_t        msnsecs[NCMSTATES];
 448         hrtime_t        new_tot;
 449 
 450         cp = cpu[icpu->ic_cpu_id];
 451         get_cpu_mstate(cp, msnsecs);
 452 
 453         icpu->ic_tot = msnsecs[CMS_IDLE] + msnsecs[CMS_USER] +
 454             msnsecs[CMS_SYSTEM];
 455 
 456 }
 457 
 458 /*
 459  * Create a new interrupt vector.
 460  */
 461 static ib_ivec_t *
 462 ib_ivec_create(const char *buspath, uint64_t ino)
 463 {
 464         ib_ivec_t       *ivec;
 465 
 466         ivec = (ib_ivec_t *)kmem_alloc(sizeof (ib_ivec_t), KM_SLEEP);
 467         
 468         list_link_init(&ivec->ii_next);
 469 
 470         ivec->ii_buspath = (char *)buspath;  /* XXX: strdup */
 471         ivec->ii_ino = ino;
 472         ivec->ii_ihs = 1;
 473 
 474         return (ivec);
 475 }
 476 
 477 static void
 478 intrd_ivec_register(ib_cpu_t *icpu)
 479 {
 480 }
 481 
 482 /*
 483  * Find interrupt vector by ino.
 484  */
 485 static ib_ivec_t *
 486 ib_ivec_find_ino(list_t ivec_list, uint64_t ino)
 487 {
 488         ib_ivec_t       *ivec;
 489 
 490         FOREACH_IVEC(ivec, ivec_list) {
 491                 if (ivec->ii_inum == ino)
 492                         return (ivec);
 493         }
 494 
 495         return (NULL);
 496 }
 497 
 498 /*
 499  * Delete a interrupt vector from a list.
 500  */
 501 static void
 502 ib_ivec_delete_ino(list_t ivec_list, uint64_t ino)
 503 {
 504         ib_ivec_t       *ivec;
 505 
 506         FOREACH_IVEC(ivec, ivec_list) {
 507                 if (ivec->ii_inum == ino) {
 508                         /* XXX: remove from list */
 509                         ;
 510                 }
 511         }
 512 }
 513 
 514 /*
 515  * Add a new interrupt vector to a list.
 516  */
 517 static void
 518 ib_ivec_add_ino(list_t ivec_list, ib_ivec_t *ivec)
 519 {
 520         list_insert_tail(&ivec_list, ivec);
 521 }
 522 
 523 static ib_msi_t *
 524 ib_msi_create(const char *name)
 525 {
 526         ib_msi_t        *msi;
 527 
 528         msi = (ib_msi_t *)kmem_alloc(sizeof (ib_msi_t), KM_SLEEP);
 529 
 530         msi->im_name = name;
 531 
 532         list_link_init(&msi->im_next);
 533         list_create(&msi->im_ino_list, sizeof (ib_msi_ino_t),
 534             offsetof(ib_msi_ino_t, imi_next));
 535 
 536         return (msi);
 537 }
 538 
 539 /*
 540  * Allocate and initialize a new snapshot structure.
 541  */
 542 static ib_snapshot_t *
 543 ib_snapshot_create(void)
 544 {
 545         ib_snapshot_t   *snapshot;
 546 
 547         snapshot = kmem_alloc(sizeof (ib_snapshot_t), KM_SLEEP);
 548 
 549         /* init link */
 550 
 551         /* Initialize CPU list */
 552         list_create(&snapshot->is_cpu_list, sizeof (ib_cpu_t),
 553             offsetof(ib_cpu_t, ic_next));
 554 
 555         snapshot->is_num_cpus = 0;
 556 
 557         return (snapshot);
 558 }
 559 
 560 static ib_ivec_t *
 561 ib_irq_fill_ivec(kstat_t *ksp)
 562 {
 563         kstat_named_t   *knp;
 564         ib_ivec_t       *ivec;
 565         char            *datap;
 566         uint64_t        time;
 567         int             i;
 568 
 569         datap = ksp->ks_data;
 570         knp = KSTAT_NAMED_PTR(ksp);
 571         for (i = 0; i < ksp->ks_ndata; i++, knp++) {
 572                 IB_IMPLDBG((CE_CONT, "ib_irq_fill_ivec: %s",
 573                     knp->name));
 574 
 575                 if (strcmp(knp->name, "time") == 0) {
 576                         cmn_err(CE_CONT, "XXX ib time");
 577                         time = knp->value.ui64;
 578                 }
 579 
 580                 knp += sizeof (kstat_named_t);
 581                 datap += sizeof (kstat_named_t);
 582         }
 583 
 584         /* Allocate a new interrupt vector */
 585         ivec = ib_ivec_create("", 0);
 586         ivec->ii_time = time;
 587 
 588         return (ivec);
 589 }
 590 
 591 /*
 592  * XXX: icpu not needed, move out of loop
 593  */
 594 static void
 595 ib_irq_statistics(ib_cpu_t *icpu)
 596 {
 597         kstat_t         *ksp;
 598         int             instance = 1;
 599 
 600         /*
 601          * Read pci interrupts.
 602          */
 603         ksp = kstat_hold_byname("pci_intrs", instance, "pci", ALL_ZONES);
 604         while (ksp != NULL) {
 605                 KSTAT_ENTER(ksp);
 606 
 607                 if (ksp->ks_data != NULL && ksp->ks_type == KSTAT_TYPE_NAMED) {
 608                         ib_cpu_t        *icpu;
 609                         ib_ivec_t       *ivec;
 610                         kstat_named_t   *knp;
 611                         kstat_named_t   *datap;
 612                         uint64_t        ino;
 613                         char            *buspath;
 614                         char            *namep;
 615                         processorid_t   cpu_id;
 616                         int             i;
 617                         boolean_t       is_enabled = B_TRUE;
 618 
 619                         (void) KSTAT_UPDATE(ksp, KSTAT_READ);
 620 
 621                         /*
 622                          * Find the CPU this interrupt vector is on and
 623                          * if the vector itself is enabled.
 624                          */
 625                         datap = ksp->ks_data;
 626                         namep = KSTAT_NAMED_PTR(ksp)->name;
 627                         for (i = 0; i < ksp->ks_ndata; i++) {
 628                                 if (strcmp(namep, "cpu") == 0) {
 629                                         cpu_id = datap->value.ui64;
 630                                 } else if (strcmp(namep, "type") == 0) {
 631                                         if (strcmp(datap->value.c, "disabled") == 0) {
 632                                                 is_enabled = B_FALSE;
 633                                                 break;
 634                                         }
 635                                 }
 636 
 637                                 namep += sizeof (kstat_named_t);
 638                                 datap += sizeof (kstat_named_t);
 639                         }
 640 
 641                         /*
 642                          * Skip this interrupt vector if its disabled.
 643                          */
 644                         if (!is_enabled)
 645                                 continue;
 646 
 647                         /*
 648                          * Check if CPU is online.
 649                          */
 650                         icpu = ib_cpu_find(ib_cpu_list, cpu_id);
 651                         if (icpu == NULL || icpu->ic_offline)
 652                                 continue;
 653 
 654                         /*
 655                          * Fill information.
 656                          */
 657                         ivec = ib_irq_fill_ivec(ksp);
 658                         if (ivec == NULL)
 659                                 continue;
 660 
 661                         list_insert_tail(&icpu->ic_ivec_list, ivec);
 662                 }
 663 
 664                 KSTAT_EXIT(ksp);
 665                 kstat_rele(ksp);
 666 
 667                 instance++;
 668                 ksp = kstat_hold_byname("pci_intrs", instance, "pci", ALL_ZONES);
 669         }
 670 }
 671 
 672 /*
 673  * Collect data from CPUs and interrupt vectors.
 674  */
 675 static ib_snapshot_t *
 676 ib_get_statistics(void)
 677 {
 678         ib_cpu_t        *os_cpu;
 679         ib_snapshot_t   *snapshot;
 680         ib_cpu_t        *snapshot_cpu;
 681 
 682         /*
 683          * Nothing to balance with one CPU. XXX: right place?
 684          */
 685         if (ib_num_cpus <= 1) {
 686                 ib_sleeptime = IB_ONECPU_SLEEPTIME;
 687                 return (NULL);
 688         }
 689 
 690         /*
 691          * Store all CPUs and ivecs here.
 692          */
 693         snapshot = ib_snapshot_create();
 694 
 695         /*
 696          * Loop over all active CPUs
 697          */
 698         FOREACH_CPU(os_cpu, ib_cpu_list) {
 699 
 700                 snapshot->is_num_cpus++;
 701 
 702                 snapshot_cpu = ib_cpu_create();
 703                 snapshot_cpu->ic_cpu_id = os_cpu->ic_cpu_id;
 704 
 705                 list_insert_tail(&snapshot->is_cpu_list, snapshot_cpu);
 706 
 707                 ib_cpu_statistics(snapshot_cpu);
 708                 ib_irq_statistics(os_cpu);
 709         }
 710 
 711         return (snapshot);
 712 }
 713 
 714 static ib_delta_t *
 715 ib_delta_create(void)
 716 {
 717         ib_delta_t      *delta;
 718 
 719         delta = kmem_alloc(sizeof (ib_delta_t), KM_SLEEP);
 720         delta->id_missing = B_FALSE;
 721 
 722         list_create(&delta->id_cpu_list, sizeof (ib_cpu_t),
 723             offsetof(ib_cpu_t, ic_next));
 724 
 725         return (delta);
 726 }
 727 
 728 /*
 729  * Generate the delta of two snapshots.
 730  */
 731 static ib_delta_t *
 732 ib_delta_generate(ib_snapshot_t *old_snapshot, ib_snapshot_t *new_snapshot)
 733 {
 734         ib_cpu_t        *old_cpu, *new_cpu;
 735         ib_delta_t      *delta;
 736         int             intrload = 0;
 737         int             intrnsec = 0;
 738         processorid_t   cpus = 0;
 739 
 740         /*
 741          * Allocate a new delta structure.
 742          */
 743         delta = ib_delta_create();
 744 
 745         /*
 746          * Number of CPUs must be the same.
 747          */
 748         delta->id_missing = old_snapshot->is_num_cpus !=
 749             new_snapshot->is_num_cpus;
 750 
 751         if (delta->id_missing != 0) {
 752                 IB_LOG((CE_CONT, "ib_delta_generate: number of CPUs changed"));
 753                 return (delta);
 754         }
 755 
 756         /*
 757          * Loop over the CPUs in both snapshots.
 758          */
 759         for (new_cpu = list_head(&new_snapshot->is_cpu_list),
 760             old_cpu = list_head(&old_snapshot->is_cpu_list);
 761             new_cpu != NULL && old_cpu != NULL;
 762             new_cpu = list_next(&new_snapshot->is_cpu_list, new_cpu),
 763             old_cpu = list_next(&old_snapshot->is_cpu_list, old_cpu)) {
 764                 ib_cpu_t        *delta_cpu;
 765                 ib_ivec_t       *new_ivec;
 766 
 767                 /* XXX: just onlined CPU? */
 768 
 769                 /* Allocate a new CPU structure */
 770                 delta_cpu = ib_cpu_create();
 771 
 772                 /* Difference of total time */
 773                 delta_cpu->ic_tot = new_cpu->ic_tot - old_cpu->ic_tot;
 774                 if (!(delta_cpu->ic_tot >= 0)) {
 775                         delta->id_missing = B_TRUE;
 776                         kmem_free(delta_cpu, sizeof (ib_cpu_t));
 777                         return (delta);
 778                 }
 779 
 780                 list_insert_tail(&delta->id_cpu_list, delta_cpu);
 781 
 782                 /* Avoid division by zero */
 783                 if (delta_cpu->ic_tot == 0)
 784                         delta_cpu->ic_tot = 1;
 785 
 786                 delta_cpu->ic_intrs = 0;
 787                 delta_cpu->ic_big_intrs = 0;
 788 
 789                 /*
 790                  * Number of interrupt vectors must be the same.
 791                  */
 792                 if (old_cpu->ic_num_ivecs != new_cpu->ic_num_ivecs) {
 793                         IB_LOG((CE_CONT, "ib_delta_generate: cpu %d has more "
 794                             "or less interrupts", old_cpu->ic_cpu_id));
 795                         delta->id_missing = B_TRUE;
 796                         return (delta);
 797                 }
 798 
 799                 /*
 800                  * Loop over the interrupt vectors of the new CPU.
 801                  */
 802                 for (new_ivec = list_head(&new_cpu->ic_ivec_list);
 803                     new_ivec != NULL; new_ivec =
 804                     list_next(&new_cpu->ic_ivec_list, new_ivec)) {
 805                         ib_ivec_t       *ivec;
 806                         ib_ivec_t       *delta_ivec;
 807                         hrtime_t        time;
 808 
 809                         if (new_ivec->ii_num_ino == 0)
 810                                 continue;
 811 
 812                         /*
 813                          * If interrupt vector does not exists or XXX crtime
 814                          * is different, set missing.
 815                          */
 816                         ivec = ib_ivec_find_ino(old_cpu->ic_ivec_list,
 817                             new_ivec->ii_ino);
 818                         if (ivec == NULL) {
 819                                 delta->id_missing = B_TRUE;
 820                                 return (delta);
 821                         }
 822 
 823                         /* Allocate a new delta interrupt vector */
 824                         delta_ivec = ib_ivec_create(new_ivec->ii_buspath,
 825                             new_ivec->ii_ino);
 826 
 827                         /*
 828                          * Time used by this interrupt.
 829                          */
 830                         time = new_ivec->ii_time - ivec->ii_time;
 831                         if (time < 0) {
 832                                 delta->id_missing = B_TRUE;
 833                                 kmem_free(delta_ivec, sizeof (ib_delta_t));
 834                                 return (delta);
 835                         }
 836 
 837                         delta_cpu->ic_intrs += time;
 838                         delta_ivec->ii_time = time;
 839 
 840                         if (time > delta_cpu->ic_bigintr)
 841                                 delta_cpu->ic_bigintr = time;
 842 
 843                         /*
 844                          * Fill in the rest.
 845                          */
 846                         delta_ivec->ii_ihs = new_ivec->ii_ihs;
 847                         delta_ivec->ii_pil = new_ivec->ii_pil;
 848                         delta_ivec->ii_ino = new_ivec->ii_ino;
 849                         delta_ivec->ii_num_ino = new_ivec->ii_num_ino;
 850                         /* XXX: buspath, name */
 851                 }
 852 
 853                 /*
 854                  * Rounding error
 855                  */
 856                 if (delta_cpu->ic_tot < delta_cpu->ic_intrs)
 857                         delta_cpu->ic_tot = delta_cpu->ic_intrs;
 858 
 859                 delta_cpu->ic_intr_load =
 860                     delta_cpu->ic_intrs / delta_cpu->ic_tot;
 861                 intrload += delta_cpu->ic_intr_load;
 862                 intrnsec += delta_cpu->ic_intrs;
 863 
 864                 cpus++;
 865         }
 866 
 867         if (cpus > 0) {
 868                 delta->id_avgintrload = intrload / cpus;
 869                 delta->id_avgintrnsec = intrnsec / cpus;
 870         } else {
 871                 delta->id_avgintrload = 0;
 872                 delta->id_avgintrnsec = 0;
 873         }
 874 
 875         return (delta);
 876 }
 877 
 878 /*
 879  * Compress deltas.
 880  */
 881 static ib_delta_t *
 882 ib_delta_compress(list_t *deltas)
 883 {
 884         ib_cpu_t        *icpu;
 885         ib_ivec_t       *ivec;
 886         ib_delta_t      *new_delta, *delta;
 887         processorid_t   cpus = 0;
 888         int             high_intrload = 0;
 889         int             intrs = 0, tot;
 890 
 891         /* Check if empty list of deltas */
 892         if (deltas == NULL || list_is_empty(deltas) != 0) {
 893                 IB_LOG((CE_CONT, "ib_delta_compress: deltas are empty?"));
 894                 return (NULL);
 895         }
 896 
 897         /* Allocate a new delta structure */
 898         new_delta = ib_delta_create();
 899 
 900         /*
 901          * Loop over the deltas in the list.
 902          */
 903         for (delta = list_head(deltas); delta != NULL;
 904             delta = list_next(deltas, delta)) {
 905 
 906                 /* Compressing bad delta? */
 907                 if (delta->id_missing) {
 908                         IB_LOG((CE_CONT,
 909                             "ib_delta_compress: compressing bad deltas?"));
 910                         return (NULL);
 911                 }
 912 
 913                 FOREACH_CPU(icpu, delta->id_cpu_list) {
 914                         ib_cpu_t        *new_cpu;
 915                         ib_ivec_t       *new_ivec;
 916 
 917                         intrs += icpu->ic_intrs;
 918                         tot += icpu->ic_tot;
 919                         new_cpu = ib_cpu_create();
 920                         new_cpu->ic_cpu_id = icpu->ic_cpu_id;
 921                         new_cpu->ic_intrs = icpu->ic_intrs;
 922                         new_cpu->ic_tot = icpu->ic_tot;
 923 
 924                         /* XXX: exists ivecs */
 925                         FOREACH_IVEC(new_ivec, icpu->ic_ivec_list) {
 926                                 ib_ivec_t       *new_delta_ivec;
 927 
 928                                 new_delta_ivec = ib_ivec_create(
 929                                     new_ivec->ii_buspath, new_ivec->ii_ino);
 930 
 931                         }
 932                 }
 933         }
 934 
 935         FOREACH_CPU(icpu, new_delta->id_cpu_list) {
 936                 int     bigintr = 0;
 937 
 938                 cpus++;
 939 
 940                 FOREACH_IVEC(ivec, icpu->ic_ivec_list) {
 941                         if (ivec->ii_time > bigintr)
 942                                 bigintr = ivec->ii_time;
 943                 }
 944 
 945                 icpu->ic_bigintr = bigintr;
 946                 icpu->ic_intr_load = icpu->ic_intrs / icpu->ic_tot;
 947 
 948                 if (high_intrload < icpu->ic_intr_load)
 949                         high_intrload = icpu->ic_intr_load;
 950 
 951                 if (icpu->ic_tot <= 0)
 952                         icpu->ic_tot = 100;
 953         }
 954 
 955         if (cpus > 0) {
 956                 new_delta->id_avgintrload = intrs / tot;
 957                 new_delta->id_avgintrnsec = intrs / cpus;
 958         } else {
 959                 new_delta->id_avgintrload = 0;
 960                 new_delta->id_avgintrnsec = 0;
 961         }
 962 
 963         /* XXX: global sleeptime */
 964 
 965         return (new_delta);
 966 }
 967 
 968 /*
 969  * Decide if the load is out of balance.
 970  */
 971 static int
 972 ib_imbalanced(int goodness, int baseline)
 973 {
 974         if (goodness > 50)
 975                 return (100);
 976 
 977         /* XXX: abs */
 978         if ((goodness - baseline) > goodness_mindelta)
 979                 return (100);
 980 
 981         return (0);
 982 }
 983 
 984 /*
 985  * Calculate goodness of a CPU.
 986  */
 987 static int
 988 ib_goodness_cpu(ib_cpu_t *icpu, int avg_interrupt_load)
 989 {
 990         int     goodness;
 991         int     load, load_no_bigintr;
 992 
 993         load = icpu->ic_intrs / icpu->ic_tot;
 994         if (load < avg_interrupt_load)
 995                 return (0);
 996 
 997         load_no_bigintr = (icpu->ic_intrs - icpu->ic_bigintr) / icpu->ic_tot;
 998 
 999         if ((load > goodness_unsafe_load) && (icpu->ic_num_ivecs > 1))
1000                 return (1);
1001 
1002         goodness = load - avg_interrupt_load;
1003         if (goodness > load_no_bigintr)
1004                 goodness = load_no_bigintr;
1005 
1006         return (goodness);
1007 }
1008 
1009 /*
1010  * Calculate goodness.
1011  */
1012 static int
1013 ib_goodness(ib_delta_t *delta)
1014 {
1015         ib_cpu_t        *icpu;
1016         int             goodness, high_goodness = 0;
1017 
1018         if (delta->id_missing > 0)
1019                 return (1);
1020 
1021         FOREACH_CPU(icpu, delta->id_cpu_list) {
1022                 goodness = ib_goodness_cpu(icpu, delta->id_avgintrload);
1023                 if (!(goodness >= 0 && goodness <= 100)) {
1024                         IB_LOG((CE_CONT,
1025                             "ib_goodness: cpu goodness out of range?"));
1026                         return (100);
1027                 }
1028 
1029                 if (goodness == 100)
1030                         return (100);
1031 
1032                 if (goodness > high_goodness)
1033                         high_goodness = goodness;
1034         }
1035 
1036         return (high_goodness);
1037 }
1038 
1039 static void
1040 ib_do_find_goal(list_t ivecs, list_t loads, int goal, int idx)
1041 {
1042         list_t  goals_with;
1043         list_t  goals_without;
1044         int     with, without;
1045         int     which, load;
1046 
1047 
1048         if (goal <= load) {
1049                 with = load;
1050         } else {
1051                 /* XXX: do_find_goal */
1052                 with += load;
1053         }
1054 
1055         IB_LOG((CE_CONT, "XXX"));
1056 
1057         if (with >= goal && without < goal) {
1058                 which = 0;
1059         } else if (with < goal && without >= goal) {
1060                 which = 1;
1061         } else if (with >= goal && without >= goal) {
1062                 which = without < with;
1063         } else {
1064                 which = without > with;
1065         }
1066 
1067         if (which == 1) {
1068                 IB_LOG((CE_CONT, "ib_do_find_goal: going without"));
1069                 /* XXX */
1070         } else {
1071                 IB_LOG((CE_CONT, "ib_do_find_goal: going with"));
1072                 /* XXX */
1073         }
1074 }
1075 
1076 typedef struct _ib_goal {
1077         list_node_t     *ig_link;
1078         int             ig_value;
1079 } ib_goal_t;
1080 
1081 typedef struct _ib_goal_load {
1082         list_node_t     *igl_link;
1083         int             igl_value;
1084 } ib_goal_load_t;
1085 
1086 static void
1087 ib_find_goal(list_t ivecs, int goal)
1088 {
1089         ib_ivec_t       *ivec;
1090         list_t          goals;
1091         int             load;
1092 
1093         if (goal <= 0) {
1094                 list_create(&goals, sizeof (ib_goal_t),
1095                    offsetof (ib_goal_t, ig_link));
1096         } else {
1097                 list_t          loads;
1098                 hrtime_t        tot = 0;
1099 
1100                 IB_LOG((CE_CONT, "ib_find_goal: finding goal from intrs XXX"));
1101 
1102                 FOREACH_IVEC(ivec, ivecs) {
1103                         tot += ivec->ii_time;
1104                 }
1105 
1106                 list_create(&loads, sizeof (ib_goal_load_t),
1107                     offsetof (ib_goal_load_t, igl_link));
1108 
1109                 FOREACH_IVEC(ivec, ivecs) {
1110                         ib_goal_load_t  *igl = kmem_alloc(sizeof (ib_goal_load_t), KM_SLEEP);
1111 
1112                         igl->igl_value = tot;
1113                         list_insert_tail(&loads, igl);
1114 
1115                         tot -= ivec->ii_time;
1116                 }
1117         }
1118 }
1119 
1120 static void
1121 ib_do_reconfig_cpu2cpu(ib_delta_t *delta, processorid_t src_cpuid,
1122     processorid_t tgt_cpuid, int src_load)
1123 {
1124         ib_cpu_t        *src_cpu, *tgt_cpu;
1125         ib_ivec_t       *ivec;
1126         list_t          ivecs;
1127         int             goal;
1128         int             avg_nsec;
1129 
1130         if (delta == NULL)
1131                 return;
1132 
1133         goal = delta->id_avgintrnsec;
1134 
1135         src_cpu = ib_cpu_find(delta->id_cpu_list, src_cpuid);
1136         if (src_cpu == NULL)
1137                 return;
1138 
1139         tgt_cpu = ib_cpu_find(delta->id_cpu_list, tgt_cpuid);
1140         if (tgt_cpu == NULL)
1141                 return;
1142 
1143         avg_nsec = (src_cpu->ic_intrs + tgt_cpu->ic_intrs) / 2;
1144         if (goal < avg_nsec)
1145                 goal = avg_nsec;
1146 
1147 
1148         /*
1149          * Sort interrupt vectors by time.
1150          */
1151         list_create(&ivecs, sizeof (ib_ivec_t),
1152             offsetof (ib_ivec_t, ii_next));
1153 
1154         ivec = list_head(&ivecs);
1155         if (ivec->ii_orig_cpu == src_cpuid) {
1156                 IB_LOG((CE_CONT, "Keeping XXX on %d",
1157                     src_cpuid)); /* ivec->ii_inum, */
1158                 goal -= ivec->ii_time;
1159                 /* XXX: shift */
1160         }
1161 
1162         IB_LOG((CE_CONT, "ib_reconfig_cpu2cpu: inums should total %d", goal));
1163 
1164         ib_find_goal(ivecs, goal);
1165 }
1166 
1167 static void
1168 ib_do_reconfig_cpu(ib_delta_t *delta, list_t *cpu_sorted_list,
1169     processorid_t old_cpu_id)
1170 {
1171         ib_cpu_t        *icpu;
1172         int             avgintrload;
1173 
1174         if (delta == NULL)
1175                 return;
1176 
1177         icpu = ib_cpu_find(delta->id_cpu_list, old_cpu_id);
1178         if (icpu == NULL)
1179                 return;
1180 
1181         avgintrload = delta->id_avgintrload;
1182 
1183 }
1184 
1185 /*
1186  * Reconfigure interrupt distribution among CPUs.
1187  */
1188 static int
1189 ib_do_reconfig(ib_delta_t *delta)
1190 {
1191         ib_cpu_t        *icpu;
1192         ib_ivec_t       *ivec;
1193         list_t          cpu_sorted_list;
1194         int             goodness, new_goodness;
1195         int             warned = 0;
1196         int             rval = 1, ret = 1;
1197 
1198         if (delta == NULL)
1199                 return (-1);
1200 
1201         goodness = delta->id_goodness;
1202         if (goodness < goodness_mindelta) {
1203                 IB_LOG((CE_CONT, "ib_do_reconfig: goodness is good enough"));
1204                 return (0);
1205         }
1206 
1207         IB_LOG((CE_CONT, "ib_do_reconfig: optimizing interrupt assignments"));
1208 
1209         if (delta->id_missing != 0) {
1210                 IB_LOG((CE_CONT, "ib_do_reconfig: aborted"));
1211                 return (-1);
1212         }
1213 
1214         FOREACH_CPU(icpu, delta->id_cpu_list) {
1215                 FOREACH_IVEC(ivec, icpu->ic_ivec_list) {
1216                         ivec->ii_orig_cpu = icpu->ic_cpu_id;
1217                         ivec->ii_now_cpu = icpu->ic_cpu_id;
1218                         /* XXX: inum */
1219                 }
1220         }
1221 
1222         list_create(&cpu_sorted_list, sizeof (ib_cpu_t),
1223             offsetof(ib_cpu_t, ic_next));
1224 
1225         /*
1226          * Have we an improvement?
1227          */
1228         new_goodness = ib_goodness(delta);
1229         if (!(new_goodness <= goodness)) {
1230                 IB_LOG((CE_CONT,
1231                     "ib_do_reconfig: result has worse goodness"));
1232         }
1233 
1234         if ((goodness != 100 || new_goodness == 100) &&
1235             goodness - new_goodness < goodness_mindelta) {
1236                 IB_LOG((CE_CONT,
1237                     "ib_do_reconfig: goodness already near optimum"));
1238                 return (0);
1239         }
1240 
1241         /*
1242          * Move interrupts.
1243          */
1244 
1245         FOREACH_CPU(icpu, delta->id_cpu_list) {
1246                 FOREACH_IVEC(ivec, icpu->ic_ivec_list) {
1247                         int     error;
1248 
1249                         if (ivec->ii_orig_cpu == icpu->ic_cpu_id)
1250                                 continue;
1251 
1252                         error = ib_interrupt_do_move(ivec, icpu->ic_cpu_id);
1253                         if (error != 0) {
1254                                 if (warned++ == 0) {
1255                                         IB_LOG((CE_CONT, "ib_do_reconfig: "
1256                                             "unable to move interrupt"));
1257                                 }
1258 
1259                                 IB_LOG((CE_CONT, "ib_do_reconfig: "
1260                                     "unable to move buspath"));
1261 
1262                                 ret = -1;
1263                         }
1264                 }
1265         }
1266 
1267         return (rval);
1268 }
1269 
1270 
1271 /*
1272  * Check if the interrupt load did decrease.
1273  */
1274 static void
1275 ib_interrupt_move_check(ib_delta_t *delta, processorid_t old_cpuid,
1276     processorid_t new_cpuid)
1277 {
1278         ib_cpu_t        *old_cpu, *new_cpu;
1279 
1280         /*
1281          * Check old CPU.
1282          */
1283         old_cpu = ib_cpu_find(delta->id_cpu_list, old_cpuid);
1284         if (old_cpu == NULL)
1285                 return;
1286         if (!(old_cpu->ic_tot >= old_cpu->ic_intrs)) {
1287                 IB_LOG((CE_CONT,
1288                     "Moved interrupts left 100+%% load on source CPU"));
1289         }
1290 
1291         /*
1292          * Check new CPU.
1293          */
1294         new_cpu = ib_cpu_find(delta->id_cpu_list, new_cpuid);
1295         if (new_cpu == NULL)
1296                 return;
1297         if (!(new_cpu->ic_tot >= new_cpu->ic_intrs)) {
1298                 IB_LOG((CE_CONT,
1299                     "Moved interrupts left 100+%% load on target CPU"));
1300         }
1301 }
1302 
1303 /*
1304  * Actually moving the interrupt.
1305  */
1306 static int
1307 ib_interrupt_do_move(ib_ivec_t *ivec, processorid_t cpu_id)
1308 {
1309         int     ret, result;
1310 
1311         struct psm_ops  *pops;
1312 
1313         //pops = mach_set[0];
1314 
1315         //      ret = (*psm_intr_ops)(NULL, &info_hdl, PSM_INTR_OP_SET_CPU,
1316         //          &result);
1317 
1318         return (-1);
1319 }
1320 
1321 /*
1322  * Move an interrupt to a different CPU.
1323  */
1324 static int
1325 ib_interrupt_move(ib_delta_t *delta, uint64_t inum, processorid_t old_cpuid,
1326     processorid_t new_cpuid)
1327 {
1328         ib_cpu_t        *old_cpu, *new_cpu;
1329         ib_ivec_t       *ivec;
1330 
1331         if (delta == NULL)
1332                 return (-1);
1333 
1334         /*
1335          * Remove interrupt vector from old CPU.
1336          */
1337         old_cpu = ib_cpu_find(delta->id_cpu_list, old_cpuid);
1338         if (old_cpu == NULL)
1339                 return (-1);
1340 
1341         ivec = ib_ivec_find_ino(old_cpu->ic_ivec_list, inum);
1342 
1343         old_cpu->ic_intrs -= ivec->ii_time;
1344         old_cpu->ic_intr_load = old_cpu->ic_intrs / old_cpu->ic_tot;
1345         ib_ivec_delete_ino(old_cpu->ic_ivec_list, inum);
1346 
1347         /*
1348          * Verify interrupts.
1349          */
1350         if (!(old_cpu->ic_intrs >= 0)) {
1351                 IB_LOG((CE_CONT,
1352                     "ib_interrupt_move: interrupt time > total time?"));
1353         }
1354 
1355         if (!(ivec->ii_time <= old_cpu->ic_bigintr)) {
1356                 IB_LOG((CE_CONT,
1357                     "ib_interrupt_move: interrupt time > big interrupt?"));
1358         }
1359 
1360         if (ivec->ii_time >= old_cpu->ic_bigintr) {
1361                 ib_ivec_t       *time_ivec;
1362                 uint64_t        bigtime = 0;
1363 
1364                 FOREACH_IVEC(time_ivec, old_cpu->ic_ivec_list) {
1365                         if (time_ivec->ii_time > bigtime)
1366                                 bigtime = time_ivec->ii_time;
1367                 }
1368         }
1369 
1370 
1371         /*
1372          * Insert interrupt vector into new CPU.
1373          */
1374         new_cpu = ib_cpu_find(delta->id_cpu_list, new_cpuid);
1375         if (new_cpu == NULL)
1376                 return (-1);
1377 
1378         ivec->ii_now_cpu = new_cpuid;
1379         new_cpu->ic_intrs += ivec->ii_time;
1380         new_cpu->ic_intr_load = new_cpu->ic_intrs / new_cpu->ic_tot;
1381         ib_ivec_add_ino(new_cpu->ic_ivec_list, ivec);
1382 
1383         if (ivec->ii_time > new_cpu->ic_bigintr)
1384                 new_cpu->ic_bigintr = ivec->ii_time;
1385 
1386         return (0);
1387 }