illumos Old usr/src/uts/sun4/os/intr.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 #include <sys/sysmacros.h>
  27 #include <sys/stack.h>
  28 #include <sys/cpuvar.h>
  29 #include <sys/ivintr.h>
  30 #include <sys/intreg.h>
  31 #include <sys/membar.h>
  32 #include <sys/kmem.h>
  33 #include <sys/intr.h>
  34 #include <sys/sunddi.h>
  35 #include <sys/sunndi.h>
  36 #include <sys/cmn_err.h>
  37 #include <sys/privregs.h>
  38 #include <sys/systm.h>
  39 #include <sys/archsystm.h>
  40 #include <sys/machsystm.h>
  41 #include <sys/x_call.h>
  42 #include <vm/seg_kp.h>
  43 #include <sys/debug.h>
  44 #include <sys/cyclic.h>
  45 #include <sys/kdi_impl.h>
  46 #include <sys/ddi_timer.h>
  47 
  48 #include <sys/cpu_sgnblk_defs.h>
  49 
  50 /* Global locks which protect the interrupt distribution lists */
  51 static kmutex_t intr_dist_lock;
  52 static kmutex_t intr_dist_cpu_lock;
  53 
  54 /* Head of the interrupt distribution lists */
  55 static struct intr_dist *intr_dist_head = NULL;
  56 static struct intr_dist *intr_dist_whead = NULL;
  57 
  58 static uint64_t siron_inum[DDI_IPL_10]; /* software interrupt numbers */
  59 uint64_t *siron_cpu_inum = NULL;
  60 uint64_t siron_poke_cpu_inum;
  61 static int siron_cpu_setup(cpu_setup_t, int, void *);
  62 extern uint_t softlevel1();
  63 
  64 static uint64_t siron1_inum; /* backward compatibility */
  65 uint64_t poke_cpu_inum;
  66 uint_t poke_cpu_intr(caddr_t arg1, caddr_t arg2);
  67 uint_t siron_poke_cpu_intr(caddr_t arg1, caddr_t arg2);
  68 
  69 /*
  70  * Variable to enable/disable printing a message when an invalid vecintr
  71  * is received.
  72  */
  73 uint_t ignore_invalid_vecintr = 0;
  74 
  75 /*
  76  * Note:-
  77  * siron_pending was originally created to prevent a resource over consumption
  78  * bug in setsoftint(exhaustion of interrupt pool free list).
  79  * It's original intention is obsolete with the use of iv_pending in
  80  * setsoftint. However, siron_pending stayed around, acting as a second
  81  * gatekeeper preventing soft interrupts from being queued. In this capacity,
  82  * it can lead to hangs on MP systems, where due to global visibility issues
  83  * it can end up set while iv_pending is reset, preventing soft interrupts from
  84  * ever being processed. In addition to its gatekeeper role, init_intr also
  85  * uses it to flag the situation where siron() was called before siron_inum has
  86  * been defined.
  87  *
  88  * siron() does not need an extra gatekeeper; any cpu that wishes should be
  89  * allowed to queue a soft interrupt. It is softint()'s job to ensure
  90  * correct handling of the queues. Therefore, siron_pending has been
  91  * stripped of its gatekeeper task, retaining only its intr_init job, where
  92  * it indicates that there is a pending need to call siron().
  93  */
  94 static int siron_pending[DDI_IPL_10]; /* software interrupt pending flags */
  95 static int siron1_pending; /* backward compatibility */
  96 
  97 int intr_policy = INTR_WEIGHTED_DIST;   /* interrupt distribution policy */
  98 int intr_dist_debug = 0;
  99 int32_t intr_dist_weight_max = 1;
 100 int32_t intr_dist_weight_maxmax = 1000;
 101 int intr_dist_weight_maxfactor = 2;
 102 #define INTR_DEBUG(args) if (intr_dist_debug) cmn_err args
 103 
 104 /*
 105  * intr_init() - Interrupt initialization
 106  *      Initialize the system's interrupt vector table.
 107  */
 108 void
 109 intr_init(cpu_t *cp)
 110 {
 111         int i;
 112         extern uint_t softlevel1();
 113 
 114         init_ivintr();
 115         REGISTER_BBUS_INTR();
 116 
 117         /*
 118          * Register these software interrupts for ddi timer.
 119          * Software interrupts up to the level 10 are supported.
 120          */
 121         for (i = DDI_IPL_1; i <= DDI_IPL_10; i++) {
 122                 siron_inum[i-1] = add_softintr(i, (softintrfunc)timer_softintr,
 123                     (caddr_t)(uintptr_t)(i), SOFTINT_ST);
 124         }
 125 
 126         siron1_inum = add_softintr(PIL_1, softlevel1, 0, SOFTINT_ST);
 127         poke_cpu_inum = add_softintr(PIL_13, poke_cpu_intr, 0, SOFTINT_MT);
 128         siron_poke_cpu_inum = add_softintr(PIL_13,
 129             siron_poke_cpu_intr, 0, SOFTINT_MT);
 130         cp->cpu_m.poke_cpu_outstanding = B_FALSE;
 131 
 132         mutex_init(&intr_dist_lock, NULL, MUTEX_DEFAULT, NULL);
 133         mutex_init(&intr_dist_cpu_lock, NULL, MUTEX_DEFAULT, NULL);
 134 
 135         /*
 136          * A soft interrupt may have been requested prior to the initialization
 137          * of soft interrupts.  Soft interrupts can't be dispatched until after
 138          * init_intr(), so we have to wait until now before we can dispatch the
 139          * pending soft interrupt (if any).
 140          */
 141         for (i = DDI_IPL_1; i <= DDI_IPL_10; i++) {
 142                 if (siron_pending[i-1]) {
 143                         siron_pending[i-1] = 0;
 144                         sir_on(i);
 145                 }
 146         }
 147         if (siron1_pending) {
 148                 siron1_pending = 0;
 149                 siron();
 150         }
 151 }
 152 
 153 /*
 154  * poke_cpu_intr - fall through when poke_cpu calls
 155  */
 156 /* ARGSUSED */
 157 uint_t
 158 poke_cpu_intr(caddr_t arg1, caddr_t arg2)
 159 {
 160         CPU->cpu_m.poke_cpu_outstanding = B_FALSE;
 161         membar_stld_stst();
 162         return (1);
 163 }
 164 
 165 /*
 166  * Trigger software interrupts dedicated to ddi timer.
 167  */
 168 void
 169 sir_on(int level)
 170 {
 171         ASSERT(level >= DDI_IPL_1 && level <= DDI_IPL_10);
 172         if (siron_inum[level-1])
 173                 setsoftint(siron_inum[level-1]);
 174         else
 175                 siron_pending[level-1] = 1;
 176 }
 177 
 178 /*
 179  * kmdb uses siron (and thus setsoftint) while the world is stopped in order to
 180  * inform its driver component that there's work to be done.  We need to keep
 181  * DTrace from instrumenting kmdb's siron and setsoftint.  We duplicate siron,
 182  * giving kmdb's version a kdi_ prefix to keep DTrace at bay.  The
 183  * implementation of setsoftint is complicated enough that we don't want to
 184  * duplicate it, but at the same time we don't want to preclude tracing either.
 185  * The meat of setsoftint() therefore goes into kdi_setsoftint, with
 186  * setsoftint() implemented as a wrapper.  This allows tracing, while still
 187  * providing a way for kmdb to sneak in unmolested.
 188  */
 189 void
 190 kdi_siron(void)
 191 {
 192         if (siron1_inum != 0)
 193                 kdi_setsoftint(siron1_inum);
 194         else
 195                 siron1_pending = 1;
 196 }
 197 
 198 void
 199 setsoftint(uint64_t inum)
 200 {
 201         kdi_setsoftint(inum);
 202 }
 203 
 204 /*
 205  * Generates softlevel1 interrupt on current CPU if it
 206  * is not pending already.
 207  */
 208 void
 209 siron(void)
 210 {
 211         uint64_t inum;
 212 
 213         if (siron1_inum != 0) {
 214                 /*
 215                  * Once siron_cpu_inum has been allocated, we can
 216                  * use per-CPU siron inum.
 217                  */
 218                 if (siron_cpu_inum && siron_cpu_inum[CPU->cpu_id] != 0)
 219                         inum = siron_cpu_inum[CPU->cpu_id];
 220                 else
 221                         inum = siron1_inum;
 222 
 223                 setsoftint(inum);
 224         } else
 225                 siron1_pending = 1;
 226 }
 227 
 228 
 229 static void
 230 siron_init(void)
 231 {
 232         /*
 233          * We just allocate memory for per-cpu siron right now. Rest of
 234          * the work is done when CPU is configured.
 235          */
 236         siron_cpu_inum = kmem_zalloc(sizeof (uint64_t) * NCPU, KM_SLEEP);
 237 }
 238 
 239 /*
 240  * This routine creates per-CPU siron inum for CPUs which are
 241  * configured during boot.
 242  */
 243 void
 244 siron_mp_init()
 245 {
 246         cpu_t *c;
 247 
 248         /*
 249          * Get the memory for per-CPU siron inums
 250          */
 251         siron_init();
 252 
 253         mutex_enter(&cpu_lock);
 254         c = cpu_list;
 255         do {
 256                 (void) siron_cpu_setup(CPU_CONFIG, c->cpu_id, NULL);
 257         } while ((c = c->cpu_next) != cpu_list);
 258 
 259         register_cpu_setup_func(siron_cpu_setup, NULL);
 260         mutex_exit(&cpu_lock);
 261 }
 262 
 263 /*
 264  * siron_poke_cpu_intr - cross-call handler.
 265  */
 266 /* ARGSUSED */
 267 uint_t
 268 siron_poke_cpu_intr(caddr_t arg1, caddr_t arg2)
 269 {
 270         /* generate level1 softint */
 271         siron();
 272         return (1);
 273 }
 274 
 275 /*
 276  * This routine generates a cross-call on target CPU(s).
 277  */
 278 void
 279 siron_poke_cpu(cpuset_t poke)
 280 {
 281         int cpuid = CPU->cpu_id;
 282 
 283         if (CPU_IN_SET(poke, cpuid)) {
 284                 siron();
 285                 CPUSET_DEL(poke, cpuid);
 286                 if (CPUSET_ISNULL(poke))
 287                         return;
 288         }
 289 
 290         xt_some(poke, setsoftint_tl1, siron_poke_cpu_inum, 0);
 291 }
 292 
 293 /*
 294  * This callback function allows us to create per-CPU siron inum.
 295  */
 296 /* ARGSUSED */
 297 static int
 298 siron_cpu_setup(cpu_setup_t what, int id, void *arg)
 299 {
 300         cpu_t *cp = cpu[id];
 301 
 302         ASSERT(MUTEX_HELD(&cpu_lock));
 303         ASSERT(cp != NULL);
 304 
 305         switch (what) {
 306         case CPU_CONFIG:
 307                 siron_cpu_inum[cp->cpu_id] = add_softintr(PIL_1,
 308                     (softintrfunc)softlevel1, 0, SOFTINT_ST);
 309                 break;
 310         case CPU_UNCONFIG:
 311                 (void) rem_softintr(siron_cpu_inum[cp->cpu_id]);
 312                 siron_cpu_inum[cp->cpu_id] = 0;
 313                 break;
 314         default:
 315                 break;
 316         }
 317 
 318         return (0);
 319 }
 320 
 321 /*
 322  * no_ivintr()
 323  *      called by setvecint_tl1() through sys_trap()
 324  *      vector interrupt received but not valid or not
 325  *      registered in intr_vec_table
 326  *      considered as a spurious mondo interrupt
 327  */
 328 /* ARGSUSED */
 329 void
 330 no_ivintr(struct regs *rp, int inum, int pil)
 331 {
 332         if (!ignore_invalid_vecintr)
 333                 cmn_err(CE_WARN, "invalid vector intr: number 0x%x, pil 0x%x",
 334                     inum, pil);
 335 
 336 #ifdef DEBUG_VEC_INTR
 337         prom_enter_mon();
 338 #endif /* DEBUG_VEC_INTR */
 339 }
 340 
 341 void
 342 intr_dequeue_req(uint_t pil, uint64_t inum)
 343 {
 344         intr_vec_t      *iv, *next, *prev;
 345         struct machcpu  *mcpu;
 346         uint32_t        clr;
 347         processorid_t   cpu_id;
 348         extern uint_t   getpstate(void);
 349 
 350         ASSERT((getpstate() & PSTATE_IE) == 0);
 351 
 352         mcpu = &CPU->cpu_m;
 353         cpu_id = CPU->cpu_id;
 354 
 355         iv = (intr_vec_t *)inum;
 356         prev = NULL;
 357         next = mcpu->intr_head[pil];
 358 
 359         /* Find a matching entry in the list */
 360         while (next != NULL) {
 361                 if (next == iv)
 362                         break;
 363                 prev = next;
 364                 next = IV_GET_PIL_NEXT(next, cpu_id);
 365         }
 366 
 367         if (next != NULL) {
 368                 intr_vec_t      *next_iv = IV_GET_PIL_NEXT(next, cpu_id);
 369 
 370                 /* Remove entry from list */
 371                 if (prev != NULL)
 372                         IV_SET_PIL_NEXT(prev, cpu_id, next_iv); /* non-head */
 373                 else
 374                         mcpu->intr_head[pil] = next_iv; /* head */
 375 
 376                 if (next_iv == NULL)
 377                         mcpu->intr_tail[pil] = prev; /* tail */
 378         }
 379 
 380         /* Clear pending interrupts at this level if the list is empty */
 381         if (mcpu->intr_head[pil] == NULL) {
 382                 clr = 1 << pil;
 383                 if (pil == PIL_14)
 384                         clr |= (TICK_INT_MASK | STICK_INT_MASK);
 385                 wr_clr_softint(clr);
 386         }
 387 }
 388 
 389 
 390 /*
 391  * Send a directed interrupt of specified interrupt number id to a cpu.
 392  */
 393 void
 394 send_dirint(
 395         int cpuix,              /* cpu to be interrupted */
 396         int intr_id)            /* interrupt number id */
 397 {
 398         xt_one(cpuix, setsoftint_tl1, intr_id, 0);
 399 }
 400 
 401 /*
 402  * Take the specified CPU out of participation in interrupts.
 403  *      Called by p_online(2) when a processor is being taken off-line.
 404  *      This allows interrupt threads being handled on the processor to
 405  *      complete before the processor is idled.
 406  */
 407 int
 408 cpu_disable_intr(struct cpu *cp)
 409 {
 410         ASSERT(MUTEX_HELD(&cpu_lock));
 411 
 412         /*
 413          * Turn off the CPU_ENABLE flag before calling the redistribution
 414          * function, since it checks for this in the cpu flags.
 415          */
 416         cp->cpu_flags &= ~CPU_ENABLE;
 417 
 418         intr_redist_all_cpus();
 419 
 420         return (0);
 421 }
 422 
 423 /*
 424  * Allow the specified CPU to participate in interrupts.
 425  *      Called by p_online(2) if a processor could not be taken off-line
 426  *      because of bound threads, in order to resume processing interrupts.
 427  *      Also called after starting a processor.
 428  */
 429 void
 430 cpu_enable_intr(struct cpu *cp)
 431 {
 432         ASSERT(MUTEX_HELD(&cpu_lock));
 433 
 434         cp->cpu_flags |= CPU_ENABLE;
 435 
 436         intr_redist_all_cpus();
 437 }
 438 
 439 /*
 440  * Add function to callback list for intr_redist_all_cpus.  We keep two lists,
 441  * one for weighted callbacks and one for normal callbacks. Weighted callbacks
 442  * are issued to redirect interrupts of a specified weight, from heavy to
 443  * light.  This allows all the interrupts of a given weight to be redistributed
 444  * for all weighted nexus drivers prior to those of less weight.
 445  */
 446 static void
 447 intr_dist_add_list(struct intr_dist **phead, void (*func)(void *), void *arg)
 448 {
 449         struct intr_dist *new = kmem_alloc(sizeof (*new), KM_SLEEP);
 450         struct intr_dist *iptr;
 451         struct intr_dist **pptr;
 452 
 453         ASSERT(func);
 454         new->func = func;
 455         new->arg = arg;
 456         new->next = NULL;
 457 
 458         /* Add to tail so that redistribution occurs in original order. */
 459         mutex_enter(&intr_dist_lock);
 460         for (iptr = *phead, pptr = phead; iptr != NULL;
 461             pptr = &iptr->next, iptr = iptr->next) {
 462                 /* check for problems as we locate the tail */
 463                 if ((iptr->func == func) && (iptr->arg == arg)) {
 464                         cmn_err(CE_PANIC, "intr_dist_add_list(): duplicate");
 465                         /*NOTREACHED*/
 466                 }
 467         }
 468         *pptr = new;
 469 
 470         mutex_exit(&intr_dist_lock);
 471 }
 472 
 473 void
 474 intr_dist_add(void (*func)(void *), void *arg)
 475 {
 476         intr_dist_add_list(&intr_dist_head, (void (*)(void *))func, arg);
 477 }
 478 
 479 void
 480 intr_dist_add_weighted(void (*func)(void *, int32_t, int32_t), void *arg)
 481 {
 482         intr_dist_add_list(&intr_dist_whead, (void (*)(void *))func, arg);
 483 }
 484 
 485 /*
 486  * Search for the interrupt distribution structure with the specified
 487  * mondo vec reg in the interrupt distribution list. If a match is found,
 488  * then delete the entry from the list. The caller is responsible for
 489  * modifying the mondo vector registers.
 490  */
 491 static void
 492 intr_dist_rem_list(struct intr_dist **headp, void (*func)(void *), void *arg)
 493 {
 494         struct intr_dist *iptr;
 495         struct intr_dist **vect;
 496 
 497         mutex_enter(&intr_dist_lock);
 498         for (iptr = *headp, vect = headp;
 499             iptr != NULL; vect = &iptr->next, iptr = iptr->next) {
 500                 if ((iptr->func == func) && (iptr->arg == arg)) {
 501                         *vect = iptr->next;
 502                         kmem_free(iptr, sizeof (struct intr_dist));
 503                         mutex_exit(&intr_dist_lock);
 504                         return;
 505                 }
 506         }
 507 
 508         if (!panicstr)
 509                 cmn_err(CE_PANIC, "intr_dist_rem_list: not found");
 510         mutex_exit(&intr_dist_lock);
 511 }
 512 
 513 void
 514 intr_dist_rem(void (*func)(void *), void *arg)
 515 {
 516         intr_dist_rem_list(&intr_dist_head, (void (*)(void *))func, arg);
 517 }
 518 
 519 void
 520 intr_dist_rem_weighted(void (*func)(void *, int32_t, int32_t), void *arg)
 521 {
 522         intr_dist_rem_list(&intr_dist_whead, (void (*)(void *))func, arg);
 523 }
 524 
 525 /*
 526  * Initiate interrupt redistribution.  Redistribution improves the isolation
 527  * associated with interrupt weights by ordering operations from heavy weight
 528  * to light weight.  When a CPUs orientation changes relative to interrupts,
 529  * there is *always* a redistribution to accommodate this change (call to
 530  * intr_redist_all_cpus()).  As devices (not CPUs) attach/detach it is possible
 531  * that a redistribution could improve the quality of an initialization. For
 532  * example, if you are not using a NIC it may not be attached with s10 (devfs).
 533  * If you then configure the NIC (ifconfig), this may cause the NIC to attach
 534  * and plumb interrupts.  The CPU assignment for the NIC's interrupts is
 535  * occurring late, so optimal "isolation" relative to weight is not occurring.
 536  * The same applies to detach, although in this case doing the redistribution
 537  * might improve "spread" for medium weight devices since the "isolation" of
 538  * a higher weight device may no longer be present.
 539  *
 540  * NB: We should provide a utility to trigger redistribution (ala "intradm -r").
 541  *
 542  * NB: There is risk associated with automatically triggering execution of the
 543  * redistribution code at arbitrary times. The risk comes from the fact that
 544  * there is a lot of low-level hardware interaction associated with a
 545  * redistribution.  At some point we may want this code to perform automatic
 546  * redistribution (redistribution thread; trigger timeout when add/remove
 547  * weight delta is large enough, and call cv_signal from timeout - causing
 548  * thead to call i_ddi_intr_redist_all_cpus()) but this is considered too
 549  * risky at this time.
 550  */
 551 void
 552 i_ddi_intr_redist_all_cpus()
 553 {
 554         mutex_enter(&cpu_lock);
 555         INTR_DEBUG((CE_CONT, "intr_dist: i_ddi_intr_redist_all_cpus\n"));
 556         intr_redist_all_cpus();
 557         mutex_exit(&cpu_lock);
 558 }
 559 
 560 /*
 561  * Redistribute all interrupts
 562  *
 563  * This function redistributes all interrupting devices, running the
 564  * parent callback functions for each node.
 565  */
 566 void
 567 intr_redist_all_cpus(void)
 568 {
 569         struct cpu *cp;
 570         struct intr_dist *iptr;
 571         int32_t weight, max_weight;
 572 
 573         ASSERT(MUTEX_HELD(&cpu_lock));
 574         mutex_enter(&intr_dist_lock);
 575 
 576         /*
 577          * zero cpu_intr_weight on all cpus - it is safe to traverse
 578          * cpu_list since we hold cpu_lock.
 579          */
 580         cp = cpu_list;
 581         do {
 582                 cp->cpu_intr_weight = 0;
 583         } while ((cp = cp->cpu_next) != cpu_list);
 584 
 585         /*
 586          * Assume that this redistribution may encounter a device weight
 587          * via driver.conf tuning of "ddi-intr-weight" that is at most
 588          * intr_dist_weight_maxfactor times larger.
 589          */
 590         max_weight = intr_dist_weight_max * intr_dist_weight_maxfactor;
 591         if (max_weight > intr_dist_weight_maxmax)
 592                 max_weight = intr_dist_weight_maxmax;
 593         intr_dist_weight_max = 1;
 594 
 595         INTR_DEBUG((CE_CONT, "intr_dist: "
 596             "intr_redist_all_cpus: %d-0\n", max_weight));
 597 
 598         /*
 599          * Redistribute weighted, from heavy to light.  The callback that
 600          * specifies a weight equal to weight_max should redirect all
 601          * interrupts of weight weight_max or greater [weight_max, inf.).
 602          * Interrupts of lesser weight should be processed on the call with
 603          * the matching weight. This allows all the heaver weight interrupts
 604          * on all weighted busses (multiple pci busses) to be redirected prior
 605          * to any lesser weight interrupts.
 606          */
 607         for (weight = max_weight; weight >= 0; weight--)
 608                 for (iptr = intr_dist_whead; iptr != NULL; iptr = iptr->next)
 609                         ((void (*)(void *, int32_t, int32_t))iptr->func)
 610                             (iptr->arg, max_weight, weight);
 611 
 612         /* redistribute normal (non-weighted) interrupts */
 613         for (iptr = intr_dist_head; iptr != NULL; iptr = iptr->next)
 614                 ((void (*)(void *))iptr->func)(iptr->arg);
 615         mutex_exit(&intr_dist_lock);
 616 }
 617 
 618 void
 619 intr_redist_all_cpus_shutdown(void)
 620 {
 621         intr_policy = INTR_CURRENT_CPU;
 622         intr_redist_all_cpus();
 623 }
 624 
 625 /*
 626  * Determine what CPU to target, based on interrupt policy.
 627  *
 628  * INTR_FLAT_DIST: hold a current CPU pointer in a static variable and
 629  *      advance through interrupt enabled cpus (round-robin).
 630  *
 631  * INTR_WEIGHTED_DIST: search for an enabled CPU with the lowest
 632  *      cpu_intr_weight, round robin when all equal.
 633  *
 634  *      Weighted interrupt distribution provides two things: "spread" of weight
 635  *      (associated with algorithm itself) and "isolation" (associated with a
 636  *      particular device weight). A redistribution is what provides optimal
 637  *      "isolation" of heavy weight interrupts, optimal "spread" of weight
 638  *      (relative to what came before) is always occurring.
 639  *
 640  *      An interrupt weight is a subjective number that represents the
 641  *      percentage of a CPU required to service a device's interrupts: the
 642  *      default weight is 0% (however the algorithm still maintains
 643  *      round-robin), a network interface controller (NIC) may have a large
 644  *      weight (35%). Interrupt weight only has meaning relative to the
 645  *      interrupt weight of other devices: a CPU can be weighted more than
 646  *      100%, and a single device might consume more than 100% of a CPU.
 647  *
 648  *      A coarse interrupt weight can be defined by the parent nexus driver
 649  *      based on bus specific information, like pci class codes. A nexus
 650  *      driver that supports device interrupt weighting for its children
 651  *      should call intr_dist_cpuid_add/rem_device_weight(), which adds
 652  *      and removes the weight of a device from the CPU that an interrupt
 653  *      is directed at.  The quality of initialization improves when the
 654  *      device interrupt weights more accuracy reflect actual run-time weights,
 655  *      and as the assignments are ordered from is heavy to light.
 656  *
 657  *      The implementation also supports interrupt weight being specified in
 658  *      driver.conf files via the property "ddi-intr-weight", which takes
 659  *      precedence over the nexus supplied weight.  This support is added to
 660  *      permit possible tweaking in the product in response to customer
 661  *      problems. This is not a formal or committed interface.
 662  *
 663  *      While a weighted approach chooses the CPU providing the best spread
 664  *      given past weights, less than optimal isolation can result in cases
 665  *      where heavy weight devices show up last. The nexus driver's interrupt
 666  *      redistribution logic should use intr_dist_add/rem_weighted so that
 667  *      interrupts can be redistributed heavy first for optimal isolation.
 668  */
 669 uint32_t
 670 intr_dist_cpuid(void)
 671 {
 672         static struct cpu       *curr_cpu;
 673         struct cpu              *start_cpu;
 674         struct cpu              *new_cpu;
 675         struct cpu              *cp;
 676         int                     cpuid = -1;
 677 
 678         /* Establish exclusion for curr_cpu and cpu_intr_weight manipulation */
 679         mutex_enter(&intr_dist_cpu_lock);
 680 
 681         switch (intr_policy) {
 682         case INTR_CURRENT_CPU:
 683                 cpuid = CPU->cpu_id;
 684                 break;
 685 
 686         case INTR_BOOT_CPU:
 687                 panic("INTR_BOOT_CPU no longer supported.");
 688                 /*NOTREACHED*/
 689 
 690         case INTR_FLAT_DIST:
 691         case INTR_WEIGHTED_DIST:
 692         default:
 693                 /*
 694                  * Ensure that curr_cpu is valid - cpu_next will be NULL if
 695                  * the cpu has been deleted (cpu structs are never freed).
 696                  */
 697                 if (curr_cpu == NULL || curr_cpu->cpu_next == NULL)
 698                         curr_cpu = CPU;
 699 
 700                 /*
 701                  * Advance to online CPU after curr_cpu (round-robin). For
 702                  * INTR_WEIGHTED_DIST we choose the cpu with the lightest
 703                  * weight.  For a nexus that does not support weight the
 704                  * default weight of zero is used. We degrade to round-robin
 705                  * behavior among equal weightes.  The default weight is zero
 706                  * and round-robin behavior continues.
 707                  *
 708                  * Disable preemption while traversing cpu_next_onln to
 709                  * ensure the list does not change.  This works because
 710                  * modifiers of this list and other lists in a struct cpu
 711                  * call pause_cpus() before making changes.
 712                  */
 713                 kpreempt_disable();
 714                 cp = start_cpu = curr_cpu->cpu_next_onln;
 715                 new_cpu = NULL;
 716                 do {
 717                         /* Skip CPUs with interrupts disabled */
 718                         if ((cp->cpu_flags & CPU_ENABLE) == 0)
 719                                 continue;
 720 
 721                         if (intr_policy == INTR_FLAT_DIST) {
 722                                 /* select CPU */
 723                                 new_cpu = cp;
 724                                 break;
 725                         } else if ((new_cpu == NULL) ||
 726                             (cp->cpu_intr_weight < new_cpu->cpu_intr_weight)) {
 727                                 /* Choose if lighter weight */
 728                                 new_cpu = cp;
 729                         }
 730                 } while ((cp = cp->cpu_next_onln) != start_cpu);
 731                 ASSERT(new_cpu);
 732                 cpuid = new_cpu->cpu_id;
 733 
 734                 INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: "
 735                     "targeted\n", cpuid, new_cpu->cpu_intr_weight));
 736 
 737                 /* update static pointer for next round-robin */
 738                 curr_cpu = new_cpu;
 739                 kpreempt_enable();
 740                 break;
 741         }
 742         mutex_exit(&intr_dist_cpu_lock);
 743         return (cpuid);
 744 }
 745 
 746 /*
 747  * Add or remove the the weight of a device from a CPUs interrupt weight.
 748  *
 749  * We expect nexus drivers to call intr_dist_cpuid_add/rem_device_weight for
 750  * their children to improve the overall quality of interrupt initialization.
 751  *
 752  * If a nexues shares the CPU returned by a single intr_dist_cpuid() call
 753  * among multiple devices (sharing ino) then the nexus should call
 754  * intr_dist_cpuid_add/rem_device_weight for each device separately. Devices
 755  * that share must specify the same cpuid.
 756  *
 757  * If a nexus driver is unable to determine the cpu at remove_intr time
 758  * for some of its interrupts, then it should not call add_device_weight -
 759  * intr_dist_cpuid will still provide round-robin.
 760  *
 761  * An established device weight (from dev_info node) takes precedence over
 762  * the weight passed in.  If a device weight is not already established
 763  * then the passed in nexus weight is established.
 764  */
 765 void
 766 intr_dist_cpuid_add_device_weight(uint32_t cpuid,
 767     dev_info_t *dip, int32_t nweight)
 768 {
 769         int32_t         eweight;
 770 
 771         /*
 772          * For non-weighted policy everything has weight of zero (and we get
 773          * round-robin distribution from intr_dist_cpuid).
 774          * NB: intr_policy is limited to this file. A weighted nexus driver is
 775          * calls this rouitne even if intr_policy has been patched to
 776          * INTR_FLAG_DIST.
 777          */
 778         ASSERT(dip);
 779         if (intr_policy != INTR_WEIGHTED_DIST)
 780                 return;
 781 
 782         eweight = i_ddi_get_intr_weight(dip);
 783         INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: +%2d/%2d for "
 784             "%s#%d/%s#%d\n", cpuid, cpu[cpuid]->cpu_intr_weight,
 785             nweight, eweight, ddi_driver_name(ddi_get_parent(dip)),
 786             ddi_get_instance(ddi_get_parent(dip)),
 787             ddi_driver_name(dip), ddi_get_instance(dip)));
 788 
 789         /* if no establish weight, establish nexus weight */
 790         if (eweight < 0) {
 791                 if (nweight > 0)
 792                         (void) i_ddi_set_intr_weight(dip, nweight);
 793                 else
 794                         nweight = 0;
 795         } else
 796                 nweight = eweight;      /* use established weight */
 797 
 798         /* Establish exclusion for cpu_intr_weight manipulation */
 799         mutex_enter(&intr_dist_cpu_lock);
 800         cpu[cpuid]->cpu_intr_weight += nweight;
 801 
 802         /* update intr_dist_weight_max */
 803         if (nweight > intr_dist_weight_max)
 804                 intr_dist_weight_max = nweight;
 805         mutex_exit(&intr_dist_cpu_lock);
 806 }
 807 
 808 void
 809 intr_dist_cpuid_rem_device_weight(uint32_t cpuid, dev_info_t *dip)
 810 {
 811         struct cpu      *cp;
 812         int32_t         weight;
 813 
 814         ASSERT(dip);
 815         if (intr_policy != INTR_WEIGHTED_DIST)
 816                 return;
 817 
 818         /* remove weight of device from cpu */
 819         weight = i_ddi_get_intr_weight(dip);
 820         if (weight < 0)
 821                 weight = 0;
 822         INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: -%2d    for "
 823             "%s#%d/%s#%d\n", cpuid, cpu[cpuid]->cpu_intr_weight, weight,
 824             ddi_driver_name(ddi_get_parent(dip)),
 825             ddi_get_instance(ddi_get_parent(dip)),
 826             ddi_driver_name(dip), ddi_get_instance(dip)));
 827 
 828         /* Establish exclusion for cpu_intr_weight manipulation */
 829         mutex_enter(&intr_dist_cpu_lock);
 830         cp = cpu[cpuid];
 831         cp->cpu_intr_weight -= weight;
 832         if (cp->cpu_intr_weight < 0)
 833                 cp->cpu_intr_weight = 0;     /* sanity */
 834         mutex_exit(&intr_dist_cpu_lock);
 835 }
 836 
 837 ulong_t
 838 create_softint(uint_t pil, uint_t (*func)(caddr_t, caddr_t), caddr_t arg1)
 839 {
 840         uint64_t inum;
 841 
 842         inum = add_softintr(pil, func, arg1, SOFTINT_MT);
 843         return ((ulong_t)inum);
 844 }
 845 
 846 void
 847 invoke_softint(processorid_t cpuid, ulong_t hdl)
 848 {
 849         uint64_t inum = hdl;
 850 
 851         if (cpuid == CPU->cpu_id)
 852                 setsoftint(inum);
 853         else
 854                 xt_one(cpuid, setsoftint_tl1, inum, 0);
 855 }
 856 
 857 void
 858 remove_softint(ulong_t hdl)
 859 {
 860         uint64_t inum = hdl;
 861 
 862         (void) rem_softintr(inum);
 863 }
 864 
 865 void
 866 sync_softint(cpuset_t set)
 867 {
 868         xt_sync(set);
 869 }