illumos New usr/src/uts/sun4/os/intr.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 /*
  26  * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
  27  */
  28 
  29 #include <sys/sysmacros.h>
  30 #include <sys/stack.h>
  31 #include <sys/cpuvar.h>
  32 #include <sys/ivintr.h>
  33 #include <sys/intreg.h>
  34 #include <sys/membar.h>
  35 #include <sys/kmem.h>
  36 #include <sys/intr.h>
  37 #include <sys/sunddi.h>
  38 #include <sys/sunndi.h>
  39 #include <sys/cmn_err.h>
  40 #include <sys/privregs.h>
  41 #include <sys/systm.h>
  42 #include <sys/archsystm.h>
  43 #include <sys/machsystm.h>
  44 #include <sys/x_call.h>
  45 #include <vm/seg_kp.h>
  46 #include <sys/debug.h>
  47 #include <sys/cyclic.h>
  48 #include <sys/kdi_impl.h>
  49 #include <sys/ddi_periodic.h>
  50 
  51 #include <sys/cpu_sgnblk_defs.h>
  52 
  53 /* Global locks which protect the interrupt distribution lists */
  54 static kmutex_t intr_dist_lock;
  55 static kmutex_t intr_dist_cpu_lock;
  56 
  57 /* Head of the interrupt distribution lists */
  58 static struct intr_dist *intr_dist_head = NULL;
  59 static struct intr_dist *intr_dist_whead = NULL;
  60 
  61 static uint64_t siron_inum[DDI_IPL_10]; /* software interrupt numbers */
  62 uint64_t *siron_cpu_inum = NULL;
  63 uint64_t siron_poke_cpu_inum;
  64 static int siron_cpu_setup(cpu_setup_t, int, void *);
  65 extern uint_t softlevel1();
  66 
  67 static uint64_t siron1_inum; /* backward compatibility */
  68 uint64_t poke_cpu_inum;
  69 uint_t poke_cpu_intr(caddr_t arg1, caddr_t arg2);
  70 uint_t siron_poke_cpu_intr(caddr_t arg1, caddr_t arg2);
  71 
  72 /*
  73  * Variable to enable/disable printing a message when an invalid vecintr
  74  * is received.
  75  */
  76 uint_t ignore_invalid_vecintr = 0;
  77 
  78 /*
  79  * Note:-
  80  * siron_pending was originally created to prevent a resource over consumption
  81  * bug in setsoftint(exhaustion of interrupt pool free list).
  82  * It's original intention is obsolete with the use of iv_pending in
  83  * setsoftint. However, siron_pending stayed around, acting as a second
  84  * gatekeeper preventing soft interrupts from being queued. In this capacity,
  85  * it can lead to hangs on MP systems, where due to global visibility issues
  86  * it can end up set while iv_pending is reset, preventing soft interrupts from
  87  * ever being processed. In addition to its gatekeeper role, init_intr also
  88  * uses it to flag the situation where siron() was called before siron_inum has
  89  * been defined.
  90  *
  91  * siron() does not need an extra gatekeeper; any cpu that wishes should be
  92  * allowed to queue a soft interrupt. It is softint()'s job to ensure
  93  * correct handling of the queues. Therefore, siron_pending has been
  94  * stripped of its gatekeeper task, retaining only its intr_init job, where
  95  * it indicates that there is a pending need to call siron().
  96  */
  97 static int siron_pending[DDI_IPL_10]; /* software interrupt pending flags */
  98 static int siron1_pending; /* backward compatibility */
  99 
 100 int intr_policy = INTR_WEIGHTED_DIST;   /* interrupt distribution policy */
 101 int intr_dist_debug = 0;
 102 int32_t intr_dist_weight_max = 1;
 103 int32_t intr_dist_weight_maxmax = 1000;
 104 int intr_dist_weight_maxfactor = 2;
 105 #define INTR_DEBUG(args) if (intr_dist_debug) cmn_err args
 106 
 107 /*
 108  * intr_init() - Interrupt initialization
 109  *      Initialize the system's interrupt vector table.
 110  */
 111 void
 112 intr_init(cpu_t *cp)
 113 {
 114         int i;
 115         extern uint_t softlevel1();
 116 
 117         init_ivintr();
 118         REGISTER_BBUS_INTR();
 119 
 120         /*
 121          * Register these software interrupts for ddi timer.
 122          * Software interrupts up to the level 10 are supported.
 123          */
 124         for (i = DDI_IPL_1; i <= DDI_IPL_10; i++) {
 125                 siron_inum[i - 1] = add_softintr(i,
 126                     (softintrfunc)ddi_periodic_softintr,
 127                     (caddr_t)(uintptr_t)(i), SOFTINT_ST);
 128         }
 129 
 130         siron1_inum = add_softintr(PIL_1, softlevel1, 0, SOFTINT_ST);
 131         poke_cpu_inum = add_softintr(PIL_13, poke_cpu_intr, 0, SOFTINT_MT);
 132         siron_poke_cpu_inum = add_softintr(PIL_13,
 133             siron_poke_cpu_intr, 0, SOFTINT_MT);
 134         cp->cpu_m.poke_cpu_outstanding = B_FALSE;
 135 
 136         mutex_init(&intr_dist_lock, NULL, MUTEX_DEFAULT, NULL);
 137         mutex_init(&intr_dist_cpu_lock, NULL, MUTEX_DEFAULT, NULL);
 138 
 139         /*
 140          * A soft interrupt may have been requested prior to the initialization
 141          * of soft interrupts.  Soft interrupts can't be dispatched until after
 142          * init_intr(), so we have to wait until now before we can dispatch the
 143          * pending soft interrupt (if any).
 144          */
 145         for (i = DDI_IPL_1; i <= DDI_IPL_10; i++) {
 146                 if (siron_pending[i-1]) {
 147                         siron_pending[i-1] = 0;
 148                         sir_on(i);
 149                 }
 150         }
 151         if (siron1_pending) {
 152                 siron1_pending = 0;
 153                 siron();
 154         }
 155 }
 156 
 157 /*
 158  * poke_cpu_intr - fall through when poke_cpu calls
 159  */
 160 /* ARGSUSED */
 161 uint_t
 162 poke_cpu_intr(caddr_t arg1, caddr_t arg2)
 163 {
 164         CPU->cpu_m.poke_cpu_outstanding = B_FALSE;
 165         membar_stld_stst();
 166         return (1);
 167 }
 168 
 169 /*
 170  * Trigger software interrupts dedicated to ddi timer.
 171  */
 172 void
 173 sir_on(int level)
 174 {
 175         ASSERT(level >= DDI_IPL_1 && level <= DDI_IPL_10);
 176         if (siron_inum[level-1])
 177                 setsoftint(siron_inum[level-1]);
 178         else
 179                 siron_pending[level-1] = 1;
 180 }
 181 
 182 /*
 183  * kmdb uses siron (and thus setsoftint) while the world is stopped in order to
 184  * inform its driver component that there's work to be done.  We need to keep
 185  * DTrace from instrumenting kmdb's siron and setsoftint.  We duplicate siron,
 186  * giving kmdb's version a kdi_ prefix to keep DTrace at bay.  The
 187  * implementation of setsoftint is complicated enough that we don't want to
 188  * duplicate it, but at the same time we don't want to preclude tracing either.
 189  * The meat of setsoftint() therefore goes into kdi_setsoftint, with
 190  * setsoftint() implemented as a wrapper.  This allows tracing, while still
 191  * providing a way for kmdb to sneak in unmolested.
 192  */
 193 void
 194 kdi_siron(void)
 195 {
 196         if (siron1_inum != 0)
 197                 kdi_setsoftint(siron1_inum);
 198         else
 199                 siron1_pending = 1;
 200 }
 201 
 202 void
 203 setsoftint(uint64_t inum)
 204 {
 205         kdi_setsoftint(inum);
 206 }
 207 
 208 /*
 209  * Generates softlevel1 interrupt on current CPU if it
 210  * is not pending already.
 211  */
 212 void
 213 siron(void)
 214 {
 215         uint64_t inum;
 216 
 217         if (siron1_inum != 0) {
 218                 /*
 219                  * Once siron_cpu_inum has been allocated, we can
 220                  * use per-CPU siron inum.
 221                  */
 222                 if (siron_cpu_inum && siron_cpu_inum[CPU->cpu_id] != 0)
 223                         inum = siron_cpu_inum[CPU->cpu_id];
 224                 else
 225                         inum = siron1_inum;
 226 
 227                 setsoftint(inum);
 228         } else
 229                 siron1_pending = 1;
 230 }
 231 
 232 
 233 static void
 234 siron_init(void)
 235 {
 236         /*
 237          * We just allocate memory for per-cpu siron right now. Rest of
 238          * the work is done when CPU is configured.
 239          */
 240         siron_cpu_inum = kmem_zalloc(sizeof (uint64_t) * NCPU, KM_SLEEP);
 241 }
 242 
 243 /*
 244  * This routine creates per-CPU siron inum for CPUs which are
 245  * configured during boot.
 246  */
 247 void
 248 siron_mp_init()
 249 {
 250         cpu_t *c;
 251 
 252         /*
 253          * Get the memory for per-CPU siron inums
 254          */
 255         siron_init();
 256 
 257         mutex_enter(&cpu_lock);
 258         c = cpu_list;
 259         do {
 260                 (void) siron_cpu_setup(CPU_CONFIG, c->cpu_id, NULL);
 261         } while ((c = c->cpu_next) != cpu_list);
 262 
 263         register_cpu_setup_func(siron_cpu_setup, NULL);
 264         mutex_exit(&cpu_lock);
 265 }
 266 
 267 /*
 268  * siron_poke_cpu_intr - cross-call handler.
 269  */
 270 /* ARGSUSED */
 271 uint_t
 272 siron_poke_cpu_intr(caddr_t arg1, caddr_t arg2)
 273 {
 274         /* generate level1 softint */
 275         siron();
 276         return (1);
 277 }
 278 
 279 /*
 280  * This routine generates a cross-call on target CPU(s).
 281  */
 282 void
 283 siron_poke_cpu(cpuset_t poke)
 284 {
 285         int cpuid = CPU->cpu_id;
 286 
 287         if (CPU_IN_SET(poke, cpuid)) {
 288                 siron();
 289                 CPUSET_DEL(poke, cpuid);
 290                 if (CPUSET_ISNULL(poke))
 291                         return;
 292         }
 293 
 294         xt_some(poke, setsoftint_tl1, siron_poke_cpu_inum, 0);
 295 }
 296 
 297 /*
 298  * This callback function allows us to create per-CPU siron inum.
 299  */
 300 /* ARGSUSED */
 301 static int
 302 siron_cpu_setup(cpu_setup_t what, int id, void *arg)
 303 {
 304         cpu_t *cp = cpu[id];
 305 
 306         ASSERT(MUTEX_HELD(&cpu_lock));
 307         ASSERT(cp != NULL);
 308 
 309         switch (what) {
 310         case CPU_CONFIG:
 311                 siron_cpu_inum[cp->cpu_id] = add_softintr(PIL_1,
 312                     (softintrfunc)softlevel1, 0, SOFTINT_ST);
 313                 break;
 314         case CPU_UNCONFIG:
 315                 (void) rem_softintr(siron_cpu_inum[cp->cpu_id]);
 316                 siron_cpu_inum[cp->cpu_id] = 0;
 317                 break;
 318         default:
 319                 break;
 320         }
 321 
 322         return (0);
 323 }
 324 
 325 /*
 326  * no_ivintr()
 327  *      called by setvecint_tl1() through sys_trap()
 328  *      vector interrupt received but not valid or not
 329  *      registered in intr_vec_table
 330  *      considered as a spurious mondo interrupt
 331  */
 332 /* ARGSUSED */
 333 void
 334 no_ivintr(struct regs *rp, int inum, int pil)
 335 {
 336         if (!ignore_invalid_vecintr)
 337                 cmn_err(CE_WARN, "invalid vector intr: number 0x%x, pil 0x%x",
 338                     inum, pil);
 339 
 340 #ifdef DEBUG_VEC_INTR
 341         prom_enter_mon();
 342 #endif /* DEBUG_VEC_INTR */
 343 }
 344 
 345 void
 346 intr_dequeue_req(uint_t pil, uint64_t inum)
 347 {
 348         intr_vec_t      *iv, *next, *prev;
 349         struct machcpu  *mcpu;
 350         uint32_t        clr;
 351         processorid_t   cpu_id;
 352         extern uint_t   getpstate(void);
 353 
 354         ASSERT((getpstate() & PSTATE_IE) == 0);
 355 
 356         mcpu = &CPU->cpu_m;
 357         cpu_id = CPU->cpu_id;
 358 
 359         iv = (intr_vec_t *)inum;
 360         prev = NULL;
 361         next = mcpu->intr_head[pil];
 362 
 363         /* Find a matching entry in the list */
 364         while (next != NULL) {
 365                 if (next == iv)
 366                         break;
 367                 prev = next;
 368                 next = IV_GET_PIL_NEXT(next, cpu_id);
 369         }
 370 
 371         if (next != NULL) {
 372                 intr_vec_t      *next_iv = IV_GET_PIL_NEXT(next, cpu_id);
 373 
 374                 /* Remove entry from list */
 375                 if (prev != NULL)
 376                         IV_SET_PIL_NEXT(prev, cpu_id, next_iv); /* non-head */
 377                 else
 378                         mcpu->intr_head[pil] = next_iv; /* head */
 379 
 380                 if (next_iv == NULL)
 381                         mcpu->intr_tail[pil] = prev; /* tail */
 382         }
 383 
 384         /* Clear pending interrupts at this level if the list is empty */
 385         if (mcpu->intr_head[pil] == NULL) {
 386                 clr = 1 << pil;
 387                 if (pil == PIL_14)
 388                         clr |= (TICK_INT_MASK | STICK_INT_MASK);
 389                 wr_clr_softint(clr);
 390         }
 391 }
 392 
 393 
 394 /*
 395  * Send a directed interrupt of specified interrupt number id to a cpu.
 396  */
 397 void
 398 send_dirint(
 399         int cpuix,              /* cpu to be interrupted */
 400         int intr_id)            /* interrupt number id */
 401 {
 402         xt_one(cpuix, setsoftint_tl1, intr_id, 0);
 403 }
 404 
 405 /*
 406  * Take the specified CPU out of participation in interrupts.
 407  *      Called by p_online(2) when a processor is being taken off-line.
 408  *      This allows interrupt threads being handled on the processor to
 409  *      complete before the processor is idled.
 410  */
 411 int
 412 cpu_disable_intr(struct cpu *cp)
 413 {
 414         ASSERT(MUTEX_HELD(&cpu_lock));
 415 
 416         /*
 417          * Turn off the CPU_ENABLE flag before calling the redistribution
 418          * function, since it checks for this in the cpu flags.
 419          */
 420         cp->cpu_flags &= ~CPU_ENABLE;
 421 
 422         intr_redist_all_cpus();
 423 
 424         return (0);
 425 }
 426 
 427 /*
 428  * Allow the specified CPU to participate in interrupts.
 429  *      Called by p_online(2) if a processor could not be taken off-line
 430  *      because of bound threads, in order to resume processing interrupts.
 431  *      Also called after starting a processor.
 432  */
 433 void
 434 cpu_enable_intr(struct cpu *cp)
 435 {
 436         ASSERT(MUTEX_HELD(&cpu_lock));
 437 
 438         cp->cpu_flags |= CPU_ENABLE;
 439 
 440         intr_redist_all_cpus();
 441 }
 442 
 443 /*
 444  * Add function to callback list for intr_redist_all_cpus.  We keep two lists,
 445  * one for weighted callbacks and one for normal callbacks. Weighted callbacks
 446  * are issued to redirect interrupts of a specified weight, from heavy to
 447  * light.  This allows all the interrupts of a given weight to be redistributed
 448  * for all weighted nexus drivers prior to those of less weight.
 449  */
 450 static void
 451 intr_dist_add_list(struct intr_dist **phead, void (*func)(void *), void *arg)
 452 {
 453         struct intr_dist *new = kmem_alloc(sizeof (*new), KM_SLEEP);
 454         struct intr_dist *iptr;
 455         struct intr_dist **pptr;
 456 
 457         ASSERT(func);
 458         new->func = func;
 459         new->arg = arg;
 460         new->next = NULL;
 461 
 462         /* Add to tail so that redistribution occurs in original order. */
 463         mutex_enter(&intr_dist_lock);
 464         for (iptr = *phead, pptr = phead; iptr != NULL;
 465             pptr = &iptr->next, iptr = iptr->next) {
 466                 /* check for problems as we locate the tail */
 467                 if ((iptr->func == func) && (iptr->arg == arg)) {
 468                         cmn_err(CE_PANIC, "intr_dist_add_list(): duplicate");
 469                         /*NOTREACHED*/
 470                 }
 471         }
 472         *pptr = new;
 473 
 474         mutex_exit(&intr_dist_lock);
 475 }
 476 
 477 void
 478 intr_dist_add(void (*func)(void *), void *arg)
 479 {
 480         intr_dist_add_list(&intr_dist_head, (void (*)(void *))func, arg);
 481 }
 482 
 483 void
 484 intr_dist_add_weighted(void (*func)(void *, int32_t, int32_t), void *arg)
 485 {
 486         intr_dist_add_list(&intr_dist_whead, (void (*)(void *))func, arg);
 487 }
 488 
 489 /*
 490  * Search for the interrupt distribution structure with the specified
 491  * mondo vec reg in the interrupt distribution list. If a match is found,
 492  * then delete the entry from the list. The caller is responsible for
 493  * modifying the mondo vector registers.
 494  */
 495 static void
 496 intr_dist_rem_list(struct intr_dist **headp, void (*func)(void *), void *arg)
 497 {
 498         struct intr_dist *iptr;
 499         struct intr_dist **vect;
 500 
 501         mutex_enter(&intr_dist_lock);
 502         for (iptr = *headp, vect = headp;
 503             iptr != NULL; vect = &iptr->next, iptr = iptr->next) {
 504                 if ((iptr->func == func) && (iptr->arg == arg)) {
 505                         *vect = iptr->next;
 506                         kmem_free(iptr, sizeof (struct intr_dist));
 507                         mutex_exit(&intr_dist_lock);
 508                         return;
 509                 }
 510         }
 511 
 512         if (!panicstr)
 513                 cmn_err(CE_PANIC, "intr_dist_rem_list: not found");
 514         mutex_exit(&intr_dist_lock);
 515 }
 516 
 517 void
 518 intr_dist_rem(void (*func)(void *), void *arg)
 519 {
 520         intr_dist_rem_list(&intr_dist_head, (void (*)(void *))func, arg);
 521 }
 522 
 523 void
 524 intr_dist_rem_weighted(void (*func)(void *, int32_t, int32_t), void *arg)
 525 {
 526         intr_dist_rem_list(&intr_dist_whead, (void (*)(void *))func, arg);
 527 }
 528 
 529 /*
 530  * Initiate interrupt redistribution.  Redistribution improves the isolation
 531  * associated with interrupt weights by ordering operations from heavy weight
 532  * to light weight.  When a CPUs orientation changes relative to interrupts,
 533  * there is *always* a redistribution to accommodate this change (call to
 534  * intr_redist_all_cpus()).  As devices (not CPUs) attach/detach it is possible
 535  * that a redistribution could improve the quality of an initialization. For
 536  * example, if you are not using a NIC it may not be attached with s10 (devfs).
 537  * If you then configure the NIC (ifconfig), this may cause the NIC to attach
 538  * and plumb interrupts.  The CPU assignment for the NIC's interrupts is
 539  * occurring late, so optimal "isolation" relative to weight is not occurring.
 540  * The same applies to detach, although in this case doing the redistribution
 541  * might improve "spread" for medium weight devices since the "isolation" of
 542  * a higher weight device may no longer be present.
 543  *
 544  * NB: We should provide a utility to trigger redistribution (ala "intradm -r").
 545  *
 546  * NB: There is risk associated with automatically triggering execution of the
 547  * redistribution code at arbitrary times. The risk comes from the fact that
 548  * there is a lot of low-level hardware interaction associated with a
 549  * redistribution.  At some point we may want this code to perform automatic
 550  * redistribution (redistribution thread; trigger timeout when add/remove
 551  * weight delta is large enough, and call cv_signal from timeout - causing
 552  * thead to call i_ddi_intr_redist_all_cpus()) but this is considered too
 553  * risky at this time.
 554  */
 555 void
 556 i_ddi_intr_redist_all_cpus()
 557 {
 558         mutex_enter(&cpu_lock);
 559         INTR_DEBUG((CE_CONT, "intr_dist: i_ddi_intr_redist_all_cpus\n"));
 560         intr_redist_all_cpus();
 561         mutex_exit(&cpu_lock);
 562 }
 563 
 564 /*
 565  * Redistribute all interrupts
 566  *
 567  * This function redistributes all interrupting devices, running the
 568  * parent callback functions for each node.
 569  */
 570 void
 571 intr_redist_all_cpus(void)
 572 {
 573         struct cpu *cp;
 574         struct intr_dist *iptr;
 575         int32_t weight, max_weight;
 576 
 577         ASSERT(MUTEX_HELD(&cpu_lock));
 578         mutex_enter(&intr_dist_lock);
 579 
 580         /*
 581          * zero cpu_intr_weight on all cpus - it is safe to traverse
 582          * cpu_list since we hold cpu_lock.
 583          */
 584         cp = cpu_list;
 585         do {
 586                 cp->cpu_intr_weight = 0;
 587         } while ((cp = cp->cpu_next) != cpu_list);
 588 
 589         /*
 590          * Assume that this redistribution may encounter a device weight
 591          * via driver.conf tuning of "ddi-intr-weight" that is at most
 592          * intr_dist_weight_maxfactor times larger.
 593          */
 594         max_weight = intr_dist_weight_max * intr_dist_weight_maxfactor;
 595         if (max_weight > intr_dist_weight_maxmax)
 596                 max_weight = intr_dist_weight_maxmax;
 597         intr_dist_weight_max = 1;
 598 
 599         INTR_DEBUG((CE_CONT, "intr_dist: "
 600             "intr_redist_all_cpus: %d-0\n", max_weight));
 601 
 602         /*
 603          * Redistribute weighted, from heavy to light.  The callback that
 604          * specifies a weight equal to weight_max should redirect all
 605          * interrupts of weight weight_max or greater [weight_max, inf.).
 606          * Interrupts of lesser weight should be processed on the call with
 607          * the matching weight. This allows all the heaver weight interrupts
 608          * on all weighted busses (multiple pci busses) to be redirected prior
 609          * to any lesser weight interrupts.
 610          */
 611         for (weight = max_weight; weight >= 0; weight--)
 612                 for (iptr = intr_dist_whead; iptr != NULL; iptr = iptr->next)
 613                         ((void (*)(void *, int32_t, int32_t))iptr->func)
 614                             (iptr->arg, max_weight, weight);
 615 
 616         /* redistribute normal (non-weighted) interrupts */
 617         for (iptr = intr_dist_head; iptr != NULL; iptr = iptr->next)
 618                 ((void (*)(void *))iptr->func)(iptr->arg);
 619         mutex_exit(&intr_dist_lock);
 620 }
 621 
 622 void
 623 intr_redist_all_cpus_shutdown(void)
 624 {
 625         intr_policy = INTR_CURRENT_CPU;
 626         intr_redist_all_cpus();
 627 }
 628 
 629 /*
 630  * Determine what CPU to target, based on interrupt policy.
 631  *
 632  * INTR_FLAT_DIST: hold a current CPU pointer in a static variable and
 633  *      advance through interrupt enabled cpus (round-robin).
 634  *
 635  * INTR_WEIGHTED_DIST: search for an enabled CPU with the lowest
 636  *      cpu_intr_weight, round robin when all equal.
 637  *
 638  *      Weighted interrupt distribution provides two things: "spread" of weight
 639  *      (associated with algorithm itself) and "isolation" (associated with a
 640  *      particular device weight). A redistribution is what provides optimal
 641  *      "isolation" of heavy weight interrupts, optimal "spread" of weight
 642  *      (relative to what came before) is always occurring.
 643  *
 644  *      An interrupt weight is a subjective number that represents the
 645  *      percentage of a CPU required to service a device's interrupts: the
 646  *      default weight is 0% (however the algorithm still maintains
 647  *      round-robin), a network interface controller (NIC) may have a large
 648  *      weight (35%). Interrupt weight only has meaning relative to the
 649  *      interrupt weight of other devices: a CPU can be weighted more than
 650  *      100%, and a single device might consume more than 100% of a CPU.
 651  *
 652  *      A coarse interrupt weight can be defined by the parent nexus driver
 653  *      based on bus specific information, like pci class codes. A nexus
 654  *      driver that supports device interrupt weighting for its children
 655  *      should call intr_dist_cpuid_add/rem_device_weight(), which adds
 656  *      and removes the weight of a device from the CPU that an interrupt
 657  *      is directed at.  The quality of initialization improves when the
 658  *      device interrupt weights more accuracy reflect actual run-time weights,
 659  *      and as the assignments are ordered from is heavy to light.
 660  *
 661  *      The implementation also supports interrupt weight being specified in
 662  *      driver.conf files via the property "ddi-intr-weight", which takes
 663  *      precedence over the nexus supplied weight.  This support is added to
 664  *      permit possible tweaking in the product in response to customer
 665  *      problems. This is not a formal or committed interface.
 666  *
 667  *      While a weighted approach chooses the CPU providing the best spread
 668  *      given past weights, less than optimal isolation can result in cases
 669  *      where heavy weight devices show up last. The nexus driver's interrupt
 670  *      redistribution logic should use intr_dist_add/rem_weighted so that
 671  *      interrupts can be redistributed heavy first for optimal isolation.
 672  */
 673 uint32_t
 674 intr_dist_cpuid(void)
 675 {
 676         static struct cpu       *curr_cpu;
 677         struct cpu              *start_cpu;
 678         struct cpu              *new_cpu;
 679         struct cpu              *cp;
 680         int                     cpuid = -1;
 681 
 682         /* Establish exclusion for curr_cpu and cpu_intr_weight manipulation */
 683         mutex_enter(&intr_dist_cpu_lock);
 684 
 685         switch (intr_policy) {
 686         case INTR_CURRENT_CPU:
 687                 cpuid = CPU->cpu_id;
 688                 break;
 689 
 690         case INTR_BOOT_CPU:
 691                 panic("INTR_BOOT_CPU no longer supported.");
 692                 /*NOTREACHED*/
 693 
 694         case INTR_FLAT_DIST:
 695         case INTR_WEIGHTED_DIST:
 696         default:
 697                 /*
 698                  * Ensure that curr_cpu is valid - cpu_next will be NULL if
 699                  * the cpu has been deleted (cpu structs are never freed).
 700                  */
 701                 if (curr_cpu == NULL || curr_cpu->cpu_next == NULL)
 702                         curr_cpu = CPU;
 703 
 704                 /*
 705                  * Advance to online CPU after curr_cpu (round-robin). For
 706                  * INTR_WEIGHTED_DIST we choose the cpu with the lightest
 707                  * weight.  For a nexus that does not support weight the
 708                  * default weight of zero is used. We degrade to round-robin
 709                  * behavior among equal weightes.  The default weight is zero
 710                  * and round-robin behavior continues.
 711                  *
 712                  * Disable preemption while traversing cpu_next_onln to
 713                  * ensure the list does not change.  This works because
 714                  * modifiers of this list and other lists in a struct cpu
 715                  * call pause_cpus() before making changes.
 716                  */
 717                 kpreempt_disable();
 718                 cp = start_cpu = curr_cpu->cpu_next_onln;
 719                 new_cpu = NULL;
 720                 do {
 721                         /* Skip CPUs with interrupts disabled */
 722                         if ((cp->cpu_flags & CPU_ENABLE) == 0)
 723                                 continue;
 724 
 725                         if (intr_policy == INTR_FLAT_DIST) {
 726                                 /* select CPU */
 727                                 new_cpu = cp;
 728                                 break;
 729                         } else if ((new_cpu == NULL) ||
 730                             (cp->cpu_intr_weight < new_cpu->cpu_intr_weight)) {
 731                                 /* Choose if lighter weight */
 732                                 new_cpu = cp;
 733                         }
 734                 } while ((cp = cp->cpu_next_onln) != start_cpu);
 735                 ASSERT(new_cpu);
 736                 cpuid = new_cpu->cpu_id;
 737 
 738                 INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: "
 739                     "targeted\n", cpuid, new_cpu->cpu_intr_weight));
 740 
 741                 /* update static pointer for next round-robin */
 742                 curr_cpu = new_cpu;
 743                 kpreempt_enable();
 744                 break;
 745         }
 746         mutex_exit(&intr_dist_cpu_lock);
 747         return (cpuid);
 748 }
 749 
 750 /*
 751  * Add or remove the the weight of a device from a CPUs interrupt weight.
 752  *
 753  * We expect nexus drivers to call intr_dist_cpuid_add/rem_device_weight for
 754  * their children to improve the overall quality of interrupt initialization.
 755  *
 756  * If a nexues shares the CPU returned by a single intr_dist_cpuid() call
 757  * among multiple devices (sharing ino) then the nexus should call
 758  * intr_dist_cpuid_add/rem_device_weight for each device separately. Devices
 759  * that share must specify the same cpuid.
 760  *
 761  * If a nexus driver is unable to determine the cpu at remove_intr time
 762  * for some of its interrupts, then it should not call add_device_weight -
 763  * intr_dist_cpuid will still provide round-robin.
 764  *
 765  * An established device weight (from dev_info node) takes precedence over
 766  * the weight passed in.  If a device weight is not already established
 767  * then the passed in nexus weight is established.
 768  */
 769 void
 770 intr_dist_cpuid_add_device_weight(uint32_t cpuid,
 771     dev_info_t *dip, int32_t nweight)
 772 {
 773         int32_t         eweight;
 774 
 775         /*
 776          * For non-weighted policy everything has weight of zero (and we get
 777          * round-robin distribution from intr_dist_cpuid).
 778          * NB: intr_policy is limited to this file. A weighted nexus driver is
 779          * calls this rouitne even if intr_policy has been patched to
 780          * INTR_FLAG_DIST.
 781          */
 782         ASSERT(dip);
 783         if (intr_policy != INTR_WEIGHTED_DIST)
 784                 return;
 785 
 786         eweight = i_ddi_get_intr_weight(dip);
 787         INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: +%2d/%2d for "
 788             "%s#%d/%s#%d\n", cpuid, cpu[cpuid]->cpu_intr_weight,
 789             nweight, eweight, ddi_driver_name(ddi_get_parent(dip)),
 790             ddi_get_instance(ddi_get_parent(dip)),
 791             ddi_driver_name(dip), ddi_get_instance(dip)));
 792 
 793         /* if no establish weight, establish nexus weight */
 794         if (eweight < 0) {
 795                 if (nweight > 0)
 796                         (void) i_ddi_set_intr_weight(dip, nweight);
 797                 else
 798                         nweight = 0;
 799         } else
 800                 nweight = eweight;      /* use established weight */
 801 
 802         /* Establish exclusion for cpu_intr_weight manipulation */
 803         mutex_enter(&intr_dist_cpu_lock);
 804         cpu[cpuid]->cpu_intr_weight += nweight;
 805 
 806         /* update intr_dist_weight_max */
 807         if (nweight > intr_dist_weight_max)
 808                 intr_dist_weight_max = nweight;
 809         mutex_exit(&intr_dist_cpu_lock);
 810 }
 811 
 812 void
 813 intr_dist_cpuid_rem_device_weight(uint32_t cpuid, dev_info_t *dip)
 814 {
 815         struct cpu      *cp;
 816         int32_t         weight;
 817 
 818         ASSERT(dip);
 819         if (intr_policy != INTR_WEIGHTED_DIST)
 820                 return;
 821 
 822         /* remove weight of device from cpu */
 823         weight = i_ddi_get_intr_weight(dip);
 824         if (weight < 0)
 825                 weight = 0;
 826         INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: -%2d    for "
 827             "%s#%d/%s#%d\n", cpuid, cpu[cpuid]->cpu_intr_weight, weight,
 828             ddi_driver_name(ddi_get_parent(dip)),
 829             ddi_get_instance(ddi_get_parent(dip)),
 830             ddi_driver_name(dip), ddi_get_instance(dip)));
 831 
 832         /* Establish exclusion for cpu_intr_weight manipulation */
 833         mutex_enter(&intr_dist_cpu_lock);
 834         cp = cpu[cpuid];
 835         cp->cpu_intr_weight -= weight;
 836         if (cp->cpu_intr_weight < 0)
 837                 cp->cpu_intr_weight = 0;     /* sanity */
 838         mutex_exit(&intr_dist_cpu_lock);
 839 }
 840 
 841 ulong_t
 842 create_softint(uint_t pil, uint_t (*func)(caddr_t, caddr_t), caddr_t arg1)
 843 {
 844         uint64_t inum;
 845 
 846         inum = add_softintr(pil, func, arg1, SOFTINT_MT);
 847         return ((ulong_t)inum);
 848 }
 849 
 850 void
 851 invoke_softint(processorid_t cpuid, ulong_t hdl)
 852 {
 853         uint64_t inum = hdl;
 854 
 855         if (cpuid == CPU->cpu_id)
 856                 setsoftint(inum);
 857         else
 858                 xt_one(cpuid, setsoftint_tl1, inum, 0);
 859 }
 860 
 861 void
 862 remove_softint(ulong_t hdl)
 863 {
 864         uint64_t inum = hdl;
 865 
 866         (void) rem_softintr(inum);
 867 }
 868 
 869 void
 870 sync_softint(cpuset_t set)
 871 {
 872         xt_sync(set);
 873 }