1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 /*
  25  * Copyright (c) 2010, Intel Corporation.
  26  * All rights reserved.
  27  */
  28 
  29 /*
  30  * PSMI 1.1 extensions are supported only in 2.6 and later versions.
  31  * PSMI 1.2 extensions are supported only in 2.7 and later versions.
  32  * PSMI 1.3 and 1.4 extensions are supported in Solaris 10.
  33  * PSMI 1.5 extensions are supported in Solaris Nevada.
  34  * PSMI 1.6 extensions are supported in Solaris Nevada.
  35  * PSMI 1.7 extensions are supported in Solaris Nevada.
  36  */
  37 #define PSMI_1_7
  38 
  39 #include <sys/processor.h>
  40 #include <sys/time.h>
  41 #include <sys/psm.h>
  42 #include <sys/smp_impldefs.h>
  43 #include <sys/cram.h>
  44 #include <sys/acpi/acpi.h>
  45 #include <sys/acpica.h>
  46 #include <sys/psm_common.h>
  47 #include <sys/apic.h>
  48 #include <sys/apic_common.h>
  49 #include <sys/pit.h>
  50 #include <sys/ddi.h>
  51 #include <sys/sunddi.h>
  52 #include <sys/ddi_impldefs.h>
  53 #include <sys/pci.h>
  54 #include <sys/promif.h>
  55 #include <sys/x86_archext.h>
  56 #include <sys/cpc_impl.h>
  57 #include <sys/uadmin.h>
  58 #include <sys/panic.h>
  59 #include <sys/debug.h>
  60 #include <sys/archsystm.h>
  61 #include <sys/trap.h>
  62 #include <sys/machsystm.h>
  63 #include <sys/cpuvar.h>
  64 #include <sys/rm_platter.h>
  65 #include <sys/privregs.h>
  66 #include <sys/cyclic.h>
  67 #include <sys/note.h>
  68 #include <sys/pci_intr_lib.h>
  69 #include <sys/sunndi.h>
  70 
  71 
  72 /*
  73  *      Local Function Prototypes
  74  */
  75 static void apic_mark_vector(uchar_t oldvector, uchar_t newvector);
  76 static void apic_xlate_vector_free_timeout_handler(void *arg);
  77 static int apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu,
  78     int new_bind_cpu, int apicindex, int intin_no, int which_irq,
  79     struct ioapic_reprogram_data *drep);
  80 static int apic_setup_irq_table(dev_info_t *dip, int irqno,
  81     struct apic_io_intr *intrp, struct intrspec *ispec, iflag_t *intr_flagp,
  82     int type);
  83 static void apic_try_deferred_reprogram(int ipl, int vect);
  84 static void delete_defer_repro_ent(int which_irq);
  85 static void apic_ioapic_wait_pending_clear(int ioapicindex,
  86     int intin_no);
  87 
  88 extern int apic_acpi_translate_pci_irq(dev_info_t *dip, int busid, int devid,
  89     int ipin, int *pci_irqp, iflag_t *intr_flagp);
  90 extern int apic_handle_pci_pci_bridge(dev_info_t *idip, int child_devno,
  91     int child_ipin, struct apic_io_intr **intrp);
  92 extern uchar_t acpi_find_ioapic(int irq);
  93 extern struct apic_io_intr *apic_find_io_intr_w_busid(int irqno, int busid);
  94 extern int apic_find_bus_id(int bustype);
  95 extern int apic_find_intin(uchar_t ioapic, uchar_t intin);
  96 extern void apic_record_rdt_entry(apic_irq_t *irqptr, int irq);
  97 
  98 extern  int apic_sci_vect;
  99 extern  iflag_t apic_sci_flags;
 100 extern  int     apic_intr_policy;
 101 extern  char *psm_name;
 102 
 103 /*
 104  * number of bits per byte, from <sys/param.h>
 105  */
 106 #define UCHAR_MAX       ((1 << NBBY) - 1)
 107 
 108 /* Max wait time (in repetitions) for flags to clear in an RDT entry. */
 109 extern int apic_max_reps_clear_pending;
 110 
 111 /* The irq # is implicit in the array index: */
 112 struct ioapic_reprogram_data apic_reprogram_info[APIC_MAX_VECTOR+1];
 113 /*
 114  * APIC_MAX_VECTOR + 1 is the maximum # of IRQs as well. ioapic_reprogram_info
 115  * is indexed by IRQ number, NOT by vector number.
 116  */
 117 
 118 extern  int     apic_int_busy_mark;
 119 extern  int     apic_int_free_mark;
 120 extern  int     apic_diff_for_redistribution;
 121 extern  int     apic_sample_factor_redistribution;
 122 extern  int     apic_redist_cpu_skip;
 123 extern  int     apic_num_imbalance;
 124 extern  int     apic_num_rebind;
 125 
 126 /* timeout for xlate_vector, mark_vector */
 127 int     apic_revector_timeout = 16 * 10000; /* 160 millisec */
 128 
 129 extern int      apic_defconf;
 130 extern int      apic_irq_translate;
 131 
 132 extern int      apic_use_acpi_madt_only;        /* 1=ONLY use MADT from ACPI */
 133 
 134 extern  uchar_t apic_io_vectbase[MAX_IO_APIC];
 135 
 136 extern  boolean_t ioapic_mask_workaround[MAX_IO_APIC];
 137 
 138 /*
 139  * First available slot to be used as IRQ index into the apic_irq_table
 140  * for those interrupts (like MSI/X) that don't have a physical IRQ.
 141  */
 142 extern int apic_first_avail_irq;
 143 
 144 /*
 145  * apic_defer_reprogram_lock ensures that only one processor is handling
 146  * deferred interrupt programming at *_intr_exit time.
 147  */
 148 static  lock_t  apic_defer_reprogram_lock;
 149 
 150 /*
 151  * The current number of deferred reprogrammings outstanding
 152  */
 153 uint_t  apic_reprogram_outstanding = 0;
 154 
 155 #ifdef DEBUG
 156 /*
 157  * Counters that keep track of deferred reprogramming stats
 158  */
 159 uint_t  apic_intr_deferrals = 0;
 160 uint_t  apic_intr_deliver_timeouts = 0;
 161 uint_t  apic_last_ditch_reprogram_failures = 0;
 162 uint_t  apic_deferred_setup_failures = 0;
 163 uint_t  apic_defer_repro_total_retries = 0;
 164 uint_t  apic_defer_repro_successes = 0;
 165 uint_t  apic_deferred_spurious_enters = 0;
 166 #endif
 167 
 168 extern  int     apic_io_max;
 169 extern  struct apic_io_intr *apic_io_intrp;
 170 
 171 uchar_t apic_vector_to_irq[APIC_MAX_VECTOR+1];
 172 
 173 extern  uint32_t        eisa_level_intr_mask;
 174         /* At least MSB will be set if EISA bus */
 175 
 176 extern  int     apic_pci_bus_total;
 177 extern  uchar_t apic_single_pci_busid;
 178 
 179 /*
 180  * Following declarations are for revectoring; used when ISRs at different
 181  * IPLs share an irq.
 182  */
 183 static  lock_t  apic_revector_lock;
 184 int     apic_revector_pending = 0;
 185 static  uchar_t *apic_oldvec_to_newvec;
 186 static  uchar_t *apic_newvec_to_oldvec;
 187 
 188 /* ACPI Interrupt Source Override Structure ptr */
 189 ACPI_MADT_INTERRUPT_OVERRIDE *acpi_isop;
 190 extern  int acpi_iso_cnt;
 191 
 192 /*
 193  * Auto-configuration routines
 194  */
 195 
 196 /*
 197  * Initialise vector->ipl and ipl->pri arrays. level_intr and irqtable
 198  * are also set to NULL. vector->irq is set to a value which cannot map
 199  * to a real irq to show that it is free.
 200  */
 201 void
 202 apic_init_common(void)
 203 {
 204         int     i, j, indx;
 205         int     *iptr;
 206 
 207         /*
 208          * Initialize apic_ipls from apic_vectortoipl.  This array is
 209          * used in apic_intr_enter to determine the IPL to use for the
 210          * corresponding vector.  On some systems, due to hardware errata
 211          * and interrupt sharing, the IPL may not correspond to the IPL listed
 212          * in apic_vectortoipl (see apic_addspl and apic_delspl).
 213          */
 214         for (i = 0; i < (APIC_AVAIL_VECTOR / APIC_VECTOR_PER_IPL); i++) {
 215                 indx = i * APIC_VECTOR_PER_IPL;
 216 
 217                 for (j = 0; j < APIC_VECTOR_PER_IPL; j++, indx++)
 218                         apic_ipls[indx] = apic_vectortoipl[i];
 219         }
 220 
 221         /* cpu 0 is always up (for now) */
 222         apic_cpus[0].aci_status = APIC_CPU_ONLINE | APIC_CPU_INTR_ENABLE;
 223 
 224         iptr = (int *)&apic_irq_table[0];
 225         for (i = 0; i <= APIC_MAX_VECTOR; i++) {
 226                 apic_level_intr[i] = 0;
 227                 *iptr++ = NULL;
 228                 apic_vector_to_irq[i] = APIC_RESV_IRQ;
 229 
 230                 /* These *must* be initted to B_TRUE! */
 231                 apic_reprogram_info[i].done = B_TRUE;
 232                 apic_reprogram_info[i].irqp = NULL;
 233                 apic_reprogram_info[i].tries = 0;
 234                 apic_reprogram_info[i].bindcpu = 0;
 235         }
 236 
 237         /*
 238          * Allocate a dummy irq table entry for the reserved entry.
 239          * This takes care of the race between removing an irq and
 240          * clock detecting a CPU in that irq during interrupt load
 241          * sampling.
 242          */
 243         apic_irq_table[APIC_RESV_IRQ] =
 244             kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP);
 245 
 246         mutex_init(&airq_mutex, NULL, MUTEX_DEFAULT, NULL);
 247 }
 248 
 249 void
 250 ioapic_init_intr(int mask_apic)
 251 {
 252         int ioapic_ix;
 253         struct intrspec ispec;
 254         apic_irq_t *irqptr;
 255         int i, j;
 256         ulong_t iflag;
 257 
 258         LOCK_INIT_CLEAR(&apic_revector_lock);
 259         LOCK_INIT_CLEAR(&apic_defer_reprogram_lock);
 260 
 261         /* mask interrupt vectors */
 262         for (j = 0; j < apic_io_max && mask_apic; j++) {
 263                 int intin_max;
 264 
 265                 ioapic_ix = j;
 266                 /* Bits 23-16 define the maximum redirection entries */
 267                 intin_max = (ioapic_read(ioapic_ix, APIC_VERS_CMD) >> 16)
 268                     & 0xff;
 269                 for (i = 0; i <= intin_max; i++)
 270                         ioapic_write(ioapic_ix, APIC_RDT_CMD + 2 * i, AV_MASK);
 271         }
 272 
 273         /*
 274          * Hack alert: deal with ACPI SCI interrupt chicken/egg here
 275          */
 276         if (apic_sci_vect > 0) {
 277                 /*
 278                  * acpica has already done add_avintr(); we just
 279                  * to finish the job by mimicing translate_irq()
 280                  *
 281                  * Fake up an intrspec and setup the tables
 282                  */
 283                 ispec.intrspec_vec = apic_sci_vect;
 284                 ispec.intrspec_pri = SCI_IPL;
 285 
 286                 if (apic_setup_irq_table(NULL, apic_sci_vect, NULL,
 287                     &ispec, &apic_sci_flags, DDI_INTR_TYPE_FIXED) < 0) {
 288                         cmn_err(CE_WARN, "!apic: SCI setup failed");
 289                         return;
 290                 }
 291                 irqptr = apic_irq_table[apic_sci_vect];
 292 
 293                 iflag = intr_clear();
 294                 lock_set(&apic_ioapic_lock);
 295 
 296                 /* Program I/O APIC */
 297                 (void) apic_setup_io_intr(irqptr, apic_sci_vect, B_FALSE);
 298 
 299                 lock_clear(&apic_ioapic_lock);
 300                 intr_restore(iflag);
 301 
 302                 irqptr->airq_share++;
 303         }
 304 }
 305 
 306 /*
 307  * Add mask bits to disable interrupt vector from happening
 308  * at or above IPL. In addition, it should remove mask bits
 309  * to enable interrupt vectors below the given IPL.
 310  *
 311  * Both add and delspl are complicated by the fact that different interrupts
 312  * may share IRQs. This can happen in two ways.
 313  * 1. The same H/W line is shared by more than 1 device
 314  * 1a. with interrupts at different IPLs
 315  * 1b. with interrupts at same IPL
 316  * 2. We ran out of vectors at a given IPL and started sharing vectors.
 317  * 1b and 2 should be handled gracefully, except for the fact some ISRs
 318  * will get called often when no interrupt is pending for the device.
 319  * For 1a, we handle it at the higher IPL.
 320  */
 321 /*ARGSUSED*/
 322 int
 323 apic_addspl_common(int irqno, int ipl, int min_ipl, int max_ipl)
 324 {
 325         uchar_t vector;
 326         ulong_t iflag;
 327         apic_irq_t *irqptr, *irqheadptr;
 328         int irqindex;
 329 
 330         ASSERT(max_ipl <= UCHAR_MAX);
 331         irqindex = IRQINDEX(irqno);
 332 
 333         if ((irqindex == -1) || (!apic_irq_table[irqindex]))
 334                 return (PSM_FAILURE);
 335 
 336         mutex_enter(&airq_mutex);
 337         irqptr = irqheadptr = apic_irq_table[irqindex];
 338 
 339         DDI_INTR_IMPLDBG((CE_CONT, "apic_addspl: dip=0x%p type=%d irqno=0x%x "
 340             "vector=0x%x\n", (void *)irqptr->airq_dip,
 341             irqptr->airq_mps_intr_index, irqno, irqptr->airq_vector));
 342 
 343         while (irqptr) {
 344                 if (VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno)
 345                         break;
 346                 irqptr = irqptr->airq_next;
 347         }
 348         irqptr->airq_share++;
 349 
 350         mutex_exit(&airq_mutex);
 351 
 352         /* return if it is not hardware interrupt */
 353         if (irqptr->airq_mps_intr_index == RESERVE_INDEX)
 354                 return (PSM_SUCCESS);
 355 
 356         /* Or if there are more interupts at a higher IPL */
 357         if (ipl != max_ipl)
 358                 return (PSM_SUCCESS);
 359 
 360         /*
 361          * if apic_picinit() has not been called yet, just return.
 362          * At the end of apic_picinit(), we will call setup_io_intr().
 363          */
 364 
 365         if (!apic_picinit_called)
 366                 return (PSM_SUCCESS);
 367 
 368         /*
 369          * Upgrade vector if max_ipl is not earlier ipl. If we cannot allocate,
 370          * return failure.
 371          */
 372         if (irqptr->airq_ipl != max_ipl &&
 373             !ioapic_mask_workaround[irqptr->airq_ioapicindex]) {
 374 
 375                 vector = apic_allocate_vector(max_ipl, irqindex, 1);
 376                 if (vector == 0) {
 377                         irqptr->airq_share--;
 378                         return (PSM_FAILURE);
 379                 }
 380                 irqptr = irqheadptr;
 381                 apic_mark_vector(irqptr->airq_vector, vector);
 382                 while (irqptr) {
 383                         irqptr->airq_vector = vector;
 384                         irqptr->airq_ipl = (uchar_t)max_ipl;
 385                         /*
 386                          * reprogram irq being added and every one else
 387                          * who is not in the UNINIT state
 388                          */
 389                         if ((VIRTIRQ(irqindex, irqptr->airq_share_id) ==
 390                             irqno) || (irqptr->airq_temp_cpu != IRQ_UNINIT)) {
 391                                 apic_record_rdt_entry(irqptr, irqindex);
 392 
 393                                 iflag = intr_clear();
 394                                 lock_set(&apic_ioapic_lock);
 395 
 396                                 (void) apic_setup_io_intr(irqptr, irqindex,
 397                                     B_FALSE);
 398 
 399                                 lock_clear(&apic_ioapic_lock);
 400                                 intr_restore(iflag);
 401                         }
 402                         irqptr = irqptr->airq_next;
 403                 }
 404                 return (PSM_SUCCESS);
 405 
 406         } else if (irqptr->airq_ipl != max_ipl &&
 407             ioapic_mask_workaround[irqptr->airq_ioapicindex]) {
 408                 /*
 409                  * We cannot upgrade the vector, but we can change
 410                  * the IPL that this vector induces.
 411                  *
 412                  * Note that we subtract APIC_BASE_VECT from the vector
 413                  * here because this array is used in apic_intr_enter
 414                  * (no need to add APIC_BASE_VECT in that hot code
 415                  * path since we can do it in the rarely-executed path
 416                  * here).
 417                  */
 418                 apic_ipls[irqptr->airq_vector - APIC_BASE_VECT] =
 419                     (uchar_t)max_ipl;
 420 
 421                 irqptr = irqheadptr;
 422                 while (irqptr) {
 423                         irqptr->airq_ipl = (uchar_t)max_ipl;
 424                         irqptr = irqptr->airq_next;
 425                 }
 426 
 427                 return (PSM_SUCCESS);
 428         }
 429 
 430         ASSERT(irqptr);
 431 
 432         iflag = intr_clear();
 433         lock_set(&apic_ioapic_lock);
 434 
 435         (void) apic_setup_io_intr(irqptr, irqindex, B_FALSE);
 436 
 437         lock_clear(&apic_ioapic_lock);
 438         intr_restore(iflag);
 439 
 440         return (PSM_SUCCESS);
 441 }
 442 
 443 /*
 444  * Recompute mask bits for the given interrupt vector.
 445  * If there is no interrupt servicing routine for this
 446  * vector, this function should disable interrupt vector
 447  * from happening at all IPLs. If there are still
 448  * handlers using the given vector, this function should
 449  * disable the given vector from happening below the lowest
 450  * IPL of the remaining hadlers.
 451  */
 452 /*ARGSUSED*/
 453 int
 454 apic_delspl_common(int irqno, int ipl, int min_ipl, int max_ipl)
 455 {
 456         uchar_t vector;
 457         uint32_t bind_cpu;
 458         int intin, irqindex;
 459         int ioapic_ix;
 460         apic_irq_t      *irqptr, *preirqptr, *irqheadptr, *irqp;
 461         ulong_t iflag;
 462 
 463         mutex_enter(&airq_mutex);
 464         irqindex = IRQINDEX(irqno);
 465         irqptr = preirqptr = irqheadptr = apic_irq_table[irqindex];
 466 
 467         DDI_INTR_IMPLDBG((CE_CONT, "apic_delspl: dip=0x%p type=%d irqno=0x%x "
 468             "vector=0x%x\n", (void *)irqptr->airq_dip,
 469             irqptr->airq_mps_intr_index, irqno, irqptr->airq_vector));
 470 
 471         while (irqptr) {
 472                 if (VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno)
 473                         break;
 474                 preirqptr = irqptr;
 475                 irqptr = irqptr->airq_next;
 476         }
 477         ASSERT(irqptr);
 478 
 479         irqptr->airq_share--;
 480 
 481         mutex_exit(&airq_mutex);
 482 
 483         /*
 484          * If there are more interrupts at a higher IPL, we don't need
 485          * to disable anything.
 486          */
 487         if (ipl < max_ipl)
 488                 return (PSM_SUCCESS);
 489 
 490         /* return if it is not hardware interrupt */
 491         if (irqptr->airq_mps_intr_index == RESERVE_INDEX)
 492                 return (PSM_SUCCESS);
 493 
 494         if (!apic_picinit_called) {
 495                 /*
 496                  * Clear irq_struct. If two devices shared an intpt
 497                  * line & 1 unloaded before picinit, we are hosed. But, then
 498                  * we hope the machine survive.
 499                  */
 500                 irqptr->airq_mps_intr_index = FREE_INDEX;
 501                 irqptr->airq_temp_cpu = IRQ_UNINIT;
 502                 apic_free_vector(irqptr->airq_vector);
 503                 return (PSM_SUCCESS);
 504         }
 505         /*
 506          * Downgrade vector to new max_ipl if needed. If we cannot allocate,
 507          * use old IPL. Not very elegant, but it should work.
 508          */
 509         if ((irqptr->airq_ipl != max_ipl) && (max_ipl != PSM_INVALID_IPL) &&
 510             !ioapic_mask_workaround[irqptr->airq_ioapicindex]) {
 511                 apic_irq_t      *irqp;
 512                 if (vector = apic_allocate_vector(max_ipl, irqno, 1)) {
 513                         apic_mark_vector(irqheadptr->airq_vector, vector);
 514                         irqp = irqheadptr;
 515                         while (irqp) {
 516                                 irqp->airq_vector = vector;
 517                                 irqp->airq_ipl = (uchar_t)max_ipl;
 518                                 if (irqp->airq_temp_cpu != IRQ_UNINIT) {
 519                                         apic_record_rdt_entry(irqp, irqindex);
 520 
 521                                         iflag = intr_clear();
 522                                         lock_set(&apic_ioapic_lock);
 523 
 524                                         (void) apic_setup_io_intr(irqp,
 525                                             irqindex, B_FALSE);
 526 
 527                                         lock_clear(&apic_ioapic_lock);
 528                                         intr_restore(iflag);
 529                                 }
 530                                 irqp = irqp->airq_next;
 531                         }
 532                 }
 533 
 534         } else if (irqptr->airq_ipl != max_ipl &&
 535             max_ipl != PSM_INVALID_IPL &&
 536             ioapic_mask_workaround[irqptr->airq_ioapicindex]) {
 537 
 538         /*
 539          * We cannot downgrade the IPL of the vector below the vector's
 540          * hardware priority. If we did, it would be possible for a
 541          * higher-priority hardware vector to interrupt a CPU running at an IPL
 542          * lower than the hardware priority of the interrupting vector (but
 543          * higher than the soft IPL of this IRQ). When this happens, we would
 544          * then try to drop the IPL BELOW what it was (effectively dropping
 545          * below base_spl) which would be potentially catastrophic.
 546          *
 547          * (e.g. Suppose the hardware vector associated with this IRQ is 0x40
 548          * (hardware IPL of 4).  Further assume that the old IPL of this IRQ
 549          * was 4, but the new IPL is 1.  If we forced vector 0x40 to result in
 550          * an IPL of 1, it would be possible for the processor to be executing
 551          * at IPL 3 and for an interrupt to come in on vector 0x40, interrupting
 552          * the currently-executing ISR.  When apic_intr_enter consults
 553          * apic_irqs[], it will return 1, bringing the IPL of the CPU down to 1
 554          * so even though the processor was running at IPL 4, an IPL 1
 555          * interrupt will have interrupted it, which must not happen)).
 556          *
 557          * Effectively, this means that the hardware priority corresponding to
 558          * the IRQ's IPL (in apic_ipls[]) cannot be lower than the vector's
 559          * hardware priority.
 560          *
 561          * (In the above example, then, after removal of the IPL 4 device's
 562          * interrupt handler, the new IPL will continue to be 4 because the
 563          * hardware priority that IPL 1 implies is lower than the hardware
 564          * priority of the vector used.)
 565          */
 566                 /* apic_ipls is indexed by vector, starting at APIC_BASE_VECT */
 567                 const int apic_ipls_index = irqptr->airq_vector -
 568                     APIC_BASE_VECT;
 569                 const int vect_inherent_hwpri = irqptr->airq_vector >>
 570                     APIC_IPL_SHIFT;
 571 
 572                 /*
 573                  * If there are still devices using this IRQ, determine the
 574                  * new ipl to use.
 575                  */
 576                 if (irqptr->airq_share) {
 577                         int vect_desired_hwpri, hwpri;
 578 
 579                         ASSERT(max_ipl < MAXIPL);
 580                         vect_desired_hwpri = apic_ipltopri[max_ipl] >>
 581                             APIC_IPL_SHIFT;
 582 
 583                         /*
 584                          * If the desired IPL's hardware priority is lower
 585                          * than that of the vector, use the hardware priority
 586                          * of the vector to determine the new IPL.
 587                          */
 588                         hwpri = (vect_desired_hwpri < vect_inherent_hwpri) ?
 589                             vect_inherent_hwpri : vect_desired_hwpri;
 590 
 591                         /*
 592                          * Now, to get the right index for apic_vectortoipl,
 593                          * we need to subtract APIC_BASE_VECT from the
 594                          * hardware-vector-equivalent (in hwpri).  Since hwpri
 595                          * is already shifted, we shift APIC_BASE_VECT before
 596                          * doing the subtraction.
 597                          */
 598                         hwpri -= (APIC_BASE_VECT >> APIC_IPL_SHIFT);
 599 
 600                         ASSERT(hwpri >= 0);
 601                         ASSERT(hwpri < MAXIPL);
 602                         max_ipl = apic_vectortoipl[hwpri];
 603                         apic_ipls[apic_ipls_index] = max_ipl;
 604 
 605                         irqp = irqheadptr;
 606                         while (irqp) {
 607                                 irqp->airq_ipl = (uchar_t)max_ipl;
 608                                 irqp = irqp->airq_next;
 609                         }
 610                 } else {
 611                         /*
 612                          * No more devices on this IRQ, so reset this vector's
 613                          * element in apic_ipls to the original IPL for this
 614                          * vector
 615                          */
 616                         apic_ipls[apic_ipls_index] =
 617                             apic_vectortoipl[vect_inherent_hwpri];
 618                 }
 619         }
 620 
 621         /*
 622          * If there are still active interrupts, we are done.
 623          */
 624         if (irqptr->airq_share)
 625                 return (PSM_SUCCESS);
 626 
 627         iflag = intr_clear();
 628         lock_set(&apic_ioapic_lock);
 629 
 630         if (irqptr->airq_mps_intr_index == MSI_INDEX) {
 631                 /*
 632                  * Disable the MSI vector
 633                  * Make sure we only disable on the last
 634                  * of the multi-MSI support
 635                  */
 636                 if (i_ddi_intr_get_current_nenables(irqptr->airq_dip) == 1) {
 637                         apic_pci_msi_disable_mode(irqptr->airq_dip,
 638                             DDI_INTR_TYPE_MSI);
 639                 }
 640         } else if (irqptr->airq_mps_intr_index == MSIX_INDEX) {
 641                 /*
 642                  * Disable the MSI-X vector
 643                  * needs to clear its mask and addr/data for each MSI-X
 644                  */
 645                 apic_pci_msi_unconfigure(irqptr->airq_dip, DDI_INTR_TYPE_MSIX,
 646                     irqptr->airq_origirq);
 647                 /*
 648                  * Make sure we only disable on the last MSI-X
 649                  */
 650                 if (i_ddi_intr_get_current_nenables(irqptr->airq_dip) == 1) {
 651                         apic_pci_msi_disable_mode(irqptr->airq_dip,
 652                             DDI_INTR_TYPE_MSIX);
 653                 }
 654         } else {
 655                 /*
 656                  * The assumption here is that this is safe, even for
 657                  * systems with IOAPICs that suffer from the hardware
 658                  * erratum because all devices have been quiesced before
 659                  * they unregister their interrupt handlers.  If that
 660                  * assumption turns out to be false, this mask operation
 661                  * can induce the same erratum result we're trying to
 662                  * avoid.
 663                  */
 664                 ioapic_ix = irqptr->airq_ioapicindex;
 665                 intin = irqptr->airq_intin_no;
 666                 ioapic_write(ioapic_ix, APIC_RDT_CMD + 2 * intin, AV_MASK);
 667         }
 668 
 669         /*
 670          * This irq entry is the only one in the chain.
 671          */
 672         if (irqheadptr->airq_next == NULL) {
 673                 ASSERT(irqheadptr == irqptr);
 674                 bind_cpu = irqptr->airq_temp_cpu;
 675                 if (((uint32_t)bind_cpu != IRQ_UNBOUND) &&
 676                     ((uint32_t)bind_cpu != IRQ_UNINIT)) {
 677                         ASSERT(apic_cpu_in_range(bind_cpu));
 678                         if (bind_cpu & IRQ_USER_BOUND) {
 679                                 /* If hardbound, temp_cpu == cpu */
 680                                 bind_cpu &= ~IRQ_USER_BOUND;
 681                                 apic_cpus[bind_cpu].aci_bound--;
 682                         } else
 683                                 apic_cpus[bind_cpu].aci_temp_bound--;
 684                 }
 685                 irqptr->airq_temp_cpu = IRQ_UNINIT;
 686                 irqptr->airq_mps_intr_index = FREE_INDEX;
 687                 lock_clear(&apic_ioapic_lock);
 688                 intr_restore(iflag);
 689                 apic_free_vector(irqptr->airq_vector);
 690                 return (PSM_SUCCESS);
 691         }
 692 
 693         /*
 694          * If we get here, we are sharing the vector and there are more than
 695          * one active irq entries in the chain.
 696          */
 697         lock_clear(&apic_ioapic_lock);
 698         intr_restore(iflag);
 699 
 700         mutex_enter(&airq_mutex);
 701         /* Remove the irq entry from the chain */
 702         if (irqptr == irqheadptr) { /* The irq entry is at the head */
 703                 apic_irq_table[irqindex] = irqptr->airq_next;
 704         } else {
 705                 preirqptr->airq_next = irqptr->airq_next;
 706         }
 707         /* Free the irq entry */
 708         kmem_free(irqptr, sizeof (apic_irq_t));
 709         mutex_exit(&airq_mutex);
 710 
 711         return (PSM_SUCCESS);
 712 }
 713 
 714 /*
 715  * apic_introp_xlate() replaces apic_translate_irq() and is
 716  * called only from apic_intr_ops().  With the new ADII framework,
 717  * the priority can no longer be retrieved through i_ddi_get_intrspec().
 718  * It has to be passed in from the caller.
 719  *
 720  * Return value:
 721  *      Success: irqno for the given device
 722  *      Failure: -1
 723  */
 724 int
 725 apic_introp_xlate(dev_info_t *dip, struct intrspec *ispec, int type)
 726 {
 727         char dev_type[16];
 728         int dev_len, pci_irq, newirq, bustype, devid, busid, i;
 729         int irqno = ispec->intrspec_vec;
 730         ddi_acc_handle_t cfg_handle;
 731         uchar_t ipin;
 732         struct apic_io_intr *intrp;
 733         iflag_t intr_flag;
 734         ACPI_SUBTABLE_HEADER    *hp;
 735         ACPI_MADT_INTERRUPT_OVERRIDE *isop;
 736         apic_irq_t *airqp;
 737         int parent_is_pci_or_pciex = 0;
 738         int child_is_pciex = 0;
 739 
 740         DDI_INTR_IMPLDBG((CE_CONT, "apic_introp_xlate: dip=0x%p name=%s "
 741             "type=%d irqno=0x%x\n", (void *)dip, ddi_get_name(dip), type,
 742             irqno));
 743 
 744         dev_len = sizeof (dev_type);
 745         if (ddi_getlongprop_buf(DDI_DEV_T_ANY, ddi_get_parent(dip),
 746             DDI_PROP_DONTPASS, "device_type", (caddr_t)dev_type,
 747             &dev_len) == DDI_PROP_SUCCESS) {
 748                 if ((strcmp(dev_type, "pci") == 0) ||
 749                     (strcmp(dev_type, "pciex") == 0))
 750                         parent_is_pci_or_pciex = 1;
 751         }
 752 
 753         if (ddi_getlongprop_buf(DDI_DEV_T_ANY, dip,
 754             DDI_PROP_DONTPASS, "compatible", (caddr_t)dev_type,
 755             &dev_len) == DDI_PROP_SUCCESS) {
 756                 if (strstr(dev_type, "pciex"))
 757                         child_is_pciex = 1;
 758         }
 759 
 760         if (DDI_INTR_IS_MSI_OR_MSIX(type)) {
 761                 if ((airqp = apic_find_irq(dip, ispec, type)) != NULL) {
 762                         airqp->airq_iflag.bustype =
 763                             child_is_pciex ? BUS_PCIE : BUS_PCI;
 764                         return (apic_vector_to_irq[airqp->airq_vector]);
 765                 }
 766                 return (apic_setup_irq_table(dip, irqno, NULL, ispec,
 767                     NULL, type));
 768         }
 769 
 770         bustype = 0;
 771 
 772         /* check if we have already translated this irq */
 773         mutex_enter(&airq_mutex);
 774         newirq = apic_min_device_irq;
 775         for (; newirq <= apic_max_device_irq; newirq++) {
 776                 airqp = apic_irq_table[newirq];
 777                 while (airqp) {
 778                         if ((airqp->airq_dip == dip) &&
 779                             (airqp->airq_origirq == irqno) &&
 780                             (airqp->airq_mps_intr_index != FREE_INDEX)) {
 781 
 782                                 mutex_exit(&airq_mutex);
 783                                 return (VIRTIRQ(newirq, airqp->airq_share_id));
 784                         }
 785                         airqp = airqp->airq_next;
 786                 }
 787         }
 788         mutex_exit(&airq_mutex);
 789 
 790         if (apic_defconf)
 791                 goto defconf;
 792 
 793         if ((dip == NULL) || (!apic_irq_translate && !apic_enable_acpi))
 794                 goto nonpci;
 795 
 796         if (parent_is_pci_or_pciex) {
 797                 /* pci device */
 798                 if (acpica_get_bdf(dip, &busid, &devid, NULL) != 0)
 799                         goto nonpci;
 800                 if (busid == 0 && apic_pci_bus_total == 1)
 801                         busid = (int)apic_single_pci_busid;
 802 
 803                 if (pci_config_setup(dip, &cfg_handle) != DDI_SUCCESS)
 804                         return (-1);
 805                 ipin = pci_config_get8(cfg_handle, PCI_CONF_IPIN) - PCI_INTA;
 806                 pci_config_teardown(&cfg_handle);
 807                 if (apic_enable_acpi && !apic_use_acpi_madt_only) {
 808                         if (apic_acpi_translate_pci_irq(dip, busid, devid,
 809                             ipin, &pci_irq, &intr_flag) != ACPI_PSM_SUCCESS)
 810                                 return (-1);
 811 
 812                         intr_flag.bustype = child_is_pciex ? BUS_PCIE : BUS_PCI;
 813                         return (apic_setup_irq_table(dip, pci_irq, NULL, ispec,
 814                             &intr_flag, type));
 815                 } else {
 816                         pci_irq = ((devid & 0x1f) << 2) | (ipin & 0x3);
 817                         if ((intrp = apic_find_io_intr_w_busid(pci_irq, busid))
 818                             == NULL) {
 819                                 if ((pci_irq = apic_handle_pci_pci_bridge(dip,
 820                                     devid, ipin, &intrp)) == -1)
 821                                         return (-1);
 822                         }
 823                         return (apic_setup_irq_table(dip, pci_irq, intrp, ispec,
 824                             NULL, type));
 825                 }
 826         } else if (strcmp(dev_type, "isa") == 0)
 827                 bustype = BUS_ISA;
 828         else if (strcmp(dev_type, "eisa") == 0)
 829                 bustype = BUS_EISA;
 830 
 831 nonpci:
 832         if (apic_enable_acpi && !apic_use_acpi_madt_only) {
 833                 /* search iso entries first */
 834                 if (acpi_iso_cnt != 0) {
 835                         hp = (ACPI_SUBTABLE_HEADER *)acpi_isop;
 836                         i = 0;
 837                         while (i < acpi_iso_cnt) {
 838                                 if (hp->Type ==
 839                                     ACPI_MADT_TYPE_INTERRUPT_OVERRIDE) {
 840                                         isop =
 841                                             (ACPI_MADT_INTERRUPT_OVERRIDE *) hp;
 842                                         if (isop->Bus == 0 &&
 843                                             isop->SourceIrq == irqno) {
 844                                                 newirq = isop->GlobalIrq;
 845                                                 intr_flag.intr_po =
 846                                                     isop->IntiFlags &
 847                                                     ACPI_MADT_POLARITY_MASK;
 848                                                 intr_flag.intr_el =
 849                                                     (isop->IntiFlags &
 850                                                     ACPI_MADT_TRIGGER_MASK)
 851                                                     >> 2;
 852                                                 intr_flag.bustype = BUS_ISA;
 853 
 854                                                 return (apic_setup_irq_table(
 855                                                     dip, newirq, NULL, ispec,
 856                                                     &intr_flag, type));
 857 
 858                                         }
 859                                         i++;
 860                                 }
 861                                 hp = (ACPI_SUBTABLE_HEADER *)(((char *)hp) +
 862                                     hp->Length);
 863                         }
 864                 }
 865                 intr_flag.intr_po = INTR_PO_ACTIVE_HIGH;
 866                 intr_flag.intr_el = INTR_EL_EDGE;
 867                 intr_flag.bustype = BUS_ISA;
 868                 return (apic_setup_irq_table(dip, irqno, NULL, ispec,
 869                     &intr_flag, type));
 870         } else {
 871                 if (bustype == 0)       /* not initialized */
 872                         bustype = eisa_level_intr_mask ? BUS_EISA : BUS_ISA;
 873                 for (i = 0; i < 2; i++) {
 874                         if (((busid = apic_find_bus_id(bustype)) != -1) &&
 875                             ((intrp = apic_find_io_intr_w_busid(irqno, busid))
 876                             != NULL)) {
 877                                 if ((newirq = apic_setup_irq_table(dip, irqno,
 878                                     intrp, ispec, NULL, type)) != -1) {
 879                                         return (newirq);
 880                                 }
 881                                 goto defconf;
 882                         }
 883                         bustype = (bustype == BUS_EISA) ? BUS_ISA : BUS_EISA;
 884                 }
 885         }
 886 
 887 /* MPS default configuration */
 888 defconf:
 889         newirq = apic_setup_irq_table(dip, irqno, NULL, ispec, NULL, type);
 890         if (newirq == -1)
 891                 return (-1);
 892         ASSERT(IRQINDEX(newirq) == irqno);
 893         ASSERT(apic_irq_table[irqno]);
 894         return (newirq);
 895 }
 896 
 897 /*
 898  * Attempt to share vector with someone else
 899  */
 900 static int
 901 apic_share_vector(int irqno, iflag_t *intr_flagp, short intr_index, int ipl,
 902         uchar_t ioapicindex, uchar_t ipin, apic_irq_t **irqptrp)
 903 {
 904 #ifdef DEBUG
 905         apic_irq_t *tmpirqp = NULL;
 906 #endif /* DEBUG */
 907         apic_irq_t *irqptr, dummyirq;
 908         int     newirq, chosen_irq = -1, share = 127;
 909         int     lowest, highest, i;
 910         uchar_t share_id;
 911 
 912         DDI_INTR_IMPLDBG((CE_CONT, "apic_share_vector: irqno=0x%x "
 913             "intr_index=0x%x ipl=0x%x\n", irqno, intr_index, ipl));
 914 
 915         highest = apic_ipltopri[ipl] + APIC_VECTOR_MASK;
 916         lowest = apic_ipltopri[ipl-1] + APIC_VECTOR_PER_IPL;
 917 
 918         if (highest < lowest) /* Both ipl and ipl-1 map to same pri */
 919                 lowest -= APIC_VECTOR_PER_IPL;
 920         dummyirq.airq_mps_intr_index = intr_index;
 921         dummyirq.airq_ioapicindex = ioapicindex;
 922         dummyirq.airq_intin_no = ipin;
 923         if (intr_flagp)
 924                 dummyirq.airq_iflag = *intr_flagp;
 925         apic_record_rdt_entry(&dummyirq, irqno);
 926         for (i = lowest; i <= highest; i++) {
 927                 newirq = apic_vector_to_irq[i];
 928                 if (newirq == APIC_RESV_IRQ)
 929                         continue;
 930                 irqptr = apic_irq_table[newirq];
 931 
 932                 if ((dummyirq.airq_rdt_entry & 0xFF00) !=
 933                     (irqptr->airq_rdt_entry & 0xFF00))
 934                         /* not compatible */
 935                         continue;
 936 
 937                 if (irqptr->airq_share < share) {
 938                         share = irqptr->airq_share;
 939                         chosen_irq = newirq;
 940                 }
 941         }
 942         if (chosen_irq != -1) {
 943                 /*
 944                  * Assign a share id which is free or which is larger
 945                  * than the largest one.
 946                  */
 947                 share_id = 1;
 948                 mutex_enter(&airq_mutex);
 949                 irqptr = apic_irq_table[chosen_irq];
 950                 while (irqptr) {
 951                         if (irqptr->airq_mps_intr_index == FREE_INDEX) {
 952                                 share_id = irqptr->airq_share_id;
 953                                 break;
 954                         }
 955                         if (share_id <= irqptr->airq_share_id)
 956                                 share_id = irqptr->airq_share_id + 1;
 957 #ifdef DEBUG
 958                         tmpirqp = irqptr;
 959 #endif /* DEBUG */
 960                         irqptr = irqptr->airq_next;
 961                 }
 962                 if (!irqptr) {
 963                         irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP);
 964                         irqptr->airq_temp_cpu = IRQ_UNINIT;
 965                         irqptr->airq_next =
 966                             apic_irq_table[chosen_irq]->airq_next;
 967                         apic_irq_table[chosen_irq]->airq_next = irqptr;
 968 #ifdef  DEBUG
 969                         tmpirqp = apic_irq_table[chosen_irq];
 970 #endif /* DEBUG */
 971                 }
 972                 irqptr->airq_mps_intr_index = intr_index;
 973                 irqptr->airq_ioapicindex = ioapicindex;
 974                 irqptr->airq_intin_no = ipin;
 975                 if (intr_flagp)
 976                         irqptr->airq_iflag = *intr_flagp;
 977                 irqptr->airq_vector = apic_irq_table[chosen_irq]->airq_vector;
 978                 irqptr->airq_share_id = share_id;
 979                 apic_record_rdt_entry(irqptr, irqno);
 980                 *irqptrp = irqptr;
 981 #ifdef  DEBUG
 982                 /* shuffle the pointers to test apic_delspl path */
 983                 if (tmpirqp) {
 984                         tmpirqp->airq_next = irqptr->airq_next;
 985                         irqptr->airq_next = apic_irq_table[chosen_irq];
 986                         apic_irq_table[chosen_irq] = irqptr;
 987                 }
 988 #endif /* DEBUG */
 989                 mutex_exit(&airq_mutex);
 990                 return (VIRTIRQ(chosen_irq, share_id));
 991         }
 992         return (-1);
 993 }
 994 
 995 /*
 996  * Allocate/Initialize the apic_irq_table[] entry for given irqno. If the entry
 997  * is used already, we will try to allocate a new irqno.
 998  *
 999  * Return value:
1000  *      Success: irqno
1001  *      Failure: -1
1002  */
1003 static int
1004 apic_setup_irq_table(dev_info_t *dip, int irqno, struct apic_io_intr *intrp,
1005     struct intrspec *ispec, iflag_t *intr_flagp, int type)
1006 {
1007         int origirq = ispec->intrspec_vec;
1008         uchar_t ipl = ispec->intrspec_pri;
1009         int     newirq, intr_index;
1010         uchar_t ipin, ioapic, ioapicindex, vector;
1011         apic_irq_t *irqptr;
1012         major_t major;
1013         dev_info_t      *sdip;
1014 
1015         DDI_INTR_IMPLDBG((CE_CONT, "apic_setup_irq_table: dip=0x%p type=%d "
1016             "irqno=0x%x origirq=0x%x\n", (void *)dip, type, irqno, origirq));
1017 
1018         ASSERT(ispec != NULL);
1019 
1020         major =  (dip != NULL) ? ddi_driver_major(dip) : 0;
1021 
1022         if (DDI_INTR_IS_MSI_OR_MSIX(type)) {
1023                 /* MSI/X doesn't need to setup ioapic stuffs */
1024                 ioapicindex = 0xff;
1025                 ioapic = 0xff;
1026                 ipin = (uchar_t)0xff;
1027                 intr_index = (type == DDI_INTR_TYPE_MSI) ? MSI_INDEX :
1028                     MSIX_INDEX;
1029                 mutex_enter(&airq_mutex);
1030                 if ((irqno = apic_allocate_irq(apic_first_avail_irq)) == -1) {
1031                         mutex_exit(&airq_mutex);
1032                         /* need an irq for MSI/X to index into autovect[] */
1033                         cmn_err(CE_WARN, "No interrupt irq: %s instance %d",
1034                             ddi_get_name(dip), ddi_get_instance(dip));
1035                         return (-1);
1036                 }
1037                 mutex_exit(&airq_mutex);
1038 
1039         } else if (intrp != NULL) {
1040                 intr_index = (int)(intrp - apic_io_intrp);
1041                 ioapic = intrp->intr_destid;
1042                 ipin = intrp->intr_destintin;
1043                 /* Find ioapicindex. If destid was ALL, we will exit with 0. */
1044                 for (ioapicindex = apic_io_max - 1; ioapicindex; ioapicindex--)
1045                         if (apic_io_id[ioapicindex] == ioapic)
1046                                 break;
1047                 ASSERT((ioapic == apic_io_id[ioapicindex]) ||
1048                     (ioapic == INTR_ALL_APIC));
1049 
1050                 /* check whether this intin# has been used by another irqno */
1051                 if ((newirq = apic_find_intin(ioapicindex, ipin)) != -1) {
1052                         return (newirq);
1053                 }
1054 
1055         } else if (intr_flagp != NULL) {
1056                 /* ACPI case */
1057                 intr_index = ACPI_INDEX;
1058                 ioapicindex = acpi_find_ioapic(irqno);
1059                 ASSERT(ioapicindex != 0xFF);
1060                 ioapic = apic_io_id[ioapicindex];
1061                 ipin = irqno - apic_io_vectbase[ioapicindex];
1062                 if (apic_irq_table[irqno] &&
1063                     apic_irq_table[irqno]->airq_mps_intr_index == ACPI_INDEX) {
1064                         ASSERT(apic_irq_table[irqno]->airq_intin_no == ipin &&
1065                             apic_irq_table[irqno]->airq_ioapicindex ==
1066                             ioapicindex);
1067                         return (irqno);
1068                 }
1069 
1070         } else {
1071                 /* default configuration */
1072                 ioapicindex = 0;
1073                 ioapic = apic_io_id[ioapicindex];
1074                 ipin = (uchar_t)irqno;
1075                 intr_index = DEFAULT_INDEX;
1076         }
1077 
1078         if (ispec == NULL) {
1079                 APIC_VERBOSE_IOAPIC((CE_WARN, "No intrspec for irqno = %x\n",
1080                     irqno));
1081         } else if ((vector = apic_allocate_vector(ipl, irqno, 0)) == 0) {
1082                 if ((newirq = apic_share_vector(irqno, intr_flagp, intr_index,
1083                     ipl, ioapicindex, ipin, &irqptr)) != -1) {
1084                         irqptr->airq_ipl = ipl;
1085                         irqptr->airq_origirq = (uchar_t)origirq;
1086                         irqptr->airq_dip = dip;
1087                         irqptr->airq_major = major;
1088                         sdip = apic_irq_table[IRQINDEX(newirq)]->airq_dip;
1089                         /* This is OK to do really */
1090                         if (sdip == NULL) {
1091                                 cmn_err(CE_WARN, "Sharing vectors: %s"
1092                                     " instance %d and SCI",
1093                                     ddi_get_name(dip), ddi_get_instance(dip));
1094                         } else {
1095                                 cmn_err(CE_WARN, "Sharing vectors: %s"
1096                                     " instance %d and %s instance %d",
1097                                     ddi_get_name(sdip), ddi_get_instance(sdip),
1098                                     ddi_get_name(dip), ddi_get_instance(dip));
1099                         }
1100                         return (newirq);
1101                 }
1102                 /* try high priority allocation now  that share has failed */
1103                 if ((vector = apic_allocate_vector(ipl, irqno, 1)) == 0) {
1104                         cmn_err(CE_WARN, "No interrupt vector: %s instance %d",
1105                             ddi_get_name(dip), ddi_get_instance(dip));
1106                         return (-1);
1107                 }
1108         }
1109 
1110         mutex_enter(&airq_mutex);
1111         if (apic_irq_table[irqno] == NULL) {
1112                 irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP);
1113                 irqptr->airq_temp_cpu = IRQ_UNINIT;
1114                 apic_irq_table[irqno] = irqptr;
1115         } else {
1116                 irqptr = apic_irq_table[irqno];
1117                 if (irqptr->airq_mps_intr_index != FREE_INDEX) {
1118                         /*
1119                          * The slot is used by another irqno, so allocate
1120                          * a free irqno for this interrupt
1121                          */
1122                         newirq = apic_allocate_irq(apic_first_avail_irq);
1123                         if (newirq == -1) {
1124                                 mutex_exit(&airq_mutex);
1125                                 return (-1);
1126                         }
1127                         irqno = newirq;
1128                         irqptr = apic_irq_table[irqno];
1129                         if (irqptr == NULL) {
1130                                 irqptr = kmem_zalloc(sizeof (apic_irq_t),
1131                                     KM_SLEEP);
1132                                 irqptr->airq_temp_cpu = IRQ_UNINIT;
1133                                 apic_irq_table[irqno] = irqptr;
1134                         }
1135                         vector = apic_modify_vector(vector, newirq);
1136                 }
1137         }
1138         apic_max_device_irq = max(irqno, apic_max_device_irq);
1139         apic_min_device_irq = min(irqno, apic_min_device_irq);
1140         mutex_exit(&airq_mutex);
1141         irqptr->airq_ioapicindex = ioapicindex;
1142         irqptr->airq_intin_no = ipin;
1143         irqptr->airq_ipl = ipl;
1144         irqptr->airq_vector = vector;
1145         irqptr->airq_origirq = (uchar_t)origirq;
1146         irqptr->airq_share_id = 0;
1147         irqptr->airq_mps_intr_index = (short)intr_index;
1148         irqptr->airq_dip = dip;
1149         irqptr->airq_major = major;
1150         irqptr->airq_cpu = apic_bind_intr(dip, irqno, ioapic, ipin);
1151         if (intr_flagp)
1152                 irqptr->airq_iflag = *intr_flagp;
1153 
1154         if (!DDI_INTR_IS_MSI_OR_MSIX(type)) {
1155                 /* setup I/O APIC entry for non-MSI/X interrupts */
1156                 apic_record_rdt_entry(irqptr, irqno);
1157         }
1158         return (irqno);
1159 }
1160 
1161 /*
1162  * return the cpu to which this intr should be bound.
1163  * Check properties or any other mechanism to see if user wants it
1164  * bound to a specific CPU. If so, return the cpu id with high bit set.
1165  * If not, use the policy to choose a cpu and return the id.
1166  */
1167 uint32_t
1168 apic_bind_intr(dev_info_t *dip, int irq, uchar_t ioapicid, uchar_t intin)
1169 {
1170         int     instance, instno, prop_len, bind_cpu, count;
1171         uint_t  i, rc;
1172         uint32_t cpu;
1173         major_t major;
1174         char    *name, *drv_name, *prop_val, *cptr;
1175         char    prop_name[32];
1176         ulong_t iflag;
1177 
1178 
1179         if (apic_intr_policy == INTR_LOWEST_PRIORITY)
1180                 return (IRQ_UNBOUND);
1181 
1182         if (apic_nproc == 1)
1183                 return (0);
1184 
1185         drv_name = NULL;
1186         rc = DDI_PROP_NOT_FOUND;
1187         major = (major_t)-1;
1188         if (dip != NULL) {
1189                 name = ddi_get_name(dip);
1190                 major = ddi_name_to_major(name);
1191                 drv_name = ddi_major_to_name(major);
1192                 instance = ddi_get_instance(dip);
1193                 if (apic_intr_policy == INTR_ROUND_ROBIN_WITH_AFFINITY) {
1194                         i = apic_min_device_irq;
1195                         for (; i <= apic_max_device_irq; i++) {
1196 
1197                                 if ((i == irq) || (apic_irq_table[i] == NULL) ||
1198                                     (apic_irq_table[i]->airq_mps_intr_index
1199                                     == FREE_INDEX))
1200                                         continue;
1201 
1202                                 if ((apic_irq_table[i]->airq_major == major) &&
1203                                     (!(apic_irq_table[i]->airq_cpu &
1204                                     IRQ_USER_BOUND))) {
1205 
1206                                         cpu = apic_irq_table[i]->airq_cpu;
1207 
1208                                         cmn_err(CE_CONT,
1209                                             "!%s: %s (%s) instance #%d "
1210                                             "irq 0x%x vector 0x%x ioapic 0x%x "
1211                                             "intin 0x%x is bound to cpu %d\n",
1212                                             psm_name,
1213                                             name, drv_name, instance, irq,
1214                                             apic_irq_table[irq]->airq_vector,
1215                                             ioapicid, intin, cpu);
1216                                         return (cpu);
1217                                 }
1218                         }
1219                 }
1220                 /*
1221                  * search for "drvname"_intpt_bind_cpus property first, the
1222                  * syntax of the property should be "a[,b,c,...]" where
1223                  * instance 0 binds to cpu a, instance 1 binds to cpu b,
1224                  * instance 3 binds to cpu c...
1225                  * ddi_getlongprop() will search /option first, then /
1226                  * if "drvname"_intpt_bind_cpus doesn't exist, then find
1227                  * intpt_bind_cpus property.  The syntax is the same, and
1228                  * it applies to all the devices if its "drvname" specific
1229                  * property doesn't exist
1230                  */
1231                 (void) strcpy(prop_name, drv_name);
1232                 (void) strcat(prop_name, "_intpt_bind_cpus");
1233                 rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0, prop_name,
1234                     (caddr_t)&prop_val, &prop_len);
1235                 if (rc != DDI_PROP_SUCCESS) {
1236                         rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0,
1237                             "intpt_bind_cpus", (caddr_t)&prop_val, &prop_len);
1238                 }
1239         }
1240         if (rc == DDI_PROP_SUCCESS) {
1241                 for (i = count = 0; i < (prop_len - 1); i++)
1242                         if (prop_val[i] == ',')
1243                                 count++;
1244                 if (prop_val[i-1] != ',')
1245                         count++;
1246                 /*
1247                  * if somehow the binding instances defined in the
1248                  * property are not enough for this instno., then
1249                  * reuse the pattern for the next instance until
1250                  * it reaches the requested instno
1251                  */
1252                 instno = instance % count;
1253                 i = 0;
1254                 cptr = prop_val;
1255                 while (i < instno)
1256                         if (*cptr++ == ',')
1257                                 i++;
1258                 bind_cpu = stoi(&cptr);
1259                 kmem_free(prop_val, prop_len);
1260                 /* if specific CPU is bogus, then default to next cpu */
1261                 if (!apic_cpu_in_range(bind_cpu)) {
1262                         cmn_err(CE_WARN, "%s: %s=%s: CPU %d not present",
1263                             psm_name, prop_name, prop_val, bind_cpu);
1264                         rc = DDI_PROP_NOT_FOUND;
1265                 } else {
1266                         /* indicate that we are bound at user request */
1267                         bind_cpu |= IRQ_USER_BOUND;
1268                 }
1269                 /*
1270                  * no need to check apic_cpus[].aci_status, if specific CPU is
1271                  * not up, then post_cpu_start will handle it.
1272                  */
1273         }
1274         if (rc != DDI_PROP_SUCCESS) {
1275                 iflag = intr_clear();
1276                 lock_set(&apic_ioapic_lock);
1277                 bind_cpu = apic_get_next_bind_cpu();
1278                 lock_clear(&apic_ioapic_lock);
1279                 intr_restore(iflag);
1280         }
1281 
1282         if (drv_name != NULL)
1283                 cmn_err(CE_CONT, "!%s: %s (%s) instance %d irq 0x%x "
1284                     "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n",
1285                     psm_name, name, drv_name, instance, irq,
1286                     apic_irq_table[irq]->airq_vector, ioapicid, intin,
1287                     bind_cpu & ~IRQ_USER_BOUND);
1288         else
1289                 cmn_err(CE_CONT, "!%s: irq 0x%x "
1290                     "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n",
1291                     psm_name, irq, apic_irq_table[irq]->airq_vector, ioapicid,
1292                     intin, bind_cpu & ~IRQ_USER_BOUND);
1293 
1294         return ((uint32_t)bind_cpu);
1295 }
1296 
1297 /*
1298  * Mark vector as being in the process of being deleted. Interrupts
1299  * may still come in on some CPU. The moment an interrupt comes with
1300  * the new vector, we know we can free the old one. Called only from
1301  * addspl and delspl with interrupts disabled. Because an interrupt
1302  * can be shared, but no interrupt from either device may come in,
1303  * we also use a timeout mechanism, which we arbitrarily set to
1304  * apic_revector_timeout microseconds.
1305  */
1306 static void
1307 apic_mark_vector(uchar_t oldvector, uchar_t newvector)
1308 {
1309         ulong_t iflag;
1310 
1311         iflag = intr_clear();
1312         lock_set(&apic_revector_lock);
1313         if (!apic_oldvec_to_newvec) {
1314                 apic_oldvec_to_newvec =
1315                     kmem_zalloc(sizeof (newvector) * APIC_MAX_VECTOR * 2,
1316                     KM_NOSLEEP);
1317 
1318                 if (!apic_oldvec_to_newvec) {
1319                         /*
1320                          * This failure is not catastrophic.
1321                          * But, the oldvec will never be freed.
1322                          */
1323                         apic_error |= APIC_ERR_MARK_VECTOR_FAIL;
1324                         lock_clear(&apic_revector_lock);
1325                         intr_restore(iflag);
1326                         return;
1327                 }
1328                 apic_newvec_to_oldvec = &apic_oldvec_to_newvec[APIC_MAX_VECTOR];
1329         }
1330 
1331         /* See if we already did this for drivers which do double addintrs */
1332         if (apic_oldvec_to_newvec[oldvector] != newvector) {
1333                 apic_oldvec_to_newvec[oldvector] = newvector;
1334                 apic_newvec_to_oldvec[newvector] = oldvector;
1335                 apic_revector_pending++;
1336         }
1337         lock_clear(&apic_revector_lock);
1338         intr_restore(iflag);
1339         (void) timeout(apic_xlate_vector_free_timeout_handler,
1340             (void *)(uintptr_t)oldvector, drv_usectohz(apic_revector_timeout));
1341 }
1342 
1343 /*
1344  * xlate_vector is called from intr_enter if revector_pending is set.
1345  * It will xlate it if needed and mark the old vector as free.
1346  */
1347 uchar_t
1348 apic_xlate_vector(uchar_t vector)
1349 {
1350         uchar_t newvector, oldvector = 0;
1351 
1352         lock_set(&apic_revector_lock);
1353         /* Do we really need to do this ? */
1354         if (!apic_revector_pending) {
1355                 lock_clear(&apic_revector_lock);
1356                 return (vector);
1357         }
1358         if ((newvector = apic_oldvec_to_newvec[vector]) != 0)
1359                 oldvector = vector;
1360         else {
1361                 /*
1362                  * The incoming vector is new . See if a stale entry is
1363                  * remaining
1364                  */
1365                 if ((oldvector = apic_newvec_to_oldvec[vector]) != 0)
1366                         newvector = vector;
1367         }
1368 
1369         if (oldvector) {
1370                 apic_revector_pending--;
1371                 apic_oldvec_to_newvec[oldvector] = 0;
1372                 apic_newvec_to_oldvec[newvector] = 0;
1373                 apic_free_vector(oldvector);
1374                 lock_clear(&apic_revector_lock);
1375                 /* There could have been more than one reprogramming! */
1376                 return (apic_xlate_vector(newvector));
1377         }
1378         lock_clear(&apic_revector_lock);
1379         return (vector);
1380 }
1381 
1382 void
1383 apic_xlate_vector_free_timeout_handler(void *arg)
1384 {
1385         ulong_t iflag;
1386         uchar_t oldvector, newvector;
1387 
1388         oldvector = (uchar_t)(uintptr_t)arg;
1389         iflag = intr_clear();
1390         lock_set(&apic_revector_lock);
1391         if ((newvector = apic_oldvec_to_newvec[oldvector]) != 0) {
1392                 apic_free_vector(oldvector);
1393                 apic_oldvec_to_newvec[oldvector] = 0;
1394                 apic_newvec_to_oldvec[newvector] = 0;
1395                 apic_revector_pending--;
1396         }
1397 
1398         lock_clear(&apic_revector_lock);
1399         intr_restore(iflag);
1400 }
1401 
1402 /*
1403  * Bind interrupt corresponding to irq_ptr to bind_cpu.
1404  * Must be called with interrupts disabled and apic_ioapic_lock held
1405  */
1406 int
1407 apic_rebind(apic_irq_t *irq_ptr, int bind_cpu,
1408     struct ioapic_reprogram_data *drep)
1409 {
1410         int                     ioapicindex, intin_no;
1411         uint32_t                airq_temp_cpu;
1412         apic_cpus_info_t        *cpu_infop;
1413         uint32_t                rdt_entry;
1414         int                     which_irq;
1415         ioapic_rdt_t            irdt;
1416 
1417         which_irq = apic_vector_to_irq[irq_ptr->airq_vector];
1418 
1419         intin_no = irq_ptr->airq_intin_no;
1420         ioapicindex = irq_ptr->airq_ioapicindex;
1421         airq_temp_cpu = irq_ptr->airq_temp_cpu;
1422         if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu != IRQ_UNBOUND) {
1423                 if (airq_temp_cpu & IRQ_USER_BOUND)
1424                         /* Mask off high bit so it can be used as array index */
1425                         airq_temp_cpu &= ~IRQ_USER_BOUND;
1426 
1427                 ASSERT(apic_cpu_in_range(airq_temp_cpu));
1428         }
1429 
1430         /*
1431          * Can't bind to a CPU that's not accepting interrupts:
1432          */
1433         cpu_infop = &apic_cpus[bind_cpu & ~IRQ_USER_BOUND];
1434         if (!(cpu_infop->aci_status & APIC_CPU_INTR_ENABLE))
1435                 return (1);
1436 
1437         /*
1438          * If we are about to change the interrupt vector for this interrupt,
1439          * and this interrupt is level-triggered, attached to an IOAPIC,
1440          * has been delivered to a CPU and that CPU has not handled it
1441          * yet, we cannot reprogram the IOAPIC now.
1442          */
1443         if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) {
1444 
1445                 rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex,
1446                     intin_no);
1447 
1448                 if ((irq_ptr->airq_vector != RDT_VECTOR(rdt_entry)) &&
1449                     apic_check_stuck_interrupt(irq_ptr, airq_temp_cpu,
1450                     bind_cpu, ioapicindex, intin_no, which_irq, drep) != 0) {
1451 
1452                         return (0);
1453                 }
1454 
1455                 /*
1456                  * NOTE: We do not unmask the RDT here, as an interrupt MAY
1457                  * still come in before we have a chance to reprogram it below.
1458                  * The reprogramming below will simultaneously change and
1459                  * unmask the RDT entry.
1460                  */
1461 
1462                 if ((uint32_t)bind_cpu == IRQ_UNBOUND) {
1463                         irdt.ir_lo =  AV_LDEST | AV_LOPRI |
1464                             irq_ptr->airq_rdt_entry;
1465 
1466                         WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no,
1467                             AV_TOALL);
1468 
1469                         if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu !=
1470                             IRQ_UNBOUND)
1471                                 apic_cpus[airq_temp_cpu].aci_temp_bound--;
1472 
1473                         /*
1474                          * Write the vector, trigger, and polarity portion of
1475                          * the RDT
1476                          */
1477                         WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin_no,
1478                             irdt.ir_lo);
1479 
1480                         irq_ptr->airq_temp_cpu = IRQ_UNBOUND;
1481                         return (0);
1482                 }
1483         }
1484 
1485         if (bind_cpu & IRQ_USER_BOUND) {
1486                 cpu_infop->aci_bound++;
1487         } else {
1488                 cpu_infop->aci_temp_bound++;
1489         }
1490         ASSERT(apic_cpu_in_range(bind_cpu));
1491 
1492         if ((airq_temp_cpu != IRQ_UNBOUND) && (airq_temp_cpu != IRQ_UNINIT)) {
1493                 apic_cpus[airq_temp_cpu].aci_temp_bound--;
1494         }
1495         if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) {
1496 
1497                 irdt.ir_lo = AV_PDEST | AV_FIXED | irq_ptr->airq_rdt_entry;
1498                 irdt.ir_hi = cpu_infop->aci_local_id;
1499 
1500                 /* Write the RDT entry -- bind to a specific CPU: */
1501                 WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no,
1502                     irdt.ir_hi << APIC_ID_BIT_OFFSET);
1503 
1504                 /* Write the vector, trigger, and polarity portion of the RDT */
1505                 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin_no,
1506                     irdt.ir_lo);
1507 
1508         } else {
1509                 int type = (irq_ptr->airq_mps_intr_index == MSI_INDEX) ?
1510                     DDI_INTR_TYPE_MSI : DDI_INTR_TYPE_MSIX;
1511                 if (type == DDI_INTR_TYPE_MSI) {
1512                         if (irq_ptr->airq_ioapicindex ==
1513                             irq_ptr->airq_origirq) {
1514                                 /* first one */
1515                                 DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call "
1516                                     "apic_pci_msi_enable_vector\n"));
1517                                 apic_pci_msi_enable_vector(irq_ptr,
1518                                     type, which_irq, irq_ptr->airq_vector,
1519                                     irq_ptr->airq_intin_no,
1520                                     cpu_infop->aci_local_id);
1521                         }
1522                         if ((irq_ptr->airq_ioapicindex +
1523                             irq_ptr->airq_intin_no - 1) ==
1524                             irq_ptr->airq_origirq) { /* last one */
1525                                 DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call "
1526                                     "apic_pci_msi_enable_mode\n"));
1527                                 apic_pci_msi_enable_mode(irq_ptr->airq_dip,
1528                                     type, which_irq);
1529                         }
1530                 } else { /* MSI-X */
1531                         apic_pci_msi_enable_vector(irq_ptr, type,
1532                             irq_ptr->airq_origirq, irq_ptr->airq_vector, 1,
1533                             cpu_infop->aci_local_id);
1534                         apic_pci_msi_enable_mode(irq_ptr->airq_dip, type,
1535                             irq_ptr->airq_origirq);
1536                 }
1537         }
1538         irq_ptr->airq_temp_cpu = (uint32_t)bind_cpu;
1539         apic_redist_cpu_skip &= ~(1 << (bind_cpu & ~IRQ_USER_BOUND));
1540         return (0);
1541 }
1542 
1543 static void
1544 apic_last_ditch_clear_remote_irr(int ioapic_ix, int intin_no)
1545 {
1546         if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no)
1547             & AV_REMOTE_IRR) != 0) {
1548                 /*
1549                  * Trying to clear the bit through normal
1550                  * channels has failed.  So as a last-ditch
1551                  * effort, try to set the trigger mode to
1552                  * edge, then to level.  This has been
1553                  * observed to work on many systems.
1554                  */
1555                 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1556                     intin_no,
1557                     READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1558                     intin_no) & ~AV_LEVEL);
1559 
1560                 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1561                     intin_no,
1562                     READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1563                     intin_no) | AV_LEVEL);
1564 
1565                 /*
1566                  * If the bit's STILL set, this interrupt may
1567                  * be hosed.
1568                  */
1569                 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1570                     intin_no) & AV_REMOTE_IRR) != 0) {
1571 
1572                         prom_printf("%s: Remote IRR still "
1573                             "not clear for IOAPIC %d intin %d.\n"
1574                             "\tInterrupts to this pin may cease "
1575                             "functioning.\n", psm_name, ioapic_ix,
1576                             intin_no);
1577 #ifdef DEBUG
1578                         apic_last_ditch_reprogram_failures++;
1579 #endif
1580                 }
1581         }
1582 }
1583 
1584 /*
1585  * This function is protected by apic_ioapic_lock coupled with the
1586  * fact that interrupts are disabled.
1587  */
1588 static void
1589 delete_defer_repro_ent(int which_irq)
1590 {
1591         ASSERT(which_irq >= 0);
1592         ASSERT(which_irq <= 255);
1593         ASSERT(LOCK_HELD(&apic_ioapic_lock));
1594 
1595         if (apic_reprogram_info[which_irq].done)
1596                 return;
1597 
1598         apic_reprogram_info[which_irq].done = B_TRUE;
1599 
1600 #ifdef DEBUG
1601         apic_defer_repro_total_retries +=
1602             apic_reprogram_info[which_irq].tries;
1603 
1604         apic_defer_repro_successes++;
1605 #endif
1606 
1607         if (--apic_reprogram_outstanding == 0) {
1608 
1609                 setlvlx = psm_intr_exit_fn();
1610         }
1611 }
1612 
1613 
1614 /*
1615  * Interrupts must be disabled during this function to prevent
1616  * self-deadlock.  Interrupts are disabled because this function
1617  * is called from apic_check_stuck_interrupt(), which is called
1618  * from apic_rebind(), which requires its caller to disable interrupts.
1619  */
1620 static void
1621 add_defer_repro_ent(apic_irq_t *irq_ptr, int which_irq, int new_bind_cpu)
1622 {
1623         ASSERT(which_irq >= 0);
1624         ASSERT(which_irq <= 255);
1625         ASSERT(!interrupts_enabled());
1626 
1627         /*
1628          * On the off-chance that there's already a deferred
1629          * reprogramming on this irq, check, and if so, just update the
1630          * CPU and irq pointer to which the interrupt is targeted, then return.
1631          */
1632         if (!apic_reprogram_info[which_irq].done) {
1633                 apic_reprogram_info[which_irq].bindcpu = new_bind_cpu;
1634                 apic_reprogram_info[which_irq].irqp = irq_ptr;
1635                 return;
1636         }
1637 
1638         apic_reprogram_info[which_irq].irqp = irq_ptr;
1639         apic_reprogram_info[which_irq].bindcpu = new_bind_cpu;
1640         apic_reprogram_info[which_irq].tries = 0;
1641         /*
1642          * This must be the last thing set, since we're not
1643          * grabbing any locks, apic_try_deferred_reprogram() will
1644          * make its decision about using this entry iff done
1645          * is false.
1646          */
1647         apic_reprogram_info[which_irq].done = B_FALSE;
1648 
1649         /*
1650          * If there were previously no deferred reprogrammings, change
1651          * setlvlx to call apic_try_deferred_reprogram()
1652          */
1653         if (++apic_reprogram_outstanding == 1) {
1654 
1655                 setlvlx = apic_try_deferred_reprogram;
1656         }
1657 }
1658 
1659 static void
1660 apic_try_deferred_reprogram(int prev_ipl, int irq)
1661 {
1662         int reproirq;
1663         ulong_t iflag;
1664         struct ioapic_reprogram_data *drep;
1665 
1666         (*psm_intr_exit_fn())(prev_ipl, irq);
1667 
1668         if (!lock_try(&apic_defer_reprogram_lock)) {
1669                 return;
1670         }
1671 
1672         /*
1673          * Acquire the apic_ioapic_lock so that any other operations that
1674          * may affect the apic_reprogram_info state are serialized.
1675          * It's still possible for the last deferred reprogramming to clear
1676          * between the time we entered this function and the time we get to
1677          * the for loop below.  In that case, *setlvlx will have been set
1678          * back to *_intr_exit and drep will be NULL. (There's no way to
1679          * stop that from happening -- we would need to grab a lock before
1680          * calling *setlvlx, which is neither realistic nor prudent).
1681          */
1682         iflag = intr_clear();
1683         lock_set(&apic_ioapic_lock);
1684 
1685         /*
1686          * For each deferred RDT entry, try to reprogram it now.  Note that
1687          * there is no lock acquisition to read apic_reprogram_info because
1688          * '.done' is set only after the other fields in the structure are set.
1689          */
1690 
1691         drep = NULL;
1692         for (reproirq = 0; reproirq <= APIC_MAX_VECTOR; reproirq++) {
1693                 if (apic_reprogram_info[reproirq].done == B_FALSE) {
1694                         drep = &apic_reprogram_info[reproirq];
1695                         break;
1696                 }
1697         }
1698 
1699         /*
1700          * Either we found a deferred action to perform, or
1701          * we entered this function spuriously, after *setlvlx
1702          * was restored to point to *_intr_exit.  Any other
1703          * permutation is invalid.
1704          */
1705         ASSERT(drep != NULL || *setlvlx == psm_intr_exit_fn());
1706 
1707         /*
1708          * Though we can't really do anything about errors
1709          * at this point, keep track of them for reporting.
1710          * Note that it is very possible for apic_setup_io_intr
1711          * to re-register this very timeout if the Remote IRR bit
1712          * has not yet cleared.
1713          */
1714 
1715 #ifdef DEBUG
1716         if (drep != NULL) {
1717                 if (apic_setup_io_intr(drep, reproirq, B_TRUE) != 0) {
1718                         apic_deferred_setup_failures++;
1719                 }
1720         } else {
1721                 apic_deferred_spurious_enters++;
1722         }
1723 #else
1724         if (drep != NULL)
1725                 (void) apic_setup_io_intr(drep, reproirq, B_TRUE);
1726 #endif
1727 
1728         lock_clear(&apic_ioapic_lock);
1729         intr_restore(iflag);
1730 
1731         lock_clear(&apic_defer_reprogram_lock);
1732 }
1733 
1734 static void
1735 apic_ioapic_wait_pending_clear(int ioapic_ix, int intin_no)
1736 {
1737         int waited;
1738 
1739         /*
1740          * Wait for the delivery pending bit to clear.
1741          */
1742         if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) &
1743             (AV_LEVEL|AV_PENDING)) == (AV_LEVEL|AV_PENDING)) {
1744 
1745                 /*
1746                  * If we're still waiting on the delivery of this interrupt,
1747                  * continue to wait here until it is delivered (this should be
1748                  * a very small amount of time, but include a timeout just in
1749                  * case).
1750                  */
1751                 for (waited = 0; waited < apic_max_reps_clear_pending;
1752                     waited++) {
1753                         if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1754                             intin_no) & AV_PENDING) == 0) {
1755                                 break;
1756                         }
1757                 }
1758         }
1759 }
1760 
1761 
1762 /*
1763  * Checks to see if the IOAPIC interrupt entry specified has its Remote IRR
1764  * bit set.  Calls functions that modify the function that setlvlx points to,
1765  * so that the reprogramming can be retried very shortly.
1766  *
1767  * This function will mask the RDT entry if the interrupt is level-triggered.
1768  * (The caller is responsible for unmasking the RDT entry.)
1769  *
1770  * Returns non-zero if the caller should defer IOAPIC reprogramming.
1771  */
1772 static int
1773 apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu,
1774     int new_bind_cpu, int ioapic_ix, int intin_no, int which_irq,
1775     struct ioapic_reprogram_data *drep)
1776 {
1777         int32_t                 rdt_entry;
1778         int                     waited;
1779         int                     reps = 0;
1780 
1781         /*
1782          * Wait for the delivery pending bit to clear.
1783          */
1784         do {
1785                 ++reps;
1786 
1787                 apic_ioapic_wait_pending_clear(ioapic_ix, intin_no);
1788 
1789                 /*
1790                  * Mask the RDT entry, but only if it's a level-triggered
1791                  * interrupt
1792                  */
1793                 rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1794                     intin_no);
1795                 if ((rdt_entry & (AV_LEVEL|AV_MASK)) == AV_LEVEL) {
1796 
1797                         /* Mask it */
1798                         WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no,
1799                             AV_MASK | rdt_entry);
1800                 }
1801 
1802                 if ((rdt_entry & AV_LEVEL) == AV_LEVEL) {
1803                         /*
1804                          * If there was a race and an interrupt was injected
1805                          * just before we masked, check for that case here.
1806                          * Then, unmask the RDT entry and try again.  If we're
1807                          * on our last try, don't unmask (because we want the
1808                          * RDT entry to remain masked for the rest of the
1809                          * function).
1810                          */
1811                         rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1812                             intin_no);
1813                         if ((rdt_entry & AV_PENDING) &&
1814                             (reps < apic_max_reps_clear_pending)) {
1815                                 /* Unmask it */
1816                                 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1817                                     intin_no, rdt_entry & ~AV_MASK);
1818                         }
1819                 }
1820 
1821         } while ((rdt_entry & AV_PENDING) &&
1822             (reps < apic_max_reps_clear_pending));
1823 
1824 #ifdef DEBUG
1825                 if (rdt_entry & AV_PENDING)
1826                         apic_intr_deliver_timeouts++;
1827 #endif
1828 
1829         /*
1830          * If the remote IRR bit is set, then the interrupt has been sent
1831          * to a CPU for processing.  We have no choice but to wait for
1832          * that CPU to process the interrupt, at which point the remote IRR
1833          * bit will be cleared.
1834          */
1835         if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) &
1836             (AV_LEVEL|AV_REMOTE_IRR)) == (AV_LEVEL|AV_REMOTE_IRR)) {
1837 
1838                 /*
1839                  * If the CPU that this RDT is bound to is NOT the current
1840                  * CPU, wait until that CPU handles the interrupt and ACKs
1841                  * it.  If this interrupt is not bound to any CPU (that is,
1842                  * if it's bound to the logical destination of "anyone"), it
1843                  * may have been delivered to the current CPU so handle that
1844                  * case by deferring the reprogramming (below).
1845                  */
1846                 if ((old_bind_cpu != IRQ_UNBOUND) &&
1847                     (old_bind_cpu != IRQ_UNINIT) &&
1848                     (old_bind_cpu != psm_get_cpu_id())) {
1849                         for (waited = 0; waited < apic_max_reps_clear_pending;
1850                             waited++) {
1851                                 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1852                                     intin_no) & AV_REMOTE_IRR) == 0) {
1853 
1854                                         delete_defer_repro_ent(which_irq);
1855 
1856                                         /* Remote IRR has cleared! */
1857                                         return (0);
1858                                 }
1859                         }
1860                 }
1861 
1862                 /*
1863                  * If we waited and the Remote IRR bit is still not cleared,
1864                  * AND if we've invoked the timeout APIC_REPROGRAM_MAX_TIMEOUTS
1865                  * times for this interrupt, try the last-ditch workaround:
1866                  */
1867                 if (drep && drep->tries >= APIC_REPROGRAM_MAX_TRIES) {
1868 
1869                         apic_last_ditch_clear_remote_irr(ioapic_ix, intin_no);
1870 
1871                         /* Mark this one as reprogrammed: */
1872                         delete_defer_repro_ent(which_irq);
1873 
1874                         return (0);
1875                 } else {
1876 #ifdef DEBUG
1877                         apic_intr_deferrals++;
1878 #endif
1879 
1880                         /*
1881                          * If waiting for the Remote IRR bit (above) didn't
1882                          * allow it to clear, defer the reprogramming.
1883                          * Add a new deferred-programming entry if the
1884                          * caller passed a NULL one (and update the existing one
1885                          * in case anything changed).
1886                          */
1887                         add_defer_repro_ent(irq_ptr, which_irq, new_bind_cpu);
1888                         if (drep)
1889                                 drep->tries++;
1890 
1891                         /* Inform caller to defer IOAPIC programming: */
1892                         return (1);
1893                 }
1894 
1895         }
1896 
1897         /* Remote IRR is clear */
1898         delete_defer_repro_ent(which_irq);
1899 
1900         return (0);
1901 }
1902 
1903 /*
1904  * Called to migrate all interrupts at an irq to another cpu.
1905  * Must be called with interrupts disabled and apic_ioapic_lock held
1906  */
1907 int
1908 apic_rebind_all(apic_irq_t *irq_ptr, int bind_cpu)
1909 {
1910         apic_irq_t      *irqptr = irq_ptr;
1911         int             retval = 0;
1912 
1913         while (irqptr) {
1914                 if (irqptr->airq_temp_cpu != IRQ_UNINIT)
1915                         retval |= apic_rebind(irqptr, bind_cpu, NULL);
1916                 irqptr = irqptr->airq_next;
1917         }
1918 
1919         return (retval);
1920 }
1921 
1922 /*
1923  * apic_intr_redistribute does all the messy computations for identifying
1924  * which interrupt to move to which CPU. Currently we do just one interrupt
1925  * at a time. This reduces the time we spent doing all this within clock
1926  * interrupt. When it is done in idle, we could do more than 1.
1927  * First we find the most busy and the most free CPU (time in ISR only)
1928  * skipping those CPUs that has been identified as being ineligible (cpu_skip)
1929  * Then we look for IRQs which are closest to the difference between the
1930  * most busy CPU and the average ISR load. We try to find one whose load
1931  * is less than difference.If none exists, then we chose one larger than the
1932  * difference, provided it does not make the most idle CPU worse than the
1933  * most busy one. In the end, we clear all the busy fields for CPUs. For
1934  * IRQs, they are cleared as they are scanned.
1935  */
1936 void
1937 apic_intr_redistribute(void)
1938 {
1939         int busiest_cpu, most_free_cpu;
1940         int cpu_free, cpu_busy, max_busy, min_busy;
1941         int min_free, diff;
1942         int average_busy, cpus_online;
1943         int i, busy;
1944         ulong_t iflag;
1945         apic_cpus_info_t *cpu_infop;
1946         apic_irq_t *min_busy_irq = NULL;
1947         apic_irq_t *max_busy_irq = NULL;
1948 
1949         busiest_cpu = most_free_cpu = -1;
1950         cpu_free = cpu_busy = max_busy = average_busy = 0;
1951         min_free = apic_sample_factor_redistribution;
1952         cpus_online = 0;
1953         /*
1954          * Below we will check for CPU_INTR_ENABLE, bound, temp_bound, temp_cpu
1955          * without ioapic_lock. That is OK as we are just doing statistical
1956          * sampling anyway and any inaccuracy now will get corrected next time
1957          * The call to rebind which actually changes things will make sure
1958          * we are consistent.
1959          */
1960         for (i = 0; i < apic_nproc; i++) {
1961                 if (apic_cpu_in_range(i) &&
1962                     !(apic_redist_cpu_skip & (1 << i)) &&
1963                     (apic_cpus[i].aci_status & APIC_CPU_INTR_ENABLE)) {
1964 
1965                         cpu_infop = &apic_cpus[i];
1966                         /*
1967                          * If no unbound interrupts or only 1 total on this
1968                          * CPU, skip
1969                          */
1970                         if (!cpu_infop->aci_temp_bound ||
1971                             (cpu_infop->aci_bound + cpu_infop->aci_temp_bound)
1972                             == 1) {
1973                                 apic_redist_cpu_skip |= 1 << i;
1974                                 continue;
1975                         }
1976 
1977                         busy = cpu_infop->aci_busy;
1978                         average_busy += busy;
1979                         cpus_online++;
1980                         if (max_busy < busy) {
1981                                 max_busy = busy;
1982                                 busiest_cpu = i;
1983                         }
1984                         if (min_free > busy) {
1985                                 min_free = busy;
1986                                 most_free_cpu = i;
1987                         }
1988                         if (busy > apic_int_busy_mark) {
1989                                 cpu_busy |= 1 << i;
1990                         } else {
1991                                 if (busy < apic_int_free_mark)
1992                                         cpu_free |= 1 << i;
1993                         }
1994                 }
1995         }
1996         if ((cpu_busy && cpu_free) ||
1997             (max_busy >= (min_free + apic_diff_for_redistribution))) {
1998 
1999                 apic_num_imbalance++;
2000 #ifdef  DEBUG
2001                 if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) {
2002                         prom_printf(
2003                             "redistribute busy=%x free=%x max=%x min=%x",
2004                             cpu_busy, cpu_free, max_busy, min_free);
2005                 }
2006 #endif /* DEBUG */
2007 
2008 
2009                 average_busy /= cpus_online;
2010 
2011                 diff = max_busy - average_busy;
2012                 min_busy = max_busy; /* start with the max possible value */
2013                 max_busy = 0;
2014                 min_busy_irq = max_busy_irq = NULL;
2015                 i = apic_min_device_irq;
2016                 for (; i <= apic_max_device_irq; i++) {
2017                         apic_irq_t *irq_ptr;
2018                         /* Change to linked list per CPU ? */
2019                         if ((irq_ptr = apic_irq_table[i]) == NULL)
2020                                 continue;
2021                         /* Check for irq_busy & decide which one to move */
2022                         /* Also zero them for next round */
2023                         if ((irq_ptr->airq_temp_cpu == busiest_cpu) &&
2024                             irq_ptr->airq_busy) {
2025                                 if (irq_ptr->airq_busy < diff) {
2026                                         /*
2027                                          * Check for least busy CPU,
2028                                          * best fit or what ?
2029                                          */
2030                                         if (max_busy < irq_ptr->airq_busy) {
2031                                                 /*
2032                                                  * Most busy within the
2033                                                  * required differential
2034                                                  */
2035                                                 max_busy = irq_ptr->airq_busy;
2036                                                 max_busy_irq = irq_ptr;
2037                                         }
2038                                 } else {
2039                                         if (min_busy > irq_ptr->airq_busy) {
2040                                                 /*
2041                                                  * least busy, but more than
2042                                                  * the reqd diff
2043                                                  */
2044                                                 if (min_busy <
2045                                                     (diff + average_busy -
2046                                                     min_free)) {
2047                                                         /*
2048                                                          * Making sure new cpu
2049                                                          * will not end up
2050                                                          * worse
2051                                                          */
2052                                                         min_busy =
2053                                                             irq_ptr->airq_busy;
2054 
2055                                                         min_busy_irq = irq_ptr;
2056                                                 }
2057                                         }
2058                                 }
2059                         }
2060                         irq_ptr->airq_busy = 0;
2061                 }
2062 
2063                 if (max_busy_irq != NULL) {
2064 #ifdef  DEBUG
2065                         if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) {
2066                                 prom_printf("rebinding %x to %x",
2067                                     max_busy_irq->airq_vector, most_free_cpu);
2068                         }
2069 #endif /* DEBUG */
2070                         iflag = intr_clear();
2071                         if (lock_try(&apic_ioapic_lock)) {
2072                                 if (apic_rebind_all(max_busy_irq,
2073                                     most_free_cpu) == 0) {
2074                                         /* Make change permenant */
2075                                         max_busy_irq->airq_cpu =
2076                                             (uint32_t)most_free_cpu;
2077                                 }
2078                                 lock_clear(&apic_ioapic_lock);
2079                         }
2080                         intr_restore(iflag);
2081 
2082                 } else if (min_busy_irq != NULL) {
2083 #ifdef  DEBUG
2084                         if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) {
2085                                 prom_printf("rebinding %x to %x",
2086                                     min_busy_irq->airq_vector, most_free_cpu);
2087                         }
2088 #endif /* DEBUG */
2089 
2090                         iflag = intr_clear();
2091                         if (lock_try(&apic_ioapic_lock)) {
2092                                 if (apic_rebind_all(min_busy_irq,
2093                                     most_free_cpu) == 0) {
2094                                         /* Make change permenant */
2095                                         min_busy_irq->airq_cpu =
2096                                             (uint32_t)most_free_cpu;
2097                                 }
2098                                 lock_clear(&apic_ioapic_lock);
2099                         }
2100                         intr_restore(iflag);
2101 
2102                 } else {
2103                         if (cpu_busy != (1 << busiest_cpu)) {
2104                                 apic_redist_cpu_skip |= 1 << busiest_cpu;
2105                                 /*
2106                                  * We leave cpu_skip set so that next time we
2107                                  * can choose another cpu
2108                                  */
2109                         }
2110                 }
2111                 apic_num_rebind++;
2112         } else {
2113                 /*
2114                  * found nothing. Could be that we skipped over valid CPUs
2115                  * or we have balanced everything. If we had a variable
2116                  * ticks_for_redistribution, it could be increased here.
2117                  * apic_int_busy, int_free etc would also need to be
2118                  * changed.
2119                  */
2120                 if (apic_redist_cpu_skip)
2121                         apic_redist_cpu_skip = 0;
2122         }
2123         for (i = 0; i < apic_nproc; i++) {
2124                 if (apic_cpu_in_range(i)) {
2125                         apic_cpus[i].aci_busy = 0;
2126                 }
2127         }
2128 }
2129 
2130 void
2131 apic_cleanup_busy(void)
2132 {
2133         int i;
2134         apic_irq_t *irq_ptr;
2135 
2136         for (i = 0; i < apic_nproc; i++) {
2137                 if (apic_cpu_in_range(i)) {
2138                         apic_cpus[i].aci_busy = 0;
2139                 }
2140         }
2141 
2142         for (i = apic_min_device_irq; i <= apic_max_device_irq; i++) {
2143                 if ((irq_ptr = apic_irq_table[i]) != NULL)
2144                         irq_ptr->airq_busy = 0;
2145         }
2146 }
2147 
2148 int
2149 apic_ioapic_method_probe()
2150 {
2151         return (PSM_SUCCESS);
2152 }