1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 /* 25 * Copyright (c) 2010, Intel Corporation. 26 * All rights reserved. 27 */ 28 29 /* 30 * PSMI 1.1 extensions are supported only in 2.6 and later versions. 31 * PSMI 1.2 extensions are supported only in 2.7 and later versions. 32 * PSMI 1.3 and 1.4 extensions are supported in Solaris 10. 33 * PSMI 1.5 extensions are supported in Solaris Nevada. 34 * PSMI 1.6 extensions are supported in Solaris Nevada. 35 * PSMI 1.7 extensions are supported in Solaris Nevada. 36 */ 37 #define PSMI_1_7 38 39 #include <sys/processor.h> 40 #include <sys/time.h> 41 #include <sys/psm.h> 42 #include <sys/smp_impldefs.h> 43 #include <sys/cram.h> 44 #include <sys/acpi/acpi.h> 45 #include <sys/acpica.h> 46 #include <sys/psm_common.h> 47 #include <sys/apic.h> 48 #include <sys/apic_common.h> 49 #include <sys/pit.h> 50 #include <sys/ddi.h> 51 #include <sys/sunddi.h> 52 #include <sys/ddi_impldefs.h> 53 #include <sys/pci.h> 54 #include <sys/promif.h> 55 #include <sys/x86_archext.h> 56 #include <sys/cpc_impl.h> 57 #include <sys/uadmin.h> 58 #include <sys/panic.h> 59 #include <sys/debug.h> 60 #include <sys/archsystm.h> 61 #include <sys/trap.h> 62 #include <sys/machsystm.h> 63 #include <sys/cpuvar.h> 64 #include <sys/rm_platter.h> 65 #include <sys/privregs.h> 66 #include <sys/cyclic.h> 67 #include <sys/note.h> 68 #include <sys/pci_intr_lib.h> 69 #include <sys/sunndi.h> 70 71 72 /* 73 * Local Function Prototypes 74 */ 75 static void apic_mark_vector(uchar_t oldvector, uchar_t newvector); 76 static void apic_xlate_vector_free_timeout_handler(void *arg); 77 static int apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu, 78 int new_bind_cpu, int apicindex, int intin_no, int which_irq, 79 struct ioapic_reprogram_data *drep); 80 static int apic_setup_irq_table(dev_info_t *dip, int irqno, 81 struct apic_io_intr *intrp, struct intrspec *ispec, iflag_t *intr_flagp, 82 int type); 83 static void apic_try_deferred_reprogram(int ipl, int vect); 84 static void delete_defer_repro_ent(int which_irq); 85 static void apic_ioapic_wait_pending_clear(int ioapicindex, 86 int intin_no); 87 88 extern int apic_acpi_translate_pci_irq(dev_info_t *dip, int busid, int devid, 89 int ipin, int *pci_irqp, iflag_t *intr_flagp); 90 extern int apic_handle_pci_pci_bridge(dev_info_t *idip, int child_devno, 91 int child_ipin, struct apic_io_intr **intrp); 92 extern uchar_t acpi_find_ioapic(int irq); 93 extern struct apic_io_intr *apic_find_io_intr_w_busid(int irqno, int busid); 94 extern int apic_find_bus_id(int bustype); 95 extern int apic_find_intin(uchar_t ioapic, uchar_t intin); 96 extern void apic_record_rdt_entry(apic_irq_t *irqptr, int irq); 97 98 extern int apic_sci_vect; 99 extern iflag_t apic_sci_flags; 100 extern int apic_intr_policy; 101 extern char *psm_name; 102 103 /* 104 * number of bits per byte, from <sys/param.h> 105 */ 106 #define UCHAR_MAX ((1 << NBBY) - 1) 107 108 /* Max wait time (in repetitions) for flags to clear in an RDT entry. */ 109 extern int apic_max_reps_clear_pending; 110 111 /* The irq # is implicit in the array index: */ 112 struct ioapic_reprogram_data apic_reprogram_info[APIC_MAX_VECTOR+1]; 113 /* 114 * APIC_MAX_VECTOR + 1 is the maximum # of IRQs as well. ioapic_reprogram_info 115 * is indexed by IRQ number, NOT by vector number. 116 */ 117 118 extern int apic_int_busy_mark; 119 extern int apic_int_free_mark; 120 extern int apic_diff_for_redistribution; 121 extern int apic_sample_factor_redistribution; 122 extern int apic_redist_cpu_skip; 123 extern int apic_num_imbalance; 124 extern int apic_num_rebind; 125 126 /* timeout for xlate_vector, mark_vector */ 127 int apic_revector_timeout = 16 * 10000; /* 160 millisec */ 128 129 extern int apic_defconf; 130 extern int apic_irq_translate; 131 132 extern int apic_use_acpi_madt_only; /* 1=ONLY use MADT from ACPI */ 133 134 extern uchar_t apic_io_vectbase[MAX_IO_APIC]; 135 136 extern boolean_t ioapic_mask_workaround[MAX_IO_APIC]; 137 138 /* 139 * First available slot to be used as IRQ index into the apic_irq_table 140 * for those interrupts (like MSI/X) that don't have a physical IRQ. 141 */ 142 extern int apic_first_avail_irq; 143 144 /* 145 * apic_defer_reprogram_lock ensures that only one processor is handling 146 * deferred interrupt programming at *_intr_exit time. 147 */ 148 static lock_t apic_defer_reprogram_lock; 149 150 /* 151 * The current number of deferred reprogrammings outstanding 152 */ 153 uint_t apic_reprogram_outstanding = 0; 154 155 #ifdef DEBUG 156 /* 157 * Counters that keep track of deferred reprogramming stats 158 */ 159 uint_t apic_intr_deferrals = 0; 160 uint_t apic_intr_deliver_timeouts = 0; 161 uint_t apic_last_ditch_reprogram_failures = 0; 162 uint_t apic_deferred_setup_failures = 0; 163 uint_t apic_defer_repro_total_retries = 0; 164 uint_t apic_defer_repro_successes = 0; 165 uint_t apic_deferred_spurious_enters = 0; 166 #endif 167 168 extern int apic_io_max; 169 extern struct apic_io_intr *apic_io_intrp; 170 171 uchar_t apic_vector_to_irq[APIC_MAX_VECTOR+1]; 172 173 extern uint32_t eisa_level_intr_mask; 174 /* At least MSB will be set if EISA bus */ 175 176 extern int apic_pci_bus_total; 177 extern uchar_t apic_single_pci_busid; 178 179 /* 180 * Following declarations are for revectoring; used when ISRs at different 181 * IPLs share an irq. 182 */ 183 static lock_t apic_revector_lock; 184 int apic_revector_pending = 0; 185 static uchar_t *apic_oldvec_to_newvec; 186 static uchar_t *apic_newvec_to_oldvec; 187 188 /* ACPI Interrupt Source Override Structure ptr */ 189 ACPI_MADT_INTERRUPT_OVERRIDE *acpi_isop; 190 extern int acpi_iso_cnt; 191 192 /* 193 * Auto-configuration routines 194 */ 195 196 /* 197 * Initialise vector->ipl and ipl->pri arrays. level_intr and irqtable 198 * are also set to NULL. vector->irq is set to a value which cannot map 199 * to a real irq to show that it is free. 200 */ 201 void 202 apic_init_common(void) 203 { 204 int i, j, indx; 205 int *iptr; 206 207 /* 208 * Initialize apic_ipls from apic_vectortoipl. This array is 209 * used in apic_intr_enter to determine the IPL to use for the 210 * corresponding vector. On some systems, due to hardware errata 211 * and interrupt sharing, the IPL may not correspond to the IPL listed 212 * in apic_vectortoipl (see apic_addspl and apic_delspl). 213 */ 214 for (i = 0; i < (APIC_AVAIL_VECTOR / APIC_VECTOR_PER_IPL); i++) { 215 indx = i * APIC_VECTOR_PER_IPL; 216 217 for (j = 0; j < APIC_VECTOR_PER_IPL; j++, indx++) 218 apic_ipls[indx] = apic_vectortoipl[i]; 219 } 220 221 /* cpu 0 is always up (for now) */ 222 apic_cpus[0].aci_status = APIC_CPU_ONLINE | APIC_CPU_INTR_ENABLE; 223 224 iptr = (int *)&apic_irq_table[0]; 225 for (i = 0; i <= APIC_MAX_VECTOR; i++) { 226 apic_level_intr[i] = 0; 227 *iptr++ = NULL; 228 apic_vector_to_irq[i] = APIC_RESV_IRQ; 229 230 /* These *must* be initted to B_TRUE! */ 231 apic_reprogram_info[i].done = B_TRUE; 232 apic_reprogram_info[i].irqp = NULL; 233 apic_reprogram_info[i].tries = 0; 234 apic_reprogram_info[i].bindcpu = 0; 235 } 236 237 /* 238 * Allocate a dummy irq table entry for the reserved entry. 239 * This takes care of the race between removing an irq and 240 * clock detecting a CPU in that irq during interrupt load 241 * sampling. 242 */ 243 apic_irq_table[APIC_RESV_IRQ] = 244 kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP); 245 246 mutex_init(&airq_mutex, NULL, MUTEX_DEFAULT, NULL); 247 } 248 249 void 250 ioapic_init_intr(int mask_apic) 251 { 252 int ioapic_ix; 253 struct intrspec ispec; 254 apic_irq_t *irqptr; 255 int i, j; 256 ulong_t iflag; 257 258 LOCK_INIT_CLEAR(&apic_revector_lock); 259 LOCK_INIT_CLEAR(&apic_defer_reprogram_lock); 260 261 /* mask interrupt vectors */ 262 for (j = 0; j < apic_io_max && mask_apic; j++) { 263 int intin_max; 264 265 ioapic_ix = j; 266 /* Bits 23-16 define the maximum redirection entries */ 267 intin_max = (ioapic_read(ioapic_ix, APIC_VERS_CMD) >> 16) 268 & 0xff; 269 for (i = 0; i <= intin_max; i++) 270 ioapic_write(ioapic_ix, APIC_RDT_CMD + 2 * i, AV_MASK); 271 } 272 273 /* 274 * Hack alert: deal with ACPI SCI interrupt chicken/egg here 275 */ 276 if (apic_sci_vect > 0) { 277 /* 278 * acpica has already done add_avintr(); we just 279 * to finish the job by mimicing translate_irq() 280 * 281 * Fake up an intrspec and setup the tables 282 */ 283 ispec.intrspec_vec = apic_sci_vect; 284 ispec.intrspec_pri = SCI_IPL; 285 286 if (apic_setup_irq_table(NULL, apic_sci_vect, NULL, 287 &ispec, &apic_sci_flags, DDI_INTR_TYPE_FIXED) < 0) { 288 cmn_err(CE_WARN, "!apic: SCI setup failed"); 289 return; 290 } 291 irqptr = apic_irq_table[apic_sci_vect]; 292 293 iflag = intr_clear(); 294 lock_set(&apic_ioapic_lock); 295 296 /* Program I/O APIC */ 297 (void) apic_setup_io_intr(irqptr, apic_sci_vect, B_FALSE); 298 299 lock_clear(&apic_ioapic_lock); 300 intr_restore(iflag); 301 302 irqptr->airq_share++; 303 } 304 } 305 306 /* 307 * Add mask bits to disable interrupt vector from happening 308 * at or above IPL. In addition, it should remove mask bits 309 * to enable interrupt vectors below the given IPL. 310 * 311 * Both add and delspl are complicated by the fact that different interrupts 312 * may share IRQs. This can happen in two ways. 313 * 1. The same H/W line is shared by more than 1 device 314 * 1a. with interrupts at different IPLs 315 * 1b. with interrupts at same IPL 316 * 2. We ran out of vectors at a given IPL and started sharing vectors. 317 * 1b and 2 should be handled gracefully, except for the fact some ISRs 318 * will get called often when no interrupt is pending for the device. 319 * For 1a, we handle it at the higher IPL. 320 */ 321 /*ARGSUSED*/ 322 int 323 apic_addspl_common(int irqno, int ipl, int min_ipl, int max_ipl) 324 { 325 uchar_t vector; 326 ulong_t iflag; 327 apic_irq_t *irqptr, *irqheadptr; 328 int irqindex; 329 330 ASSERT(max_ipl <= UCHAR_MAX); 331 irqindex = IRQINDEX(irqno); 332 333 if ((irqindex == -1) || (!apic_irq_table[irqindex])) 334 return (PSM_FAILURE); 335 336 mutex_enter(&airq_mutex); 337 irqptr = irqheadptr = apic_irq_table[irqindex]; 338 339 DDI_INTR_IMPLDBG((CE_CONT, "apic_addspl: dip=0x%p type=%d irqno=0x%x " 340 "vector=0x%x\n", (void *)irqptr->airq_dip, 341 irqptr->airq_mps_intr_index, irqno, irqptr->airq_vector)); 342 343 while (irqptr) { 344 if (VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno) 345 break; 346 irqptr = irqptr->airq_next; 347 } 348 irqptr->airq_share++; 349 350 mutex_exit(&airq_mutex); 351 352 /* return if it is not hardware interrupt */ 353 if (irqptr->airq_mps_intr_index == RESERVE_INDEX) 354 return (PSM_SUCCESS); 355 356 /* Or if there are more interupts at a higher IPL */ 357 if (ipl != max_ipl) 358 return (PSM_SUCCESS); 359 360 /* 361 * if apic_picinit() has not been called yet, just return. 362 * At the end of apic_picinit(), we will call setup_io_intr(). 363 */ 364 365 if (!apic_picinit_called) 366 return (PSM_SUCCESS); 367 368 /* 369 * Upgrade vector if max_ipl is not earlier ipl. If we cannot allocate, 370 * return failure. 371 */ 372 if (irqptr->airq_ipl != max_ipl && 373 !ioapic_mask_workaround[irqptr->airq_ioapicindex]) { 374 375 vector = apic_allocate_vector(max_ipl, irqindex, 1); 376 if (vector == 0) { 377 irqptr->airq_share--; 378 return (PSM_FAILURE); 379 } 380 irqptr = irqheadptr; 381 apic_mark_vector(irqptr->airq_vector, vector); 382 while (irqptr) { 383 irqptr->airq_vector = vector; 384 irqptr->airq_ipl = (uchar_t)max_ipl; 385 /* 386 * reprogram irq being added and every one else 387 * who is not in the UNINIT state 388 */ 389 if ((VIRTIRQ(irqindex, irqptr->airq_share_id) == 390 irqno) || (irqptr->airq_temp_cpu != IRQ_UNINIT)) { 391 apic_record_rdt_entry(irqptr, irqindex); 392 393 iflag = intr_clear(); 394 lock_set(&apic_ioapic_lock); 395 396 (void) apic_setup_io_intr(irqptr, irqindex, 397 B_FALSE); 398 399 lock_clear(&apic_ioapic_lock); 400 intr_restore(iflag); 401 } 402 irqptr = irqptr->airq_next; 403 } 404 return (PSM_SUCCESS); 405 406 } else if (irqptr->airq_ipl != max_ipl && 407 ioapic_mask_workaround[irqptr->airq_ioapicindex]) { 408 /* 409 * We cannot upgrade the vector, but we can change 410 * the IPL that this vector induces. 411 * 412 * Note that we subtract APIC_BASE_VECT from the vector 413 * here because this array is used in apic_intr_enter 414 * (no need to add APIC_BASE_VECT in that hot code 415 * path since we can do it in the rarely-executed path 416 * here). 417 */ 418 apic_ipls[irqptr->airq_vector - APIC_BASE_VECT] = 419 (uchar_t)max_ipl; 420 421 irqptr = irqheadptr; 422 while (irqptr) { 423 irqptr->airq_ipl = (uchar_t)max_ipl; 424 irqptr = irqptr->airq_next; 425 } 426 427 return (PSM_SUCCESS); 428 } 429 430 ASSERT(irqptr); 431 432 iflag = intr_clear(); 433 lock_set(&apic_ioapic_lock); 434 435 (void) apic_setup_io_intr(irqptr, irqindex, B_FALSE); 436 437 lock_clear(&apic_ioapic_lock); 438 intr_restore(iflag); 439 440 return (PSM_SUCCESS); 441 } 442 443 /* 444 * Recompute mask bits for the given interrupt vector. 445 * If there is no interrupt servicing routine for this 446 * vector, this function should disable interrupt vector 447 * from happening at all IPLs. If there are still 448 * handlers using the given vector, this function should 449 * disable the given vector from happening below the lowest 450 * IPL of the remaining hadlers. 451 */ 452 /*ARGSUSED*/ 453 int 454 apic_delspl_common(int irqno, int ipl, int min_ipl, int max_ipl) 455 { 456 uchar_t vector; 457 uint32_t bind_cpu; 458 int intin, irqindex; 459 int ioapic_ix; 460 apic_irq_t *irqptr, *preirqptr, *irqheadptr, *irqp; 461 ulong_t iflag; 462 463 mutex_enter(&airq_mutex); 464 irqindex = IRQINDEX(irqno); 465 irqptr = preirqptr = irqheadptr = apic_irq_table[irqindex]; 466 467 DDI_INTR_IMPLDBG((CE_CONT, "apic_delspl: dip=0x%p type=%d irqno=0x%x " 468 "vector=0x%x\n", (void *)irqptr->airq_dip, 469 irqptr->airq_mps_intr_index, irqno, irqptr->airq_vector)); 470 471 while (irqptr) { 472 if (VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno) 473 break; 474 preirqptr = irqptr; 475 irqptr = irqptr->airq_next; 476 } 477 ASSERT(irqptr); 478 479 irqptr->airq_share--; 480 481 mutex_exit(&airq_mutex); 482 483 /* 484 * If there are more interrupts at a higher IPL, we don't need 485 * to disable anything. 486 */ 487 if (ipl < max_ipl) 488 return (PSM_SUCCESS); 489 490 /* return if it is not hardware interrupt */ 491 if (irqptr->airq_mps_intr_index == RESERVE_INDEX) 492 return (PSM_SUCCESS); 493 494 if (!apic_picinit_called) { 495 /* 496 * Clear irq_struct. If two devices shared an intpt 497 * line & 1 unloaded before picinit, we are hosed. But, then 498 * we hope the machine survive. 499 */ 500 irqptr->airq_mps_intr_index = FREE_INDEX; 501 irqptr->airq_temp_cpu = IRQ_UNINIT; 502 apic_free_vector(irqptr->airq_vector); 503 return (PSM_SUCCESS); 504 } 505 /* 506 * Downgrade vector to new max_ipl if needed. If we cannot allocate, 507 * use old IPL. Not very elegant, but it should work. 508 */ 509 if ((irqptr->airq_ipl != max_ipl) && (max_ipl != PSM_INVALID_IPL) && 510 !ioapic_mask_workaround[irqptr->airq_ioapicindex]) { 511 apic_irq_t *irqp; 512 if (vector = apic_allocate_vector(max_ipl, irqno, 1)) { 513 apic_mark_vector(irqheadptr->airq_vector, vector); 514 irqp = irqheadptr; 515 while (irqp) { 516 irqp->airq_vector = vector; 517 irqp->airq_ipl = (uchar_t)max_ipl; 518 if (irqp->airq_temp_cpu != IRQ_UNINIT) { 519 apic_record_rdt_entry(irqp, irqindex); 520 521 iflag = intr_clear(); 522 lock_set(&apic_ioapic_lock); 523 524 (void) apic_setup_io_intr(irqp, 525 irqindex, B_FALSE); 526 527 lock_clear(&apic_ioapic_lock); 528 intr_restore(iflag); 529 } 530 irqp = irqp->airq_next; 531 } 532 } 533 534 } else if (irqptr->airq_ipl != max_ipl && 535 max_ipl != PSM_INVALID_IPL && 536 ioapic_mask_workaround[irqptr->airq_ioapicindex]) { 537 538 /* 539 * We cannot downgrade the IPL of the vector below the vector's 540 * hardware priority. If we did, it would be possible for a 541 * higher-priority hardware vector to interrupt a CPU running at an IPL 542 * lower than the hardware priority of the interrupting vector (but 543 * higher than the soft IPL of this IRQ). When this happens, we would 544 * then try to drop the IPL BELOW what it was (effectively dropping 545 * below base_spl) which would be potentially catastrophic. 546 * 547 * (e.g. Suppose the hardware vector associated with this IRQ is 0x40 548 * (hardware IPL of 4). Further assume that the old IPL of this IRQ 549 * was 4, but the new IPL is 1. If we forced vector 0x40 to result in 550 * an IPL of 1, it would be possible for the processor to be executing 551 * at IPL 3 and for an interrupt to come in on vector 0x40, interrupting 552 * the currently-executing ISR. When apic_intr_enter consults 553 * apic_irqs[], it will return 1, bringing the IPL of the CPU down to 1 554 * so even though the processor was running at IPL 4, an IPL 1 555 * interrupt will have interrupted it, which must not happen)). 556 * 557 * Effectively, this means that the hardware priority corresponding to 558 * the IRQ's IPL (in apic_ipls[]) cannot be lower than the vector's 559 * hardware priority. 560 * 561 * (In the above example, then, after removal of the IPL 4 device's 562 * interrupt handler, the new IPL will continue to be 4 because the 563 * hardware priority that IPL 1 implies is lower than the hardware 564 * priority of the vector used.) 565 */ 566 /* apic_ipls is indexed by vector, starting at APIC_BASE_VECT */ 567 const int apic_ipls_index = irqptr->airq_vector - 568 APIC_BASE_VECT; 569 const int vect_inherent_hwpri = irqptr->airq_vector >> 570 APIC_IPL_SHIFT; 571 572 /* 573 * If there are still devices using this IRQ, determine the 574 * new ipl to use. 575 */ 576 if (irqptr->airq_share) { 577 int vect_desired_hwpri, hwpri; 578 579 ASSERT(max_ipl < MAXIPL); 580 vect_desired_hwpri = apic_ipltopri[max_ipl] >> 581 APIC_IPL_SHIFT; 582 583 /* 584 * If the desired IPL's hardware priority is lower 585 * than that of the vector, use the hardware priority 586 * of the vector to determine the new IPL. 587 */ 588 hwpri = (vect_desired_hwpri < vect_inherent_hwpri) ? 589 vect_inherent_hwpri : vect_desired_hwpri; 590 591 /* 592 * Now, to get the right index for apic_vectortoipl, 593 * we need to subtract APIC_BASE_VECT from the 594 * hardware-vector-equivalent (in hwpri). Since hwpri 595 * is already shifted, we shift APIC_BASE_VECT before 596 * doing the subtraction. 597 */ 598 hwpri -= (APIC_BASE_VECT >> APIC_IPL_SHIFT); 599 600 ASSERT(hwpri >= 0); 601 ASSERT(hwpri < MAXIPL); 602 max_ipl = apic_vectortoipl[hwpri]; 603 apic_ipls[apic_ipls_index] = max_ipl; 604 605 irqp = irqheadptr; 606 while (irqp) { 607 irqp->airq_ipl = (uchar_t)max_ipl; 608 irqp = irqp->airq_next; 609 } 610 } else { 611 /* 612 * No more devices on this IRQ, so reset this vector's 613 * element in apic_ipls to the original IPL for this 614 * vector 615 */ 616 apic_ipls[apic_ipls_index] = 617 apic_vectortoipl[vect_inherent_hwpri]; 618 } 619 } 620 621 /* 622 * If there are still active interrupts, we are done. 623 */ 624 if (irqptr->airq_share) 625 return (PSM_SUCCESS); 626 627 iflag = intr_clear(); 628 lock_set(&apic_ioapic_lock); 629 630 if (irqptr->airq_mps_intr_index == MSI_INDEX) { 631 /* 632 * Disable the MSI vector 633 * Make sure we only disable on the last 634 * of the multi-MSI support 635 */ 636 if (i_ddi_intr_get_current_nenables(irqptr->airq_dip) == 1) { 637 apic_pci_msi_disable_mode(irqptr->airq_dip, 638 DDI_INTR_TYPE_MSI); 639 } 640 } else if (irqptr->airq_mps_intr_index == MSIX_INDEX) { 641 /* 642 * Disable the MSI-X vector 643 * needs to clear its mask and addr/data for each MSI-X 644 */ 645 apic_pci_msi_unconfigure(irqptr->airq_dip, DDI_INTR_TYPE_MSIX, 646 irqptr->airq_origirq); 647 /* 648 * Make sure we only disable on the last MSI-X 649 */ 650 if (i_ddi_intr_get_current_nenables(irqptr->airq_dip) == 1) { 651 apic_pci_msi_disable_mode(irqptr->airq_dip, 652 DDI_INTR_TYPE_MSIX); 653 } 654 } else { 655 /* 656 * The assumption here is that this is safe, even for 657 * systems with IOAPICs that suffer from the hardware 658 * erratum because all devices have been quiesced before 659 * they unregister their interrupt handlers. If that 660 * assumption turns out to be false, this mask operation 661 * can induce the same erratum result we're trying to 662 * avoid. 663 */ 664 ioapic_ix = irqptr->airq_ioapicindex; 665 intin = irqptr->airq_intin_no; 666 ioapic_write(ioapic_ix, APIC_RDT_CMD + 2 * intin, AV_MASK); 667 } 668 669 /* 670 * This irq entry is the only one in the chain. 671 */ 672 if (irqheadptr->airq_next == NULL) { 673 ASSERT(irqheadptr == irqptr); 674 bind_cpu = irqptr->airq_temp_cpu; 675 if (((uint32_t)bind_cpu != IRQ_UNBOUND) && 676 ((uint32_t)bind_cpu != IRQ_UNINIT)) { 677 ASSERT(apic_cpu_in_range(bind_cpu)); 678 if (bind_cpu & IRQ_USER_BOUND) { 679 /* If hardbound, temp_cpu == cpu */ 680 bind_cpu &= ~IRQ_USER_BOUND; 681 apic_cpus[bind_cpu].aci_bound--; 682 } else 683 apic_cpus[bind_cpu].aci_temp_bound--; 684 } 685 irqptr->airq_temp_cpu = IRQ_UNINIT; 686 irqptr->airq_mps_intr_index = FREE_INDEX; 687 lock_clear(&apic_ioapic_lock); 688 intr_restore(iflag); 689 apic_free_vector(irqptr->airq_vector); 690 return (PSM_SUCCESS); 691 } 692 693 /* 694 * If we get here, we are sharing the vector and there are more than 695 * one active irq entries in the chain. 696 */ 697 lock_clear(&apic_ioapic_lock); 698 intr_restore(iflag); 699 700 mutex_enter(&airq_mutex); 701 /* Remove the irq entry from the chain */ 702 if (irqptr == irqheadptr) { /* The irq entry is at the head */ 703 apic_irq_table[irqindex] = irqptr->airq_next; 704 } else { 705 preirqptr->airq_next = irqptr->airq_next; 706 } 707 /* Free the irq entry */ 708 kmem_free(irqptr, sizeof (apic_irq_t)); 709 mutex_exit(&airq_mutex); 710 711 return (PSM_SUCCESS); 712 } 713 714 /* 715 * apic_introp_xlate() replaces apic_translate_irq() and is 716 * called only from apic_intr_ops(). With the new ADII framework, 717 * the priority can no longer be retrieved through i_ddi_get_intrspec(). 718 * It has to be passed in from the caller. 719 * 720 * Return value: 721 * Success: irqno for the given device 722 * Failure: -1 723 */ 724 int 725 apic_introp_xlate(dev_info_t *dip, struct intrspec *ispec, int type) 726 { 727 char dev_type[16]; 728 int dev_len, pci_irq, newirq, bustype, devid, busid, i; 729 int irqno = ispec->intrspec_vec; 730 ddi_acc_handle_t cfg_handle; 731 uchar_t ipin; 732 struct apic_io_intr *intrp; 733 iflag_t intr_flag; 734 ACPI_SUBTABLE_HEADER *hp; 735 ACPI_MADT_INTERRUPT_OVERRIDE *isop; 736 apic_irq_t *airqp; 737 int parent_is_pci_or_pciex = 0; 738 int child_is_pciex = 0; 739 740 DDI_INTR_IMPLDBG((CE_CONT, "apic_introp_xlate: dip=0x%p name=%s " 741 "type=%d irqno=0x%x\n", (void *)dip, ddi_get_name(dip), type, 742 irqno)); 743 744 dev_len = sizeof (dev_type); 745 if (ddi_getlongprop_buf(DDI_DEV_T_ANY, ddi_get_parent(dip), 746 DDI_PROP_DONTPASS, "device_type", (caddr_t)dev_type, 747 &dev_len) == DDI_PROP_SUCCESS) { 748 if ((strcmp(dev_type, "pci") == 0) || 749 (strcmp(dev_type, "pciex") == 0)) 750 parent_is_pci_or_pciex = 1; 751 } 752 753 if (ddi_getlongprop_buf(DDI_DEV_T_ANY, dip, 754 DDI_PROP_DONTPASS, "compatible", (caddr_t)dev_type, 755 &dev_len) == DDI_PROP_SUCCESS) { 756 if (strstr(dev_type, "pciex")) 757 child_is_pciex = 1; 758 } 759 760 if (DDI_INTR_IS_MSI_OR_MSIX(type)) { 761 if ((airqp = apic_find_irq(dip, ispec, type)) != NULL) { 762 airqp->airq_iflag.bustype = 763 child_is_pciex ? BUS_PCIE : BUS_PCI; 764 return (apic_vector_to_irq[airqp->airq_vector]); 765 } 766 return (apic_setup_irq_table(dip, irqno, NULL, ispec, 767 NULL, type)); 768 } 769 770 bustype = 0; 771 772 /* check if we have already translated this irq */ 773 mutex_enter(&airq_mutex); 774 newirq = apic_min_device_irq; 775 for (; newirq <= apic_max_device_irq; newirq++) { 776 airqp = apic_irq_table[newirq]; 777 while (airqp) { 778 if ((airqp->airq_dip == dip) && 779 (airqp->airq_origirq == irqno) && 780 (airqp->airq_mps_intr_index != FREE_INDEX)) { 781 782 mutex_exit(&airq_mutex); 783 return (VIRTIRQ(newirq, airqp->airq_share_id)); 784 } 785 airqp = airqp->airq_next; 786 } 787 } 788 mutex_exit(&airq_mutex); 789 790 if (apic_defconf) 791 goto defconf; 792 793 if ((dip == NULL) || (!apic_irq_translate && !apic_enable_acpi)) 794 goto nonpci; 795 796 if (parent_is_pci_or_pciex) { 797 /* pci device */ 798 if (acpica_get_bdf(dip, &busid, &devid, NULL) != 0) 799 goto nonpci; 800 if (busid == 0 && apic_pci_bus_total == 1) 801 busid = (int)apic_single_pci_busid; 802 803 if (pci_config_setup(dip, &cfg_handle) != DDI_SUCCESS) 804 return (-1); 805 ipin = pci_config_get8(cfg_handle, PCI_CONF_IPIN) - PCI_INTA; 806 pci_config_teardown(&cfg_handle); 807 if (apic_enable_acpi && !apic_use_acpi_madt_only) { 808 if (apic_acpi_translate_pci_irq(dip, busid, devid, 809 ipin, &pci_irq, &intr_flag) != ACPI_PSM_SUCCESS) 810 return (-1); 811 812 intr_flag.bustype = child_is_pciex ? BUS_PCIE : BUS_PCI; 813 return (apic_setup_irq_table(dip, pci_irq, NULL, ispec, 814 &intr_flag, type)); 815 } else { 816 pci_irq = ((devid & 0x1f) << 2) | (ipin & 0x3); 817 if ((intrp = apic_find_io_intr_w_busid(pci_irq, busid)) 818 == NULL) { 819 if ((pci_irq = apic_handle_pci_pci_bridge(dip, 820 devid, ipin, &intrp)) == -1) 821 return (-1); 822 } 823 return (apic_setup_irq_table(dip, pci_irq, intrp, ispec, 824 NULL, type)); 825 } 826 } else if (strcmp(dev_type, "isa") == 0) 827 bustype = BUS_ISA; 828 else if (strcmp(dev_type, "eisa") == 0) 829 bustype = BUS_EISA; 830 831 nonpci: 832 if (apic_enable_acpi && !apic_use_acpi_madt_only) { 833 /* search iso entries first */ 834 if (acpi_iso_cnt != 0) { 835 hp = (ACPI_SUBTABLE_HEADER *)acpi_isop; 836 i = 0; 837 while (i < acpi_iso_cnt) { 838 if (hp->Type == 839 ACPI_MADT_TYPE_INTERRUPT_OVERRIDE) { 840 isop = 841 (ACPI_MADT_INTERRUPT_OVERRIDE *) hp; 842 if (isop->Bus == 0 && 843 isop->SourceIrq == irqno) { 844 newirq = isop->GlobalIrq; 845 intr_flag.intr_po = 846 isop->IntiFlags & 847 ACPI_MADT_POLARITY_MASK; 848 intr_flag.intr_el = 849 (isop->IntiFlags & 850 ACPI_MADT_TRIGGER_MASK) 851 >> 2; 852 intr_flag.bustype = BUS_ISA; 853 854 return (apic_setup_irq_table( 855 dip, newirq, NULL, ispec, 856 &intr_flag, type)); 857 858 } 859 i++; 860 } 861 hp = (ACPI_SUBTABLE_HEADER *)(((char *)hp) + 862 hp->Length); 863 } 864 } 865 intr_flag.intr_po = INTR_PO_ACTIVE_HIGH; 866 intr_flag.intr_el = INTR_EL_EDGE; 867 intr_flag.bustype = BUS_ISA; 868 return (apic_setup_irq_table(dip, irqno, NULL, ispec, 869 &intr_flag, type)); 870 } else { 871 if (bustype == 0) /* not initialized */ 872 bustype = eisa_level_intr_mask ? BUS_EISA : BUS_ISA; 873 for (i = 0; i < 2; i++) { 874 if (((busid = apic_find_bus_id(bustype)) != -1) && 875 ((intrp = apic_find_io_intr_w_busid(irqno, busid)) 876 != NULL)) { 877 if ((newirq = apic_setup_irq_table(dip, irqno, 878 intrp, ispec, NULL, type)) != -1) { 879 return (newirq); 880 } 881 goto defconf; 882 } 883 bustype = (bustype == BUS_EISA) ? BUS_ISA : BUS_EISA; 884 } 885 } 886 887 /* MPS default configuration */ 888 defconf: 889 newirq = apic_setup_irq_table(dip, irqno, NULL, ispec, NULL, type); 890 if (newirq == -1) 891 return (-1); 892 ASSERT(IRQINDEX(newirq) == irqno); 893 ASSERT(apic_irq_table[irqno]); 894 return (newirq); 895 } 896 897 /* 898 * Attempt to share vector with someone else 899 */ 900 static int 901 apic_share_vector(int irqno, iflag_t *intr_flagp, short intr_index, int ipl, 902 uchar_t ioapicindex, uchar_t ipin, apic_irq_t **irqptrp) 903 { 904 #ifdef DEBUG 905 apic_irq_t *tmpirqp = NULL; 906 #endif /* DEBUG */ 907 apic_irq_t *irqptr, dummyirq; 908 int newirq, chosen_irq = -1, share = 127; 909 int lowest, highest, i; 910 uchar_t share_id; 911 912 DDI_INTR_IMPLDBG((CE_CONT, "apic_share_vector: irqno=0x%x " 913 "intr_index=0x%x ipl=0x%x\n", irqno, intr_index, ipl)); 914 915 highest = apic_ipltopri[ipl] + APIC_VECTOR_MASK; 916 lowest = apic_ipltopri[ipl-1] + APIC_VECTOR_PER_IPL; 917 918 if (highest < lowest) /* Both ipl and ipl-1 map to same pri */ 919 lowest -= APIC_VECTOR_PER_IPL; 920 dummyirq.airq_mps_intr_index = intr_index; 921 dummyirq.airq_ioapicindex = ioapicindex; 922 dummyirq.airq_intin_no = ipin; 923 if (intr_flagp) 924 dummyirq.airq_iflag = *intr_flagp; 925 apic_record_rdt_entry(&dummyirq, irqno); 926 for (i = lowest; i <= highest; i++) { 927 newirq = apic_vector_to_irq[i]; 928 if (newirq == APIC_RESV_IRQ) 929 continue; 930 irqptr = apic_irq_table[newirq]; 931 932 if ((dummyirq.airq_rdt_entry & 0xFF00) != 933 (irqptr->airq_rdt_entry & 0xFF00)) 934 /* not compatible */ 935 continue; 936 937 if (irqptr->airq_share < share) { 938 share = irqptr->airq_share; 939 chosen_irq = newirq; 940 } 941 } 942 if (chosen_irq != -1) { 943 /* 944 * Assign a share id which is free or which is larger 945 * than the largest one. 946 */ 947 share_id = 1; 948 mutex_enter(&airq_mutex); 949 irqptr = apic_irq_table[chosen_irq]; 950 while (irqptr) { 951 if (irqptr->airq_mps_intr_index == FREE_INDEX) { 952 share_id = irqptr->airq_share_id; 953 break; 954 } 955 if (share_id <= irqptr->airq_share_id) 956 share_id = irqptr->airq_share_id + 1; 957 #ifdef DEBUG 958 tmpirqp = irqptr; 959 #endif /* DEBUG */ 960 irqptr = irqptr->airq_next; 961 } 962 if (!irqptr) { 963 irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP); 964 irqptr->airq_temp_cpu = IRQ_UNINIT; 965 irqptr->airq_next = 966 apic_irq_table[chosen_irq]->airq_next; 967 apic_irq_table[chosen_irq]->airq_next = irqptr; 968 #ifdef DEBUG 969 tmpirqp = apic_irq_table[chosen_irq]; 970 #endif /* DEBUG */ 971 } 972 irqptr->airq_mps_intr_index = intr_index; 973 irqptr->airq_ioapicindex = ioapicindex; 974 irqptr->airq_intin_no = ipin; 975 if (intr_flagp) 976 irqptr->airq_iflag = *intr_flagp; 977 irqptr->airq_vector = apic_irq_table[chosen_irq]->airq_vector; 978 irqptr->airq_share_id = share_id; 979 apic_record_rdt_entry(irqptr, irqno); 980 *irqptrp = irqptr; 981 #ifdef DEBUG 982 /* shuffle the pointers to test apic_delspl path */ 983 if (tmpirqp) { 984 tmpirqp->airq_next = irqptr->airq_next; 985 irqptr->airq_next = apic_irq_table[chosen_irq]; 986 apic_irq_table[chosen_irq] = irqptr; 987 } 988 #endif /* DEBUG */ 989 mutex_exit(&airq_mutex); 990 return (VIRTIRQ(chosen_irq, share_id)); 991 } 992 return (-1); 993 } 994 995 /* 996 * Allocate/Initialize the apic_irq_table[] entry for given irqno. If the entry 997 * is used already, we will try to allocate a new irqno. 998 * 999 * Return value: 1000 * Success: irqno 1001 * Failure: -1 1002 */ 1003 static int 1004 apic_setup_irq_table(dev_info_t *dip, int irqno, struct apic_io_intr *intrp, 1005 struct intrspec *ispec, iflag_t *intr_flagp, int type) 1006 { 1007 int origirq = ispec->intrspec_vec; 1008 uchar_t ipl = ispec->intrspec_pri; 1009 int newirq, intr_index; 1010 uchar_t ipin, ioapic, ioapicindex, vector; 1011 apic_irq_t *irqptr; 1012 major_t major; 1013 dev_info_t *sdip; 1014 1015 DDI_INTR_IMPLDBG((CE_CONT, "apic_setup_irq_table: dip=0x%p type=%d " 1016 "irqno=0x%x origirq=0x%x\n", (void *)dip, type, irqno, origirq)); 1017 1018 ASSERT(ispec != NULL); 1019 1020 major = (dip != NULL) ? ddi_driver_major(dip) : 0; 1021 1022 if (DDI_INTR_IS_MSI_OR_MSIX(type)) { 1023 /* MSI/X doesn't need to setup ioapic stuffs */ 1024 ioapicindex = 0xff; 1025 ioapic = 0xff; 1026 ipin = (uchar_t)0xff; 1027 intr_index = (type == DDI_INTR_TYPE_MSI) ? MSI_INDEX : 1028 MSIX_INDEX; 1029 mutex_enter(&airq_mutex); 1030 if ((irqno = apic_allocate_irq(apic_first_avail_irq)) == -1) { 1031 mutex_exit(&airq_mutex); 1032 /* need an irq for MSI/X to index into autovect[] */ 1033 cmn_err(CE_WARN, "No interrupt irq: %s instance %d", 1034 ddi_get_name(dip), ddi_get_instance(dip)); 1035 return (-1); 1036 } 1037 mutex_exit(&airq_mutex); 1038 1039 } else if (intrp != NULL) { 1040 intr_index = (int)(intrp - apic_io_intrp); 1041 ioapic = intrp->intr_destid; 1042 ipin = intrp->intr_destintin; 1043 /* Find ioapicindex. If destid was ALL, we will exit with 0. */ 1044 for (ioapicindex = apic_io_max - 1; ioapicindex; ioapicindex--) 1045 if (apic_io_id[ioapicindex] == ioapic) 1046 break; 1047 ASSERT((ioapic == apic_io_id[ioapicindex]) || 1048 (ioapic == INTR_ALL_APIC)); 1049 1050 /* check whether this intin# has been used by another irqno */ 1051 if ((newirq = apic_find_intin(ioapicindex, ipin)) != -1) { 1052 return (newirq); 1053 } 1054 1055 } else if (intr_flagp != NULL) { 1056 /* ACPI case */ 1057 intr_index = ACPI_INDEX; 1058 ioapicindex = acpi_find_ioapic(irqno); 1059 ASSERT(ioapicindex != 0xFF); 1060 ioapic = apic_io_id[ioapicindex]; 1061 ipin = irqno - apic_io_vectbase[ioapicindex]; 1062 if (apic_irq_table[irqno] && 1063 apic_irq_table[irqno]->airq_mps_intr_index == ACPI_INDEX) { 1064 ASSERT(apic_irq_table[irqno]->airq_intin_no == ipin && 1065 apic_irq_table[irqno]->airq_ioapicindex == 1066 ioapicindex); 1067 return (irqno); 1068 } 1069 1070 } else { 1071 /* default configuration */ 1072 ioapicindex = 0; 1073 ioapic = apic_io_id[ioapicindex]; 1074 ipin = (uchar_t)irqno; 1075 intr_index = DEFAULT_INDEX; 1076 } 1077 1078 if (ispec == NULL) { 1079 APIC_VERBOSE_IOAPIC((CE_WARN, "No intrspec for irqno = %x\n", 1080 irqno)); 1081 } else if ((vector = apic_allocate_vector(ipl, irqno, 0)) == 0) { 1082 if ((newirq = apic_share_vector(irqno, intr_flagp, intr_index, 1083 ipl, ioapicindex, ipin, &irqptr)) != -1) { 1084 irqptr->airq_ipl = ipl; 1085 irqptr->airq_origirq = (uchar_t)origirq; 1086 irqptr->airq_dip = dip; 1087 irqptr->airq_major = major; 1088 sdip = apic_irq_table[IRQINDEX(newirq)]->airq_dip; 1089 /* This is OK to do really */ 1090 if (sdip == NULL) { 1091 cmn_err(CE_WARN, "Sharing vectors: %s" 1092 " instance %d and SCI", 1093 ddi_get_name(dip), ddi_get_instance(dip)); 1094 } else { 1095 cmn_err(CE_WARN, "Sharing vectors: %s" 1096 " instance %d and %s instance %d", 1097 ddi_get_name(sdip), ddi_get_instance(sdip), 1098 ddi_get_name(dip), ddi_get_instance(dip)); 1099 } 1100 return (newirq); 1101 } 1102 /* try high priority allocation now that share has failed */ 1103 if ((vector = apic_allocate_vector(ipl, irqno, 1)) == 0) { 1104 cmn_err(CE_WARN, "No interrupt vector: %s instance %d", 1105 ddi_get_name(dip), ddi_get_instance(dip)); 1106 return (-1); 1107 } 1108 } 1109 1110 mutex_enter(&airq_mutex); 1111 if (apic_irq_table[irqno] == NULL) { 1112 irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP); 1113 irqptr->airq_temp_cpu = IRQ_UNINIT; 1114 apic_irq_table[irqno] = irqptr; 1115 } else { 1116 irqptr = apic_irq_table[irqno]; 1117 if (irqptr->airq_mps_intr_index != FREE_INDEX) { 1118 /* 1119 * The slot is used by another irqno, so allocate 1120 * a free irqno for this interrupt 1121 */ 1122 newirq = apic_allocate_irq(apic_first_avail_irq); 1123 if (newirq == -1) { 1124 mutex_exit(&airq_mutex); 1125 return (-1); 1126 } 1127 irqno = newirq; 1128 irqptr = apic_irq_table[irqno]; 1129 if (irqptr == NULL) { 1130 irqptr = kmem_zalloc(sizeof (apic_irq_t), 1131 KM_SLEEP); 1132 irqptr->airq_temp_cpu = IRQ_UNINIT; 1133 apic_irq_table[irqno] = irqptr; 1134 } 1135 vector = apic_modify_vector(vector, newirq); 1136 } 1137 } 1138 apic_max_device_irq = max(irqno, apic_max_device_irq); 1139 apic_min_device_irq = min(irqno, apic_min_device_irq); 1140 mutex_exit(&airq_mutex); 1141 irqptr->airq_ioapicindex = ioapicindex; 1142 irqptr->airq_intin_no = ipin; 1143 irqptr->airq_ipl = ipl; 1144 irqptr->airq_vector = vector; 1145 irqptr->airq_origirq = (uchar_t)origirq; 1146 irqptr->airq_share_id = 0; 1147 irqptr->airq_mps_intr_index = (short)intr_index; 1148 irqptr->airq_dip = dip; 1149 irqptr->airq_major = major; 1150 irqptr->airq_cpu = apic_bind_intr(dip, irqno, ioapic, ipin); 1151 if (intr_flagp) 1152 irqptr->airq_iflag = *intr_flagp; 1153 1154 if (!DDI_INTR_IS_MSI_OR_MSIX(type)) { 1155 /* setup I/O APIC entry for non-MSI/X interrupts */ 1156 apic_record_rdt_entry(irqptr, irqno); 1157 } 1158 return (irqno); 1159 } 1160 1161 /* 1162 * return the cpu to which this intr should be bound. 1163 * Check properties or any other mechanism to see if user wants it 1164 * bound to a specific CPU. If so, return the cpu id with high bit set. 1165 * If not, use the policy to choose a cpu and return the id. 1166 */ 1167 uint32_t 1168 apic_bind_intr(dev_info_t *dip, int irq, uchar_t ioapicid, uchar_t intin) 1169 { 1170 int instance, instno, prop_len, bind_cpu, count; 1171 uint_t i, rc; 1172 uint32_t cpu; 1173 major_t major; 1174 char *name, *drv_name, *prop_val, *cptr; 1175 char prop_name[32]; 1176 ulong_t iflag; 1177 1178 1179 if (apic_intr_policy == INTR_LOWEST_PRIORITY) 1180 return (IRQ_UNBOUND); 1181 1182 if (apic_nproc == 1) 1183 return (0); 1184 1185 drv_name = NULL; 1186 rc = DDI_PROP_NOT_FOUND; 1187 major = (major_t)-1; 1188 if (dip != NULL) { 1189 name = ddi_get_name(dip); 1190 major = ddi_name_to_major(name); 1191 drv_name = ddi_major_to_name(major); 1192 instance = ddi_get_instance(dip); 1193 if (apic_intr_policy == INTR_ROUND_ROBIN_WITH_AFFINITY) { 1194 i = apic_min_device_irq; 1195 for (; i <= apic_max_device_irq; i++) { 1196 1197 if ((i == irq) || (apic_irq_table[i] == NULL) || 1198 (apic_irq_table[i]->airq_mps_intr_index 1199 == FREE_INDEX)) 1200 continue; 1201 1202 if ((apic_irq_table[i]->airq_major == major) && 1203 (!(apic_irq_table[i]->airq_cpu & 1204 IRQ_USER_BOUND))) { 1205 1206 cpu = apic_irq_table[i]->airq_cpu; 1207 1208 cmn_err(CE_CONT, 1209 "!%s: %s (%s) instance #%d " 1210 "irq 0x%x vector 0x%x ioapic 0x%x " 1211 "intin 0x%x is bound to cpu %d\n", 1212 psm_name, 1213 name, drv_name, instance, irq, 1214 apic_irq_table[irq]->airq_vector, 1215 ioapicid, intin, cpu); 1216 return (cpu); 1217 } 1218 } 1219 } 1220 /* 1221 * search for "drvname"_intpt_bind_cpus property first, the 1222 * syntax of the property should be "a[,b,c,...]" where 1223 * instance 0 binds to cpu a, instance 1 binds to cpu b, 1224 * instance 3 binds to cpu c... 1225 * ddi_getlongprop() will search /option first, then / 1226 * if "drvname"_intpt_bind_cpus doesn't exist, then find 1227 * intpt_bind_cpus property. The syntax is the same, and 1228 * it applies to all the devices if its "drvname" specific 1229 * property doesn't exist 1230 */ 1231 (void) strcpy(prop_name, drv_name); 1232 (void) strcat(prop_name, "_intpt_bind_cpus"); 1233 rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0, prop_name, 1234 (caddr_t)&prop_val, &prop_len); 1235 if (rc != DDI_PROP_SUCCESS) { 1236 rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0, 1237 "intpt_bind_cpus", (caddr_t)&prop_val, &prop_len); 1238 } 1239 } 1240 if (rc == DDI_PROP_SUCCESS) { 1241 for (i = count = 0; i < (prop_len - 1); i++) 1242 if (prop_val[i] == ',') 1243 count++; 1244 if (prop_val[i-1] != ',') 1245 count++; 1246 /* 1247 * if somehow the binding instances defined in the 1248 * property are not enough for this instno., then 1249 * reuse the pattern for the next instance until 1250 * it reaches the requested instno 1251 */ 1252 instno = instance % count; 1253 i = 0; 1254 cptr = prop_val; 1255 while (i < instno) 1256 if (*cptr++ == ',') 1257 i++; 1258 bind_cpu = stoi(&cptr); 1259 kmem_free(prop_val, prop_len); 1260 /* if specific CPU is bogus, then default to next cpu */ 1261 if (!apic_cpu_in_range(bind_cpu)) { 1262 cmn_err(CE_WARN, "%s: %s=%s: CPU %d not present", 1263 psm_name, prop_name, prop_val, bind_cpu); 1264 rc = DDI_PROP_NOT_FOUND; 1265 } else { 1266 /* indicate that we are bound at user request */ 1267 bind_cpu |= IRQ_USER_BOUND; 1268 } 1269 /* 1270 * no need to check apic_cpus[].aci_status, if specific CPU is 1271 * not up, then post_cpu_start will handle it. 1272 */ 1273 } 1274 if (rc != DDI_PROP_SUCCESS) { 1275 iflag = intr_clear(); 1276 lock_set(&apic_ioapic_lock); 1277 bind_cpu = apic_get_next_bind_cpu(); 1278 lock_clear(&apic_ioapic_lock); 1279 intr_restore(iflag); 1280 } 1281 1282 if (drv_name != NULL) 1283 cmn_err(CE_CONT, "!%s: %s (%s) instance %d irq 0x%x " 1284 "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n", 1285 psm_name, name, drv_name, instance, irq, 1286 apic_irq_table[irq]->airq_vector, ioapicid, intin, 1287 bind_cpu & ~IRQ_USER_BOUND); 1288 else 1289 cmn_err(CE_CONT, "!%s: irq 0x%x " 1290 "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n", 1291 psm_name, irq, apic_irq_table[irq]->airq_vector, ioapicid, 1292 intin, bind_cpu & ~IRQ_USER_BOUND); 1293 1294 return ((uint32_t)bind_cpu); 1295 } 1296 1297 /* 1298 * Mark vector as being in the process of being deleted. Interrupts 1299 * may still come in on some CPU. The moment an interrupt comes with 1300 * the new vector, we know we can free the old one. Called only from 1301 * addspl and delspl with interrupts disabled. Because an interrupt 1302 * can be shared, but no interrupt from either device may come in, 1303 * we also use a timeout mechanism, which we arbitrarily set to 1304 * apic_revector_timeout microseconds. 1305 */ 1306 static void 1307 apic_mark_vector(uchar_t oldvector, uchar_t newvector) 1308 { 1309 ulong_t iflag; 1310 1311 iflag = intr_clear(); 1312 lock_set(&apic_revector_lock); 1313 if (!apic_oldvec_to_newvec) { 1314 apic_oldvec_to_newvec = 1315 kmem_zalloc(sizeof (newvector) * APIC_MAX_VECTOR * 2, 1316 KM_NOSLEEP); 1317 1318 if (!apic_oldvec_to_newvec) { 1319 /* 1320 * This failure is not catastrophic. 1321 * But, the oldvec will never be freed. 1322 */ 1323 apic_error |= APIC_ERR_MARK_VECTOR_FAIL; 1324 lock_clear(&apic_revector_lock); 1325 intr_restore(iflag); 1326 return; 1327 } 1328 apic_newvec_to_oldvec = &apic_oldvec_to_newvec[APIC_MAX_VECTOR]; 1329 } 1330 1331 /* See if we already did this for drivers which do double addintrs */ 1332 if (apic_oldvec_to_newvec[oldvector] != newvector) { 1333 apic_oldvec_to_newvec[oldvector] = newvector; 1334 apic_newvec_to_oldvec[newvector] = oldvector; 1335 apic_revector_pending++; 1336 } 1337 lock_clear(&apic_revector_lock); 1338 intr_restore(iflag); 1339 (void) timeout(apic_xlate_vector_free_timeout_handler, 1340 (void *)(uintptr_t)oldvector, drv_usectohz(apic_revector_timeout)); 1341 } 1342 1343 /* 1344 * xlate_vector is called from intr_enter if revector_pending is set. 1345 * It will xlate it if needed and mark the old vector as free. 1346 */ 1347 uchar_t 1348 apic_xlate_vector(uchar_t vector) 1349 { 1350 uchar_t newvector, oldvector = 0; 1351 1352 lock_set(&apic_revector_lock); 1353 /* Do we really need to do this ? */ 1354 if (!apic_revector_pending) { 1355 lock_clear(&apic_revector_lock); 1356 return (vector); 1357 } 1358 if ((newvector = apic_oldvec_to_newvec[vector]) != 0) 1359 oldvector = vector; 1360 else { 1361 /* 1362 * The incoming vector is new . See if a stale entry is 1363 * remaining 1364 */ 1365 if ((oldvector = apic_newvec_to_oldvec[vector]) != 0) 1366 newvector = vector; 1367 } 1368 1369 if (oldvector) { 1370 apic_revector_pending--; 1371 apic_oldvec_to_newvec[oldvector] = 0; 1372 apic_newvec_to_oldvec[newvector] = 0; 1373 apic_free_vector(oldvector); 1374 lock_clear(&apic_revector_lock); 1375 /* There could have been more than one reprogramming! */ 1376 return (apic_xlate_vector(newvector)); 1377 } 1378 lock_clear(&apic_revector_lock); 1379 return (vector); 1380 } 1381 1382 void 1383 apic_xlate_vector_free_timeout_handler(void *arg) 1384 { 1385 ulong_t iflag; 1386 uchar_t oldvector, newvector; 1387 1388 oldvector = (uchar_t)(uintptr_t)arg; 1389 iflag = intr_clear(); 1390 lock_set(&apic_revector_lock); 1391 if ((newvector = apic_oldvec_to_newvec[oldvector]) != 0) { 1392 apic_free_vector(oldvector); 1393 apic_oldvec_to_newvec[oldvector] = 0; 1394 apic_newvec_to_oldvec[newvector] = 0; 1395 apic_revector_pending--; 1396 } 1397 1398 lock_clear(&apic_revector_lock); 1399 intr_restore(iflag); 1400 } 1401 1402 /* 1403 * Bind interrupt corresponding to irq_ptr to bind_cpu. 1404 * Must be called with interrupts disabled and apic_ioapic_lock held 1405 */ 1406 int 1407 apic_rebind(apic_irq_t *irq_ptr, int bind_cpu, 1408 struct ioapic_reprogram_data *drep) 1409 { 1410 int ioapicindex, intin_no; 1411 uint32_t airq_temp_cpu; 1412 apic_cpus_info_t *cpu_infop; 1413 uint32_t rdt_entry; 1414 int which_irq; 1415 ioapic_rdt_t irdt; 1416 1417 which_irq = apic_vector_to_irq[irq_ptr->airq_vector]; 1418 1419 intin_no = irq_ptr->airq_intin_no; 1420 ioapicindex = irq_ptr->airq_ioapicindex; 1421 airq_temp_cpu = irq_ptr->airq_temp_cpu; 1422 if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu != IRQ_UNBOUND) { 1423 if (airq_temp_cpu & IRQ_USER_BOUND) 1424 /* Mask off high bit so it can be used as array index */ 1425 airq_temp_cpu &= ~IRQ_USER_BOUND; 1426 1427 ASSERT(apic_cpu_in_range(airq_temp_cpu)); 1428 } 1429 1430 /* 1431 * Can't bind to a CPU that's not accepting interrupts: 1432 */ 1433 cpu_infop = &apic_cpus[bind_cpu & ~IRQ_USER_BOUND]; 1434 if (!(cpu_infop->aci_status & APIC_CPU_INTR_ENABLE)) 1435 return (1); 1436 1437 /* 1438 * If we are about to change the interrupt vector for this interrupt, 1439 * and this interrupt is level-triggered, attached to an IOAPIC, 1440 * has been delivered to a CPU and that CPU has not handled it 1441 * yet, we cannot reprogram the IOAPIC now. 1442 */ 1443 if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) { 1444 1445 rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, 1446 intin_no); 1447 1448 if ((irq_ptr->airq_vector != RDT_VECTOR(rdt_entry)) && 1449 apic_check_stuck_interrupt(irq_ptr, airq_temp_cpu, 1450 bind_cpu, ioapicindex, intin_no, which_irq, drep) != 0) { 1451 1452 return (0); 1453 } 1454 1455 /* 1456 * NOTE: We do not unmask the RDT here, as an interrupt MAY 1457 * still come in before we have a chance to reprogram it below. 1458 * The reprogramming below will simultaneously change and 1459 * unmask the RDT entry. 1460 */ 1461 1462 if ((uint32_t)bind_cpu == IRQ_UNBOUND) { 1463 irdt.ir_lo = AV_LDEST | AV_LOPRI | 1464 irq_ptr->airq_rdt_entry; 1465 1466 WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no, 1467 AV_TOALL); 1468 1469 if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu != 1470 IRQ_UNBOUND) 1471 apic_cpus[airq_temp_cpu].aci_temp_bound--; 1472 1473 /* 1474 * Write the vector, trigger, and polarity portion of 1475 * the RDT 1476 */ 1477 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin_no, 1478 irdt.ir_lo); 1479 1480 irq_ptr->airq_temp_cpu = IRQ_UNBOUND; 1481 return (0); 1482 } 1483 } 1484 1485 if (bind_cpu & IRQ_USER_BOUND) { 1486 cpu_infop->aci_bound++; 1487 } else { 1488 cpu_infop->aci_temp_bound++; 1489 } 1490 ASSERT(apic_cpu_in_range(bind_cpu)); 1491 1492 if ((airq_temp_cpu != IRQ_UNBOUND) && (airq_temp_cpu != IRQ_UNINIT)) { 1493 apic_cpus[airq_temp_cpu].aci_temp_bound--; 1494 } 1495 if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) { 1496 1497 irdt.ir_lo = AV_PDEST | AV_FIXED | irq_ptr->airq_rdt_entry; 1498 irdt.ir_hi = cpu_infop->aci_local_id; 1499 1500 /* Write the RDT entry -- bind to a specific CPU: */ 1501 WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no, 1502 irdt.ir_hi << APIC_ID_BIT_OFFSET); 1503 1504 /* Write the vector, trigger, and polarity portion of the RDT */ 1505 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin_no, 1506 irdt.ir_lo); 1507 1508 } else { 1509 int type = (irq_ptr->airq_mps_intr_index == MSI_INDEX) ? 1510 DDI_INTR_TYPE_MSI : DDI_INTR_TYPE_MSIX; 1511 if (type == DDI_INTR_TYPE_MSI) { 1512 if (irq_ptr->airq_ioapicindex == 1513 irq_ptr->airq_origirq) { 1514 /* first one */ 1515 DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call " 1516 "apic_pci_msi_enable_vector\n")); 1517 apic_pci_msi_enable_vector(irq_ptr, 1518 type, which_irq, irq_ptr->airq_vector, 1519 irq_ptr->airq_intin_no, 1520 cpu_infop->aci_local_id); 1521 } 1522 if ((irq_ptr->airq_ioapicindex + 1523 irq_ptr->airq_intin_no - 1) == 1524 irq_ptr->airq_origirq) { /* last one */ 1525 DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call " 1526 "apic_pci_msi_enable_mode\n")); 1527 apic_pci_msi_enable_mode(irq_ptr->airq_dip, 1528 type, which_irq); 1529 } 1530 } else { /* MSI-X */ 1531 apic_pci_msi_enable_vector(irq_ptr, type, 1532 irq_ptr->airq_origirq, irq_ptr->airq_vector, 1, 1533 cpu_infop->aci_local_id); 1534 apic_pci_msi_enable_mode(irq_ptr->airq_dip, type, 1535 irq_ptr->airq_origirq); 1536 } 1537 } 1538 irq_ptr->airq_temp_cpu = (uint32_t)bind_cpu; 1539 apic_redist_cpu_skip &= ~(1 << (bind_cpu & ~IRQ_USER_BOUND)); 1540 return (0); 1541 } 1542 1543 static void 1544 apic_last_ditch_clear_remote_irr(int ioapic_ix, int intin_no) 1545 { 1546 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) 1547 & AV_REMOTE_IRR) != 0) { 1548 /* 1549 * Trying to clear the bit through normal 1550 * channels has failed. So as a last-ditch 1551 * effort, try to set the trigger mode to 1552 * edge, then to level. This has been 1553 * observed to work on many systems. 1554 */ 1555 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1556 intin_no, 1557 READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1558 intin_no) & ~AV_LEVEL); 1559 1560 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1561 intin_no, 1562 READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1563 intin_no) | AV_LEVEL); 1564 1565 /* 1566 * If the bit's STILL set, this interrupt may 1567 * be hosed. 1568 */ 1569 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1570 intin_no) & AV_REMOTE_IRR) != 0) { 1571 1572 prom_printf("%s: Remote IRR still " 1573 "not clear for IOAPIC %d intin %d.\n" 1574 "\tInterrupts to this pin may cease " 1575 "functioning.\n", psm_name, ioapic_ix, 1576 intin_no); 1577 #ifdef DEBUG 1578 apic_last_ditch_reprogram_failures++; 1579 #endif 1580 } 1581 } 1582 } 1583 1584 /* 1585 * This function is protected by apic_ioapic_lock coupled with the 1586 * fact that interrupts are disabled. 1587 */ 1588 static void 1589 delete_defer_repro_ent(int which_irq) 1590 { 1591 ASSERT(which_irq >= 0); 1592 ASSERT(which_irq <= 255); 1593 ASSERT(LOCK_HELD(&apic_ioapic_lock)); 1594 1595 if (apic_reprogram_info[which_irq].done) 1596 return; 1597 1598 apic_reprogram_info[which_irq].done = B_TRUE; 1599 1600 #ifdef DEBUG 1601 apic_defer_repro_total_retries += 1602 apic_reprogram_info[which_irq].tries; 1603 1604 apic_defer_repro_successes++; 1605 #endif 1606 1607 if (--apic_reprogram_outstanding == 0) { 1608 1609 setlvlx = psm_intr_exit_fn(); 1610 } 1611 } 1612 1613 1614 /* 1615 * Interrupts must be disabled during this function to prevent 1616 * self-deadlock. Interrupts are disabled because this function 1617 * is called from apic_check_stuck_interrupt(), which is called 1618 * from apic_rebind(), which requires its caller to disable interrupts. 1619 */ 1620 static void 1621 add_defer_repro_ent(apic_irq_t *irq_ptr, int which_irq, int new_bind_cpu) 1622 { 1623 ASSERT(which_irq >= 0); 1624 ASSERT(which_irq <= 255); 1625 ASSERT(!interrupts_enabled()); 1626 1627 /* 1628 * On the off-chance that there's already a deferred 1629 * reprogramming on this irq, check, and if so, just update the 1630 * CPU and irq pointer to which the interrupt is targeted, then return. 1631 */ 1632 if (!apic_reprogram_info[which_irq].done) { 1633 apic_reprogram_info[which_irq].bindcpu = new_bind_cpu; 1634 apic_reprogram_info[which_irq].irqp = irq_ptr; 1635 return; 1636 } 1637 1638 apic_reprogram_info[which_irq].irqp = irq_ptr; 1639 apic_reprogram_info[which_irq].bindcpu = new_bind_cpu; 1640 apic_reprogram_info[which_irq].tries = 0; 1641 /* 1642 * This must be the last thing set, since we're not 1643 * grabbing any locks, apic_try_deferred_reprogram() will 1644 * make its decision about using this entry iff done 1645 * is false. 1646 */ 1647 apic_reprogram_info[which_irq].done = B_FALSE; 1648 1649 /* 1650 * If there were previously no deferred reprogrammings, change 1651 * setlvlx to call apic_try_deferred_reprogram() 1652 */ 1653 if (++apic_reprogram_outstanding == 1) { 1654 1655 setlvlx = apic_try_deferred_reprogram; 1656 } 1657 } 1658 1659 static void 1660 apic_try_deferred_reprogram(int prev_ipl, int irq) 1661 { 1662 int reproirq; 1663 ulong_t iflag; 1664 struct ioapic_reprogram_data *drep; 1665 1666 (*psm_intr_exit_fn())(prev_ipl, irq); 1667 1668 if (!lock_try(&apic_defer_reprogram_lock)) { 1669 return; 1670 } 1671 1672 /* 1673 * Acquire the apic_ioapic_lock so that any other operations that 1674 * may affect the apic_reprogram_info state are serialized. 1675 * It's still possible for the last deferred reprogramming to clear 1676 * between the time we entered this function and the time we get to 1677 * the for loop below. In that case, *setlvlx will have been set 1678 * back to *_intr_exit and drep will be NULL. (There's no way to 1679 * stop that from happening -- we would need to grab a lock before 1680 * calling *setlvlx, which is neither realistic nor prudent). 1681 */ 1682 iflag = intr_clear(); 1683 lock_set(&apic_ioapic_lock); 1684 1685 /* 1686 * For each deferred RDT entry, try to reprogram it now. Note that 1687 * there is no lock acquisition to read apic_reprogram_info because 1688 * '.done' is set only after the other fields in the structure are set. 1689 */ 1690 1691 drep = NULL; 1692 for (reproirq = 0; reproirq <= APIC_MAX_VECTOR; reproirq++) { 1693 if (apic_reprogram_info[reproirq].done == B_FALSE) { 1694 drep = &apic_reprogram_info[reproirq]; 1695 break; 1696 } 1697 } 1698 1699 /* 1700 * Either we found a deferred action to perform, or 1701 * we entered this function spuriously, after *setlvlx 1702 * was restored to point to *_intr_exit. Any other 1703 * permutation is invalid. 1704 */ 1705 ASSERT(drep != NULL || *setlvlx == psm_intr_exit_fn()); 1706 1707 /* 1708 * Though we can't really do anything about errors 1709 * at this point, keep track of them for reporting. 1710 * Note that it is very possible for apic_setup_io_intr 1711 * to re-register this very timeout if the Remote IRR bit 1712 * has not yet cleared. 1713 */ 1714 1715 #ifdef DEBUG 1716 if (drep != NULL) { 1717 if (apic_setup_io_intr(drep, reproirq, B_TRUE) != 0) { 1718 apic_deferred_setup_failures++; 1719 } 1720 } else { 1721 apic_deferred_spurious_enters++; 1722 } 1723 #else 1724 if (drep != NULL) 1725 (void) apic_setup_io_intr(drep, reproirq, B_TRUE); 1726 #endif 1727 1728 lock_clear(&apic_ioapic_lock); 1729 intr_restore(iflag); 1730 1731 lock_clear(&apic_defer_reprogram_lock); 1732 } 1733 1734 static void 1735 apic_ioapic_wait_pending_clear(int ioapic_ix, int intin_no) 1736 { 1737 int waited; 1738 1739 /* 1740 * Wait for the delivery pending bit to clear. 1741 */ 1742 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) & 1743 (AV_LEVEL|AV_PENDING)) == (AV_LEVEL|AV_PENDING)) { 1744 1745 /* 1746 * If we're still waiting on the delivery of this interrupt, 1747 * continue to wait here until it is delivered (this should be 1748 * a very small amount of time, but include a timeout just in 1749 * case). 1750 */ 1751 for (waited = 0; waited < apic_max_reps_clear_pending; 1752 waited++) { 1753 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1754 intin_no) & AV_PENDING) == 0) { 1755 break; 1756 } 1757 } 1758 } 1759 } 1760 1761 1762 /* 1763 * Checks to see if the IOAPIC interrupt entry specified has its Remote IRR 1764 * bit set. Calls functions that modify the function that setlvlx points to, 1765 * so that the reprogramming can be retried very shortly. 1766 * 1767 * This function will mask the RDT entry if the interrupt is level-triggered. 1768 * (The caller is responsible for unmasking the RDT entry.) 1769 * 1770 * Returns non-zero if the caller should defer IOAPIC reprogramming. 1771 */ 1772 static int 1773 apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu, 1774 int new_bind_cpu, int ioapic_ix, int intin_no, int which_irq, 1775 struct ioapic_reprogram_data *drep) 1776 { 1777 int32_t rdt_entry; 1778 int waited; 1779 int reps = 0; 1780 1781 /* 1782 * Wait for the delivery pending bit to clear. 1783 */ 1784 do { 1785 ++reps; 1786 1787 apic_ioapic_wait_pending_clear(ioapic_ix, intin_no); 1788 1789 /* 1790 * Mask the RDT entry, but only if it's a level-triggered 1791 * interrupt 1792 */ 1793 rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1794 intin_no); 1795 if ((rdt_entry & (AV_LEVEL|AV_MASK)) == AV_LEVEL) { 1796 1797 /* Mask it */ 1798 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no, 1799 AV_MASK | rdt_entry); 1800 } 1801 1802 if ((rdt_entry & AV_LEVEL) == AV_LEVEL) { 1803 /* 1804 * If there was a race and an interrupt was injected 1805 * just before we masked, check for that case here. 1806 * Then, unmask the RDT entry and try again. If we're 1807 * on our last try, don't unmask (because we want the 1808 * RDT entry to remain masked for the rest of the 1809 * function). 1810 */ 1811 rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1812 intin_no); 1813 if ((rdt_entry & AV_PENDING) && 1814 (reps < apic_max_reps_clear_pending)) { 1815 /* Unmask it */ 1816 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1817 intin_no, rdt_entry & ~AV_MASK); 1818 } 1819 } 1820 1821 } while ((rdt_entry & AV_PENDING) && 1822 (reps < apic_max_reps_clear_pending)); 1823 1824 #ifdef DEBUG 1825 if (rdt_entry & AV_PENDING) 1826 apic_intr_deliver_timeouts++; 1827 #endif 1828 1829 /* 1830 * If the remote IRR bit is set, then the interrupt has been sent 1831 * to a CPU for processing. We have no choice but to wait for 1832 * that CPU to process the interrupt, at which point the remote IRR 1833 * bit will be cleared. 1834 */ 1835 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) & 1836 (AV_LEVEL|AV_REMOTE_IRR)) == (AV_LEVEL|AV_REMOTE_IRR)) { 1837 1838 /* 1839 * If the CPU that this RDT is bound to is NOT the current 1840 * CPU, wait until that CPU handles the interrupt and ACKs 1841 * it. If this interrupt is not bound to any CPU (that is, 1842 * if it's bound to the logical destination of "anyone"), it 1843 * may have been delivered to the current CPU so handle that 1844 * case by deferring the reprogramming (below). 1845 */ 1846 if ((old_bind_cpu != IRQ_UNBOUND) && 1847 (old_bind_cpu != IRQ_UNINIT) && 1848 (old_bind_cpu != psm_get_cpu_id())) { 1849 for (waited = 0; waited < apic_max_reps_clear_pending; 1850 waited++) { 1851 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1852 intin_no) & AV_REMOTE_IRR) == 0) { 1853 1854 delete_defer_repro_ent(which_irq); 1855 1856 /* Remote IRR has cleared! */ 1857 return (0); 1858 } 1859 } 1860 } 1861 1862 /* 1863 * If we waited and the Remote IRR bit is still not cleared, 1864 * AND if we've invoked the timeout APIC_REPROGRAM_MAX_TIMEOUTS 1865 * times for this interrupt, try the last-ditch workaround: 1866 */ 1867 if (drep && drep->tries >= APIC_REPROGRAM_MAX_TRIES) { 1868 1869 apic_last_ditch_clear_remote_irr(ioapic_ix, intin_no); 1870 1871 /* Mark this one as reprogrammed: */ 1872 delete_defer_repro_ent(which_irq); 1873 1874 return (0); 1875 } else { 1876 #ifdef DEBUG 1877 apic_intr_deferrals++; 1878 #endif 1879 1880 /* 1881 * If waiting for the Remote IRR bit (above) didn't 1882 * allow it to clear, defer the reprogramming. 1883 * Add a new deferred-programming entry if the 1884 * caller passed a NULL one (and update the existing one 1885 * in case anything changed). 1886 */ 1887 add_defer_repro_ent(irq_ptr, which_irq, new_bind_cpu); 1888 if (drep) 1889 drep->tries++; 1890 1891 /* Inform caller to defer IOAPIC programming: */ 1892 return (1); 1893 } 1894 1895 } 1896 1897 /* Remote IRR is clear */ 1898 delete_defer_repro_ent(which_irq); 1899 1900 return (0); 1901 } 1902 1903 /* 1904 * Called to migrate all interrupts at an irq to another cpu. 1905 * Must be called with interrupts disabled and apic_ioapic_lock held 1906 */ 1907 int 1908 apic_rebind_all(apic_irq_t *irq_ptr, int bind_cpu) 1909 { 1910 apic_irq_t *irqptr = irq_ptr; 1911 int retval = 0; 1912 1913 while (irqptr) { 1914 if (irqptr->airq_temp_cpu != IRQ_UNINIT) 1915 retval |= apic_rebind(irqptr, bind_cpu, NULL); 1916 irqptr = irqptr->airq_next; 1917 } 1918 1919 return (retval); 1920 } 1921 1922 /* 1923 * apic_intr_redistribute does all the messy computations for identifying 1924 * which interrupt to move to which CPU. Currently we do just one interrupt 1925 * at a time. This reduces the time we spent doing all this within clock 1926 * interrupt. When it is done in idle, we could do more than 1. 1927 * First we find the most busy and the most free CPU (time in ISR only) 1928 * skipping those CPUs that has been identified as being ineligible (cpu_skip) 1929 * Then we look for IRQs which are closest to the difference between the 1930 * most busy CPU and the average ISR load. We try to find one whose load 1931 * is less than difference.If none exists, then we chose one larger than the 1932 * difference, provided it does not make the most idle CPU worse than the 1933 * most busy one. In the end, we clear all the busy fields for CPUs. For 1934 * IRQs, they are cleared as they are scanned. 1935 */ 1936 void 1937 apic_intr_redistribute(void) 1938 { 1939 int busiest_cpu, most_free_cpu; 1940 int cpu_free, cpu_busy, max_busy, min_busy; 1941 int min_free, diff; 1942 int average_busy, cpus_online; 1943 int i, busy; 1944 ulong_t iflag; 1945 apic_cpus_info_t *cpu_infop; 1946 apic_irq_t *min_busy_irq = NULL; 1947 apic_irq_t *max_busy_irq = NULL; 1948 1949 busiest_cpu = most_free_cpu = -1; 1950 cpu_free = cpu_busy = max_busy = average_busy = 0; 1951 min_free = apic_sample_factor_redistribution; 1952 cpus_online = 0; 1953 /* 1954 * Below we will check for CPU_INTR_ENABLE, bound, temp_bound, temp_cpu 1955 * without ioapic_lock. That is OK as we are just doing statistical 1956 * sampling anyway and any inaccuracy now will get corrected next time 1957 * The call to rebind which actually changes things will make sure 1958 * we are consistent. 1959 */ 1960 for (i = 0; i < apic_nproc; i++) { 1961 if (apic_cpu_in_range(i) && 1962 !(apic_redist_cpu_skip & (1 << i)) && 1963 (apic_cpus[i].aci_status & APIC_CPU_INTR_ENABLE)) { 1964 1965 cpu_infop = &apic_cpus[i]; 1966 /* 1967 * If no unbound interrupts or only 1 total on this 1968 * CPU, skip 1969 */ 1970 if (!cpu_infop->aci_temp_bound || 1971 (cpu_infop->aci_bound + cpu_infop->aci_temp_bound) 1972 == 1) { 1973 apic_redist_cpu_skip |= 1 << i; 1974 continue; 1975 } 1976 1977 busy = cpu_infop->aci_busy; 1978 average_busy += busy; 1979 cpus_online++; 1980 if (max_busy < busy) { 1981 max_busy = busy; 1982 busiest_cpu = i; 1983 } 1984 if (min_free > busy) { 1985 min_free = busy; 1986 most_free_cpu = i; 1987 } 1988 if (busy > apic_int_busy_mark) { 1989 cpu_busy |= 1 << i; 1990 } else { 1991 if (busy < apic_int_free_mark) 1992 cpu_free |= 1 << i; 1993 } 1994 } 1995 } 1996 if ((cpu_busy && cpu_free) || 1997 (max_busy >= (min_free + apic_diff_for_redistribution))) { 1998 1999 apic_num_imbalance++; 2000 #ifdef DEBUG 2001 if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) { 2002 prom_printf( 2003 "redistribute busy=%x free=%x max=%x min=%x", 2004 cpu_busy, cpu_free, max_busy, min_free); 2005 } 2006 #endif /* DEBUG */ 2007 2008 2009 average_busy /= cpus_online; 2010 2011 diff = max_busy - average_busy; 2012 min_busy = max_busy; /* start with the max possible value */ 2013 max_busy = 0; 2014 min_busy_irq = max_busy_irq = NULL; 2015 i = apic_min_device_irq; 2016 for (; i <= apic_max_device_irq; i++) { 2017 apic_irq_t *irq_ptr; 2018 /* Change to linked list per CPU ? */ 2019 if ((irq_ptr = apic_irq_table[i]) == NULL) 2020 continue; 2021 /* Check for irq_busy & decide which one to move */ 2022 /* Also zero them for next round */ 2023 if ((irq_ptr->airq_temp_cpu == busiest_cpu) && 2024 irq_ptr->airq_busy) { 2025 if (irq_ptr->airq_busy < diff) { 2026 /* 2027 * Check for least busy CPU, 2028 * best fit or what ? 2029 */ 2030 if (max_busy < irq_ptr->airq_busy) { 2031 /* 2032 * Most busy within the 2033 * required differential 2034 */ 2035 max_busy = irq_ptr->airq_busy; 2036 max_busy_irq = irq_ptr; 2037 } 2038 } else { 2039 if (min_busy > irq_ptr->airq_busy) { 2040 /* 2041 * least busy, but more than 2042 * the reqd diff 2043 */ 2044 if (min_busy < 2045 (diff + average_busy - 2046 min_free)) { 2047 /* 2048 * Making sure new cpu 2049 * will not end up 2050 * worse 2051 */ 2052 min_busy = 2053 irq_ptr->airq_busy; 2054 2055 min_busy_irq = irq_ptr; 2056 } 2057 } 2058 } 2059 } 2060 irq_ptr->airq_busy = 0; 2061 } 2062 2063 if (max_busy_irq != NULL) { 2064 #ifdef DEBUG 2065 if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) { 2066 prom_printf("rebinding %x to %x", 2067 max_busy_irq->airq_vector, most_free_cpu); 2068 } 2069 #endif /* DEBUG */ 2070 iflag = intr_clear(); 2071 if (lock_try(&apic_ioapic_lock)) { 2072 if (apic_rebind_all(max_busy_irq, 2073 most_free_cpu) == 0) { 2074 /* Make change permenant */ 2075 max_busy_irq->airq_cpu = 2076 (uint32_t)most_free_cpu; 2077 } 2078 lock_clear(&apic_ioapic_lock); 2079 } 2080 intr_restore(iflag); 2081 2082 } else if (min_busy_irq != NULL) { 2083 #ifdef DEBUG 2084 if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) { 2085 prom_printf("rebinding %x to %x", 2086 min_busy_irq->airq_vector, most_free_cpu); 2087 } 2088 #endif /* DEBUG */ 2089 2090 iflag = intr_clear(); 2091 if (lock_try(&apic_ioapic_lock)) { 2092 if (apic_rebind_all(min_busy_irq, 2093 most_free_cpu) == 0) { 2094 /* Make change permenant */ 2095 min_busy_irq->airq_cpu = 2096 (uint32_t)most_free_cpu; 2097 } 2098 lock_clear(&apic_ioapic_lock); 2099 } 2100 intr_restore(iflag); 2101 2102 } else { 2103 if (cpu_busy != (1 << busiest_cpu)) { 2104 apic_redist_cpu_skip |= 1 << busiest_cpu; 2105 /* 2106 * We leave cpu_skip set so that next time we 2107 * can choose another cpu 2108 */ 2109 } 2110 } 2111 apic_num_rebind++; 2112 } else { 2113 /* 2114 * found nothing. Could be that we skipped over valid CPUs 2115 * or we have balanced everything. If we had a variable 2116 * ticks_for_redistribution, it could be increased here. 2117 * apic_int_busy, int_free etc would also need to be 2118 * changed. 2119 */ 2120 if (apic_redist_cpu_skip) 2121 apic_redist_cpu_skip = 0; 2122 } 2123 for (i = 0; i < apic_nproc; i++) { 2124 if (apic_cpu_in_range(i)) { 2125 apic_cpus[i].aci_busy = 0; 2126 } 2127 } 2128 } 2129 2130 void 2131 apic_cleanup_busy(void) 2132 { 2133 int i; 2134 apic_irq_t *irq_ptr; 2135 2136 for (i = 0; i < apic_nproc; i++) { 2137 if (apic_cpu_in_range(i)) { 2138 apic_cpus[i].aci_busy = 0; 2139 } 2140 } 2141 2142 for (i = apic_min_device_irq; i <= apic_max_device_irq; i++) { 2143 if ((irq_ptr = apic_irq_table[i]) != NULL) 2144 irq_ptr->airq_busy = 0; 2145 } 2146 } 2147 2148 int 2149 apic_ioapic_method_probe() 2150 { 2151 return (PSM_SUCCESS); 2152 }