1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 /* 25 * Copyright (c) 2010, Intel Corporation. 26 * All rights reserved. 27 * Copyright 2016 PALO, Richard. 28 */ 29 30 /* 31 * PSMI 1.1 extensions are supported only in 2.6 and later versions. 32 * PSMI 1.2 extensions are supported only in 2.7 and later versions. 33 * PSMI 1.3 and 1.4 extensions are supported in Solaris 10. 34 * PSMI 1.5 extensions are supported in Solaris Nevada. 35 * PSMI 1.6 extensions are supported in Solaris Nevada. 36 * PSMI 1.7 extensions are supported in Solaris Nevada. 37 */ 38 #define PSMI_1_7 39 40 #include <sys/processor.h> 41 #include <sys/time.h> 42 #include <sys/psm.h> 43 #include <sys/smp_impldefs.h> 44 #include <sys/cram.h> 45 #include <sys/acpi/acpi.h> 46 #include <sys/acpica.h> 47 #include <sys/psm_common.h> 48 #include <sys/apic.h> 49 #include <sys/apic_common.h> 50 #include <sys/pit.h> 51 #include <sys/ddi.h> 52 #include <sys/sunddi.h> 53 #include <sys/ddi_impldefs.h> 54 #include <sys/pci.h> 55 #include <sys/promif.h> 56 #include <sys/x86_archext.h> 57 #include <sys/cpc_impl.h> 58 #include <sys/uadmin.h> 59 #include <sys/panic.h> 60 #include <sys/debug.h> 61 #include <sys/archsystm.h> 62 #include <sys/trap.h> 63 #include <sys/machsystm.h> 64 #include <sys/cpuvar.h> 65 #include <sys/rm_platter.h> 66 #include <sys/privregs.h> 67 #include <sys/cyclic.h> 68 #include <sys/note.h> 69 #include <sys/pci_intr_lib.h> 70 #include <sys/sunndi.h> 71 72 73 /* 74 * Local Function Prototypes 75 */ 76 static void apic_mark_vector(uchar_t oldvector, uchar_t newvector); 77 static void apic_xlate_vector_free_timeout_handler(void *arg); 78 static int apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu, 79 int new_bind_cpu, int apicindex, int intin_no, int which_irq, 80 struct ioapic_reprogram_data *drep); 81 static int apic_setup_irq_table(dev_info_t *dip, int irqno, 82 struct apic_io_intr *intrp, struct intrspec *ispec, iflag_t *intr_flagp, 83 int type); 84 static void apic_try_deferred_reprogram(int ipl, int vect); 85 static void delete_defer_repro_ent(int which_irq); 86 static void apic_ioapic_wait_pending_clear(int ioapicindex, 87 int intin_no); 88 89 extern int apic_acpi_translate_pci_irq(dev_info_t *dip, int busid, int devid, 90 int ipin, int *pci_irqp, iflag_t *intr_flagp); 91 extern int apic_handle_pci_pci_bridge(dev_info_t *idip, int child_devno, 92 int child_ipin, struct apic_io_intr **intrp); 93 extern uchar_t acpi_find_ioapic(int irq); 94 extern struct apic_io_intr *apic_find_io_intr_w_busid(int irqno, int busid); 95 extern int apic_find_bus_id(int bustype); 96 extern int apic_find_intin(uchar_t ioapic, uchar_t intin); 97 extern void apic_record_rdt_entry(apic_irq_t *irqptr, int irq); 98 99 extern int apic_sci_vect; 100 extern iflag_t apic_sci_flags; 101 extern int apic_intr_policy; 102 extern char *psm_name; 103 104 /* 105 * number of bits per byte, from <sys/param.h> 106 */ 107 #define UCHAR_MAX ((1 << NBBY) - 1) 108 109 /* Max wait time (in repetitions) for flags to clear in an RDT entry. */ 110 extern int apic_max_reps_clear_pending; 111 112 /* The irq # is implicit in the array index: */ 113 struct ioapic_reprogram_data apic_reprogram_info[APIC_MAX_VECTOR+1]; 114 /* 115 * APIC_MAX_VECTOR + 1 is the maximum # of IRQs as well. ioapic_reprogram_info 116 * is indexed by IRQ number, NOT by vector number. 117 */ 118 119 extern int apic_int_busy_mark; 120 extern int apic_int_free_mark; 121 extern int apic_diff_for_redistribution; 122 extern int apic_sample_factor_redistribution; 123 extern int apic_redist_cpu_skip; 124 extern int apic_num_imbalance; 125 extern int apic_num_rebind; 126 127 /* timeout for xlate_vector, mark_vector */ 128 int apic_revector_timeout = 16 * 10000; /* 160 millisec */ 129 130 extern int apic_defconf; 131 extern int apic_irq_translate; 132 133 extern int apic_use_acpi_madt_only; /* 1=ONLY use MADT from ACPI */ 134 135 extern uchar_t apic_io_vectbase[MAX_IO_APIC]; 136 137 extern boolean_t ioapic_mask_workaround[MAX_IO_APIC]; 138 139 /* 140 * First available slot to be used as IRQ index into the apic_irq_table 141 * for those interrupts (like MSI/X) that don't have a physical IRQ. 142 */ 143 extern int apic_first_avail_irq; 144 145 /* 146 * apic_defer_reprogram_lock ensures that only one processor is handling 147 * deferred interrupt programming at *_intr_exit time. 148 */ 149 static lock_t apic_defer_reprogram_lock; 150 151 /* 152 * The current number of deferred reprogrammings outstanding 153 */ 154 uint_t apic_reprogram_outstanding = 0; 155 156 #ifdef DEBUG 157 /* 158 * Counters that keep track of deferred reprogramming stats 159 */ 160 uint_t apic_intr_deferrals = 0; 161 uint_t apic_intr_deliver_timeouts = 0; 162 uint_t apic_last_ditch_reprogram_failures = 0; 163 uint_t apic_deferred_setup_failures = 0; 164 uint_t apic_defer_repro_total_retries = 0; 165 uint_t apic_defer_repro_successes = 0; 166 uint_t apic_deferred_spurious_enters = 0; 167 #endif 168 169 extern int apic_io_max; 170 extern struct apic_io_intr *apic_io_intrp; 171 172 uchar_t apic_vector_to_irq[APIC_MAX_VECTOR+1]; 173 174 extern uint32_t eisa_level_intr_mask; 175 /* At least MSB will be set if EISA bus */ 176 177 extern int apic_pci_bus_total; 178 extern uchar_t apic_single_pci_busid; 179 180 /* 181 * Following declarations are for revectoring; used when ISRs at different 182 * IPLs share an irq. 183 */ 184 static lock_t apic_revector_lock; 185 int apic_revector_pending = 0; 186 static uchar_t *apic_oldvec_to_newvec; 187 static uchar_t *apic_newvec_to_oldvec; 188 189 /* ACPI Interrupt Source Override Structure ptr */ 190 ACPI_MADT_INTERRUPT_OVERRIDE *acpi_isop; 191 extern int acpi_iso_cnt; 192 193 /* 194 * Auto-configuration routines 195 */ 196 197 /* 198 * Initialise vector->ipl and ipl->pri arrays. level_intr and irqtable 199 * are also set to NULL. vector->irq is set to a value which cannot map 200 * to a real irq to show that it is free. 201 */ 202 void 203 apic_init_common(void) 204 { 205 int i, j, indx; 206 int *iptr; 207 208 /* 209 * Initialize apic_ipls from apic_vectortoipl. This array is 210 * used in apic_intr_enter to determine the IPL to use for the 211 * corresponding vector. On some systems, due to hardware errata 212 * and interrupt sharing, the IPL may not correspond to the IPL listed 213 * in apic_vectortoipl (see apic_addspl and apic_delspl). 214 */ 215 for (i = 0; i < (APIC_AVAIL_VECTOR / APIC_VECTOR_PER_IPL); i++) { 216 indx = i * APIC_VECTOR_PER_IPL; 217 218 for (j = 0; j < APIC_VECTOR_PER_IPL; j++, indx++) 219 apic_ipls[indx] = apic_vectortoipl[i]; 220 } 221 222 /* cpu 0 is always up (for now) */ 223 apic_cpus[0].aci_status = APIC_CPU_ONLINE | APIC_CPU_INTR_ENABLE; 224 225 iptr = (int *)&apic_irq_table[0]; 226 for (i = 0; i <= APIC_MAX_VECTOR; i++) { 227 apic_level_intr[i] = 0; 228 *iptr++ = 0; 229 apic_vector_to_irq[i] = APIC_RESV_IRQ; 230 231 /* These *must* be initted to B_TRUE! */ 232 apic_reprogram_info[i].done = B_TRUE; 233 apic_reprogram_info[i].irqp = NULL; 234 apic_reprogram_info[i].tries = 0; 235 apic_reprogram_info[i].bindcpu = 0; 236 } 237 238 /* 239 * Allocate a dummy irq table entry for the reserved entry. 240 * This takes care of the race between removing an irq and 241 * clock detecting a CPU in that irq during interrupt load 242 * sampling. 243 */ 244 apic_irq_table[APIC_RESV_IRQ] = 245 kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP); 246 247 mutex_init(&airq_mutex, NULL, MUTEX_DEFAULT, NULL); 248 } 249 250 void 251 ioapic_init_intr(int mask_apic) 252 { 253 int ioapic_ix; 254 struct intrspec ispec; 255 apic_irq_t *irqptr; 256 int i, j; 257 ulong_t iflag; 258 259 LOCK_INIT_CLEAR(&apic_revector_lock); 260 LOCK_INIT_CLEAR(&apic_defer_reprogram_lock); 261 262 /* mask interrupt vectors */ 263 for (j = 0; j < apic_io_max && mask_apic; j++) { 264 int intin_max; 265 266 ioapic_ix = j; 267 /* Bits 23-16 define the maximum redirection entries */ 268 intin_max = (ioapic_read(ioapic_ix, APIC_VERS_CMD) >> 16) 269 & 0xff; 270 for (i = 0; i <= intin_max; i++) 271 ioapic_write(ioapic_ix, APIC_RDT_CMD + 2 * i, AV_MASK); 272 } 273 274 /* 275 * Hack alert: deal with ACPI SCI interrupt chicken/egg here 276 */ 277 if (apic_sci_vect > 0) { 278 /* 279 * acpica has already done add_avintr(); we just 280 * to finish the job by mimicing translate_irq() 281 * 282 * Fake up an intrspec and setup the tables 283 */ 284 ispec.intrspec_vec = apic_sci_vect; 285 ispec.intrspec_pri = SCI_IPL; 286 287 if (apic_setup_irq_table(NULL, apic_sci_vect, NULL, 288 &ispec, &apic_sci_flags, DDI_INTR_TYPE_FIXED) < 0) { 289 cmn_err(CE_WARN, "!apic: SCI setup failed"); 290 return; 291 } 292 irqptr = apic_irq_table[apic_sci_vect]; 293 294 iflag = intr_clear(); 295 lock_set(&apic_ioapic_lock); 296 297 /* Program I/O APIC */ 298 (void) apic_setup_io_intr(irqptr, apic_sci_vect, B_FALSE); 299 300 lock_clear(&apic_ioapic_lock); 301 intr_restore(iflag); 302 303 irqptr->airq_share++; 304 } 305 } 306 307 /* 308 * Add mask bits to disable interrupt vector from happening 309 * at or above IPL. In addition, it should remove mask bits 310 * to enable interrupt vectors below the given IPL. 311 * 312 * Both add and delspl are complicated by the fact that different interrupts 313 * may share IRQs. This can happen in two ways. 314 * 1. The same H/W line is shared by more than 1 device 315 * 1a. with interrupts at different IPLs 316 * 1b. with interrupts at same IPL 317 * 2. We ran out of vectors at a given IPL and started sharing vectors. 318 * 1b and 2 should be handled gracefully, except for the fact some ISRs 319 * will get called often when no interrupt is pending for the device. 320 * For 1a, we handle it at the higher IPL. 321 */ 322 /*ARGSUSED*/ 323 int 324 apic_addspl_common(int irqno, int ipl, int min_ipl, int max_ipl) 325 { 326 uchar_t vector; 327 ulong_t iflag; 328 apic_irq_t *irqptr, *irqheadptr; 329 int irqindex; 330 331 ASSERT(max_ipl <= UCHAR_MAX); 332 irqindex = IRQINDEX(irqno); 333 334 if ((irqindex == -1) || (!apic_irq_table[irqindex])) 335 return (PSM_FAILURE); 336 337 mutex_enter(&airq_mutex); 338 irqptr = irqheadptr = apic_irq_table[irqindex]; 339 340 DDI_INTR_IMPLDBG((CE_CONT, "apic_addspl: dip=0x%p type=%d irqno=0x%x " 341 "vector=0x%x\n", (void *)irqptr->airq_dip, 342 irqptr->airq_mps_intr_index, irqno, irqptr->airq_vector)); 343 344 while (irqptr) { 345 if (VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno) 346 break; 347 irqptr = irqptr->airq_next; 348 } 349 irqptr->airq_share++; 350 351 mutex_exit(&airq_mutex); 352 353 /* return if it is not hardware interrupt */ 354 if (irqptr->airq_mps_intr_index == RESERVE_INDEX) 355 return (PSM_SUCCESS); 356 357 /* Or if there are more interupts at a higher IPL */ 358 if (ipl != max_ipl) 359 return (PSM_SUCCESS); 360 361 /* 362 * if apic_picinit() has not been called yet, just return. 363 * At the end of apic_picinit(), we will call setup_io_intr(). 364 */ 365 366 if (!apic_picinit_called) 367 return (PSM_SUCCESS); 368 369 /* 370 * Upgrade vector if max_ipl is not earlier ipl. If we cannot allocate, 371 * return failure. 372 */ 373 if (irqptr->airq_ipl != max_ipl && 374 !ioapic_mask_workaround[irqptr->airq_ioapicindex]) { 375 376 vector = apic_allocate_vector(max_ipl, irqindex, 1); 377 if (vector == 0) { 378 irqptr->airq_share--; 379 return (PSM_FAILURE); 380 } 381 irqptr = irqheadptr; 382 apic_mark_vector(irqptr->airq_vector, vector); 383 while (irqptr) { 384 irqptr->airq_vector = vector; 385 irqptr->airq_ipl = (uchar_t)max_ipl; 386 /* 387 * reprogram irq being added and every one else 388 * who is not in the UNINIT state 389 */ 390 if ((VIRTIRQ(irqindex, irqptr->airq_share_id) == 391 irqno) || (irqptr->airq_temp_cpu != IRQ_UNINIT)) { 392 apic_record_rdt_entry(irqptr, irqindex); 393 394 iflag = intr_clear(); 395 lock_set(&apic_ioapic_lock); 396 397 (void) apic_setup_io_intr(irqptr, irqindex, 398 B_FALSE); 399 400 lock_clear(&apic_ioapic_lock); 401 intr_restore(iflag); 402 } 403 irqptr = irqptr->airq_next; 404 } 405 return (PSM_SUCCESS); 406 407 } else if (irqptr->airq_ipl != max_ipl && 408 ioapic_mask_workaround[irqptr->airq_ioapicindex]) { 409 /* 410 * We cannot upgrade the vector, but we can change 411 * the IPL that this vector induces. 412 * 413 * Note that we subtract APIC_BASE_VECT from the vector 414 * here because this array is used in apic_intr_enter 415 * (no need to add APIC_BASE_VECT in that hot code 416 * path since we can do it in the rarely-executed path 417 * here). 418 */ 419 apic_ipls[irqptr->airq_vector - APIC_BASE_VECT] = 420 (uchar_t)max_ipl; 421 422 irqptr = irqheadptr; 423 while (irqptr) { 424 irqptr->airq_ipl = (uchar_t)max_ipl; 425 irqptr = irqptr->airq_next; 426 } 427 428 return (PSM_SUCCESS); 429 } 430 431 ASSERT(irqptr); 432 433 iflag = intr_clear(); 434 lock_set(&apic_ioapic_lock); 435 436 (void) apic_setup_io_intr(irqptr, irqindex, B_FALSE); 437 438 lock_clear(&apic_ioapic_lock); 439 intr_restore(iflag); 440 441 return (PSM_SUCCESS); 442 } 443 444 /* 445 * Recompute mask bits for the given interrupt vector. 446 * If there is no interrupt servicing routine for this 447 * vector, this function should disable interrupt vector 448 * from happening at all IPLs. If there are still 449 * handlers using the given vector, this function should 450 * disable the given vector from happening below the lowest 451 * IPL of the remaining hadlers. 452 */ 453 /*ARGSUSED*/ 454 int 455 apic_delspl_common(int irqno, int ipl, int min_ipl, int max_ipl) 456 { 457 uchar_t vector; 458 uint32_t bind_cpu; 459 int intin, irqindex; 460 int ioapic_ix; 461 apic_irq_t *irqptr, *preirqptr, *irqheadptr, *irqp; 462 ulong_t iflag; 463 464 mutex_enter(&airq_mutex); 465 irqindex = IRQINDEX(irqno); 466 irqptr = preirqptr = irqheadptr = apic_irq_table[irqindex]; 467 468 DDI_INTR_IMPLDBG((CE_CONT, "apic_delspl: dip=0x%p type=%d irqno=0x%x " 469 "vector=0x%x\n", (void *)irqptr->airq_dip, 470 irqptr->airq_mps_intr_index, irqno, irqptr->airq_vector)); 471 472 while (irqptr) { 473 if (VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno) 474 break; 475 preirqptr = irqptr; 476 irqptr = irqptr->airq_next; 477 } 478 ASSERT(irqptr); 479 480 irqptr->airq_share--; 481 482 mutex_exit(&airq_mutex); 483 484 /* 485 * If there are more interrupts at a higher IPL, we don't need 486 * to disable anything. 487 */ 488 if (ipl < max_ipl) 489 return (PSM_SUCCESS); 490 491 /* return if it is not hardware interrupt */ 492 if (irqptr->airq_mps_intr_index == RESERVE_INDEX) 493 return (PSM_SUCCESS); 494 495 if (!apic_picinit_called) { 496 /* 497 * Clear irq_struct. If two devices shared an intpt 498 * line & 1 unloaded before picinit, we are hosed. But, then 499 * we hope the machine survive. 500 */ 501 irqptr->airq_mps_intr_index = FREE_INDEX; 502 irqptr->airq_temp_cpu = IRQ_UNINIT; 503 apic_free_vector(irqptr->airq_vector); 504 return (PSM_SUCCESS); 505 } 506 /* 507 * Downgrade vector to new max_ipl if needed. If we cannot allocate, 508 * use old IPL. Not very elegant, but it should work. 509 */ 510 if ((irqptr->airq_ipl != max_ipl) && (max_ipl != PSM_INVALID_IPL) && 511 !ioapic_mask_workaround[irqptr->airq_ioapicindex]) { 512 apic_irq_t *irqp; 513 if (vector = apic_allocate_vector(max_ipl, irqno, 1)) { 514 apic_mark_vector(irqheadptr->airq_vector, vector); 515 irqp = irqheadptr; 516 while (irqp) { 517 irqp->airq_vector = vector; 518 irqp->airq_ipl = (uchar_t)max_ipl; 519 if (irqp->airq_temp_cpu != IRQ_UNINIT) { 520 apic_record_rdt_entry(irqp, irqindex); 521 522 iflag = intr_clear(); 523 lock_set(&apic_ioapic_lock); 524 525 (void) apic_setup_io_intr(irqp, 526 irqindex, B_FALSE); 527 528 lock_clear(&apic_ioapic_lock); 529 intr_restore(iflag); 530 } 531 irqp = irqp->airq_next; 532 } 533 } 534 535 } else if (irqptr->airq_ipl != max_ipl && 536 max_ipl != PSM_INVALID_IPL && 537 ioapic_mask_workaround[irqptr->airq_ioapicindex]) { 538 539 /* 540 * We cannot downgrade the IPL of the vector below the vector's 541 * hardware priority. If we did, it would be possible for a 542 * higher-priority hardware vector to interrupt a CPU running at an IPL 543 * lower than the hardware priority of the interrupting vector (but 544 * higher than the soft IPL of this IRQ). When this happens, we would 545 * then try to drop the IPL BELOW what it was (effectively dropping 546 * below base_spl) which would be potentially catastrophic. 547 * 548 * (e.g. Suppose the hardware vector associated with this IRQ is 0x40 549 * (hardware IPL of 4). Further assume that the old IPL of this IRQ 550 * was 4, but the new IPL is 1. If we forced vector 0x40 to result in 551 * an IPL of 1, it would be possible for the processor to be executing 552 * at IPL 3 and for an interrupt to come in on vector 0x40, interrupting 553 * the currently-executing ISR. When apic_intr_enter consults 554 * apic_irqs[], it will return 1, bringing the IPL of the CPU down to 1 555 * so even though the processor was running at IPL 4, an IPL 1 556 * interrupt will have interrupted it, which must not happen)). 557 * 558 * Effectively, this means that the hardware priority corresponding to 559 * the IRQ's IPL (in apic_ipls[]) cannot be lower than the vector's 560 * hardware priority. 561 * 562 * (In the above example, then, after removal of the IPL 4 device's 563 * interrupt handler, the new IPL will continue to be 4 because the 564 * hardware priority that IPL 1 implies is lower than the hardware 565 * priority of the vector used.) 566 */ 567 /* apic_ipls is indexed by vector, starting at APIC_BASE_VECT */ 568 const int apic_ipls_index = irqptr->airq_vector - 569 APIC_BASE_VECT; 570 const int vect_inherent_hwpri = irqptr->airq_vector >> 571 APIC_IPL_SHIFT; 572 573 /* 574 * If there are still devices using this IRQ, determine the 575 * new ipl to use. 576 */ 577 if (irqptr->airq_share) { 578 int vect_desired_hwpri, hwpri; 579 580 ASSERT(max_ipl < MAXIPL); 581 vect_desired_hwpri = apic_ipltopri[max_ipl] >> 582 APIC_IPL_SHIFT; 583 584 /* 585 * If the desired IPL's hardware priority is lower 586 * than that of the vector, use the hardware priority 587 * of the vector to determine the new IPL. 588 */ 589 hwpri = (vect_desired_hwpri < vect_inherent_hwpri) ? 590 vect_inherent_hwpri : vect_desired_hwpri; 591 592 /* 593 * Now, to get the right index for apic_vectortoipl, 594 * we need to subtract APIC_BASE_VECT from the 595 * hardware-vector-equivalent (in hwpri). Since hwpri 596 * is already shifted, we shift APIC_BASE_VECT before 597 * doing the subtraction. 598 */ 599 hwpri -= (APIC_BASE_VECT >> APIC_IPL_SHIFT); 600 601 ASSERT(hwpri >= 0); 602 ASSERT(hwpri < MAXIPL); 603 max_ipl = apic_vectortoipl[hwpri]; 604 apic_ipls[apic_ipls_index] = max_ipl; 605 606 irqp = irqheadptr; 607 while (irqp) { 608 irqp->airq_ipl = (uchar_t)max_ipl; 609 irqp = irqp->airq_next; 610 } 611 } else { 612 /* 613 * No more devices on this IRQ, so reset this vector's 614 * element in apic_ipls to the original IPL for this 615 * vector 616 */ 617 apic_ipls[apic_ipls_index] = 618 apic_vectortoipl[vect_inherent_hwpri]; 619 } 620 } 621 622 /* 623 * If there are still active interrupts, we are done. 624 */ 625 if (irqptr->airq_share) 626 return (PSM_SUCCESS); 627 628 iflag = intr_clear(); 629 lock_set(&apic_ioapic_lock); 630 631 if (irqptr->airq_mps_intr_index == MSI_INDEX) { 632 /* 633 * Disable the MSI vector 634 * Make sure we only disable on the last 635 * of the multi-MSI support 636 */ 637 if (i_ddi_intr_get_current_nenables(irqptr->airq_dip) == 1) { 638 apic_pci_msi_disable_mode(irqptr->airq_dip, 639 DDI_INTR_TYPE_MSI); 640 } 641 } else if (irqptr->airq_mps_intr_index == MSIX_INDEX) { 642 /* 643 * Disable the MSI-X vector 644 * needs to clear its mask and addr/data for each MSI-X 645 */ 646 apic_pci_msi_unconfigure(irqptr->airq_dip, DDI_INTR_TYPE_MSIX, 647 irqptr->airq_origirq); 648 /* 649 * Make sure we only disable on the last MSI-X 650 */ 651 if (i_ddi_intr_get_current_nenables(irqptr->airq_dip) == 1) { 652 apic_pci_msi_disable_mode(irqptr->airq_dip, 653 DDI_INTR_TYPE_MSIX); 654 } 655 } else { 656 /* 657 * The assumption here is that this is safe, even for 658 * systems with IOAPICs that suffer from the hardware 659 * erratum because all devices have been quiesced before 660 * they unregister their interrupt handlers. If that 661 * assumption turns out to be false, this mask operation 662 * can induce the same erratum result we're trying to 663 * avoid. 664 */ 665 ioapic_ix = irqptr->airq_ioapicindex; 666 intin = irqptr->airq_intin_no; 667 ioapic_write(ioapic_ix, APIC_RDT_CMD + 2 * intin, AV_MASK); 668 } 669 670 /* 671 * This irq entry is the only one in the chain. 672 */ 673 if (irqheadptr->airq_next == NULL) { 674 ASSERT(irqheadptr == irqptr); 675 bind_cpu = irqptr->airq_temp_cpu; 676 if (((uint32_t)bind_cpu != IRQ_UNBOUND) && 677 ((uint32_t)bind_cpu != IRQ_UNINIT)) { 678 ASSERT(apic_cpu_in_range(bind_cpu)); 679 if (bind_cpu & IRQ_USER_BOUND) { 680 /* If hardbound, temp_cpu == cpu */ 681 bind_cpu &= ~IRQ_USER_BOUND; 682 apic_cpus[bind_cpu].aci_bound--; 683 } else 684 apic_cpus[bind_cpu].aci_temp_bound--; 685 } 686 irqptr->airq_temp_cpu = IRQ_UNINIT; 687 irqptr->airq_mps_intr_index = FREE_INDEX; 688 lock_clear(&apic_ioapic_lock); 689 intr_restore(iflag); 690 apic_free_vector(irqptr->airq_vector); 691 return (PSM_SUCCESS); 692 } 693 694 /* 695 * If we get here, we are sharing the vector and there are more than 696 * one active irq entries in the chain. 697 */ 698 lock_clear(&apic_ioapic_lock); 699 intr_restore(iflag); 700 701 mutex_enter(&airq_mutex); 702 /* Remove the irq entry from the chain */ 703 if (irqptr == irqheadptr) { /* The irq entry is at the head */ 704 apic_irq_table[irqindex] = irqptr->airq_next; 705 } else { 706 preirqptr->airq_next = irqptr->airq_next; 707 } 708 /* Free the irq entry */ 709 kmem_free(irqptr, sizeof (apic_irq_t)); 710 mutex_exit(&airq_mutex); 711 712 return (PSM_SUCCESS); 713 } 714 715 /* 716 * apic_introp_xlate() replaces apic_translate_irq() and is 717 * called only from apic_intr_ops(). With the new ADII framework, 718 * the priority can no longer be retrieved through i_ddi_get_intrspec(). 719 * It has to be passed in from the caller. 720 * 721 * Return value: 722 * Success: irqno for the given device 723 * Failure: -1 724 */ 725 int 726 apic_introp_xlate(dev_info_t *dip, struct intrspec *ispec, int type) 727 { 728 char dev_type[16]; 729 int dev_len, pci_irq, newirq, bustype, devid, busid, i; 730 int irqno = ispec->intrspec_vec; 731 ddi_acc_handle_t cfg_handle; 732 uchar_t ipin; 733 struct apic_io_intr *intrp; 734 iflag_t intr_flag; 735 ACPI_SUBTABLE_HEADER *hp; 736 ACPI_MADT_INTERRUPT_OVERRIDE *isop; 737 apic_irq_t *airqp; 738 int parent_is_pci_or_pciex = 0; 739 int child_is_pciex = 0; 740 741 DDI_INTR_IMPLDBG((CE_CONT, "apic_introp_xlate: dip=0x%p name=%s " 742 "type=%d irqno=0x%x\n", (void *)dip, ddi_get_name(dip), type, 743 irqno)); 744 745 dev_len = sizeof (dev_type); 746 if (ddi_getlongprop_buf(DDI_DEV_T_ANY, ddi_get_parent(dip), 747 DDI_PROP_DONTPASS, "device_type", (caddr_t)dev_type, 748 &dev_len) == DDI_PROP_SUCCESS) { 749 if ((strcmp(dev_type, "pci") == 0) || 750 (strcmp(dev_type, "pciex") == 0)) 751 parent_is_pci_or_pciex = 1; 752 } 753 754 if (ddi_getlongprop_buf(DDI_DEV_T_ANY, dip, 755 DDI_PROP_DONTPASS, "compatible", (caddr_t)dev_type, 756 &dev_len) == DDI_PROP_SUCCESS) { 757 if (strstr(dev_type, "pciex")) 758 child_is_pciex = 1; 759 } 760 761 if (DDI_INTR_IS_MSI_OR_MSIX(type)) { 762 if ((airqp = apic_find_irq(dip, ispec, type)) != NULL) { 763 airqp->airq_iflag.bustype = 764 child_is_pciex ? BUS_PCIE : BUS_PCI; 765 return (apic_vector_to_irq[airqp->airq_vector]); 766 } 767 return (apic_setup_irq_table(dip, irqno, NULL, ispec, 768 NULL, type)); 769 } 770 771 bustype = 0; 772 773 /* check if we have already translated this irq */ 774 mutex_enter(&airq_mutex); 775 newirq = apic_min_device_irq; 776 for (; newirq <= apic_max_device_irq; newirq++) { 777 airqp = apic_irq_table[newirq]; 778 while (airqp) { 779 if ((airqp->airq_dip == dip) && 780 (airqp->airq_origirq == irqno) && 781 (airqp->airq_mps_intr_index != FREE_INDEX)) { 782 783 mutex_exit(&airq_mutex); 784 return (VIRTIRQ(newirq, airqp->airq_share_id)); 785 } 786 airqp = airqp->airq_next; 787 } 788 } 789 mutex_exit(&airq_mutex); 790 791 if (apic_defconf) 792 goto defconf; 793 794 if ((dip == NULL) || (!apic_irq_translate && !apic_enable_acpi)) 795 goto nonpci; 796 797 if (parent_is_pci_or_pciex) { 798 /* pci device */ 799 if (acpica_get_bdf(dip, &busid, &devid, NULL) != 0) 800 goto nonpci; 801 if (busid == 0 && apic_pci_bus_total == 1) 802 busid = (int)apic_single_pci_busid; 803 804 if (pci_config_setup(dip, &cfg_handle) != DDI_SUCCESS) 805 return (-1); 806 ipin = pci_config_get8(cfg_handle, PCI_CONF_IPIN) - PCI_INTA; 807 pci_config_teardown(&cfg_handle); 808 if (apic_enable_acpi && !apic_use_acpi_madt_only) { 809 if (apic_acpi_translate_pci_irq(dip, busid, devid, 810 ipin, &pci_irq, &intr_flag) != ACPI_PSM_SUCCESS) 811 return (-1); 812 813 intr_flag.bustype = child_is_pciex ? BUS_PCIE : BUS_PCI; 814 return (apic_setup_irq_table(dip, pci_irq, NULL, ispec, 815 &intr_flag, type)); 816 } else { 817 pci_irq = ((devid & 0x1f) << 2) | (ipin & 0x3); 818 if ((intrp = apic_find_io_intr_w_busid(pci_irq, busid)) 819 == NULL) { 820 if ((pci_irq = apic_handle_pci_pci_bridge(dip, 821 devid, ipin, &intrp)) == -1) 822 return (-1); 823 } 824 return (apic_setup_irq_table(dip, pci_irq, intrp, ispec, 825 NULL, type)); 826 } 827 } else if (strcmp(dev_type, "isa") == 0) 828 bustype = BUS_ISA; 829 else if (strcmp(dev_type, "eisa") == 0) 830 bustype = BUS_EISA; 831 832 nonpci: 833 if (apic_enable_acpi && !apic_use_acpi_madt_only) { 834 /* search iso entries first */ 835 if (acpi_iso_cnt != 0) { 836 hp = (ACPI_SUBTABLE_HEADER *)acpi_isop; 837 i = 0; 838 while (i < acpi_iso_cnt) { 839 if (hp->Type == 840 ACPI_MADT_TYPE_INTERRUPT_OVERRIDE) { 841 isop = 842 (ACPI_MADT_INTERRUPT_OVERRIDE *) hp; 843 if (isop->Bus == 0 && 844 isop->SourceIrq == irqno) { 845 newirq = isop->GlobalIrq; 846 intr_flag.intr_po = 847 isop->IntiFlags & 848 ACPI_MADT_POLARITY_MASK; 849 intr_flag.intr_el = 850 (isop->IntiFlags & 851 ACPI_MADT_TRIGGER_MASK) 852 >> 2; 853 intr_flag.bustype = BUS_ISA; 854 855 return (apic_setup_irq_table( 856 dip, newirq, NULL, ispec, 857 &intr_flag, type)); 858 859 } 860 i++; 861 } 862 hp = (ACPI_SUBTABLE_HEADER *)(((char *)hp) + 863 hp->Length); 864 } 865 } 866 intr_flag.intr_po = INTR_PO_ACTIVE_HIGH; 867 intr_flag.intr_el = INTR_EL_EDGE; 868 intr_flag.bustype = BUS_ISA; 869 return (apic_setup_irq_table(dip, irqno, NULL, ispec, 870 &intr_flag, type)); 871 } else { 872 if (bustype == 0) /* not initialized */ 873 bustype = eisa_level_intr_mask ? BUS_EISA : BUS_ISA; 874 for (i = 0; i < 2; i++) { 875 if (((busid = apic_find_bus_id(bustype)) != -1) && 876 ((intrp = apic_find_io_intr_w_busid(irqno, busid)) 877 != NULL)) { 878 if ((newirq = apic_setup_irq_table(dip, irqno, 879 intrp, ispec, NULL, type)) != -1) { 880 return (newirq); 881 } 882 goto defconf; 883 } 884 bustype = (bustype == BUS_EISA) ? BUS_ISA : BUS_EISA; 885 } 886 } 887 888 /* MPS default configuration */ 889 defconf: 890 newirq = apic_setup_irq_table(dip, irqno, NULL, ispec, NULL, type); 891 if (newirq == -1) 892 return (-1); 893 ASSERT(IRQINDEX(newirq) == irqno); 894 ASSERT(apic_irq_table[irqno]); 895 return (newirq); 896 } 897 898 /* 899 * Attempt to share vector with someone else 900 */ 901 static int 902 apic_share_vector(int irqno, iflag_t *intr_flagp, short intr_index, int ipl, 903 uchar_t ioapicindex, uchar_t ipin, apic_irq_t **irqptrp) 904 { 905 #ifdef DEBUG 906 apic_irq_t *tmpirqp = NULL; 907 #endif /* DEBUG */ 908 apic_irq_t *irqptr, dummyirq; 909 int newirq, chosen_irq = -1, share = 127; 910 int lowest, highest, i; 911 uchar_t share_id; 912 913 DDI_INTR_IMPLDBG((CE_CONT, "apic_share_vector: irqno=0x%x " 914 "intr_index=0x%x ipl=0x%x\n", irqno, intr_index, ipl)); 915 916 highest = apic_ipltopri[ipl] + APIC_VECTOR_MASK; 917 lowest = apic_ipltopri[ipl-1] + APIC_VECTOR_PER_IPL; 918 919 if (highest < lowest) /* Both ipl and ipl-1 map to same pri */ 920 lowest -= APIC_VECTOR_PER_IPL; 921 dummyirq.airq_mps_intr_index = intr_index; 922 dummyirq.airq_ioapicindex = ioapicindex; 923 dummyirq.airq_intin_no = ipin; 924 if (intr_flagp) 925 dummyirq.airq_iflag = *intr_flagp; 926 apic_record_rdt_entry(&dummyirq, irqno); 927 for (i = lowest; i <= highest; i++) { 928 newirq = apic_vector_to_irq[i]; 929 if (newirq == APIC_RESV_IRQ) 930 continue; 931 irqptr = apic_irq_table[newirq]; 932 933 if ((dummyirq.airq_rdt_entry & 0xFF00) != 934 (irqptr->airq_rdt_entry & 0xFF00)) 935 /* not compatible */ 936 continue; 937 938 if (irqptr->airq_share < share) { 939 share = irqptr->airq_share; 940 chosen_irq = newirq; 941 } 942 } 943 if (chosen_irq != -1) { 944 /* 945 * Assign a share id which is free or which is larger 946 * than the largest one. 947 */ 948 share_id = 1; 949 mutex_enter(&airq_mutex); 950 irqptr = apic_irq_table[chosen_irq]; 951 while (irqptr) { 952 if (irqptr->airq_mps_intr_index == FREE_INDEX) { 953 share_id = irqptr->airq_share_id; 954 break; 955 } 956 if (share_id <= irqptr->airq_share_id) 957 share_id = irqptr->airq_share_id + 1; 958 #ifdef DEBUG 959 tmpirqp = irqptr; 960 #endif /* DEBUG */ 961 irqptr = irqptr->airq_next; 962 } 963 if (!irqptr) { 964 irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP); 965 irqptr->airq_temp_cpu = IRQ_UNINIT; 966 irqptr->airq_next = 967 apic_irq_table[chosen_irq]->airq_next; 968 apic_irq_table[chosen_irq]->airq_next = irqptr; 969 #ifdef DEBUG 970 tmpirqp = apic_irq_table[chosen_irq]; 971 #endif /* DEBUG */ 972 } 973 irqptr->airq_mps_intr_index = intr_index; 974 irqptr->airq_ioapicindex = ioapicindex; 975 irqptr->airq_intin_no = ipin; 976 if (intr_flagp) 977 irqptr->airq_iflag = *intr_flagp; 978 irqptr->airq_vector = apic_irq_table[chosen_irq]->airq_vector; 979 irqptr->airq_share_id = share_id; 980 apic_record_rdt_entry(irqptr, irqno); 981 *irqptrp = irqptr; 982 #ifdef DEBUG 983 /* shuffle the pointers to test apic_delspl path */ 984 if (tmpirqp) { 985 tmpirqp->airq_next = irqptr->airq_next; 986 irqptr->airq_next = apic_irq_table[chosen_irq]; 987 apic_irq_table[chosen_irq] = irqptr; 988 } 989 #endif /* DEBUG */ 990 mutex_exit(&airq_mutex); 991 return (VIRTIRQ(chosen_irq, share_id)); 992 } 993 return (-1); 994 } 995 996 /* 997 * Allocate/Initialize the apic_irq_table[] entry for given irqno. If the entry 998 * is used already, we will try to allocate a new irqno. 999 * 1000 * Return value: 1001 * Success: irqno 1002 * Failure: -1 1003 */ 1004 static int 1005 apic_setup_irq_table(dev_info_t *dip, int irqno, struct apic_io_intr *intrp, 1006 struct intrspec *ispec, iflag_t *intr_flagp, int type) 1007 { 1008 int origirq = ispec->intrspec_vec; 1009 uchar_t ipl = ispec->intrspec_pri; 1010 int newirq, intr_index; 1011 uchar_t ipin, ioapic, ioapicindex, vector; 1012 apic_irq_t *irqptr; 1013 major_t major; 1014 dev_info_t *sdip; 1015 1016 DDI_INTR_IMPLDBG((CE_CONT, "apic_setup_irq_table: dip=0x%p type=%d " 1017 "irqno=0x%x origirq=0x%x\n", (void *)dip, type, irqno, origirq)); 1018 1019 ASSERT(ispec != NULL); 1020 1021 major = (dip != NULL) ? ddi_driver_major(dip) : 0; 1022 1023 if (DDI_INTR_IS_MSI_OR_MSIX(type)) { 1024 /* MSI/X doesn't need to setup ioapic stuffs */ 1025 ioapicindex = 0xff; 1026 ioapic = 0xff; 1027 ipin = (uchar_t)0xff; 1028 intr_index = (type == DDI_INTR_TYPE_MSI) ? MSI_INDEX : 1029 MSIX_INDEX; 1030 mutex_enter(&airq_mutex); 1031 if ((irqno = apic_allocate_irq(apic_first_avail_irq)) == -1) { 1032 mutex_exit(&airq_mutex); 1033 /* need an irq for MSI/X to index into autovect[] */ 1034 cmn_err(CE_WARN, "No interrupt irq: %s instance %d", 1035 ddi_get_name(dip), ddi_get_instance(dip)); 1036 return (-1); 1037 } 1038 mutex_exit(&airq_mutex); 1039 1040 } else if (intrp != NULL) { 1041 intr_index = (int)(intrp - apic_io_intrp); 1042 ioapic = intrp->intr_destid; 1043 ipin = intrp->intr_destintin; 1044 /* Find ioapicindex. If destid was ALL, we will exit with 0. */ 1045 for (ioapicindex = apic_io_max - 1; ioapicindex; ioapicindex--) 1046 if (apic_io_id[ioapicindex] == ioapic) 1047 break; 1048 ASSERT((ioapic == apic_io_id[ioapicindex]) || 1049 (ioapic == INTR_ALL_APIC)); 1050 1051 /* check whether this intin# has been used by another irqno */ 1052 if ((newirq = apic_find_intin(ioapicindex, ipin)) != -1) { 1053 return (newirq); 1054 } 1055 1056 } else if (intr_flagp != NULL) { 1057 /* ACPI case */ 1058 intr_index = ACPI_INDEX; 1059 ioapicindex = acpi_find_ioapic(irqno); 1060 ASSERT(ioapicindex != 0xFF); 1061 ioapic = apic_io_id[ioapicindex]; 1062 ipin = irqno - apic_io_vectbase[ioapicindex]; 1063 if (apic_irq_table[irqno] && 1064 apic_irq_table[irqno]->airq_mps_intr_index == ACPI_INDEX) { 1065 ASSERT(apic_irq_table[irqno]->airq_intin_no == ipin && 1066 apic_irq_table[irqno]->airq_ioapicindex == 1067 ioapicindex); 1068 return (irqno); 1069 } 1070 1071 } else { 1072 /* default configuration */ 1073 ioapicindex = 0; 1074 ioapic = apic_io_id[ioapicindex]; 1075 ipin = (uchar_t)irqno; 1076 intr_index = DEFAULT_INDEX; 1077 } 1078 1079 if (ispec == NULL) { 1080 APIC_VERBOSE_IOAPIC((CE_WARN, "No intrspec for irqno = %x\n", 1081 irqno)); 1082 } else if ((vector = apic_allocate_vector(ipl, irqno, 0)) == 0) { 1083 if ((newirq = apic_share_vector(irqno, intr_flagp, intr_index, 1084 ipl, ioapicindex, ipin, &irqptr)) != -1) { 1085 irqptr->airq_ipl = ipl; 1086 irqptr->airq_origirq = (uchar_t)origirq; 1087 irqptr->airq_dip = dip; 1088 irqptr->airq_major = major; 1089 sdip = apic_irq_table[IRQINDEX(newirq)]->airq_dip; 1090 /* This is OK to do really */ 1091 if (sdip == NULL) { 1092 cmn_err(CE_WARN, "Sharing vectors: %s" 1093 " instance %d and SCI", 1094 ddi_get_name(dip), ddi_get_instance(dip)); 1095 } else { 1096 cmn_err(CE_WARN, "Sharing vectors: %s" 1097 " instance %d and %s instance %d", 1098 ddi_get_name(sdip), ddi_get_instance(sdip), 1099 ddi_get_name(dip), ddi_get_instance(dip)); 1100 } 1101 return (newirq); 1102 } 1103 /* try high priority allocation now that share has failed */ 1104 if ((vector = apic_allocate_vector(ipl, irqno, 1)) == 0) { 1105 cmn_err(CE_WARN, "No interrupt vector: %s instance %d", 1106 ddi_get_name(dip), ddi_get_instance(dip)); 1107 return (-1); 1108 } 1109 } 1110 1111 mutex_enter(&airq_mutex); 1112 if (apic_irq_table[irqno] == NULL) { 1113 irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP); 1114 irqptr->airq_temp_cpu = IRQ_UNINIT; 1115 apic_irq_table[irqno] = irqptr; 1116 } else { 1117 irqptr = apic_irq_table[irqno]; 1118 if (irqptr->airq_mps_intr_index != FREE_INDEX) { 1119 /* 1120 * The slot is used by another irqno, so allocate 1121 * a free irqno for this interrupt 1122 */ 1123 newirq = apic_allocate_irq(apic_first_avail_irq); 1124 if (newirq == -1) { 1125 mutex_exit(&airq_mutex); 1126 return (-1); 1127 } 1128 irqno = newirq; 1129 irqptr = apic_irq_table[irqno]; 1130 if (irqptr == NULL) { 1131 irqptr = kmem_zalloc(sizeof (apic_irq_t), 1132 KM_SLEEP); 1133 irqptr->airq_temp_cpu = IRQ_UNINIT; 1134 apic_irq_table[irqno] = irqptr; 1135 } 1136 vector = apic_modify_vector(vector, newirq); 1137 } 1138 } 1139 apic_max_device_irq = max(irqno, apic_max_device_irq); 1140 apic_min_device_irq = min(irqno, apic_min_device_irq); 1141 mutex_exit(&airq_mutex); 1142 irqptr->airq_ioapicindex = ioapicindex; 1143 irqptr->airq_intin_no = ipin; 1144 irqptr->airq_ipl = ipl; 1145 irqptr->airq_vector = vector; 1146 irqptr->airq_origirq = (uchar_t)origirq; 1147 irqptr->airq_share_id = 0; 1148 irqptr->airq_mps_intr_index = (short)intr_index; 1149 irqptr->airq_dip = dip; 1150 irqptr->airq_major = major; 1151 irqptr->airq_cpu = apic_bind_intr(dip, irqno, ioapic, ipin); 1152 if (intr_flagp) 1153 irqptr->airq_iflag = *intr_flagp; 1154 1155 if (!DDI_INTR_IS_MSI_OR_MSIX(type)) { 1156 /* setup I/O APIC entry for non-MSI/X interrupts */ 1157 apic_record_rdt_entry(irqptr, irqno); 1158 } 1159 return (irqno); 1160 } 1161 1162 /* 1163 * return the cpu to which this intr should be bound. 1164 * Check properties or any other mechanism to see if user wants it 1165 * bound to a specific CPU. If so, return the cpu id with high bit set. 1166 * If not, use the policy to choose a cpu and return the id. 1167 */ 1168 uint32_t 1169 apic_bind_intr(dev_info_t *dip, int irq, uchar_t ioapicid, uchar_t intin) 1170 { 1171 int instance, instno, prop_len, bind_cpu, count; 1172 uint_t i, rc; 1173 uint32_t cpu; 1174 major_t major; 1175 char *name, *drv_name, *prop_val, *cptr; 1176 char prop_name[32]; 1177 ulong_t iflag; 1178 1179 1180 if (apic_intr_policy == INTR_LOWEST_PRIORITY) 1181 return (IRQ_UNBOUND); 1182 1183 if (apic_nproc == 1) 1184 return (0); 1185 1186 drv_name = NULL; 1187 rc = DDI_PROP_NOT_FOUND; 1188 major = (major_t)-1; 1189 if (dip != NULL) { 1190 name = ddi_get_name(dip); 1191 major = ddi_name_to_major(name); 1192 drv_name = ddi_major_to_name(major); 1193 instance = ddi_get_instance(dip); 1194 if (apic_intr_policy == INTR_ROUND_ROBIN_WITH_AFFINITY) { 1195 i = apic_min_device_irq; 1196 for (; i <= apic_max_device_irq; i++) { 1197 1198 if ((i == irq) || (apic_irq_table[i] == NULL) || 1199 (apic_irq_table[i]->airq_mps_intr_index 1200 == FREE_INDEX)) 1201 continue; 1202 1203 if ((apic_irq_table[i]->airq_major == major) && 1204 (!(apic_irq_table[i]->airq_cpu & 1205 IRQ_USER_BOUND))) { 1206 1207 cpu = apic_irq_table[i]->airq_cpu; 1208 1209 cmn_err(CE_CONT, 1210 "!%s: %s (%s) instance #%d " 1211 "irq 0x%x vector 0x%x ioapic 0x%x " 1212 "intin 0x%x is bound to cpu %d\n", 1213 psm_name, 1214 name, drv_name, instance, irq, 1215 apic_irq_table[irq]->airq_vector, 1216 ioapicid, intin, cpu); 1217 return (cpu); 1218 } 1219 } 1220 } 1221 /* 1222 * search for "drvname"_intpt_bind_cpus property first, the 1223 * syntax of the property should be "a[,b,c,...]" where 1224 * instance 0 binds to cpu a, instance 1 binds to cpu b, 1225 * instance 3 binds to cpu c... 1226 * ddi_getlongprop() will search /option first, then / 1227 * if "drvname"_intpt_bind_cpus doesn't exist, then find 1228 * intpt_bind_cpus property. The syntax is the same, and 1229 * it applies to all the devices if its "drvname" specific 1230 * property doesn't exist 1231 */ 1232 (void) strcpy(prop_name, drv_name); 1233 (void) strcat(prop_name, "_intpt_bind_cpus"); 1234 rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0, prop_name, 1235 (caddr_t)&prop_val, &prop_len); 1236 if (rc != DDI_PROP_SUCCESS) { 1237 rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0, 1238 "intpt_bind_cpus", (caddr_t)&prop_val, &prop_len); 1239 } 1240 } 1241 if (rc == DDI_PROP_SUCCESS) { 1242 for (i = count = 0; i < (prop_len - 1); i++) 1243 if (prop_val[i] == ',') 1244 count++; 1245 if (prop_val[i-1] != ',') 1246 count++; 1247 /* 1248 * if somehow the binding instances defined in the 1249 * property are not enough for this instno., then 1250 * reuse the pattern for the next instance until 1251 * it reaches the requested instno 1252 */ 1253 instno = instance % count; 1254 i = 0; 1255 cptr = prop_val; 1256 while (i < instno) 1257 if (*cptr++ == ',') 1258 i++; 1259 bind_cpu = stoi(&cptr); 1260 kmem_free(prop_val, prop_len); 1261 /* if specific CPU is bogus, then default to next cpu */ 1262 if (!apic_cpu_in_range(bind_cpu)) { 1263 cmn_err(CE_WARN, "%s: %s=%s: CPU %d not present", 1264 psm_name, prop_name, prop_val, bind_cpu); 1265 rc = DDI_PROP_NOT_FOUND; 1266 } else { 1267 /* indicate that we are bound at user request */ 1268 bind_cpu |= IRQ_USER_BOUND; 1269 } 1270 /* 1271 * no need to check apic_cpus[].aci_status, if specific CPU is 1272 * not up, then post_cpu_start will handle it. 1273 */ 1274 } 1275 if (rc != DDI_PROP_SUCCESS) { 1276 iflag = intr_clear(); 1277 lock_set(&apic_ioapic_lock); 1278 bind_cpu = apic_get_next_bind_cpu(); 1279 lock_clear(&apic_ioapic_lock); 1280 intr_restore(iflag); 1281 } 1282 1283 if (drv_name != NULL) 1284 cmn_err(CE_CONT, "!%s: %s (%s) instance %d irq 0x%x " 1285 "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n", 1286 psm_name, name, drv_name, instance, irq, 1287 apic_irq_table[irq]->airq_vector, ioapicid, intin, 1288 bind_cpu & ~IRQ_USER_BOUND); 1289 else 1290 cmn_err(CE_CONT, "!%s: irq 0x%x " 1291 "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n", 1292 psm_name, irq, apic_irq_table[irq]->airq_vector, ioapicid, 1293 intin, bind_cpu & ~IRQ_USER_BOUND); 1294 1295 return ((uint32_t)bind_cpu); 1296 } 1297 1298 /* 1299 * Mark vector as being in the process of being deleted. Interrupts 1300 * may still come in on some CPU. The moment an interrupt comes with 1301 * the new vector, we know we can free the old one. Called only from 1302 * addspl and delspl with interrupts disabled. Because an interrupt 1303 * can be shared, but no interrupt from either device may come in, 1304 * we also use a timeout mechanism, which we arbitrarily set to 1305 * apic_revector_timeout microseconds. 1306 */ 1307 static void 1308 apic_mark_vector(uchar_t oldvector, uchar_t newvector) 1309 { 1310 ulong_t iflag; 1311 1312 iflag = intr_clear(); 1313 lock_set(&apic_revector_lock); 1314 if (!apic_oldvec_to_newvec) { 1315 apic_oldvec_to_newvec = 1316 kmem_zalloc(sizeof (newvector) * APIC_MAX_VECTOR * 2, 1317 KM_NOSLEEP); 1318 1319 if (!apic_oldvec_to_newvec) { 1320 /* 1321 * This failure is not catastrophic. 1322 * But, the oldvec will never be freed. 1323 */ 1324 apic_error |= APIC_ERR_MARK_VECTOR_FAIL; 1325 lock_clear(&apic_revector_lock); 1326 intr_restore(iflag); 1327 return; 1328 } 1329 apic_newvec_to_oldvec = &apic_oldvec_to_newvec[APIC_MAX_VECTOR]; 1330 } 1331 1332 /* See if we already did this for drivers which do double addintrs */ 1333 if (apic_oldvec_to_newvec[oldvector] != newvector) { 1334 apic_oldvec_to_newvec[oldvector] = newvector; 1335 apic_newvec_to_oldvec[newvector] = oldvector; 1336 apic_revector_pending++; 1337 } 1338 lock_clear(&apic_revector_lock); 1339 intr_restore(iflag); 1340 (void) timeout(apic_xlate_vector_free_timeout_handler, 1341 (void *)(uintptr_t)oldvector, drv_usectohz(apic_revector_timeout)); 1342 } 1343 1344 /* 1345 * xlate_vector is called from intr_enter if revector_pending is set. 1346 * It will xlate it if needed and mark the old vector as free. 1347 */ 1348 uchar_t 1349 apic_xlate_vector(uchar_t vector) 1350 { 1351 uchar_t newvector, oldvector = 0; 1352 1353 lock_set(&apic_revector_lock); 1354 /* Do we really need to do this ? */ 1355 if (!apic_revector_pending) { 1356 lock_clear(&apic_revector_lock); 1357 return (vector); 1358 } 1359 if ((newvector = apic_oldvec_to_newvec[vector]) != 0) 1360 oldvector = vector; 1361 else { 1362 /* 1363 * The incoming vector is new . See if a stale entry is 1364 * remaining 1365 */ 1366 if ((oldvector = apic_newvec_to_oldvec[vector]) != 0) 1367 newvector = vector; 1368 } 1369 1370 if (oldvector) { 1371 apic_revector_pending--; 1372 apic_oldvec_to_newvec[oldvector] = 0; 1373 apic_newvec_to_oldvec[newvector] = 0; 1374 apic_free_vector(oldvector); 1375 lock_clear(&apic_revector_lock); 1376 /* There could have been more than one reprogramming! */ 1377 return (apic_xlate_vector(newvector)); 1378 } 1379 lock_clear(&apic_revector_lock); 1380 return (vector); 1381 } 1382 1383 void 1384 apic_xlate_vector_free_timeout_handler(void *arg) 1385 { 1386 ulong_t iflag; 1387 uchar_t oldvector, newvector; 1388 1389 oldvector = (uchar_t)(uintptr_t)arg; 1390 iflag = intr_clear(); 1391 lock_set(&apic_revector_lock); 1392 if ((newvector = apic_oldvec_to_newvec[oldvector]) != 0) { 1393 apic_free_vector(oldvector); 1394 apic_oldvec_to_newvec[oldvector] = 0; 1395 apic_newvec_to_oldvec[newvector] = 0; 1396 apic_revector_pending--; 1397 } 1398 1399 lock_clear(&apic_revector_lock); 1400 intr_restore(iflag); 1401 } 1402 1403 /* 1404 * Bind interrupt corresponding to irq_ptr to bind_cpu. 1405 * Must be called with interrupts disabled and apic_ioapic_lock held 1406 */ 1407 int 1408 apic_rebind(apic_irq_t *irq_ptr, int bind_cpu, 1409 struct ioapic_reprogram_data *drep) 1410 { 1411 int ioapicindex, intin_no; 1412 uint32_t airq_temp_cpu; 1413 apic_cpus_info_t *cpu_infop; 1414 uint32_t rdt_entry; 1415 int which_irq; 1416 ioapic_rdt_t irdt; 1417 1418 which_irq = apic_vector_to_irq[irq_ptr->airq_vector]; 1419 1420 intin_no = irq_ptr->airq_intin_no; 1421 ioapicindex = irq_ptr->airq_ioapicindex; 1422 airq_temp_cpu = irq_ptr->airq_temp_cpu; 1423 if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu != IRQ_UNBOUND) { 1424 if (airq_temp_cpu & IRQ_USER_BOUND) 1425 /* Mask off high bit so it can be used as array index */ 1426 airq_temp_cpu &= ~IRQ_USER_BOUND; 1427 1428 ASSERT(apic_cpu_in_range(airq_temp_cpu)); 1429 } 1430 1431 /* 1432 * Can't bind to a CPU that's not accepting interrupts: 1433 */ 1434 cpu_infop = &apic_cpus[bind_cpu & ~IRQ_USER_BOUND]; 1435 if (!(cpu_infop->aci_status & APIC_CPU_INTR_ENABLE)) 1436 return (1); 1437 1438 /* 1439 * If we are about to change the interrupt vector for this interrupt, 1440 * and this interrupt is level-triggered, attached to an IOAPIC, 1441 * has been delivered to a CPU and that CPU has not handled it 1442 * yet, we cannot reprogram the IOAPIC now. 1443 */ 1444 if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) { 1445 1446 rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, 1447 intin_no); 1448 1449 if ((irq_ptr->airq_vector != RDT_VECTOR(rdt_entry)) && 1450 apic_check_stuck_interrupt(irq_ptr, airq_temp_cpu, 1451 bind_cpu, ioapicindex, intin_no, which_irq, drep) != 0) { 1452 1453 return (0); 1454 } 1455 1456 /* 1457 * NOTE: We do not unmask the RDT here, as an interrupt MAY 1458 * still come in before we have a chance to reprogram it below. 1459 * The reprogramming below will simultaneously change and 1460 * unmask the RDT entry. 1461 */ 1462 1463 if ((uint32_t)bind_cpu == IRQ_UNBOUND) { 1464 irdt.ir_lo = AV_LDEST | AV_LOPRI | 1465 irq_ptr->airq_rdt_entry; 1466 1467 WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no, 1468 AV_TOALL); 1469 1470 if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu != 1471 IRQ_UNBOUND) 1472 apic_cpus[airq_temp_cpu].aci_temp_bound--; 1473 1474 /* 1475 * Write the vector, trigger, and polarity portion of 1476 * the RDT 1477 */ 1478 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin_no, 1479 irdt.ir_lo); 1480 1481 irq_ptr->airq_temp_cpu = IRQ_UNBOUND; 1482 return (0); 1483 } 1484 } 1485 1486 if (bind_cpu & IRQ_USER_BOUND) { 1487 cpu_infop->aci_bound++; 1488 } else { 1489 cpu_infop->aci_temp_bound++; 1490 } 1491 ASSERT(apic_cpu_in_range(bind_cpu)); 1492 1493 if ((airq_temp_cpu != IRQ_UNBOUND) && (airq_temp_cpu != IRQ_UNINIT)) { 1494 apic_cpus[airq_temp_cpu].aci_temp_bound--; 1495 } 1496 if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) { 1497 1498 irdt.ir_lo = AV_PDEST | AV_FIXED | irq_ptr->airq_rdt_entry; 1499 irdt.ir_hi = cpu_infop->aci_local_id; 1500 1501 /* Write the RDT entry -- bind to a specific CPU: */ 1502 WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no, 1503 irdt.ir_hi << APIC_ID_BIT_OFFSET); 1504 1505 /* Write the vector, trigger, and polarity portion of the RDT */ 1506 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin_no, 1507 irdt.ir_lo); 1508 1509 } else { 1510 int type = (irq_ptr->airq_mps_intr_index == MSI_INDEX) ? 1511 DDI_INTR_TYPE_MSI : DDI_INTR_TYPE_MSIX; 1512 if (type == DDI_INTR_TYPE_MSI) { 1513 if (irq_ptr->airq_ioapicindex == 1514 irq_ptr->airq_origirq) { 1515 /* first one */ 1516 DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call " 1517 "apic_pci_msi_enable_vector\n")); 1518 apic_pci_msi_enable_vector(irq_ptr, 1519 type, which_irq, irq_ptr->airq_vector, 1520 irq_ptr->airq_intin_no, 1521 cpu_infop->aci_local_id); 1522 } 1523 if ((irq_ptr->airq_ioapicindex + 1524 irq_ptr->airq_intin_no - 1) == 1525 irq_ptr->airq_origirq) { /* last one */ 1526 DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call " 1527 "apic_pci_msi_enable_mode\n")); 1528 apic_pci_msi_enable_mode(irq_ptr->airq_dip, 1529 type, which_irq); 1530 } 1531 } else { /* MSI-X */ 1532 apic_pci_msi_enable_vector(irq_ptr, type, 1533 irq_ptr->airq_origirq, irq_ptr->airq_vector, 1, 1534 cpu_infop->aci_local_id); 1535 apic_pci_msi_enable_mode(irq_ptr->airq_dip, type, 1536 irq_ptr->airq_origirq); 1537 } 1538 } 1539 irq_ptr->airq_temp_cpu = (uint32_t)bind_cpu; 1540 apic_redist_cpu_skip &= ~(1 << (bind_cpu & ~IRQ_USER_BOUND)); 1541 return (0); 1542 } 1543 1544 static void 1545 apic_last_ditch_clear_remote_irr(int ioapic_ix, int intin_no) 1546 { 1547 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) 1548 & AV_REMOTE_IRR) != 0) { 1549 /* 1550 * Trying to clear the bit through normal 1551 * channels has failed. So as a last-ditch 1552 * effort, try to set the trigger mode to 1553 * edge, then to level. This has been 1554 * observed to work on many systems. 1555 */ 1556 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1557 intin_no, 1558 READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1559 intin_no) & ~AV_LEVEL); 1560 1561 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1562 intin_no, 1563 READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1564 intin_no) | AV_LEVEL); 1565 1566 /* 1567 * If the bit's STILL set, this interrupt may 1568 * be hosed. 1569 */ 1570 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1571 intin_no) & AV_REMOTE_IRR) != 0) { 1572 1573 prom_printf("%s: Remote IRR still " 1574 "not clear for IOAPIC %d intin %d.\n" 1575 "\tInterrupts to this pin may cease " 1576 "functioning.\n", psm_name, ioapic_ix, 1577 intin_no); 1578 #ifdef DEBUG 1579 apic_last_ditch_reprogram_failures++; 1580 #endif 1581 } 1582 } 1583 } 1584 1585 /* 1586 * This function is protected by apic_ioapic_lock coupled with the 1587 * fact that interrupts are disabled. 1588 */ 1589 static void 1590 delete_defer_repro_ent(int which_irq) 1591 { 1592 ASSERT(which_irq >= 0); 1593 ASSERT(which_irq <= 255); 1594 ASSERT(LOCK_HELD(&apic_ioapic_lock)); 1595 1596 if (apic_reprogram_info[which_irq].done) 1597 return; 1598 1599 apic_reprogram_info[which_irq].done = B_TRUE; 1600 1601 #ifdef DEBUG 1602 apic_defer_repro_total_retries += 1603 apic_reprogram_info[which_irq].tries; 1604 1605 apic_defer_repro_successes++; 1606 #endif 1607 1608 if (--apic_reprogram_outstanding == 0) { 1609 1610 setlvlx = psm_intr_exit_fn(); 1611 } 1612 } 1613 1614 1615 /* 1616 * Interrupts must be disabled during this function to prevent 1617 * self-deadlock. Interrupts are disabled because this function 1618 * is called from apic_check_stuck_interrupt(), which is called 1619 * from apic_rebind(), which requires its caller to disable interrupts. 1620 */ 1621 static void 1622 add_defer_repro_ent(apic_irq_t *irq_ptr, int which_irq, int new_bind_cpu) 1623 { 1624 ASSERT(which_irq >= 0); 1625 ASSERT(which_irq <= 255); 1626 ASSERT(!interrupts_enabled()); 1627 1628 /* 1629 * On the off-chance that there's already a deferred 1630 * reprogramming on this irq, check, and if so, just update the 1631 * CPU and irq pointer to which the interrupt is targeted, then return. 1632 */ 1633 if (!apic_reprogram_info[which_irq].done) { 1634 apic_reprogram_info[which_irq].bindcpu = new_bind_cpu; 1635 apic_reprogram_info[which_irq].irqp = irq_ptr; 1636 return; 1637 } 1638 1639 apic_reprogram_info[which_irq].irqp = irq_ptr; 1640 apic_reprogram_info[which_irq].bindcpu = new_bind_cpu; 1641 apic_reprogram_info[which_irq].tries = 0; 1642 /* 1643 * This must be the last thing set, since we're not 1644 * grabbing any locks, apic_try_deferred_reprogram() will 1645 * make its decision about using this entry iff done 1646 * is false. 1647 */ 1648 apic_reprogram_info[which_irq].done = B_FALSE; 1649 1650 /* 1651 * If there were previously no deferred reprogrammings, change 1652 * setlvlx to call apic_try_deferred_reprogram() 1653 */ 1654 if (++apic_reprogram_outstanding == 1) { 1655 1656 setlvlx = apic_try_deferred_reprogram; 1657 } 1658 } 1659 1660 static void 1661 apic_try_deferred_reprogram(int prev_ipl, int irq) 1662 { 1663 int reproirq; 1664 ulong_t iflag; 1665 struct ioapic_reprogram_data *drep; 1666 1667 (*psm_intr_exit_fn())(prev_ipl, irq); 1668 1669 if (!lock_try(&apic_defer_reprogram_lock)) { 1670 return; 1671 } 1672 1673 /* 1674 * Acquire the apic_ioapic_lock so that any other operations that 1675 * may affect the apic_reprogram_info state are serialized. 1676 * It's still possible for the last deferred reprogramming to clear 1677 * between the time we entered this function and the time we get to 1678 * the for loop below. In that case, *setlvlx will have been set 1679 * back to *_intr_exit and drep will be NULL. (There's no way to 1680 * stop that from happening -- we would need to grab a lock before 1681 * calling *setlvlx, which is neither realistic nor prudent). 1682 */ 1683 iflag = intr_clear(); 1684 lock_set(&apic_ioapic_lock); 1685 1686 /* 1687 * For each deferred RDT entry, try to reprogram it now. Note that 1688 * there is no lock acquisition to read apic_reprogram_info because 1689 * '.done' is set only after the other fields in the structure are set. 1690 */ 1691 1692 drep = NULL; 1693 for (reproirq = 0; reproirq <= APIC_MAX_VECTOR; reproirq++) { 1694 if (apic_reprogram_info[reproirq].done == B_FALSE) { 1695 drep = &apic_reprogram_info[reproirq]; 1696 break; 1697 } 1698 } 1699 1700 /* 1701 * Either we found a deferred action to perform, or 1702 * we entered this function spuriously, after *setlvlx 1703 * was restored to point to *_intr_exit. Any other 1704 * permutation is invalid. 1705 */ 1706 ASSERT(drep != NULL || *setlvlx == psm_intr_exit_fn()); 1707 1708 /* 1709 * Though we can't really do anything about errors 1710 * at this point, keep track of them for reporting. 1711 * Note that it is very possible for apic_setup_io_intr 1712 * to re-register this very timeout if the Remote IRR bit 1713 * has not yet cleared. 1714 */ 1715 1716 #ifdef DEBUG 1717 if (drep != NULL) { 1718 if (apic_setup_io_intr(drep, reproirq, B_TRUE) != 0) { 1719 apic_deferred_setup_failures++; 1720 } 1721 } else { 1722 apic_deferred_spurious_enters++; 1723 } 1724 #else 1725 if (drep != NULL) 1726 (void) apic_setup_io_intr(drep, reproirq, B_TRUE); 1727 #endif 1728 1729 lock_clear(&apic_ioapic_lock); 1730 intr_restore(iflag); 1731 1732 lock_clear(&apic_defer_reprogram_lock); 1733 } 1734 1735 static void 1736 apic_ioapic_wait_pending_clear(int ioapic_ix, int intin_no) 1737 { 1738 int waited; 1739 1740 /* 1741 * Wait for the delivery pending bit to clear. 1742 */ 1743 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) & 1744 (AV_LEVEL|AV_PENDING)) == (AV_LEVEL|AV_PENDING)) { 1745 1746 /* 1747 * If we're still waiting on the delivery of this interrupt, 1748 * continue to wait here until it is delivered (this should be 1749 * a very small amount of time, but include a timeout just in 1750 * case). 1751 */ 1752 for (waited = 0; waited < apic_max_reps_clear_pending; 1753 waited++) { 1754 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1755 intin_no) & AV_PENDING) == 0) { 1756 break; 1757 } 1758 } 1759 } 1760 } 1761 1762 1763 /* 1764 * Checks to see if the IOAPIC interrupt entry specified has its Remote IRR 1765 * bit set. Calls functions that modify the function that setlvlx points to, 1766 * so that the reprogramming can be retried very shortly. 1767 * 1768 * This function will mask the RDT entry if the interrupt is level-triggered. 1769 * (The caller is responsible for unmasking the RDT entry.) 1770 * 1771 * Returns non-zero if the caller should defer IOAPIC reprogramming. 1772 */ 1773 static int 1774 apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu, 1775 int new_bind_cpu, int ioapic_ix, int intin_no, int which_irq, 1776 struct ioapic_reprogram_data *drep) 1777 { 1778 int32_t rdt_entry; 1779 int waited; 1780 int reps = 0; 1781 1782 /* 1783 * Wait for the delivery pending bit to clear. 1784 */ 1785 do { 1786 ++reps; 1787 1788 apic_ioapic_wait_pending_clear(ioapic_ix, intin_no); 1789 1790 /* 1791 * Mask the RDT entry, but only if it's a level-triggered 1792 * interrupt 1793 */ 1794 rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1795 intin_no); 1796 if ((rdt_entry & (AV_LEVEL|AV_MASK)) == AV_LEVEL) { 1797 1798 /* Mask it */ 1799 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no, 1800 AV_MASK | rdt_entry); 1801 } 1802 1803 if ((rdt_entry & AV_LEVEL) == AV_LEVEL) { 1804 /* 1805 * If there was a race and an interrupt was injected 1806 * just before we masked, check for that case here. 1807 * Then, unmask the RDT entry and try again. If we're 1808 * on our last try, don't unmask (because we want the 1809 * RDT entry to remain masked for the rest of the 1810 * function). 1811 */ 1812 rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1813 intin_no); 1814 if ((rdt_entry & AV_PENDING) && 1815 (reps < apic_max_reps_clear_pending)) { 1816 /* Unmask it */ 1817 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1818 intin_no, rdt_entry & ~AV_MASK); 1819 } 1820 } 1821 1822 } while ((rdt_entry & AV_PENDING) && 1823 (reps < apic_max_reps_clear_pending)); 1824 1825 #ifdef DEBUG 1826 if (rdt_entry & AV_PENDING) 1827 apic_intr_deliver_timeouts++; 1828 #endif 1829 1830 /* 1831 * If the remote IRR bit is set, then the interrupt has been sent 1832 * to a CPU for processing. We have no choice but to wait for 1833 * that CPU to process the interrupt, at which point the remote IRR 1834 * bit will be cleared. 1835 */ 1836 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) & 1837 (AV_LEVEL|AV_REMOTE_IRR)) == (AV_LEVEL|AV_REMOTE_IRR)) { 1838 1839 /* 1840 * If the CPU that this RDT is bound to is NOT the current 1841 * CPU, wait until that CPU handles the interrupt and ACKs 1842 * it. If this interrupt is not bound to any CPU (that is, 1843 * if it's bound to the logical destination of "anyone"), it 1844 * may have been delivered to the current CPU so handle that 1845 * case by deferring the reprogramming (below). 1846 */ 1847 if ((old_bind_cpu != IRQ_UNBOUND) && 1848 (old_bind_cpu != IRQ_UNINIT) && 1849 (old_bind_cpu != psm_get_cpu_id())) { 1850 for (waited = 0; waited < apic_max_reps_clear_pending; 1851 waited++) { 1852 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1853 intin_no) & AV_REMOTE_IRR) == 0) { 1854 1855 delete_defer_repro_ent(which_irq); 1856 1857 /* Remote IRR has cleared! */ 1858 return (0); 1859 } 1860 } 1861 } 1862 1863 /* 1864 * If we waited and the Remote IRR bit is still not cleared, 1865 * AND if we've invoked the timeout APIC_REPROGRAM_MAX_TIMEOUTS 1866 * times for this interrupt, try the last-ditch workaround: 1867 */ 1868 if (drep && drep->tries >= APIC_REPROGRAM_MAX_TRIES) { 1869 1870 apic_last_ditch_clear_remote_irr(ioapic_ix, intin_no); 1871 1872 /* Mark this one as reprogrammed: */ 1873 delete_defer_repro_ent(which_irq); 1874 1875 return (0); 1876 } else { 1877 #ifdef DEBUG 1878 apic_intr_deferrals++; 1879 #endif 1880 1881 /* 1882 * If waiting for the Remote IRR bit (above) didn't 1883 * allow it to clear, defer the reprogramming. 1884 * Add a new deferred-programming entry if the 1885 * caller passed a NULL one (and update the existing one 1886 * in case anything changed). 1887 */ 1888 add_defer_repro_ent(irq_ptr, which_irq, new_bind_cpu); 1889 if (drep) 1890 drep->tries++; 1891 1892 /* Inform caller to defer IOAPIC programming: */ 1893 return (1); 1894 } 1895 1896 } 1897 1898 /* Remote IRR is clear */ 1899 delete_defer_repro_ent(which_irq); 1900 1901 return (0); 1902 } 1903 1904 /* 1905 * Called to migrate all interrupts at an irq to another cpu. 1906 * Must be called with interrupts disabled and apic_ioapic_lock held 1907 */ 1908 int 1909 apic_rebind_all(apic_irq_t *irq_ptr, int bind_cpu) 1910 { 1911 apic_irq_t *irqptr = irq_ptr; 1912 int retval = 0; 1913 1914 while (irqptr) { 1915 if (irqptr->airq_temp_cpu != IRQ_UNINIT) 1916 retval |= apic_rebind(irqptr, bind_cpu, NULL); 1917 irqptr = irqptr->airq_next; 1918 } 1919 1920 return (retval); 1921 } 1922 1923 /* 1924 * apic_intr_redistribute does all the messy computations for identifying 1925 * which interrupt to move to which CPU. Currently we do just one interrupt 1926 * at a time. This reduces the time we spent doing all this within clock 1927 * interrupt. When it is done in idle, we could do more than 1. 1928 * First we find the most busy and the most free CPU (time in ISR only) 1929 * skipping those CPUs that has been identified as being ineligible (cpu_skip) 1930 * Then we look for IRQs which are closest to the difference between the 1931 * most busy CPU and the average ISR load. We try to find one whose load 1932 * is less than difference.If none exists, then we chose one larger than the 1933 * difference, provided it does not make the most idle CPU worse than the 1934 * most busy one. In the end, we clear all the busy fields for CPUs. For 1935 * IRQs, they are cleared as they are scanned. 1936 */ 1937 void 1938 apic_intr_redistribute(void) 1939 { 1940 int busiest_cpu, most_free_cpu; 1941 int cpu_free, cpu_busy, max_busy, min_busy; 1942 int min_free, diff; 1943 int average_busy, cpus_online; 1944 int i, busy; 1945 ulong_t iflag; 1946 apic_cpus_info_t *cpu_infop; 1947 apic_irq_t *min_busy_irq = NULL; 1948 apic_irq_t *max_busy_irq = NULL; 1949 1950 busiest_cpu = most_free_cpu = -1; 1951 cpu_free = cpu_busy = max_busy = average_busy = 0; 1952 min_free = apic_sample_factor_redistribution; 1953 cpus_online = 0; 1954 /* 1955 * Below we will check for CPU_INTR_ENABLE, bound, temp_bound, temp_cpu 1956 * without ioapic_lock. That is OK as we are just doing statistical 1957 * sampling anyway and any inaccuracy now will get corrected next time 1958 * The call to rebind which actually changes things will make sure 1959 * we are consistent. 1960 */ 1961 for (i = 0; i < apic_nproc; i++) { 1962 if (apic_cpu_in_range(i) && 1963 !(apic_redist_cpu_skip & (1 << i)) && 1964 (apic_cpus[i].aci_status & APIC_CPU_INTR_ENABLE)) { 1965 1966 cpu_infop = &apic_cpus[i]; 1967 /* 1968 * If no unbound interrupts or only 1 total on this 1969 * CPU, skip 1970 */ 1971 if (!cpu_infop->aci_temp_bound || 1972 (cpu_infop->aci_bound + cpu_infop->aci_temp_bound) 1973 == 1) { 1974 apic_redist_cpu_skip |= 1 << i; 1975 continue; 1976 } 1977 1978 busy = cpu_infop->aci_busy; 1979 average_busy += busy; 1980 cpus_online++; 1981 if (max_busy < busy) { 1982 max_busy = busy; 1983 busiest_cpu = i; 1984 } 1985 if (min_free > busy) { 1986 min_free = busy; 1987 most_free_cpu = i; 1988 } 1989 if (busy > apic_int_busy_mark) { 1990 cpu_busy |= 1 << i; 1991 } else { 1992 if (busy < apic_int_free_mark) 1993 cpu_free |= 1 << i; 1994 } 1995 } 1996 } 1997 if ((cpu_busy && cpu_free) || 1998 (max_busy >= (min_free + apic_diff_for_redistribution))) { 1999 2000 apic_num_imbalance++; 2001 #ifdef DEBUG 2002 if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) { 2003 prom_printf( 2004 "redistribute busy=%x free=%x max=%x min=%x", 2005 cpu_busy, cpu_free, max_busy, min_free); 2006 } 2007 #endif /* DEBUG */ 2008 2009 2010 average_busy /= cpus_online; 2011 2012 diff = max_busy - average_busy; 2013 min_busy = max_busy; /* start with the max possible value */ 2014 max_busy = 0; 2015 min_busy_irq = max_busy_irq = NULL; 2016 i = apic_min_device_irq; 2017 for (; i <= apic_max_device_irq; i++) { 2018 apic_irq_t *irq_ptr; 2019 /* Change to linked list per CPU ? */ 2020 if ((irq_ptr = apic_irq_table[i]) == NULL) 2021 continue; 2022 /* Check for irq_busy & decide which one to move */ 2023 /* Also zero them for next round */ 2024 if ((irq_ptr->airq_temp_cpu == busiest_cpu) && 2025 irq_ptr->airq_busy) { 2026 if (irq_ptr->airq_busy < diff) { 2027 /* 2028 * Check for least busy CPU, 2029 * best fit or what ? 2030 */ 2031 if (max_busy < irq_ptr->airq_busy) { 2032 /* 2033 * Most busy within the 2034 * required differential 2035 */ 2036 max_busy = irq_ptr->airq_busy; 2037 max_busy_irq = irq_ptr; 2038 } 2039 } else { 2040 if (min_busy > irq_ptr->airq_busy) { 2041 /* 2042 * least busy, but more than 2043 * the reqd diff 2044 */ 2045 if (min_busy < 2046 (diff + average_busy - 2047 min_free)) { 2048 /* 2049 * Making sure new cpu 2050 * will not end up 2051 * worse 2052 */ 2053 min_busy = 2054 irq_ptr->airq_busy; 2055 2056 min_busy_irq = irq_ptr; 2057 } 2058 } 2059 } 2060 } 2061 irq_ptr->airq_busy = 0; 2062 } 2063 2064 if (max_busy_irq != NULL) { 2065 #ifdef DEBUG 2066 if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) { 2067 prom_printf("rebinding %x to %x", 2068 max_busy_irq->airq_vector, most_free_cpu); 2069 } 2070 #endif /* DEBUG */ 2071 iflag = intr_clear(); 2072 if (lock_try(&apic_ioapic_lock)) { 2073 if (apic_rebind_all(max_busy_irq, 2074 most_free_cpu) == 0) { 2075 /* Make change permenant */ 2076 max_busy_irq->airq_cpu = 2077 (uint32_t)most_free_cpu; 2078 } 2079 lock_clear(&apic_ioapic_lock); 2080 } 2081 intr_restore(iflag); 2082 2083 } else if (min_busy_irq != NULL) { 2084 #ifdef DEBUG 2085 if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) { 2086 prom_printf("rebinding %x to %x", 2087 min_busy_irq->airq_vector, most_free_cpu); 2088 } 2089 #endif /* DEBUG */ 2090 2091 iflag = intr_clear(); 2092 if (lock_try(&apic_ioapic_lock)) { 2093 if (apic_rebind_all(min_busy_irq, 2094 most_free_cpu) == 0) { 2095 /* Make change permenant */ 2096 min_busy_irq->airq_cpu = 2097 (uint32_t)most_free_cpu; 2098 } 2099 lock_clear(&apic_ioapic_lock); 2100 } 2101 intr_restore(iflag); 2102 2103 } else { 2104 if (cpu_busy != (1 << busiest_cpu)) { 2105 apic_redist_cpu_skip |= 1 << busiest_cpu; 2106 /* 2107 * We leave cpu_skip set so that next time we 2108 * can choose another cpu 2109 */ 2110 } 2111 } 2112 apic_num_rebind++; 2113 } else { 2114 /* 2115 * found nothing. Could be that we skipped over valid CPUs 2116 * or we have balanced everything. If we had a variable 2117 * ticks_for_redistribution, it could be increased here. 2118 * apic_int_busy, int_free etc would also need to be 2119 * changed. 2120 */ 2121 if (apic_redist_cpu_skip) 2122 apic_redist_cpu_skip = 0; 2123 } 2124 for (i = 0; i < apic_nproc; i++) { 2125 if (apic_cpu_in_range(i)) { 2126 apic_cpus[i].aci_busy = 0; 2127 } 2128 } 2129 } 2130 2131 void 2132 apic_cleanup_busy(void) 2133 { 2134 int i; 2135 apic_irq_t *irq_ptr; 2136 2137 for (i = 0; i < apic_nproc; i++) { 2138 if (apic_cpu_in_range(i)) { 2139 apic_cpus[i].aci_busy = 0; 2140 } 2141 } 2142 2143 for (i = apic_min_device_irq; i <= apic_max_device_irq; i++) { 2144 if ((irq_ptr = apic_irq_table[i]) != NULL) 2145 irq_ptr->airq_busy = 0; 2146 } 2147 } 2148 2149 int 2150 apic_ioapic_method_probe() 2151 { 2152 return (PSM_SUCCESS); 2153 }