1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 /*
  26  * Copyright (c) 2009-2010, Intel Corporation.
  27  * All rights reserved.
  28  */
  29 
  30 #include <sys/x86_archext.h>
  31 #include <sys/machsystm.h>
  32 #include <sys/x_call.h>
  33 #include <sys/stat.h>
  34 #include <acpica/include/acpi.h>
  35 #include <sys/acpica.h>
  36 #include <sys/cpu_acpi.h>
  37 #include <sys/cpu_idle.h>
  38 #include <sys/cpupm.h>
  39 #include <sys/cpu_event.h>
  40 #include <sys/hpet.h>
  41 #include <sys/archsystm.h>
  42 #include <vm/hat_i86.h>
  43 #include <sys/dtrace.h>
  44 #include <sys/sdt.h>
  45 #include <sys/callb.h>
  46 
  47 #define CSTATE_USING_HPET               1
  48 #define CSTATE_USING_LAT                2
  49 
  50 #define CPU_IDLE_STOP_TIMEOUT           1000
  51 
  52 extern void cpu_idle_adaptive(void);
  53 extern uint32_t cpupm_next_cstate(cma_c_state_t *cs_data,
  54     cpu_acpi_cstate_t *cstates, uint32_t cs_count, hrtime_t start);
  55 
  56 static int cpu_idle_init(cpu_t *);
  57 static void cpu_idle_fini(cpu_t *);
  58 static void cpu_idle_stop(cpu_t *);
  59 static boolean_t cpu_deep_idle_callb(void *arg, int code);
  60 static boolean_t cpu_idle_cpr_callb(void *arg, int code);
  61 static void acpi_cpu_cstate(cpu_acpi_cstate_t *cstate);
  62 
  63 static boolean_t cstate_use_timer(hrtime_t *lapic_expire, int timer);
  64 
  65 /*
  66  * the flag of always-running local APIC timer.
  67  * the flag of HPET Timer use in deep cstate.
  68  */
  69 static boolean_t cpu_cstate_arat = B_FALSE;
  70 static boolean_t cpu_cstate_hpet = B_FALSE;
  71 
  72 /*
  73  * Interfaces for modules implementing Intel's deep c-state.
  74  */
  75 cpupm_state_ops_t cpu_idle_ops = {
  76         "Generic ACPI C-state Support",
  77         cpu_idle_init,
  78         cpu_idle_fini,
  79         NULL,
  80         cpu_idle_stop
  81 };
  82 
  83 static kmutex_t         cpu_idle_callb_mutex;
  84 static callb_id_t       cpu_deep_idle_callb_id;
  85 static callb_id_t       cpu_idle_cpr_callb_id;
  86 static uint_t           cpu_idle_cfg_state;
  87 
  88 static kmutex_t cpu_idle_mutex;
  89 
  90 cpu_idle_kstat_t cpu_idle_kstat = {
  91         { "address_space_id",   KSTAT_DATA_STRING },
  92         { "latency",            KSTAT_DATA_UINT32 },
  93         { "power",              KSTAT_DATA_UINT32 },
  94 };
  95 
  96 /*
  97  * kstat update function of the c-state info
  98  */
  99 static int
 100 cpu_idle_kstat_update(kstat_t *ksp, int flag)
 101 {
 102         cpu_acpi_cstate_t *cstate = ksp->ks_private;
 103 
 104         if (flag == KSTAT_WRITE) {
 105                 return (EACCES);
 106         }
 107 
 108         if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
 109                 kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
 110                 "FFixedHW");
 111         } else if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_SYSTEM_IO) {
 112                 kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
 113                 "SystemIO");
 114         } else {
 115                 kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
 116                 "Unsupported");
 117         }
 118 
 119         cpu_idle_kstat.cs_latency.value.ui32 = cstate->cs_latency;
 120         cpu_idle_kstat.cs_power.value.ui32 = cstate->cs_power;
 121 
 122         return (0);
 123 }
 124 
 125 /*
 126  * Used during configuration callbacks to manage implementation specific
 127  * details of the hardware timer used during Deep C-state.
 128  */
 129 boolean_t
 130 cstate_timer_callback(int code)
 131 {
 132         if (cpu_cstate_arat) {
 133                 return (B_TRUE);
 134         } else if (cpu_cstate_hpet) {
 135                 return (hpet.callback(code));
 136         }
 137         return (B_FALSE);
 138 }
 139 
 140 /*
 141  * Some Local APIC Timers do not work during Deep C-states.
 142  * The Deep C-state idle function uses this function to ensure it is using a
 143  * hardware timer that works during Deep C-states.  This function also
 144  * switches the timer back to the LACPI Timer after Deep C-state.
 145  */
 146 static boolean_t
 147 cstate_use_timer(hrtime_t *lapic_expire, int timer)
 148 {
 149         if (cpu_cstate_arat)
 150                 return (B_TRUE);
 151 
 152         /*
 153          * We have to return B_FALSE if no arat or hpet support
 154          */
 155         if (!cpu_cstate_hpet)
 156                 return (B_FALSE);
 157 
 158         switch (timer) {
 159         case CSTATE_USING_HPET:
 160                 return (hpet.use_hpet_timer(lapic_expire));
 161         case CSTATE_USING_LAT:
 162                 hpet.use_lapic_timer(*lapic_expire);
 163                 return (B_TRUE);
 164         default:
 165                 return (B_FALSE);
 166         }
 167 }
 168 
 169 /*
 170  * c-state wakeup function.
 171  * Similar to cpu_wakeup and cpu_wakeup_mwait except this function deals
 172  * with CPUs asleep in MWAIT, HLT, or ACPI Deep C-State.
 173  */
 174 void
 175 cstate_wakeup(cpu_t *cp, int bound)
 176 {
 177         struct machcpu  *mcpu = &(cp->cpu_m);
 178         volatile uint32_t *mcpu_mwait = mcpu->mcpu_mwait;
 179         cpupart_t       *cpu_part;
 180         uint_t          cpu_found;
 181         processorid_t   cpu_sid;
 182 
 183         cpu_part = cp->cpu_part;
 184         cpu_sid = cp->cpu_seqid;
 185         /*
 186          * Clear the halted bit for that CPU since it will be woken up
 187          * in a moment.
 188          */
 189         if (bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) {
 190                 /*
 191                  * Clear the halted bit for that CPU since it will be
 192                  * poked in a moment.
 193                  */
 194                 bitset_atomic_del(&cpu_part->cp_haltset, cpu_sid);
 195 
 196                 /*
 197                  * We may find the current CPU present in the halted cpuset
 198                  * if we're in the context of an interrupt that occurred
 199                  * before we had a chance to clear our bit in cpu_idle().
 200                  * Waking ourself is obviously unnecessary, since if
 201                  * we're here, we're not halted.
 202                  */
 203                 if (cp != CPU) {
 204                         /*
 205                          * Use correct wakeup mechanism
 206                          */
 207                         if ((mcpu_mwait != NULL) &&
 208                             (*mcpu_mwait == MWAIT_HALTED))
 209                                 MWAIT_WAKEUP(cp);
 210                         else
 211                                 poke_cpu(cp->cpu_id);
 212                 }
 213                 return;
 214         } else {
 215                 /*
 216                  * This cpu isn't halted, but it's idle or undergoing a
 217                  * context switch. No need to awaken anyone else.
 218                  */
 219                 if (cp->cpu_thread == cp->cpu_idle_thread ||
 220                     cp->cpu_disp_flags & CPU_DISP_DONTSTEAL)
 221                         return;
 222         }
 223 
 224         /*
 225          * No need to wake up other CPUs if the thread we just enqueued
 226          * is bound.
 227          */
 228         if (bound)
 229                 return;
 230 
 231 
 232         /*
 233          * See if there's any other halted CPUs. If there are, then
 234          * select one, and awaken it.
 235          * It's possible that after we find a CPU, somebody else
 236          * will awaken it before we get the chance.
 237          * In that case, look again.
 238          */
 239         do {
 240                 cpu_found = bitset_find(&cpu_part->cp_haltset);
 241                 if (cpu_found == (uint_t)-1)
 242                         return;
 243 
 244         } while (bitset_atomic_test_and_del(&cpu_part->cp_haltset,
 245             cpu_found) < 0);
 246 
 247         /*
 248          * Must use correct wakeup mechanism to avoid lost wakeup of
 249          * alternate cpu.
 250          */
 251         if (cpu_found != CPU->cpu_seqid) {
 252                 mcpu_mwait = cpu_seq[cpu_found]->cpu_m.mcpu_mwait;
 253                 if ((mcpu_mwait != NULL) && (*mcpu_mwait == MWAIT_HALTED))
 254                         MWAIT_WAKEUP(cpu_seq[cpu_found]);
 255                 else
 256                         poke_cpu(cpu_seq[cpu_found]->cpu_id);
 257         }
 258 }
 259 
 260 /*
 261  * Function called by CPU idle notification framework to check whether CPU
 262  * has been awakened. It will be called with interrupt disabled.
 263  * If CPU has been awakened, call cpu_idle_exit() to notify CPU idle
 264  * notification framework.
 265  */
 266 static void
 267 acpi_cpu_mwait_check_wakeup(void *arg)
 268 {
 269         volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg;
 270 
 271         ASSERT(arg != NULL);
 272         if (*mcpu_mwait != MWAIT_HALTED) {
 273                 /*
 274                  * CPU has been awakened, notify CPU idle notification system.
 275                  */
 276                 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
 277         } else {
 278                 /*
 279                  * Toggle interrupt flag to detect pending interrupts.
 280                  * If interrupt happened, do_interrupt() will notify CPU idle
 281                  * notification framework so no need to call cpu_idle_exit()
 282                  * here.
 283                  */
 284                 sti();
 285                 SMT_PAUSE();
 286                 cli();
 287         }
 288 }
 289 
 290 static void
 291 acpi_cpu_mwait_ipi_check_wakeup(void *arg)
 292 {
 293         volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg;
 294 
 295         ASSERT(arg != NULL);
 296         if (*mcpu_mwait != MWAIT_WAKEUP_IPI) {
 297                 /*
 298                  * CPU has been awakened, notify CPU idle notification system.
 299                  */
 300                 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
 301         } else {
 302                 /*
 303                  * Toggle interrupt flag to detect pending interrupts.
 304                  * If interrupt happened, do_interrupt() will notify CPU idle
 305                  * notification framework so no need to call cpu_idle_exit()
 306                  * here.
 307                  */
 308                 sti();
 309                 SMT_PAUSE();
 310                 cli();
 311         }
 312 }
 313 
 314 /*ARGSUSED*/
 315 static void
 316 acpi_cpu_check_wakeup(void *arg)
 317 {
 318         /*
 319          * Toggle interrupt flag to detect pending interrupts.
 320          * If interrupt happened, do_interrupt() will notify CPU idle
 321          * notification framework so no need to call cpu_idle_exit() here.
 322          */
 323         sti();
 324         SMT_PAUSE();
 325         cli();
 326 }
 327 
 328 /*
 329  * enter deep c-state handler
 330  */
 331 static void
 332 acpi_cpu_cstate(cpu_acpi_cstate_t *cstate)
 333 {
 334         volatile uint32_t       *mcpu_mwait = CPU->cpu_m.mcpu_mwait;
 335         cpu_t                   *cpup = CPU;
 336         processorid_t           cpu_sid = cpup->cpu_seqid;
 337         cpupart_t               *cp = cpup->cpu_part;
 338         hrtime_t                lapic_expire;
 339         uint8_t                 type = cstate->cs_addrspace_id;
 340         uint32_t                cs_type = cstate->cs_type;
 341         int                     hset_update = 1;
 342         boolean_t               using_timer;
 343         cpu_idle_check_wakeup_t check_func = &acpi_cpu_check_wakeup;
 344 
 345         /*
 346          * Set our mcpu_mwait here, so we can tell if anyone tries to
 347          * wake us between now and when we call mwait.  No other cpu will
 348          * attempt to set our mcpu_mwait until we add ourself to the haltset.
 349          */
 350         if (mcpu_mwait) {
 351                 if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
 352                         *mcpu_mwait = MWAIT_WAKEUP_IPI;
 353                         check_func = &acpi_cpu_mwait_ipi_check_wakeup;
 354                 } else {
 355                         *mcpu_mwait = MWAIT_HALTED;
 356                         check_func = &acpi_cpu_mwait_check_wakeup;
 357                 }
 358         }
 359 
 360         /*
 361          * If this CPU is online, and there are multiple CPUs
 362          * in the system, then we should note our halting
 363          * by adding ourselves to the partition's halted CPU
 364          * bitmap. This allows other CPUs to find/awaken us when
 365          * work becomes available.
 366          */
 367         if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1)
 368                 hset_update = 0;
 369 
 370         /*
 371          * Add ourselves to the partition's halted CPUs bitmask
 372          * and set our HALTED flag, if necessary.
 373          *
 374          * When a thread becomes runnable, it is placed on the queue
 375          * and then the halted cpuset is checked to determine who
 376          * (if anyone) should be awakened. We therefore need to first
 377          * add ourselves to the halted cpuset, and and then check if there
 378          * is any work available.
 379          *
 380          * Note that memory barriers after updating the HALTED flag
 381          * are not necessary since an atomic operation (updating the bitmap)
 382          * immediately follows. On x86 the atomic operation acts as a
 383          * memory barrier for the update of cpu_disp_flags.
 384          */
 385         if (hset_update) {
 386                 cpup->cpu_disp_flags |= CPU_DISP_HALTED;
 387                 bitset_atomic_add(&cp->cp_haltset, cpu_sid);
 388         }
 389 
 390         /*
 391          * Check to make sure there's really nothing to do.
 392          * Work destined for this CPU may become available after
 393          * this check. We'll be notified through the clearing of our
 394          * bit in the halted CPU bitmask, and a write to our mcpu_mwait.
 395          *
 396          * disp_anywork() checks disp_nrunnable, so we do not have to later.
 397          */
 398         if (disp_anywork()) {
 399                 if (hset_update) {
 400                         cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
 401                         bitset_atomic_del(&cp->cp_haltset, cpu_sid);
 402                 }
 403                 return;
 404         }
 405 
 406         /*
 407          * We're on our way to being halted.
 408          *
 409          * The local APIC timer can stop in ACPI C2 and deeper c-states.
 410          * Try to program the HPET hardware to substitute for this CPU's
 411          * LAPIC timer.
 412          * cstate_use_timer() could disable the LAPIC Timer.  Make sure
 413          * to start the LAPIC Timer again before leaving this function.
 414          *
 415          * Disable interrupts here so we will awaken immediately after halting
 416          * if someone tries to poke us between now and the time we actually
 417          * halt.
 418          */
 419         cli();
 420         using_timer = cstate_use_timer(&lapic_expire, CSTATE_USING_HPET);
 421 
 422         /*
 423          * We check for the presence of our bit after disabling interrupts.
 424          * If it's cleared, we'll return. If the bit is cleared after
 425          * we check then the cstate_wakeup() will pop us out of the halted
 426          * state.
 427          *
 428          * This means that the ordering of the cstate_wakeup() and the clearing
 429          * of the bit by cpu_wakeup is important.
 430          * cpu_wakeup() must clear our mc_haltset bit, and then call
 431          * cstate_wakeup().
 432          * acpi_cpu_cstate() must disable interrupts, then check for the bit.
 433          */
 434         if (hset_update && bitset_in_set(&cp->cp_haltset, cpu_sid) == 0) {
 435                 (void) cstate_use_timer(&lapic_expire,
 436                     CSTATE_USING_LAT);
 437                 sti();
 438                 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
 439                 return;
 440         }
 441 
 442         /*
 443          * The check for anything locally runnable is here for performance
 444          * and isn't needed for correctness. disp_nrunnable ought to be
 445          * in our cache still, so it's inexpensive to check, and if there
 446          * is anything runnable we won't have to wait for the poke.
 447          */
 448         if (cpup->cpu_disp->disp_nrunnable != 0) {
 449                 (void) cstate_use_timer(&lapic_expire,
 450                     CSTATE_USING_LAT);
 451                 sti();
 452                 if (hset_update) {
 453                         cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
 454                         bitset_atomic_del(&cp->cp_haltset, cpu_sid);
 455                 }
 456                 return;
 457         }
 458 
 459         if (using_timer == B_FALSE) {
 460 
 461                 (void) cstate_use_timer(&lapic_expire,
 462                     CSTATE_USING_LAT);
 463                 sti();
 464 
 465                 /*
 466                  * We are currently unable to program the HPET to act as this
 467                  * CPU's proxy LAPIC timer.  This CPU cannot enter C2 or deeper
 468                  * because no timer is set to wake it up while its LAPIC timer
 469                  * stalls in deep C-States.
 470                  * Enter C1 instead.
 471                  *
 472                  * cstate_wake_cpu() will wake this CPU with an IPI which
 473                  * works with MWAIT.
 474                  */
 475                 i86_monitor(mcpu_mwait, 0, 0);
 476                 if ((*mcpu_mwait & ~MWAIT_WAKEUP_IPI) == MWAIT_HALTED) {
 477                         if (cpu_idle_enter(IDLE_STATE_C1, 0,
 478                             check_func, (void *)mcpu_mwait) == 0) {
 479                                 if ((*mcpu_mwait & ~MWAIT_WAKEUP_IPI) ==
 480                                     MWAIT_HALTED) {
 481                                         i86_mwait(0, 0);
 482                                 }
 483                                 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
 484                         }
 485                 }
 486 
 487                 /*
 488                  * We're no longer halted
 489                  */
 490                 if (hset_update) {
 491                         cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
 492                         bitset_atomic_del(&cp->cp_haltset, cpu_sid);
 493                 }
 494                 return;
 495         }
 496 
 497         if (type == ACPI_ADR_SPACE_FIXED_HARDWARE) {
 498                 /*
 499                  * We're on our way to being halted.
 500                  * To avoid a lost wakeup, arm the monitor before checking
 501                  * if another cpu wrote to mcpu_mwait to wake us up.
 502                  */
 503                 i86_monitor(mcpu_mwait, 0, 0);
 504                 if (*mcpu_mwait == MWAIT_HALTED) {
 505                         if (cpu_idle_enter((uint_t)cs_type, 0,
 506                             check_func, (void *)mcpu_mwait) == 0) {
 507                                 if (*mcpu_mwait == MWAIT_HALTED) {
 508                                         i86_mwait(cstate->cs_address, 1);
 509                                 }
 510                                 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
 511                         }
 512                 }
 513         } else if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
 514                 uint32_t value;
 515                 ACPI_TABLE_FADT *gbl_FADT;
 516 
 517                 if (*mcpu_mwait == MWAIT_WAKEUP_IPI) {
 518                         if (cpu_idle_enter((uint_t)cs_type, 0,
 519                             check_func, (void *)mcpu_mwait) == 0) {
 520                                 if (*mcpu_mwait == MWAIT_WAKEUP_IPI) {
 521                                         (void) cpu_acpi_read_port(
 522                                             cstate->cs_address, &value, 8);
 523                                         acpica_get_global_FADT(&gbl_FADT);
 524                                         (void) cpu_acpi_read_port(
 525                                             gbl_FADT->XPmTimerBlock.Address,
 526                                             &value, 32);
 527                                 }
 528                                 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
 529                         }
 530                 }
 531         }
 532 
 533         /*
 534          * The LAPIC timer may have stopped in deep c-state.
 535          * Reprogram this CPU's LAPIC here before enabling interrupts.
 536          */
 537         (void) cstate_use_timer(&lapic_expire, CSTATE_USING_LAT);
 538         sti();
 539 
 540         /*
 541          * We're no longer halted
 542          */
 543         if (hset_update) {
 544                 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
 545                 bitset_atomic_del(&cp->cp_haltset, cpu_sid);
 546         }
 547 }
 548 
 549 /*
 550  * Idle the present CPU, deep c-state is supported
 551  */
 552 void
 553 cpu_acpi_idle(void)
 554 {
 555         cpu_t *cp = CPU;
 556         cpu_acpi_handle_t handle;
 557         cma_c_state_t *cs_data;
 558         cpu_acpi_cstate_t *cstates;
 559         hrtime_t start, end;
 560         int cpu_max_cstates;
 561         uint32_t cs_indx;
 562         uint16_t cs_type;
 563 
 564         cpupm_mach_state_t *mach_state =
 565             (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
 566         handle = mach_state->ms_acpi_handle;
 567         ASSERT(CPU_ACPI_CSTATES(handle) != NULL);
 568 
 569         cs_data = mach_state->ms_cstate.cma_state.cstate;
 570         cstates = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
 571         ASSERT(cstates != NULL);
 572         cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
 573         if (cpu_max_cstates > CPU_MAX_CSTATES)
 574                 cpu_max_cstates = CPU_MAX_CSTATES;
 575         if (cpu_max_cstates == 1) {     /* no ACPI c-state data */
 576                 (*non_deep_idle_cpu)();
 577                 return;
 578         }
 579 
 580         start = gethrtime_unscaled();
 581 
 582         cs_indx = cpupm_next_cstate(cs_data, cstates, cpu_max_cstates, start);
 583 
 584         cs_type = cstates[cs_indx].cs_type;
 585 
 586         switch (cs_type) {
 587         default:
 588                 /* FALLTHROUGH */
 589         case CPU_ACPI_C1:
 590                 (*non_deep_idle_cpu)();
 591                 break;
 592 
 593         case CPU_ACPI_C2:
 594                 acpi_cpu_cstate(&cstates[cs_indx]);
 595                 break;
 596 
 597         case CPU_ACPI_C3:
 598                 /*
 599                  * All supported Intel processors maintain cache coherency
 600                  * during C3.  Currently when entering C3 processors flush
 601                  * core caches to higher level shared cache. The shared cache
 602                  * maintains state and supports probes during C3.
 603                  * Consequently there is no need to handle cache coherency
 604                  * and Bus Master activity here with the cache flush, BM_RLD
 605                  * bit, BM_STS bit, nor PM2_CNT.ARB_DIS mechanisms described
 606                  * in section 8.1.4 of the ACPI Specification 4.0.
 607                  */
 608                 acpi_cpu_cstate(&cstates[cs_indx]);
 609                 break;
 610         }
 611 
 612         end = gethrtime_unscaled();
 613 
 614         /*
 615          * Update statistics
 616          */
 617         cpupm_wakeup_cstate_data(cs_data, end);
 618 }
 619 
 620 boolean_t
 621 cpu_deep_cstates_supported(void)
 622 {
 623         extern int      idle_cpu_no_deep_c;
 624 
 625         if (idle_cpu_no_deep_c)
 626                 return (B_FALSE);
 627 
 628         if (!cpuid_deep_cstates_supported())
 629                 return (B_FALSE);
 630 
 631         if (cpuid_arat_supported()) {
 632                 cpu_cstate_arat = B_TRUE;
 633                 return (B_TRUE);
 634         }
 635 
 636         if ((hpet.supported == HPET_FULL_SUPPORT) &&
 637             hpet.install_proxy()) {
 638                 cpu_cstate_hpet = B_TRUE;
 639                 return (B_TRUE);
 640         }
 641 
 642         return (B_FALSE);
 643 }
 644 
 645 /*
 646  * Validate that this processor supports deep cstate and if so,
 647  * get the c-state data from ACPI and cache it.
 648  */
 649 static int
 650 cpu_idle_init(cpu_t *cp)
 651 {
 652         cpupm_mach_state_t *mach_state =
 653             (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
 654         cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
 655         cpu_acpi_cstate_t *cstate;
 656         char name[KSTAT_STRLEN];
 657         int cpu_max_cstates, i;
 658         int ret;
 659 
 660         /*
 661          * Cache the C-state specific ACPI data.
 662          */
 663         if ((ret = cpu_acpi_cache_cstate_data(handle)) != 0) {
 664                 if (ret < 0)
 665                         cmn_err(CE_NOTE,
 666                             "!Support for CPU deep idle states is being "
 667                             "disabled due to errors parsing ACPI C-state "
 668                             "objects exported by BIOS.");
 669                 cpu_idle_fini(cp);
 670                 return (-1);
 671         }
 672 
 673         cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
 674 
 675         cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
 676 
 677         for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
 678                 (void) snprintf(name, KSTAT_STRLEN - 1, "c%d", cstate->cs_type);
 679                 /*
 680                  * Allocate, initialize and install cstate kstat
 681                  */
 682                 cstate->cs_ksp = kstat_create("cstate", cp->cpu_id,
 683                     name, "misc",
 684                     KSTAT_TYPE_NAMED,
 685                     sizeof (cpu_idle_kstat) / sizeof (kstat_named_t),
 686                     KSTAT_FLAG_VIRTUAL);
 687 
 688                 if (cstate->cs_ksp == NULL) {
 689                         cmn_err(CE_NOTE, "kstat_create(c_state) fail");
 690                 } else {
 691                         cstate->cs_ksp->ks_data = &cpu_idle_kstat;
 692                         cstate->cs_ksp->ks_lock = &cpu_idle_mutex;
 693                         cstate->cs_ksp->ks_update = cpu_idle_kstat_update;
 694                         cstate->cs_ksp->ks_data_size += MAXNAMELEN;
 695                         cstate->cs_ksp->ks_private = cstate;
 696                         kstat_install(cstate->cs_ksp);
 697                 }
 698                 cstate++;
 699         }
 700 
 701         cpupm_alloc_domains(cp, CPUPM_C_STATES);
 702         cpupm_alloc_ms_cstate(cp);
 703 
 704         if (cpu_deep_cstates_supported()) {
 705                 uint32_t value;
 706 
 707                 mutex_enter(&cpu_idle_callb_mutex);
 708                 if (cpu_deep_idle_callb_id == (callb_id_t)0)
 709                         cpu_deep_idle_callb_id = callb_add(&cpu_deep_idle_callb,
 710                             (void *)NULL, CB_CL_CPU_DEEP_IDLE, "cpu_deep_idle");
 711                 if (cpu_idle_cpr_callb_id == (callb_id_t)0)
 712                         cpu_idle_cpr_callb_id = callb_add(&cpu_idle_cpr_callb,
 713                             (void *)NULL, CB_CL_CPR_PM, "cpu_idle_cpr");
 714                 mutex_exit(&cpu_idle_callb_mutex);
 715 
 716 
 717                 /*
 718                  * All supported CPUs (Nehalem and later) will remain in C3
 719                  * during Bus Master activity.
 720                  * All CPUs set ACPI_BITREG_BUS_MASTER_RLD to 0 here if it
 721                  * is not already 0 before enabling Deeper C-states.
 722                  */
 723                 cpu_acpi_get_register(ACPI_BITREG_BUS_MASTER_RLD, &value);
 724                 if (value & 1)
 725                         cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
 726         }
 727 
 728         return (0);
 729 }
 730 
 731 /*
 732  * Free resources allocated by cpu_idle_init().
 733  */
 734 static void
 735 cpu_idle_fini(cpu_t *cp)
 736 {
 737         cpupm_mach_state_t *mach_state =
 738             (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
 739         cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
 740         cpu_acpi_cstate_t *cstate;
 741         uint_t  cpu_max_cstates, i;
 742 
 743         /*
 744          * idle cpu points back to the generic one
 745          */
 746         idle_cpu = cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
 747         disp_enq_thread = non_deep_idle_disp_enq_thread;
 748 
 749         cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
 750         if (cstate) {
 751                 cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
 752 
 753                 for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
 754                         if (cstate->cs_ksp != NULL)
 755                                 kstat_delete(cstate->cs_ksp);
 756                         cstate++;
 757                 }
 758         }
 759 
 760         cpupm_free_ms_cstate(cp);
 761         cpupm_free_domains(&cpupm_cstate_domains);
 762         cpu_acpi_free_cstate_data(handle);
 763 
 764         mutex_enter(&cpu_idle_callb_mutex);
 765         if (cpu_deep_idle_callb_id != (callb_id_t)0) {
 766                 (void) callb_delete(cpu_deep_idle_callb_id);
 767                 cpu_deep_idle_callb_id = (callb_id_t)0;
 768         }
 769         if (cpu_idle_cpr_callb_id != (callb_id_t)0) {
 770                 (void) callb_delete(cpu_idle_cpr_callb_id);
 771                 cpu_idle_cpr_callb_id = (callb_id_t)0;
 772         }
 773         mutex_exit(&cpu_idle_callb_mutex);
 774 }
 775 
 776 /*
 777  * This function is introduced here to solve a race condition
 778  * between the master and the slave to touch c-state data structure.
 779  * After the slave calls this idle function to switch to the non
 780  * deep idle function, the master can go on to reclaim the resource.
 781  */
 782 static void
 783 cpu_idle_stop_sync(void)
 784 {
 785         /* switch to the non deep idle function */
 786         CPU->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
 787 }
 788 
 789 static void
 790 cpu_idle_stop(cpu_t *cp)
 791 {
 792         cpupm_mach_state_t *mach_state =
 793             (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
 794         cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
 795         cpu_acpi_cstate_t *cstate;
 796         uint_t cpu_max_cstates, i = 0;
 797 
 798         mutex_enter(&cpu_idle_callb_mutex);
 799         if (idle_cpu == cpu_idle_adaptive) {
 800                 /*
 801                  * invoke the slave to call synchronous idle function.
 802                  */
 803                 cp->cpu_m.mcpu_idle_cpu = cpu_idle_stop_sync;
 804                 poke_cpu(cp->cpu_id);
 805 
 806                 /*
 807                  * wait until the slave switchs to non deep idle function,
 808                  * so that the master is safe to go on to reclaim the resource.
 809                  */
 810                 while (cp->cpu_m.mcpu_idle_cpu != non_deep_idle_cpu) {
 811                         drv_usecwait(10);
 812                         if ((++i % CPU_IDLE_STOP_TIMEOUT) == 0)
 813                                 cmn_err(CE_NOTE, "!cpu_idle_stop: the slave"
 814                                     " idle stop timeout");
 815                 }
 816         }
 817         mutex_exit(&cpu_idle_callb_mutex);
 818 
 819         cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
 820         if (cstate) {
 821                 cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
 822 
 823                 for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
 824                         if (cstate->cs_ksp != NULL)
 825                                 kstat_delete(cstate->cs_ksp);
 826                         cstate++;
 827                 }
 828         }
 829         cpupm_free_ms_cstate(cp);
 830         cpupm_remove_domains(cp, CPUPM_C_STATES, &cpupm_cstate_domains);
 831         cpu_acpi_free_cstate_data(handle);
 832 }
 833 
 834 /*ARGSUSED*/
 835 static boolean_t
 836 cpu_deep_idle_callb(void *arg, int code)
 837 {
 838         boolean_t rslt = B_TRUE;
 839 
 840         mutex_enter(&cpu_idle_callb_mutex);
 841         switch (code) {
 842         case PM_DEFAULT_CPU_DEEP_IDLE:
 843                 /*
 844                  * Default policy is same as enable
 845                  */
 846                 /*FALLTHROUGH*/
 847         case PM_ENABLE_CPU_DEEP_IDLE:
 848                 if ((cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG) == 0)
 849                         break;
 850 
 851                 if (cstate_timer_callback(PM_ENABLE_CPU_DEEP_IDLE)) {
 852                         disp_enq_thread = cstate_wakeup;
 853                         idle_cpu = cpu_idle_adaptive;
 854                         cpu_idle_cfg_state &= ~CPU_IDLE_DEEP_CFG;
 855                 } else {
 856                         rslt = B_FALSE;
 857                 }
 858                 break;
 859 
 860         case PM_DISABLE_CPU_DEEP_IDLE:
 861                 if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
 862                         break;
 863 
 864                 idle_cpu = non_deep_idle_cpu;
 865                 if (cstate_timer_callback(PM_DISABLE_CPU_DEEP_IDLE)) {
 866                         disp_enq_thread = non_deep_idle_disp_enq_thread;
 867                         cpu_idle_cfg_state |= CPU_IDLE_DEEP_CFG;
 868                 }
 869                 break;
 870 
 871         default:
 872                 cmn_err(CE_NOTE, "!cpu deep_idle_callb: invalid code %d\n",
 873                     code);
 874                 break;
 875         }
 876         mutex_exit(&cpu_idle_callb_mutex);
 877         return (rslt);
 878 }
 879 
 880 /*ARGSUSED*/
 881 static boolean_t
 882 cpu_idle_cpr_callb(void *arg, int code)
 883 {
 884         boolean_t rslt = B_TRUE;
 885 
 886         mutex_enter(&cpu_idle_callb_mutex);
 887         switch (code) {
 888         case CB_CODE_CPR_RESUME:
 889                 if (cstate_timer_callback(CB_CODE_CPR_RESUME)) {
 890                         /*
 891                          * Do not enable dispatcher hooks if disabled by user.
 892                          */
 893                         if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
 894                                 break;
 895 
 896                         disp_enq_thread = cstate_wakeup;
 897                         idle_cpu = cpu_idle_adaptive;
 898                 } else {
 899                         rslt = B_FALSE;
 900                 }
 901                 break;
 902 
 903         case CB_CODE_CPR_CHKPT:
 904                 idle_cpu = non_deep_idle_cpu;
 905                 disp_enq_thread = non_deep_idle_disp_enq_thread;
 906                 (void) cstate_timer_callback(CB_CODE_CPR_CHKPT);
 907                 break;
 908 
 909         default:
 910                 cmn_err(CE_NOTE, "!cpudvr cpr_callb: invalid code %d\n", code);
 911                 break;
 912         }
 913         mutex_exit(&cpu_idle_callb_mutex);
 914         return (rslt);
 915 }
 916 
 917 /*
 918  * handle _CST notification
 919  */
 920 void
 921 cpuidle_cstate_instance(cpu_t *cp)
 922 {
 923 #ifndef __xpv
 924         cpupm_mach_state_t      *mach_state =
 925             (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
 926         cpu_acpi_handle_t       handle;
 927         struct machcpu          *mcpu;
 928         cpuset_t                dom_cpu_set;
 929         kmutex_t                *pm_lock;
 930         int                     result = 0;
 931         processorid_t           cpu_id;
 932 
 933         if (mach_state == NULL) {
 934                 return;
 935         }
 936 
 937         ASSERT(mach_state->ms_cstate.cma_domain != NULL);
 938         dom_cpu_set = mach_state->ms_cstate.cma_domain->pm_cpus;
 939         pm_lock = &mach_state->ms_cstate.cma_domain->pm_lock;
 940 
 941         /*
 942          * Do for all the CPU's in the domain
 943          */
 944         mutex_enter(pm_lock);
 945         do {
 946                 CPUSET_FIND(dom_cpu_set, cpu_id);
 947                 if (cpu_id == CPUSET_NOTINSET)
 948                         break;
 949 
 950                 ASSERT(cpu_id >= 0 && cpu_id < NCPU);
 951                 cp = cpu[cpu_id];
 952                 mach_state = (cpupm_mach_state_t *)
 953                     cp->cpu_m.mcpu_pm_mach_state;
 954                 if (!(mach_state->ms_caps & CPUPM_C_STATES)) {
 955                         mutex_exit(pm_lock);
 956                         return;
 957                 }
 958                 handle = mach_state->ms_acpi_handle;
 959                 ASSERT(handle != NULL);
 960 
 961                 /*
 962                  * re-evaluate cstate object
 963                  */
 964                 if (cpu_acpi_cache_cstate_data(handle) != 0) {
 965                         cmn_err(CE_WARN, "Cannot re-evaluate the cpu c-state"
 966                             " object Instance: %d", cpu_id);
 967                 }
 968                 mcpu = &(cp->cpu_m);
 969                 mcpu->max_cstates = cpu_acpi_get_max_cstates(handle);
 970                 if (mcpu->max_cstates > CPU_ACPI_C1) {
 971                         (void) cstate_timer_callback(
 972                             CST_EVENT_MULTIPLE_CSTATES);
 973                         disp_enq_thread = cstate_wakeup;
 974                         cp->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
 975                 } else if (mcpu->max_cstates == CPU_ACPI_C1) {
 976                         disp_enq_thread = non_deep_idle_disp_enq_thread;
 977                         cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
 978                         (void) cstate_timer_callback(CST_EVENT_ONE_CSTATE);
 979                 }
 980 
 981                 CPUSET_ATOMIC_XDEL(dom_cpu_set, cpu_id, result);
 982         } while (result < 0);
 983         mutex_exit(pm_lock);
 984 #endif
 985 }
 986 
 987 /*
 988  * handle the number or the type of available processor power states change
 989  */
 990 void
 991 cpuidle_manage_cstates(void *ctx)
 992 {
 993         cpu_t                   *cp = ctx;
 994         cpupm_mach_state_t      *mach_state =
 995             (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
 996         boolean_t               is_ready;
 997 
 998         if (mach_state == NULL) {
 999                 return;
1000         }
1001 
1002         /*
1003          * We currently refuse to power manage if the CPU is not ready to
1004          * take cross calls (cross calls fail silently if CPU is not ready
1005          * for it).
1006          *
1007          * Additionally, for x86 platforms we cannot power manage an instance,
1008          * until it has been initialized.
1009          */
1010         is_ready = (cp->cpu_flags & CPU_READY) && cpupm_cstate_ready(cp);
1011         if (!is_ready)
1012                 return;
1013 
1014         cpuidle_cstate_instance(cp);
1015 }