1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 /*
  26  * Copyright (c) 2009, Intel Corporation.
  27  * All rights reserved.
  28  */
  29 
  30 #include <sys/cpu_pm.h>
  31 #include <sys/x86_archext.h>
  32 #include <sys/sdt.h>
  33 #include <sys/spl.h>
  34 #include <sys/machsystm.h>
  35 #include <sys/archsystm.h>
  36 #include <sys/hpet.h>
  37 #include <sys/acpi/acpi.h>
  38 #include <sys/acpica.h>
  39 #include <sys/cpupm.h>
  40 #include <sys/cpu_idle.h>
  41 #include <sys/cpu_acpi.h>
  42 #include <sys/cpupm_throttle.h>
  43 #include <sys/dtrace.h>
  44 #include <sys/note.h>
  45 
  46 /*
  47  * This callback is used to build the PPM CPU domains once
  48  * a CPU device has been started. The callback is initialized
  49  * by the PPM driver to point to a routine that will build the
  50  * domains.
  51  */
  52 void (*cpupm_ppm_alloc_pstate_domains)(cpu_t *);
  53 
  54 /*
  55  * This callback is used to remove CPU from the PPM CPU domains
  56  * when the cpu driver is detached. The callback is initialized
  57  * by the PPM driver to point to a routine that will remove CPU
  58  * from the domains.
  59  */
  60 void (*cpupm_ppm_free_pstate_domains)(cpu_t *);
  61 
  62 /*
  63  * This callback is used to redefine the topspeed for a CPU device.
  64  * Since all CPUs in a domain should have identical properties, this
  65  * callback is initialized by the PPM driver to point to a routine
  66  * that will redefine the topspeed for all devices in a CPU domain.
  67  * This callback is exercised whenever an ACPI _PPC change notification
  68  * is received by the CPU driver.
  69  */
  70 void (*cpupm_redefine_topspeed)(void *);
  71 
  72 /*
  73  * This callback is used by the PPM driver to call into the CPU driver
  74  * to find a CPU's current topspeed (i.e., it's current ACPI _PPC value).
  75  */
  76 void (*cpupm_set_topspeed_callb)(void *, int);
  77 
  78 /*
  79  * This callback is used by the PPM driver to call into the CPU driver
  80  * to set a new topspeed for a CPU.
  81  */
  82 int (*cpupm_get_topspeed_callb)(void *);
  83 
  84 static void cpupm_event_notify_handler(ACPI_HANDLE, UINT32, void *);
  85 static void cpupm_free_notify_handlers(cpu_t *);
  86 static void cpupm_power_manage_notifications(void *);
  87 
  88 /*
  89  * Until proven otherwise, all power states are manageable.
  90  */
  91 static uint32_t cpupm_enabled = CPUPM_ALL_STATES;
  92 
  93 cpupm_state_domains_t *cpupm_pstate_domains = NULL;
  94 cpupm_state_domains_t *cpupm_tstate_domains = NULL;
  95 cpupm_state_domains_t *cpupm_cstate_domains = NULL;
  96 
  97 /*
  98  * c-state tunables
  99  *
 100  * cpupm_cs_sample_interval is the length of time we wait before
 101  * recalculating c-state statistics.  When a CPU goes idle it checks
 102  * to see if it has been longer than cpupm_cs_sample_interval since it last
 103  * caculated which C-state to go to.
 104  *
 105  * cpupm_cs_idle_cost_tunable is the ratio of time CPU spends executing + idle
 106  * divided by time spent in the idle state transitions.
 107  * A value of 10 means the CPU will not spend more than 1/10 of its time
 108  * in idle latency.  The worst case performance will be 90% of non Deep C-state
 109  * kernel.
 110  *
 111  * cpupm_cs_idle_save_tunable is how long we must stay in a deeper C-state
 112  * before it is worth going there.  Expressed as a multiple of latency.
 113  */
 114 uint32_t cpupm_cs_sample_interval = 100*1000*1000;      /* 100 milliseconds */
 115 uint32_t cpupm_cs_idle_cost_tunable = 10;       /* work time / latency cost */
 116 uint32_t cpupm_cs_idle_save_tunable = 2;        /* idle power savings */
 117 uint16_t cpupm_C2_idle_pct_tunable = 70;
 118 uint16_t cpupm_C3_idle_pct_tunable = 80;
 119 
 120 #ifndef __xpv
 121 extern boolean_t cpupm_intel_init(cpu_t *);
 122 extern boolean_t cpupm_amd_init(cpu_t *);
 123 
 124 typedef struct cpupm_vendor {
 125         boolean_t       (*cpuv_init)(cpu_t *);
 126 } cpupm_vendor_t;
 127 
 128 /*
 129  * Table of supported vendors.
 130  */
 131 static cpupm_vendor_t cpupm_vendors[] = {
 132         { cpupm_intel_init },
 133         { cpupm_amd_init },
 134         { NULL }
 135 };
 136 #endif
 137 
 138 /*
 139  * Initialize the machine.
 140  * See if a module exists for managing power for this CPU.
 141  */
 142 /*ARGSUSED*/
 143 void
 144 cpupm_init(cpu_t *cp)
 145 {
 146 #ifndef __xpv
 147         cpupm_vendor_t *vendors;
 148         cpupm_mach_state_t *mach_state;
 149         struct machcpu *mcpu = &(cp->cpu_m);
 150         static boolean_t first = B_TRUE;
 151         int *speeds;
 152         uint_t nspeeds;
 153         int ret;
 154 
 155         mach_state = cp->cpu_m.mcpu_pm_mach_state =
 156             kmem_zalloc(sizeof (cpupm_mach_state_t), KM_SLEEP);
 157         mach_state->ms_caps = CPUPM_NO_STATES;
 158         mutex_init(&mach_state->ms_lock, NULL, MUTEX_DRIVER, NULL);
 159 
 160         mach_state->ms_acpi_handle = cpu_acpi_init(cp);
 161         if (mach_state->ms_acpi_handle == NULL) {
 162                 cpupm_fini(cp);
 163                 cmn_err(CE_WARN, "!cpupm_init: processor %d: "
 164                     "unable to get ACPI handle", cp->cpu_id);
 165                 cmn_err(CE_NOTE, "!CPU power management will not function.");
 166                 CPUPM_DISABLE();
 167                 first = B_FALSE;
 168                 return;
 169         }
 170 
 171         /*
 172          * Loop through the CPU management module table and see if
 173          * any of the modules implement CPU power management
 174          * for this CPU.
 175          */
 176         for (vendors = cpupm_vendors; vendors->cpuv_init != NULL; vendors++) {
 177                 if (vendors->cpuv_init(cp))
 178                         break;
 179         }
 180 
 181         /*
 182          * Nope, we can't power manage this CPU.
 183          */
 184         if (vendors == NULL) {
 185                 cpupm_fini(cp);
 186                 CPUPM_DISABLE();
 187                 first = B_FALSE;
 188                 return;
 189         }
 190 
 191         /*
 192          * If P-state support exists for this system, then initialize it.
 193          */
 194         if (mach_state->ms_pstate.cma_ops != NULL) {
 195                 ret = mach_state->ms_pstate.cma_ops->cpus_init(cp);
 196                 if (ret != 0) {
 197                         mach_state->ms_pstate.cma_ops = NULL;
 198                         cpupm_disable(CPUPM_P_STATES);
 199                 } else {
 200                         nspeeds = cpupm_get_speeds(cp, &speeds);
 201                         if (nspeeds == 0) {
 202                                 cmn_err(CE_NOTE, "!cpupm_init: processor %d:"
 203                                     " no speeds to manage", cp->cpu_id);
 204                         } else {
 205                                 cpupm_set_supp_freqs(cp, speeds, nspeeds);
 206                                 cpupm_free_speeds(speeds, nspeeds);
 207                                 mach_state->ms_caps |= CPUPM_P_STATES;
 208                         }
 209                 }
 210         } else {
 211                 cpupm_disable(CPUPM_P_STATES);
 212         }
 213 
 214         if (mach_state->ms_tstate.cma_ops != NULL) {
 215                 ret = mach_state->ms_tstate.cma_ops->cpus_init(cp);
 216                 if (ret != 0) {
 217                         mach_state->ms_tstate.cma_ops = NULL;
 218                         cpupm_disable(CPUPM_T_STATES);
 219                 } else {
 220                         mach_state->ms_caps |= CPUPM_T_STATES;
 221                 }
 222         } else {
 223                 cpupm_disable(CPUPM_T_STATES);
 224         }
 225 
 226         /*
 227          * If C-states support exists for this system, then initialize it.
 228          */
 229         if (mach_state->ms_cstate.cma_ops != NULL) {
 230                 ret = mach_state->ms_cstate.cma_ops->cpus_init(cp);
 231                 if (ret != 0) {
 232                         mach_state->ms_cstate.cma_ops = NULL;
 233                         mcpu->max_cstates = CPU_ACPI_C1;
 234                         cpupm_disable(CPUPM_C_STATES);
 235                         idle_cpu = non_deep_idle_cpu;
 236                         disp_enq_thread = non_deep_idle_disp_enq_thread;
 237                 } else if (cpu_deep_cstates_supported()) {
 238                         mcpu->max_cstates = cpu_acpi_get_max_cstates(
 239                             mach_state->ms_acpi_handle);
 240                         if (mcpu->max_cstates > CPU_ACPI_C1) {
 241                                 (void) cstate_timer_callback(
 242                                     CST_EVENT_MULTIPLE_CSTATES);
 243                                 cp->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
 244                                 mcpu->mcpu_idle_type = CPU_ACPI_C1;
 245                                 disp_enq_thread = cstate_wakeup;
 246                         } else {
 247                                 (void) cstate_timer_callback(
 248                                     CST_EVENT_ONE_CSTATE);
 249                         }
 250                         mach_state->ms_caps |= CPUPM_C_STATES;
 251                 } else {
 252                         mcpu->max_cstates = CPU_ACPI_C1;
 253                         idle_cpu = non_deep_idle_cpu;
 254                         disp_enq_thread = non_deep_idle_disp_enq_thread;
 255                 }
 256         } else {
 257                 cpupm_disable(CPUPM_C_STATES);
 258         }
 259 
 260 
 261         if (mach_state->ms_caps == CPUPM_NO_STATES) {
 262                 cpupm_fini(cp);
 263                 CPUPM_DISABLE();
 264                 first = B_FALSE;
 265                 return;
 266         }
 267 
 268         if ((mach_state->ms_caps & CPUPM_T_STATES) ||
 269             (mach_state->ms_caps & CPUPM_P_STATES) ||
 270             (mach_state->ms_caps & CPUPM_C_STATES)) {
 271                 if (first) {
 272                         acpica_write_cpupm_capabilities(
 273                             mach_state->ms_caps & CPUPM_P_STATES,
 274                             mach_state->ms_caps & CPUPM_C_STATES);
 275                 }
 276                 if (mach_state->ms_caps & CPUPM_T_STATES) {
 277                         cpupm_throttle_manage_notification(cp);
 278                 }
 279                 if (mach_state->ms_caps & CPUPM_C_STATES) {
 280                         cpuidle_manage_cstates(cp);
 281                 }
 282                 if (mach_state->ms_caps & CPUPM_P_STATES) {
 283                         cpupm_power_manage_notifications(cp);
 284                 }
 285                 cpupm_add_notify_handler(cp, cpupm_event_notify_handler, cp);
 286         }
 287         first = B_FALSE;
 288 #endif
 289 }
 290 
 291 /*
 292  * Free any resources allocated during cpupm initialization or cpupm start.
 293  */
 294 /*ARGSUSED*/
 295 void
 296 cpupm_free(cpu_t *cp, boolean_t cpupm_stop)
 297 {
 298 #ifndef __xpv
 299         cpupm_mach_state_t *mach_state =
 300             (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
 301 
 302         if (mach_state == NULL)
 303                 return;
 304 
 305         if (mach_state->ms_pstate.cma_ops != NULL) {
 306                 if (cpupm_stop)
 307                         mach_state->ms_pstate.cma_ops->cpus_stop(cp);
 308                 else
 309                         mach_state->ms_pstate.cma_ops->cpus_fini(cp);
 310                 mach_state->ms_pstate.cma_ops = NULL;
 311         }
 312 
 313         if (mach_state->ms_tstate.cma_ops != NULL) {
 314                 if (cpupm_stop)
 315                         mach_state->ms_tstate.cma_ops->cpus_stop(cp);
 316                 else
 317                         mach_state->ms_tstate.cma_ops->cpus_fini(cp);
 318                 mach_state->ms_tstate.cma_ops = NULL;
 319         }
 320 
 321         if (mach_state->ms_cstate.cma_ops != NULL) {
 322                 if (cpupm_stop)
 323                         mach_state->ms_cstate.cma_ops->cpus_stop(cp);
 324                 else
 325                         mach_state->ms_cstate.cma_ops->cpus_fini(cp);
 326 
 327                 mach_state->ms_cstate.cma_ops = NULL;
 328         }
 329 
 330         cpupm_free_notify_handlers(cp);
 331 
 332         if (mach_state->ms_acpi_handle != NULL) {
 333                 cpu_acpi_fini(mach_state->ms_acpi_handle);
 334                 mach_state->ms_acpi_handle = NULL;
 335         }
 336 
 337         mutex_destroy(&mach_state->ms_lock);
 338         kmem_free(mach_state, sizeof (cpupm_mach_state_t));
 339         cp->cpu_m.mcpu_pm_mach_state = NULL;
 340 #endif
 341 }
 342 
 343 void
 344 cpupm_fini(cpu_t *cp)
 345 {
 346         /*
 347          * call (*cpus_fini)() ops to release the cpupm resource
 348          * in the P/C/T-state driver
 349          */
 350         cpupm_free(cp, B_FALSE);
 351 }
 352 
 353 void
 354 cpupm_start(cpu_t *cp)
 355 {
 356         cpupm_init(cp);
 357 }
 358 
 359 void
 360 cpupm_stop(cpu_t *cp)
 361 {
 362         /*
 363          * call (*cpus_stop)() ops to reclaim the cpupm resource
 364          * in the P/C/T-state driver
 365          */
 366         cpupm_free(cp, B_TRUE);
 367 }
 368 
 369 /*
 370  * If A CPU has started and at least one power state is manageable,
 371  * then the CPU is ready for power management.
 372  */
 373 boolean_t
 374 cpupm_is_ready(cpu_t *cp)
 375 {
 376 #ifndef __xpv
 377         cpupm_mach_state_t *mach_state =
 378             (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
 379         uint32_t cpupm_caps = mach_state->ms_caps;
 380 
 381         if (cpupm_enabled == CPUPM_NO_STATES)
 382                 return (B_FALSE);
 383 
 384         if ((cpupm_caps & CPUPM_T_STATES) ||
 385             (cpupm_caps & CPUPM_P_STATES) ||
 386             (cpupm_caps & CPUPM_C_STATES))
 387 
 388                 return (B_TRUE);
 389         return (B_FALSE);
 390 #else
 391         _NOTE(ARGUNUSED(cp));
 392         return (B_FALSE);
 393 #endif
 394 }
 395 
 396 boolean_t
 397 cpupm_is_enabled(uint32_t state)
 398 {
 399         return ((cpupm_enabled & state) == state);
 400 }
 401 
 402 /*
 403  * By default, all states are enabled.
 404  */
 405 void
 406 cpupm_disable(uint32_t state)
 407 {
 408 
 409         if (state & CPUPM_P_STATES) {
 410                 cpupm_free_domains(&cpupm_pstate_domains);
 411         }
 412         if (state & CPUPM_T_STATES) {
 413                 cpupm_free_domains(&cpupm_tstate_domains);
 414         }
 415         if (state & CPUPM_C_STATES) {
 416                 cpupm_free_domains(&cpupm_cstate_domains);
 417         }
 418         cpupm_enabled &= ~state;
 419 }
 420 
 421 /*
 422  * Allocate power domains for C,P and T States
 423  */
 424 void
 425 cpupm_alloc_domains(cpu_t *cp, int state)
 426 {
 427         cpupm_mach_state_t *mach_state =
 428             (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
 429         cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
 430         cpupm_state_domains_t **dom_ptr;
 431         cpupm_state_domains_t *dptr;
 432         cpupm_state_domains_t **mach_dom_state_ptr;
 433         uint32_t domain;
 434         uint32_t type;
 435 
 436         switch (state) {
 437         case CPUPM_P_STATES:
 438                 if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_PSD_CACHED)) {
 439                         domain = CPU_ACPI_PSD(handle).sd_domain;
 440                         type = CPU_ACPI_PSD(handle).sd_type;
 441                 } else {
 442                         if (MUTEX_HELD(&cpu_lock)) {
 443                                 domain = cpuid_get_chipid(cp);
 444                         } else {
 445                                 mutex_enter(&cpu_lock);
 446                                 domain = cpuid_get_chipid(cp);
 447                                 mutex_exit(&cpu_lock);
 448                         }
 449                         type = CPU_ACPI_HW_ALL;
 450                 }
 451                 dom_ptr = &cpupm_pstate_domains;
 452                 mach_dom_state_ptr = &mach_state->ms_pstate.cma_domain;
 453                 break;
 454         case CPUPM_T_STATES:
 455                 if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_TSD_CACHED)) {
 456                         domain = CPU_ACPI_TSD(handle).sd_domain;
 457                         type = CPU_ACPI_TSD(handle).sd_type;
 458                 } else {
 459                         if (MUTEX_HELD(&cpu_lock)) {
 460                                 domain = cpuid_get_chipid(cp);
 461                         } else {
 462                                 mutex_enter(&cpu_lock);
 463                                 domain = cpuid_get_chipid(cp);
 464                                 mutex_exit(&cpu_lock);
 465                         }
 466                         type = CPU_ACPI_HW_ALL;
 467                 }
 468                 dom_ptr = &cpupm_tstate_domains;
 469                 mach_dom_state_ptr = &mach_state->ms_tstate.cma_domain;
 470                 break;
 471         case CPUPM_C_STATES:
 472                 if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_CSD_CACHED)) {
 473                         domain = CPU_ACPI_CSD(handle).sd_domain;
 474                         type = CPU_ACPI_CSD(handle).sd_type;
 475                 } else {
 476                         if (MUTEX_HELD(&cpu_lock)) {
 477                                 domain = cpuid_get_coreid(cp);
 478                         } else {
 479                                 mutex_enter(&cpu_lock);
 480                                 domain = cpuid_get_coreid(cp);
 481                                 mutex_exit(&cpu_lock);
 482                         }
 483                         type = CPU_ACPI_HW_ALL;
 484                 }
 485                 dom_ptr = &cpupm_cstate_domains;
 486                 mach_dom_state_ptr = &mach_state->ms_cstate.cma_domain;
 487                 break;
 488         default:
 489                 return;
 490         }
 491 
 492         for (dptr = *dom_ptr; dptr != NULL; dptr = dptr->pm_next) {
 493                 if (dptr->pm_domain == domain)
 494                         break;
 495         }
 496 
 497         /* new domain is created and linked at the head */
 498         if (dptr == NULL) {
 499                 dptr = kmem_zalloc(sizeof (cpupm_state_domains_t), KM_SLEEP);
 500                 dptr->pm_domain = domain;
 501                 dptr->pm_type = type;
 502                 dptr->pm_next = *dom_ptr;
 503                 mutex_init(&dptr->pm_lock, NULL, MUTEX_SPIN,
 504                     (void *)ipltospl(DISP_LEVEL));
 505                 CPUSET_ZERO(dptr->pm_cpus);
 506                 *dom_ptr = dptr;
 507         }
 508         CPUSET_ADD(dptr->pm_cpus, cp->cpu_id);
 509         *mach_dom_state_ptr = dptr;
 510 }
 511 
 512 /*
 513  * Free C, P or T state power domains
 514  */
 515 void
 516 cpupm_free_domains(cpupm_state_domains_t **dom_ptr)
 517 {
 518         cpupm_state_domains_t *this_domain, *next_domain;
 519 
 520         this_domain = *dom_ptr;
 521         while (this_domain != NULL) {
 522                 next_domain = this_domain->pm_next;
 523                 mutex_destroy(&this_domain->pm_lock);
 524                 kmem_free((void *)this_domain,
 525                     sizeof (cpupm_state_domains_t));
 526                 this_domain = next_domain;
 527         }
 528         *dom_ptr = NULL;
 529 }
 530 
 531 /*
 532  * Remove CPU from C, P or T state power domains
 533  */
 534 void
 535 cpupm_remove_domains(cpu_t *cp, int state, cpupm_state_domains_t **dom_ptr)
 536 {
 537         cpupm_mach_state_t *mach_state =
 538             (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
 539         cpupm_state_domains_t *dptr;
 540         uint32_t pm_domain;
 541 
 542         ASSERT(mach_state);
 543 
 544         switch (state) {
 545         case CPUPM_P_STATES:
 546                 pm_domain = mach_state->ms_pstate.cma_domain->pm_domain;
 547                 break;
 548         case CPUPM_T_STATES:
 549                 pm_domain = mach_state->ms_tstate.cma_domain->pm_domain;
 550                 break;
 551         case CPUPM_C_STATES:
 552                 pm_domain = mach_state->ms_cstate.cma_domain->pm_domain;
 553                 break;
 554         default:
 555                 return;
 556         }
 557 
 558         /*
 559          * Find the CPU C, P or T state power domain
 560          */
 561         for (dptr = *dom_ptr; dptr != NULL; dptr = dptr->pm_next) {
 562                 if (dptr->pm_domain == pm_domain)
 563                         break;
 564         }
 565 
 566         /*
 567          * return if no matched domain found
 568          */
 569         if (dptr == NULL)
 570                 return;
 571 
 572         /*
 573          * We found one matched power domain, remove CPU from its cpuset.
 574          * pm_lock(spin lock) here to avoid the race conditions between
 575          * event change notification and cpu remove.
 576          */
 577         mutex_enter(&dptr->pm_lock);
 578         if (CPU_IN_SET(dptr->pm_cpus, cp->cpu_id))
 579                 CPUSET_DEL(dptr->pm_cpus, cp->cpu_id);
 580         mutex_exit(&dptr->pm_lock);
 581 }
 582 
 583 void
 584 cpupm_alloc_ms_cstate(cpu_t *cp)
 585 {
 586         cpupm_mach_state_t *mach_state;
 587         cpupm_mach_acpi_state_t *ms_cstate;
 588 
 589         mach_state = (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
 590         ms_cstate = &mach_state->ms_cstate;
 591         ASSERT(ms_cstate->cma_state.cstate == NULL);
 592         ms_cstate->cma_state.cstate = kmem_zalloc(sizeof (cma_c_state_t),
 593             KM_SLEEP);
 594         ms_cstate->cma_state.cstate->cs_next_cstate = CPU_ACPI_C1;
 595 }
 596 
 597 void
 598 cpupm_free_ms_cstate(cpu_t *cp)
 599 {
 600         cpupm_mach_state_t *mach_state =
 601             (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
 602         cpupm_mach_acpi_state_t *ms_cstate = &mach_state->ms_cstate;
 603 
 604         if (ms_cstate->cma_state.cstate != NULL) {
 605                 kmem_free(ms_cstate->cma_state.cstate, sizeof (cma_c_state_t));
 606                 ms_cstate->cma_state.cstate = NULL;
 607         }
 608 }
 609 
 610 void
 611 cpupm_state_change(cpu_t *cp, int level, int state)
 612 {
 613         cpupm_mach_state_t      *mach_state =
 614             (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
 615         cpupm_state_ops_t       *state_ops;
 616         cpupm_state_domains_t   *state_domain;
 617         cpuset_t                set;
 618 
 619         DTRACE_PROBE2(cpupm__state__change, cpu_t *, cp, int, level);
 620 
 621         if (mach_state == NULL) {
 622                 return;
 623         }
 624 
 625         switch (state) {
 626         case CPUPM_P_STATES:
 627                 state_ops = mach_state->ms_pstate.cma_ops;
 628                 state_domain = mach_state->ms_pstate.cma_domain;
 629                 break;
 630         case CPUPM_T_STATES:
 631                 state_ops = mach_state->ms_tstate.cma_ops;
 632                 state_domain = mach_state->ms_tstate.cma_domain;
 633                 break;
 634         default:
 635                 break;
 636         }
 637 
 638         switch (state_domain->pm_type) {
 639         case CPU_ACPI_SW_ANY:
 640                 /*
 641                  * A request on any CPU in the domain transitions the domain
 642                  */
 643                 CPUSET_ONLY(set, cp->cpu_id);
 644                 state_ops->cpus_change(set, level);
 645                 break;
 646         case CPU_ACPI_SW_ALL:
 647                 /*
 648                  * All CPUs in the domain must request the transition
 649                  */
 650         case CPU_ACPI_HW_ALL:
 651                 /*
 652                  * P/T-state transitions are coordinated by the hardware
 653                  * For now, request the transition on all CPUs in the domain,
 654                  * but looking ahead we can probably be smarter about this.
 655                  */
 656                 mutex_enter(&state_domain->pm_lock);
 657                 state_ops->cpus_change(state_domain->pm_cpus, level);
 658                 mutex_exit(&state_domain->pm_lock);
 659                 break;
 660         default:
 661                 cmn_err(CE_NOTE, "Unknown domain coordination type: %d",
 662                     state_domain->pm_type);
 663         }
 664 }
 665 
 666 /*
 667  * CPU PM interfaces exposed to the CPU power manager
 668  */
 669 /*ARGSUSED*/
 670 id_t
 671 cpupm_plat_domain_id(cpu_t *cp, cpupm_dtype_t type)
 672 {
 673         cpupm_mach_state_t      *mach_state =
 674             (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
 675 
 676         if ((mach_state == NULL) || (!cpupm_is_enabled(CPUPM_P_STATES) &&
 677             !cpupm_is_enabled(CPUPM_C_STATES))) {
 678                 return (CPUPM_NO_DOMAIN);
 679         }
 680         if (type == CPUPM_DTYPE_ACTIVE) {
 681                 /*
 682                  * Return P-State domain for the specified CPU
 683                  */
 684                 if (mach_state->ms_pstate.cma_domain) {
 685                         return (mach_state->ms_pstate.cma_domain->pm_domain);
 686                 }
 687         } else if (type == CPUPM_DTYPE_IDLE) {
 688                 /*
 689                  * Return C-State domain for the specified CPU
 690                  */
 691                 if (mach_state->ms_cstate.cma_domain) {
 692                         return (mach_state->ms_cstate.cma_domain->pm_domain);
 693                 }
 694         }
 695         return (CPUPM_NO_DOMAIN);
 696 }
 697 
 698 /*ARGSUSED*/
 699 uint_t
 700 cpupm_plat_state_enumerate(cpu_t *cp, cpupm_dtype_t type,
 701     cpupm_state_t *states)
 702 {
 703         int     *speeds;
 704         uint_t  nspeeds, i;
 705 
 706         /*
 707          * Idle domain support unimplemented
 708          */
 709         if (type != CPUPM_DTYPE_ACTIVE) {
 710                 return (0);
 711         }
 712         nspeeds = cpupm_get_speeds(cp, &speeds);
 713 
 714         /*
 715          * If the caller passes NULL for states, just return the
 716          * number of states.
 717          */
 718         if (states != NULL) {
 719                 for (i = 0; i < nspeeds; i++) {
 720                         states[i].cps_speed = speeds[i];
 721                         states[i].cps_handle = (cpupm_handle_t)i;
 722                 }
 723         }
 724         cpupm_free_speeds(speeds, nspeeds);
 725         return (nspeeds);
 726 }
 727 
 728 /*ARGSUSED*/
 729 int
 730 cpupm_plat_change_state(cpu_t *cp, cpupm_state_t *state)
 731 {
 732         if (!cpupm_is_ready(cp))
 733                 return (-1);
 734 
 735         cpupm_state_change(cp, (int)state->cps_handle, CPUPM_P_STATES);
 736 
 737         return (0);
 738 }
 739 
 740 /*ARGSUSED*/
 741 /*
 742  * Note: It is the responsibility of the users of
 743  * cpupm_get_speeds() to free the memory allocated
 744  * for speeds using cpupm_free_speeds()
 745  */
 746 uint_t
 747 cpupm_get_speeds(cpu_t *cp, int **speeds)
 748 {
 749 #ifndef __xpv
 750         cpupm_mach_state_t *mach_state =
 751             (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
 752         return (cpu_acpi_get_speeds(mach_state->ms_acpi_handle, speeds));
 753 #else
 754         return (0);
 755 #endif
 756 }
 757 
 758 /*ARGSUSED*/
 759 void
 760 cpupm_free_speeds(int *speeds, uint_t nspeeds)
 761 {
 762 #ifndef __xpv
 763         cpu_acpi_free_speeds(speeds, nspeeds);
 764 #endif
 765 }
 766 
 767 /*
 768  * All CPU instances have been initialized successfully.
 769  */
 770 boolean_t
 771 cpupm_power_ready(cpu_t *cp)
 772 {
 773         return (cpupm_is_enabled(CPUPM_P_STATES) && cpupm_is_ready(cp));
 774 }
 775 
 776 /*
 777  * All CPU instances have been initialized successfully.
 778  */
 779 boolean_t
 780 cpupm_throttle_ready(cpu_t *cp)
 781 {
 782         return (cpupm_is_enabled(CPUPM_T_STATES) && cpupm_is_ready(cp));
 783 }
 784 
 785 /*
 786  * All CPU instances have been initialized successfully.
 787  */
 788 boolean_t
 789 cpupm_cstate_ready(cpu_t *cp)
 790 {
 791         return (cpupm_is_enabled(CPUPM_C_STATES) && cpupm_is_ready(cp));
 792 }
 793 
 794 void
 795 cpupm_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
 796 {
 797         cpu_t *cp = ctx;
 798         cpupm_mach_state_t *mach_state =
 799             (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
 800         cpupm_notification_t *entry;
 801 
 802         mutex_enter(&mach_state->ms_lock);
 803         for (entry =  mach_state->ms_handlers; entry != NULL;
 804             entry = entry->nq_next) {
 805                 entry->nq_handler(obj, val, entry->nq_ctx);
 806         }
 807         mutex_exit(&mach_state->ms_lock);
 808 }
 809 
 810 /*ARGSUSED*/
 811 void
 812 cpupm_add_notify_handler(cpu_t *cp, CPUPM_NOTIFY_HANDLER handler, void *ctx)
 813 {
 814 #ifndef __xpv
 815         cpupm_mach_state_t *mach_state =
 816             (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
 817         cpupm_notification_t *entry;
 818 
 819         entry = kmem_zalloc(sizeof (cpupm_notification_t), KM_SLEEP);
 820         entry->nq_handler = handler;
 821         entry->nq_ctx = ctx;
 822         mutex_enter(&mach_state->ms_lock);
 823         if (mach_state->ms_handlers == NULL) {
 824                 entry->nq_next = NULL;
 825                 mach_state->ms_handlers = entry;
 826                 cpu_acpi_install_notify_handler(mach_state->ms_acpi_handle,
 827                     cpupm_notify_handler, cp);
 828 
 829         } else {
 830                 entry->nq_next = mach_state->ms_handlers;
 831                 mach_state->ms_handlers = entry;
 832         }
 833         mutex_exit(&mach_state->ms_lock);
 834 #endif
 835 }
 836 
 837 /*ARGSUSED*/
 838 static void
 839 cpupm_free_notify_handlers(cpu_t *cp)
 840 {
 841 #ifndef __xpv
 842         cpupm_mach_state_t *mach_state =
 843             (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
 844         cpupm_notification_t *entry;
 845         cpupm_notification_t *next;
 846 
 847         mutex_enter(&mach_state->ms_lock);
 848         if (mach_state->ms_handlers == NULL) {
 849                 mutex_exit(&mach_state->ms_lock);
 850                 return;
 851         }
 852         if (mach_state->ms_acpi_handle != NULL) {
 853                 cpu_acpi_remove_notify_handler(mach_state->ms_acpi_handle,
 854                     cpupm_notify_handler);
 855         }
 856         entry = mach_state->ms_handlers;
 857         while (entry != NULL) {
 858                 next = entry->nq_next;
 859                 kmem_free(entry, sizeof (cpupm_notification_t));
 860                 entry = next;
 861         }
 862         mach_state->ms_handlers = NULL;
 863         mutex_exit(&mach_state->ms_lock);
 864 #endif
 865 }
 866 
 867 /*
 868  * Get the current max speed from the ACPI _PPC object
 869  */
 870 /*ARGSUSED*/
 871 int
 872 cpupm_get_top_speed(cpu_t *cp)
 873 {
 874 #ifndef __xpv
 875         cpupm_mach_state_t      *mach_state;
 876         cpu_acpi_handle_t       handle;
 877         int                     plat_level;
 878         uint_t                  nspeeds;
 879         int                     max_level;
 880 
 881         mach_state =
 882             (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
 883         handle = mach_state->ms_acpi_handle;
 884 
 885         cpu_acpi_cache_ppc(handle);
 886         plat_level = CPU_ACPI_PPC(handle);
 887 
 888         nspeeds = CPU_ACPI_PSTATES_COUNT(handle);
 889 
 890         max_level = nspeeds - 1;
 891         if ((plat_level < 0) || (plat_level > max_level)) {
 892                 cmn_err(CE_NOTE, "!cpupm_get_top_speed: CPU %d: "
 893                     "_PPC out of range %d", cp->cpu_id, plat_level);
 894                 plat_level = 0;
 895         }
 896 
 897         return (plat_level);
 898 #else
 899         return (0);
 900 #endif
 901 }
 902 
 903 /*
 904  * This notification handler is called whenever the ACPI _PPC
 905  * object changes. The _PPC is a sort of governor on power levels.
 906  * It sets an upper threshold on which, _PSS defined, power levels
 907  * are usuable. The _PPC value is dynamic and may change as properties
 908  * (i.e., thermal or AC source) of the system change.
 909  */
 910 
 911 static void
 912 cpupm_power_manage_notifications(void *ctx)
 913 {
 914         cpu_t                   *cp = ctx;
 915         int                     top_speed;
 916 
 917         top_speed = cpupm_get_top_speed(cp);
 918         cpupm_redefine_max_activepwr_state(cp, top_speed);
 919 }
 920 
 921 /* ARGSUSED */
 922 static void
 923 cpupm_event_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
 924 {
 925 #ifndef __xpv
 926 
 927         cpu_t *cp = ctx;
 928         cpupm_mach_state_t *mach_state =
 929             (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
 930 
 931         if (mach_state == NULL)
 932                 return;
 933 
 934         /*
 935          * Currently, we handle _TPC,_CST and _PPC change notifications.
 936          */
 937         if (val == CPUPM_TPC_CHANGE_NOTIFICATION &&
 938             mach_state->ms_caps & CPUPM_T_STATES) {
 939                 cpupm_throttle_manage_notification(ctx);
 940         } else if (val == CPUPM_CST_CHANGE_NOTIFICATION &&
 941             mach_state->ms_caps & CPUPM_C_STATES) {
 942                 cpuidle_manage_cstates(ctx);
 943         } else if (val == CPUPM_PPC_CHANGE_NOTIFICATION &&
 944             mach_state->ms_caps & CPUPM_P_STATES) {
 945                 cpupm_power_manage_notifications(ctx);
 946         }
 947 #endif
 948 }
 949 
 950 /*
 951  * Update cpupm cstate data each time CPU exits idle.
 952  */
 953 void
 954 cpupm_wakeup_cstate_data(cma_c_state_t *cs_data, hrtime_t end)
 955 {
 956         cs_data->cs_idle_exit = end;
 957 }
 958 
 959 /*
 960  * Determine next cstate based on cpupm data.
 961  * Update cpupm cstate data each time CPU goes idle.
 962  * Do as much as possible in the idle state bookkeeping function because the
 963  * performance impact while idle is minimal compared to in the wakeup function
 964  * when there is real work to do.
 965  */
 966 uint32_t
 967 cpupm_next_cstate(cma_c_state_t *cs_data, cpu_acpi_cstate_t *cstates,
 968     uint32_t cs_count, hrtime_t start)
 969 {
 970         hrtime_t duration;
 971         hrtime_t ave_interval;
 972         hrtime_t ave_idle_time;
 973         uint32_t i, smpl_cnt;
 974 
 975         duration = cs_data->cs_idle_exit - cs_data->cs_idle_enter;
 976         scalehrtime(&duration);
 977         cs_data->cs_idle += duration;
 978         cs_data->cs_idle_enter = start;
 979 
 980         smpl_cnt = ++cs_data->cs_cnt;
 981         cs_data->cs_smpl_len = start - cs_data->cs_smpl_start;
 982         scalehrtime(&cs_data->cs_smpl_len);
 983         if (cs_data->cs_smpl_len > cpupm_cs_sample_interval) {
 984                 cs_data->cs_smpl_idle = cs_data->cs_idle;
 985                 cs_data->cs_idle = 0;
 986                 cs_data->cs_smpl_idle_pct = ((100 * cs_data->cs_smpl_idle) /
 987                     cs_data->cs_smpl_len);
 988 
 989                 cs_data->cs_smpl_start = start;
 990                 cs_data->cs_cnt = 0;
 991 
 992                 /*
 993                  * Strand level C-state policy
 994                  * The cpu_acpi_cstate_t *cstates array is not required to
 995                  * have an entry for both CPU_ACPI_C2 and CPU_ACPI_C3.
 996                  * There are cs_count entries in the cstates array.
 997                  * cs_data->cs_next_cstate contains the index of the next
 998                  * C-state this CPU should enter.
 999                  */
1000                 ASSERT(cstates[0].cs_type == CPU_ACPI_C1);
1001 
1002                 /*
1003                  * Will CPU be idle long enough to save power?
1004                  */
1005                 ave_idle_time = (cs_data->cs_smpl_idle / smpl_cnt) / 1000;
1006                 for (i = 1; i < cs_count; ++i) {
1007                         if (ave_idle_time < (cstates[i].cs_latency *
1008                             cpupm_cs_idle_save_tunable)) {
1009                                 cs_count = i;
1010                                 DTRACE_PROBE2(cpupm__next__cstate, cpu_t *,
1011                                     CPU, int, i);
1012                         }
1013                 }
1014 
1015                 /*
1016                  * Wakeup often (even when non-idle time is very short)?
1017                  * Some producer/consumer type loads fall into this category.
1018                  */
1019                 ave_interval = (cs_data->cs_smpl_len / smpl_cnt) / 1000;
1020                 for (i = 1; i < cs_count; ++i) {
1021                         if (ave_interval <= (cstates[i].cs_latency *
1022                             cpupm_cs_idle_cost_tunable)) {
1023                                 cs_count = i;
1024                                 DTRACE_PROBE2(cpupm__next__cstate, cpu_t *,
1025                                     CPU, int, (CPU_MAX_CSTATES + i));
1026                         }
1027                 }
1028 
1029                 /*
1030                  * Idle percent
1031                  */
1032                 for (i = 1; i < cs_count; ++i) {
1033                         switch (cstates[i].cs_type) {
1034                         case CPU_ACPI_C2:
1035                                 if (cs_data->cs_smpl_idle_pct <
1036                                     cpupm_C2_idle_pct_tunable) {
1037                                         cs_count = i;
1038                                         DTRACE_PROBE2(cpupm__next__cstate,
1039                                             cpu_t *, CPU, int,
1040                                             ((2 * CPU_MAX_CSTATES) + i));
1041                                 }
1042                                 break;
1043 
1044                         case CPU_ACPI_C3:
1045                                 if (cs_data->cs_smpl_idle_pct <
1046                                     cpupm_C3_idle_pct_tunable) {
1047                                         cs_count = i;
1048                                         DTRACE_PROBE2(cpupm__next__cstate,
1049                                             cpu_t *, CPU, int,
1050                                             ((2 * CPU_MAX_CSTATES) + i));
1051                                 }
1052                                 break;
1053                         }
1054                 }
1055 
1056                 cs_data->cs_next_cstate = cs_count - 1;
1057         }
1058 
1059         return (cs_data->cs_next_cstate);
1060 }