1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  24  * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
  25  * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
  26  */
  27 /*
  28  * Copyright (c) 2010, Intel Corporation.
  29  * All rights reserved.
  30  */
  31 /*
  32  * Portions Copyright 2009 Advanced Micro Devices, Inc.
  33  */
  34 /*
  35  * Copyright 2020 Joyent, Inc.
  36  */
  37 
  38 /*
  39  * CPU Identification logic
  40  *
  41  * The purpose of this file and its companion, cpuid_subr.c, is to help deal
  42  * with the identification of CPUs, their features, and their topologies. More
  43  * specifically, this file helps drive the following:
  44  *
  45  * 1. Enumeration of features of the processor which are used by the kernel to
  46  *    determine what features to enable or disable. These may be instruction set
  47  *    enhancements or features that we use.
  48  *
  49  * 2. Enumeration of instruction set architecture (ISA) additions that userland
  50  *    will be told about through the auxiliary vector.
  51  *
  52  * 3. Understanding the physical topology of the CPU such as the number of
  53  *    caches, how many cores it has, whether or not it supports symmetric
  54  *    multi-processing (SMT), etc.
  55  *
  56  * ------------------------
  57  * CPUID History and Basics
  58  * ------------------------
  59  *
  60  * The cpuid instruction was added by Intel roughly around the time that the
  61  * original Pentium was introduced. The purpose of cpuid was to tell in a
  62  * programmatic fashion information about the CPU that previously was guessed
  63  * at. For example, an important part of cpuid is that we can know what
  64  * extensions to the ISA exist. If you use an invalid opcode you would get a
  65  * #UD, so this method allows a program (whether a user program or the kernel)
  66  * to determine what exists without crashing or getting a SIGILL. Of course,
  67  * this was also during the era of the clones and the AMD Am5x86. The vendor
  68  * name shows up first in cpuid for a reason.
  69  *
  70  * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
  71  * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
  72  * its own meaning. The different leaves are broken down into different regions:
  73  *
  74  *      [ 0, 7fffffff ]                 This region is called the 'basic'
  75  *                                      region. This region is generally defined
  76  *                                      by Intel, though some of the original
  77  *                                      portions have different meanings based
  78  *                                      on the manufacturer. These days, Intel
  79  *                                      adds most new features to this region.
  80  *                                      AMD adds non-Intel compatible
  81  *                                      information in the third, extended
  82  *                                      region. Intel uses this for everything
  83  *                                      including ISA extensions, CPU
  84  *                                      features, cache information, topology,
  85  *                                      and more.
  86  *
  87  *                                      There is a hole carved out of this
  88  *                                      region which is reserved for
  89  *                                      hypervisors.
  90  *
  91  *      [ 40000000, 4fffffff ]          This region, which is found in the
  92  *                                      middle of the previous region, is
  93  *                                      explicitly promised to never be used by
  94  *                                      CPUs. Instead, it is used by hypervisors
  95  *                                      to communicate information about
  96  *                                      themselves to the operating system. The
  97  *                                      values and details are unique for each
  98  *                                      hypervisor.
  99  *
 100  *      [ 80000000, ffffffff ]          This region is called the 'extended'
 101  *                                      region. Some of the low leaves mirror
 102  *                                      parts of the basic leaves. This region
 103  *                                      has generally been used by AMD for
 104  *                                      various extensions. For example, AMD-
 105  *                                      specific information about caches,
 106  *                                      features, and topology are found in this
 107  *                                      region.
 108  *
 109  * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
 110  * and %edx, and then issue the cpuid instruction. At the first leaf in each of
 111  * the ranges, one of the primary things returned is the maximum valid leaf in
 112  * that range. This allows for discovery of what range of CPUID is valid.
 113  *
 114  * The CPUs have potentially surprising behavior when using an invalid leaf or
 115  * unimplemented leaf. If the requested leaf is within the valid basic or
 116  * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
 117  * set to zero. However, if you specify a leaf that is outside of a valid range,
 118  * then instead it will be filled with the last valid _basic_ leaf. For example,
 119  * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
 120  * an invalid extended leaf will return the information for leaf 3.
 121  *
 122  * Some leaves are broken down into sub-leaves. This means that the value
 123  * depends on both the leaf asked for in %eax and a secondary register. For
 124  * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
 125  * additional information. Or when getting topology information in leaf 0xb, the
 126  * initial value in %ecx changes which level of the topology that you are
 127  * getting information about.
 128  *
 129  * cpuid values are always kept to 32 bits regardless of whether or not the
 130  * program is in 64-bit mode. When executing in 64-bit mode, the upper
 131  * 32 bits of the register are always set to zero so that way the values are the
 132  * same regardless of execution mode.
 133  *
 134  * ----------------------
 135  * Identifying Processors
 136  * ----------------------
 137  *
 138  * We can identify a processor in two steps. The first step looks at cpuid leaf
 139  * 0. Leaf 0 contains the processor's vendor information. This is done by
 140  * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
 141  * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
 142  *
 143  * From there, a processor is identified by a combination of three different
 144  * values:
 145  *
 146  *  1. Family
 147  *  2. Model
 148  *  3. Stepping
 149  *
 150  * Each vendor uses the family and model to uniquely identify a processor. The
 151  * way that family and model are changed depends on the vendor. For example,
 152  * Intel has been using family 0x6 for almost all of their processor since the
 153  * Pentium Pro/Pentium II era, often called the P6. The model is used to
 154  * identify the exact processor. Different models are often used for the client
 155  * (consumer) and server parts. Even though each processor often has major
 156  * architectural differences, they still are considered the same family by
 157  * Intel.
 158  *
 159  * On the other hand, each major AMD architecture generally has its own family.
 160  * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
 161  * the model number is used to help identify specific processors.
 162  *
 163  * The stepping is used to refer to a revision of a specific microprocessor. The
 164  * term comes from equipment used to produce masks that are used to create
 165  * integrated circuits.
 166  *
 167  * The information is present in leaf 1, %eax. In technical documentation you
 168  * will see the terms extended model and extended family. The original family,
 169  * model, and stepping fields were each 4 bits wide. If the values in either
 170  * are 0xf, then one is to consult the extended model and extended family, which
 171  * take previously reserved bits and allow for a larger number of models and add
 172  * 0xf to them.
 173  *
 174  * When we process this information, we store the full family, model, and
 175  * stepping in the struct cpuid_info members cpi_family, cpi_model, and
 176  * cpi_step, respectively. Whenever you are performing comparisons with the
 177  * family, model, and stepping, you should use these members and not the raw
 178  * values from cpuid. If you must use the raw values from cpuid directly, you
 179  * must make sure that you add the extended model and family to the base model
 180  * and family.
 181  *
 182  * In general, we do not use information about the family, model, and stepping
 183  * to determine whether or not a feature is present; that is generally driven by
 184  * specific leaves. However, when something we care about on the processor is
 185  * not considered 'architectural' meaning that it is specific to a set of
 186  * processors and not promised in the architecture model to be consistent from
 187  * generation to generation, then we will fall back on this information. The
 188  * most common cases where this comes up is when we have to workaround errata in
 189  * the processor, are dealing with processor-specific features such as CPU
 190  * performance counters, or we want to provide additional information for things
 191  * such as fault management.
 192  *
 193  * While processors also do have a brand string, which is the name that people
 194  * are familiar with when buying the processor, they are not meant for
 195  * programmatic consumption. That is what the family, model, and stepping are
 196  * for.
 197  *
 198  * ------------
 199  * CPUID Passes
 200  * ------------
 201  *
 202  * As part of performing feature detection, we break this into several different
 203  * passes. The passes are as follows:
 204  *
 205  *      Pass 0          This is a primordial pass done in locore.s to deal with
 206  *                      Cyrix CPUs that don't support cpuid. The reality is that
 207  *                      we likely don't run on them any more, but there is still
 208  *                      logic for handling them.
 209  *
 210  *      Pass 1          This is the primary pass and is responsible for doing a
 211  *                      large number of different things:
 212  *
 213  *                      1. Determine which vendor manufactured the CPU and
 214  *                      determining the family, model, and stepping information.
 215  *
 216  *                      2. Gathering a large number of feature flags to
 217  *                      determine which features the CPU support and which
 218  *                      indicate things that we need to do other work in the OS
 219  *                      to enable. Features detected this way are added to the
 220  *                      x86_featureset which can be queried to
 221  *                      determine what we should do. This includes processing
 222  *                      all of the basic and extended CPU features that we care
 223  *                      about.
 224  *
 225  *                      3. Determining the CPU's topology. This includes
 226  *                      information about how many cores and threads are present
 227  *                      in the package. It also is responsible for figuring out
 228  *                      which logical CPUs are potentially part of the same core
 229  *                      and what other resources they might share. For more
 230  *                      information see the 'Topology' section.
 231  *
 232  *                      4. Determining the set of CPU security-specific features
 233  *                      that we need to worry about and determine the
 234  *                      appropriate set of workarounds.
 235  *
 236  *                      Pass 1 on the boot CPU occurs before KMDB is started.
 237  *
 238  *      Pass 2          The second pass is done after startup(). Here, we check
 239  *                      other miscellaneous features. Most of this is gathering
 240  *                      additional basic and extended features that we'll use in
 241  *                      later passes or for debugging support.
 242  *
 243  *      Pass 3          The third pass occurs after the kernel memory allocator
 244  *                      has been fully initialized. This gathers information
 245  *                      where we might need dynamic memory available for our
 246  *                      uses. This includes several varying width leaves that
 247  *                      have cache information and the processor's brand string.
 248  *
 249  *      Pass 4          The fourth and final normal pass is performed after the
 250  *                      kernel has brought most everything online. This is
 251  *                      invoked from post_startup(). In this pass, we go through
 252  *                      the set of features that we have enabled and turn that
 253  *                      into the hardware auxiliary vector features that
 254  *                      userland receives. This is used by userland, primarily
 255  *                      by the run-time link-editor (RTLD), though userland
 256  *                      software could also refer to it directly.
 257  *
 258  *      Microcode       After a microcode update, we do a selective rescan of
 259  *                      the cpuid leaves to determine what features have
 260  *                      changed. Microcode updates can provide more details
 261  *                      about security related features to deal with issues like
 262  *                      Spectre and L1TF. On occasion, vendors have violated
 263  *                      their contract and removed bits. However, we don't try
 264  *                      to detect that because that puts us in a situation that
 265  *                      we really can't deal with. As such, the only thing we
 266  *                      rescan are security related features today. See
 267  *                      cpuid_pass_ucode().
 268  *
 269  * All of the passes (except pass 0) are run on all CPUs. However, for the most
 270  * part we only care about what the boot CPU says about this information and use
 271  * the other CPUs as a rough guide to sanity check that we have the same feature
 272  * set.
 273  *
 274  * We do not support running multiple logical CPUs with disjoint, let alone
 275  * different, feature sets.
 276  *
 277  * ------------------
 278  * Processor Topology
 279  * ------------------
 280  *
 281  * One of the important things that we need to do is to understand the topology
 282  * of the underlying processor. When we say topology in this case, we're trying
 283  * to understand the relationship between the logical CPUs that the operating
 284  * system sees and the underlying physical layout. Different logical CPUs may
 285  * share different resources which can have important consequences for the
 286  * performance of the system. For example, they may share caches, execution
 287  * units, and more.
 288  *
 289  * The topology of the processor changes from generation to generation and
 290  * vendor to vendor.  Along with that, different vendors use different
 291  * terminology, and the operating system itself uses occasionally overlapping
 292  * terminology. It's important to understand what this topology looks like so
 293  * one can understand the different things that we try to calculate and
 294  * determine.
 295  *
 296  * To get started, let's talk about a little bit of terminology that we've used
 297  * so far, is used throughout this file, and is fairly generic across multiple
 298  * vendors:
 299  *
 300  * CPU
 301  *      A central processing unit (CPU) refers to a logical and/or virtual
 302  *      entity that the operating system can execute instructions on. The
 303  *      underlying resources for this CPU may be shared between multiple
 304  *      entities; however, to the operating system it is a discrete unit.
 305  *
 306  * PROCESSOR and PACKAGE
 307  *
 308  *      Generally, when we use the term 'processor' on its own, we are referring
 309  *      to the physical entity that one buys and plugs into a board. However,
 310  *      because processor has been overloaded and one might see it used to mean
 311  *      multiple different levels, we will instead use the term 'package' for
 312  *      the rest of this file. The term package comes from the electrical
 313  *      engineering side and refers to the physical entity that encloses the
 314  *      electronics inside. Strictly speaking the package can contain more than
 315  *      just the CPU, for example, on many processors it may also have what's
 316  *      called an 'integrated graphical processing unit (GPU)'. Because the
 317  *      package can encapsulate multiple units, it is the largest physical unit
 318  *      that we refer to.
 319  *
 320  * SOCKET
 321  *
 322  *      A socket refers to unit on a system board (generally the motherboard)
 323  *      that can receive a package. A single package, or processor, is plugged
 324  *      into a single socket. A system may have multiple sockets. Often times,
 325  *      the term socket is used interchangeably with package and refers to the
 326  *      electrical component that has plugged in, and not the receptacle itself.
 327  *
 328  * CORE
 329  *
 330  *      A core refers to the physical instantiation of a CPU, generally, with a
 331  *      full set of hardware resources available to it. A package may contain
 332  *      multiple cores inside of it or it may just have a single one. A
 333  *      processor with more than one core is often referred to as 'multi-core'.
 334  *      In illumos, we will use the feature X86FSET_CMP to refer to a system
 335  *      that has 'multi-core' processors.
 336  *
 337  *      A core may expose a single logical CPU to the operating system, or it
 338  *      may expose multiple CPUs, which we call threads, defined below.
 339  *
 340  *      Some resources may still be shared by cores in the same package. For
 341  *      example, many processors will share the level 3 cache between cores.
 342  *      Some AMD generations share hardware resources between cores. For more
 343  *      information on that see the section 'AMD Topology'.
 344  *
 345  * THREAD and STRAND
 346  *
 347  *      In this file, generally a thread refers to a hardware resources and not
 348  *      the operating system's logical abstraction. A thread is always exposed
 349  *      as an independent logical CPU to the operating system. A thread belongs
 350  *      to a specific core. A core may have more than one thread. When that is
 351  *      the case, the threads that are part of the same core are often referred
 352  *      to as 'siblings'.
 353  *
 354  *      When multiple threads exist, this is generally referred to as
 355  *      simultaneous multi-threading (SMT). When Intel introduced this in their
 356  *      processors they called it hyper-threading (HT). When multiple threads
 357  *      are active in a core, they split the resources of the core. For example,
 358  *      two threads may share the same set of hardware execution units.
 359  *
 360  *      The operating system often uses the term 'strand' to refer to a thread.
 361  *      This helps disambiguate it from the software concept.
 362  *
 363  * CHIP
 364  *
 365  *      Unfortunately, the term 'chip' is dramatically overloaded. At its most
 366  *      base meaning, it is used to refer to a single integrated circuit, which
 367  *      may or may not be the only thing in the package. In illumos, when you
 368  *      see the term 'chip' it is almost always referring to the same thing as
 369  *      the 'package'. However, many vendors may use chip to refer to one of
 370  *      many integrated circuits that have been placed in the package. As an
 371  *      example, see the subsequent definition.
 372  *
 373  *      To try and keep things consistent, we will only use chip when referring
 374  *      to the entire integrated circuit package, with the exception of the
 375  *      definition of multi-chip module (because it is in the name) and use the
 376  *      term 'die' when we want the more general, potential sub-component
 377  *      definition.
 378  *
 379  * DIE
 380  *
 381  *      A die refers to an integrated circuit. Inside of the package there may
 382  *      be a single die or multiple dies. This is sometimes called a 'chip' in
 383  *      vendor's parlance, but in this file, we use the term die to refer to a
 384  *      subcomponent.
 385  *
 386  * MULTI-CHIP MODULE
 387  *
 388  *      A multi-chip module (MCM) refers to putting multiple distinct chips that
 389  *      are connected together in the same package. When a multi-chip design is
 390  *      used, generally each chip is manufactured independently and then joined
 391  *      together in the package. For example, on AMD's Zen microarchitecture
 392  *      (family 0x17), the package contains several dies (the second meaning of
 393  *      chip from above) that are connected together.
 394  *
 395  * CACHE
 396  *
 397  *      A cache is a part of the processor that maintains copies of recently
 398  *      accessed memory. Caches are split into levels and then into types.
 399  *      Commonly there are one to three levels, called level one, two, and
 400  *      three. The lower the level, the smaller it is, the closer it is to the
 401  *      execution units of the CPU, and the faster it is to access. The layout
 402  *      and design of the cache come in many different flavors, consult other
 403  *      resources for a discussion of those.
 404  *
 405  *      Caches are generally split into two types, the instruction and data
 406  *      cache. The caches contain what their names suggest, the instruction
 407  *      cache has executable program text, while the data cache has all other
 408  *      memory that the processor accesses. As of this writing, data is kept
 409  *      coherent between all of the caches on x86, so if one modifies program
 410  *      text before it is executed, that will be in the data cache, and the
 411  *      instruction cache will be synchronized with that change when the
 412  *      processor actually executes those instructions. This coherency also
 413  *      covers the fact that data could show up in multiple caches.
 414  *
 415  *      Generally, the lowest level caches are specific to a core. However, the
 416  *      last layer cache is shared between some number of cores. The number of
 417  *      CPUs sharing this last level cache is important. This has implications
 418  *      for the choices that the scheduler makes, as accessing memory that might
 419  *      be in a remote cache after thread migration can be quite expensive.
 420  *
 421  *      Sometimes, the word cache is abbreviated with a '$', because in US
 422  *      English the word cache is pronounced the same as cash. So L1D$ refers to
 423  *      the L1 data cache, and L2$ would be the L2 cache. This will not be used
 424  *      in the rest of this theory statement for clarity.
 425  *
 426  * MEMORY CONTROLLER
 427  *
 428  *      The memory controller is a component that provides access to DRAM. Each
 429  *      memory controller can access a set number of DRAM channels. Each channel
 430  *      can have a number of DIMMs (sticks of memory) associated with it. A
 431  *      given package may have more than one memory controller. The association
 432  *      of the memory controller to a group of cores is important as it is
 433  *      cheaper to access memory on the controller that you are associated with.
 434  *
 435  * NUMA
 436  *
 437  *      NUMA or non-uniform memory access, describes a way that systems are
 438  *      built. On x86, any processor core can address all of the memory in the
 439  *      system. However, When using multiple sockets or possibly within a
 440  *      multi-chip module, some of that memory is physically closer and some of
 441  *      it is further. Memory that is further away is more expensive to access.
 442  *      Consider the following image of multiple sockets with memory:
 443  *
 444  *      +--------+                                                +--------+
 445  *      | DIMM A |         +----------+      +----------+         | DIMM D |
 446  *      +--------+-+       |          |      |          |       +-+------+-+
 447  *        | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
 448  *        +--------+-+     |          |      |          |     +-+------+-+
 449  *          | DIMM C |     +----------+      +----------+     | DIMM F |
 450  *          +--------+                                        +--------+
 451  *
 452  *      In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
 453  *      closer to DIMMs D-F. This means that it is cheaper for socket 0 to
 454  *      access DIMMs A-C and more expensive to access D-F as it has to go
 455  *      through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
 456  *      D-F are cheaper than A-C. While the socket form is the most common, when
 457  *      using multi-chip modules, this can also sometimes occur. For another
 458  *      example of this that's more involved, see the AMD topology section.
 459  *
 460  *
 461  * Intel Topology
 462  * --------------
 463  *
 464  * Most Intel processors since Nehalem, (as of this writing the current gen
 465  * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
 466  * the package is a single monolithic die. MCMs currently aren't used. Most
 467  * parts have three levels of caches, with the L3 cache being shared between
 468  * all of the cores on the package. The L1/L2 cache is generally specific to
 469  * an individual core. The following image shows at a simplified level what
 470  * this looks like. The memory controller is commonly part of something called
 471  * the 'Uncore', that used to be separate physical chips that were not a part of
 472  * the package, but are now part of the same chip.
 473  *
 474  *  +-----------------------------------------------------------------------+
 475  *  | Package                                                               |
 476  *  |  +-------------------+  +-------------------+  +-------------------+  |
 477  *  |  | Core              |  | Core              |  | Core              |  |
 478  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
 479  *  |  |  | Thread | | L | |  |  | Thread | | L | |  |  | Thread | | L | |  |
 480  *  |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |
 481  *  |  |  +--------+ |   | |  |  +--------+ |   | |  |  +--------+ |   | |  |
 482  *  |  |  | Thread | |   | |  |  | Thread | |   | |  |  | Thread | |   | |  |
 483  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
 484  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
 485  *  |  |  | L2 Cache     | |  |  | L2 Cache     | |  |  | L2 Cache     | |  |
 486  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
 487  *  |  +-------------------+  +-------------------+  +-------------------+  |
 488  *  | +-------------------------------------------------------------------+ |
 489  *  | |                         Shared L3 Cache                           | |
 490  *  | +-------------------------------------------------------------------+ |
 491  *  | +-------------------------------------------------------------------+ |
 492  *  | |                        Memory Controller                          | |
 493  *  | +-------------------------------------------------------------------+ |
 494  *  +-----------------------------------------------------------------------+
 495  *
 496  * A side effect of this current architecture is that what we care about from a
 497  * scheduling and topology perspective, is simplified. In general we care about
 498  * understanding which logical CPUs are part of the same core and socket.
 499  *
 500  * To determine the relationship between threads and cores, Intel initially used
 501  * the identifier in the advanced programmable interrupt controller (APIC). They
 502  * also added cpuid leaf 4 to give additional information about the number of
 503  * threads and CPUs in the processor. With the addition of x2apic (which
 504  * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
 505  * additional cpuid topology leaf 0xB was added.
 506  *
 507  * AMD Topology
 508  * ------------
 509  *
 510  * When discussing AMD topology, we want to break this into three distinct
 511  * generations of topology. There's the basic topology that has been used in
 512  * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
 513  * with family 0x15 (Bulldozer), and there's the topology that was introduced
 514  * with family 0x17 (Zen). AMD also has some additional terminology that's worth
 515  * talking about.
 516  *
 517  * Until the introduction of family 0x17 (Zen), AMD did not implement something
 518  * that they considered SMT. Whether or not the AMD processors have SMT
 519  * influences many things including scheduling and reliability, availability,
 520  * and serviceability (RAS) features.
 521  *
 522  * NODE
 523  *
 524  *      AMD uses the term node to refer to a die that contains a number of cores
 525  *      and I/O resources. Depending on the processor family and model, more
 526  *      than one node can be present in the package. When there is more than one
 527  *      node this indicates a multi-chip module. Usually each node has its own
 528  *      access to memory and I/O devices. This is important and generally
 529  *      different from the corresponding Intel Nehalem-Skylake+ processors. As a
 530  *      result, we track this relationship in the operating system.
 531  *
 532  *      In processors with an L3 cache, the L3 cache is generally shared across
 533  *      the entire node, though the way this is carved up varies from generation
 534  *      to generation.
 535  *
 536  * BULLDOZER
 537  *
 538  *      Starting with the Bulldozer family (0x15) and continuing until the
 539  *      introduction of the Zen microarchitecture, AMD introduced the idea of a
 540  *      compute unit. In a compute unit, two traditional cores share a number of
 541  *      hardware resources. Critically, they share the FPU, L1 instruction
 542  *      cache, and the L2 cache. Several compute units were then combined inside
 543  *      of a single node.  Because the integer execution units, L1 data cache,
 544  *      and some other resources were not shared between the cores, AMD never
 545  *      considered this to be SMT.
 546  *
 547  * ZEN
 548  *
 549  *      The Zen family (0x17) uses a multi-chip module (MCM) design, the module
 550  *      is called Zeppelin. These modules are similar to the idea of nodes used
 551  *      previously. Each of these nodes has two DRAM channels which all of the
 552  *      cores in the node can access uniformly. These nodes are linked together
 553  *      in the package, creating a NUMA environment.
 554  *
 555  *      The Zeppelin die itself contains two different 'core complexes'. Each
 556  *      core complex consists of four cores which each have two threads, for a
 557  *      total of 8 logical CPUs per complex. Unlike other generations,
 558  *      where all the logical CPUs in a given node share the L3 cache, here each
 559  *      core complex has its own shared L3 cache.
 560  *
 561  *      A further thing that we need to consider is that in some configurations,
 562  *      particularly with the Threadripper line of processors, not every die
 563  *      actually has its memory controllers wired up to actual memory channels.
 564  *      This means that some cores have memory attached to them and others
 565  *      don't.
 566  *
 567  *      To put Zen in perspective, consider the following images:
 568  *
 569  *      +--------------------------------------------------------+
 570  *      | Core Complex                                           |
 571  *      | +-------------------+    +-------------------+  +---+  |
 572  *      | | Core       +----+ |    | Core       +----+ |  |   |  |
 573  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  |   |  |
 574  *      | | | Thread | +----+ |    | | Thread | +----+ |  |   |  |
 575  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  | L |  |
 576  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  | 3 |  |
 577  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
 578  *      | +-------------------+    +-------------------+  | C |  |
 579  *      | +-------------------+    +-------------------+  | a |  |
 580  *      | | Core       +----+ |    | Core       +----+ |  | c |  |
 581  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  | h |  |
 582  *      | | | Thread | +----+ |    | | Thread | +----+ |  | e |  |
 583  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |   |  |
 584  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  |   |  |
 585  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
 586  *      | +-------------------+    +-------------------+  +---+  |
 587  *      |                                                        |
 588  *      +--------------------------------------------------------+
 589  *
 590  *  This first image represents a single Zen core complex that consists of four
 591  *  cores.
 592  *
 593  *
 594  *      +--------------------------------------------------------+
 595  *      | Zeppelin Die                                           |
 596  *      |  +--------------------------------------------------+  |
 597  *      |  |         I/O Units (PCIe, SATA, USB, etc.)        |  |
 598  *      |  +--------------------------------------------------+  |
 599  *      |                           HH                           |
 600  *      |          +-----------+    HH    +-----------+          |
 601  *      |          |           |    HH    |           |          |
 602  *      |          |    Core   |==========|    Core   |          |
 603  *      |          |  Complex  |==========|  Complex  |          |
 604  *      |          |           |    HH    |           |          |
 605  *      |          +-----------+    HH    +-----------+          |
 606  *      |                           HH                           |
 607  *      |  +--------------------------------------------------+  |
 608  *      |  |                Memory Controller                 |  |
 609  *      |  +--------------------------------------------------+  |
 610  *      |                                                        |
 611  *      +--------------------------------------------------------+
 612  *
 613  *  This image represents a single Zeppelin Die. Note how both cores are
 614  *  connected to the same memory controller and I/O units. While each core
 615  *  complex has its own L3 cache as seen in the first image, they both have
 616  *  uniform access to memory.
 617  *
 618  *
 619  *                      PP                     PP
 620  *                      PP                     PP
 621  *           +----------PP---------------------PP---------+
 622  *           |          PP                     PP         |
 623  *           |    +-----------+          +-----------+    |
 624  *           |    |           |          |           |    |
 625  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
 626  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
 627  *           |    |           |          |           |    |
 628  *           |    +-----------+ooo    ...+-----------+    |
 629  *           |          HH      ooo  ...       HH         |
 630  *           |          HH        oo..         HH         |
 631  *           |          HH        ..oo         HH         |
 632  *           |          HH      ...  ooo       HH         |
 633  *           |    +-----------+...    ooo+-----------+    |
 634  *           |    |           |          |           |    |
 635  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
 636  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
 637  *           |    |           |          |           |    |
 638  *           |    +-----------+          +-----------+    |
 639  *           |          PP                     PP         |
 640  *           +----------PP---------------------PP---------+
 641  *                      PP                     PP
 642  *                      PP                     PP
 643  *
 644  *  This image represents a single Zen package. In this example, it has four
 645  *  Zeppelin dies, though some configurations only have a single one. In this
 646  *  example, each die is directly connected to the next. Also, each die is
 647  *  represented as being connected to memory by the 'M' character and connected
 648  *  to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
 649  *  die is made up of two core complexes, we have multiple different NUMA
 650  *  domains that we care about for these systems.
 651  *
 652  * CPUID LEAVES
 653  *
 654  * There are a few different CPUID leaves that we can use to try and understand
 655  * the actual state of the world. As part of the introduction of family 0xf, AMD
 656  * added CPUID leaf 0x80000008. This leaf tells us the number of logical
 657  * processors that are in the system. Because families before Zen didn't have
 658  * SMT, this was always the number of cores that were in the system. However, it
 659  * should always be thought of as the number of logical threads to be consistent
 660  * between generations. In addition we also get the size of the APIC ID that is
 661  * used to represent the number of logical processors. This is important for
 662  * deriving topology information.
 663  *
 664  * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
 665  * bit between Bulldozer and later families, but it is quite useful in
 666  * determining the topology information. Because this information has changed
 667  * across family generations, it's worth calling out what these mean
 668  * explicitly. The registers have the following meanings:
 669  *
 670  *      %eax    The APIC ID. The entire register is defined to have a 32-bit
 671  *              APIC ID, even though on systems without x2apic support, it will
 672  *              be limited to 8 bits.
 673  *
 674  *      %ebx    On Bulldozer-era systems this contains information about the
 675  *              number of cores that are in a compute unit (cores that share
 676  *              resources). It also contains a per-package compute unit ID that
 677  *              identifies which compute unit the logical CPU is a part of.
 678  *
 679  *              On Zen-era systems this instead contains the number of threads
 680  *              per core and the ID of the core that the logical CPU is a part
 681  *              of. Note, this ID is unique only to the package, it is not
 682  *              globally unique across the entire system.
 683  *
 684  *      %ecx    This contains the number of nodes that exist in the package. It
 685  *              also contains an ID that identifies which node the logical CPU
 686  *              is a part of.
 687  *
 688  * Finally, we also use cpuid leaf 0x8000001D to determine information about the
 689  * cache layout to determine which logical CPUs are sharing which caches.
 690  *
 691  * illumos Topology
 692  * ----------------
 693  *
 694  * Based on the above we synthesize the information into several different
 695  * variables that we store in the 'struct cpuid_info'. We'll go into the details
 696  * of what each member is supposed to represent and their uniqueness. In
 697  * general, there are two levels of uniqueness that we care about. We care about
 698  * an ID that is globally unique. That means that it will be unique across all
 699  * entities in the system. For example, the default logical CPU ID is globally
 700  * unique. On the other hand, there is some information that we only care about
 701  * being unique within the context of a single package / socket. Here are the
 702  * variables that we keep track of and their meaning.
 703  *
 704  * Several of the values that are asking for an identifier, with the exception
 705  * of cpi_apicid, are allowed to be synthetic.
 706  *
 707  *
 708  * cpi_apicid
 709  *
 710  *      This is the value of the CPU's APIC id. This should be the full 32-bit
 711  *      ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
 712  *      APIC ID. This value is globally unique between all logical CPUs across
 713  *      all packages. This is usually required by the APIC.
 714  *
 715  * cpi_chipid
 716  *
 717  *      This value indicates the ID of the package that the logical CPU is a
 718  *      part of. This value is allowed to be synthetic. It is usually derived by
 719  *      taking the CPU's APIC ID and determining how many bits are used to
 720  *      represent CPU cores in the package. All logical CPUs that are part of
 721  *      the same package must have the same value.
 722  *
 723  * cpi_coreid
 724  *
 725  *      This represents the ID of a CPU core. Two logical CPUs should only have
 726  *      the same cpi_coreid value if they are part of the same core. These
 727  *      values may be synthetic. On systems that support SMT, this value is
 728  *      usually derived from the APIC ID, otherwise it is often synthetic and
 729  *      just set to the value of the cpu_id in the cpu_t.
 730  *
 731  * cpi_pkgcoreid
 732  *
 733  *      This is similar to the cpi_coreid in that logical CPUs that are part of
 734  *      the same core should have the same ID. The main difference is that these
 735  *      values are only required to be unique to a given socket.
 736  *
 737  * cpi_clogid
 738  *
 739  *      This represents the logical ID of a logical CPU. This value should be
 740  *      unique within a given socket for each logical CPU. This is allowed to be
 741  *      synthetic, though it is usually based off of the CPU's apic ID. The
 742  *      broader system expects that logical CPUs that have are part of the same
 743  *      core have contiguous numbers. For example, if there were two threads per
 744  *      core, then the core IDs divided by two should be the same and the first
 745  *      modulus two should be zero and the second one. For example, IDs 4 and 5
 746  *      indicate two logical CPUs that are part of the same core. But IDs 5 and
 747  *      6 represent two logical CPUs that are part of different cores.
 748  *
 749  *      While it is common for the cpi_coreid and the cpi_clogid to be derived
 750  *      from the same source, strictly speaking, they don't have to be and the
 751  *      two values should be considered logically independent. One should not
 752  *      try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
 753  *      some kind of relationship. While this is tempting, we've seen cases on
 754  *      AMD family 0xf where the system's cpu id is not related to its APIC ID.
 755  *
 756  * cpi_ncpu_per_chip
 757  *
 758  *      This value indicates the total number of logical CPUs that exist in the
 759  *      physical package. Critically, this is not the number of logical CPUs
 760  *      that exist for just the single core.
 761  *
 762  *      This value should be the same for all logical CPUs in the same package.
 763  *
 764  * cpi_ncore_per_chip
 765  *
 766  *      This value indicates the total number of physical CPU cores that exist
 767  *      in the package. The system compares this value with cpi_ncpu_per_chip to
 768  *      determine if simultaneous multi-threading (SMT) is enabled. When
 769  *      cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
 770  *      the X86FSET_HTT feature is not set. If this value is greater than one,
 771  *      than we consider the processor to have the feature X86FSET_CMP, to
 772  *      indicate that there is support for more than one core.
 773  *
 774  *      This value should be the same for all logical CPUs in the same package.
 775  *
 776  * cpi_procnodes_per_pkg
 777  *
 778  *      This value indicates the number of 'nodes' that exist in the package.
 779  *      When processors are actually a multi-chip module, this represents the
 780  *      number of such modules that exist in the package. Currently, on Intel
 781  *      based systems this member is always set to 1.
 782  *
 783  *      This value should be the same for all logical CPUs in the same package.
 784  *
 785  * cpi_procnodeid
 786  *
 787  *      This value indicates the ID of the node that the logical CPU is a part
 788  *      of. All logical CPUs that are in the same node must have the same value
 789  *      here. This value must be unique across all of the packages in the
 790  *      system.  On Intel based systems, this is currently set to the value in
 791  *      cpi_chipid because there is only one node.
 792  *
 793  * cpi_cores_per_compunit
 794  *
 795  *      This value indicates the number of cores that are part of a compute
 796  *      unit. See the AMD topology section for this. This member only has real
 797  *      meaning currently for AMD Bulldozer family processors. For all other
 798  *      processors, this should currently be set to 1.
 799  *
 800  * cpi_compunitid
 801  *
 802  *      This indicates the compute unit that the logical CPU belongs to. For
 803  *      processors without AMD Bulldozer-style compute units this should be set
 804  *      to the value of cpi_coreid.
 805  *
 806  * cpi_ncpu_shr_last_cache
 807  *
 808  *      This indicates the number of logical CPUs that are sharing the same last
 809  *      level cache. This value should be the same for all CPUs that are sharing
 810  *      that cache. The last cache refers to the cache that is closest to memory
 811  *      and furthest away from the CPU.
 812  *
 813  * cpi_last_lvl_cacheid
 814  *
 815  *      This indicates the ID of the last cache that the logical CPU uses. This
 816  *      cache is often shared between multiple logical CPUs and is the cache
 817  *      that is closest to memory and furthest away from the CPU. This value
 818  *      should be the same for a group of logical CPUs only if they actually
 819  *      share the same last level cache. IDs should not overlap between
 820  *      packages.
 821  *
 822  * cpi_ncore_bits
 823  *
 824  *      This indicates the number of bits that are required to represent all of
 825  *      the cores in the system. As cores are derived based on their APIC IDs,
 826  *      we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
 827  *      this value to be larger than the actual number of IDs that are present
 828  *      in the system. This is used to size tables by the CMI framework. It is
 829  *      only filled in for Intel and AMD CPUs.
 830  *
 831  * cpi_nthread_bits
 832  *
 833  *      This indicates the number of bits required to represent all of the IDs
 834  *      that cover the logical CPUs that exist on a given core. It's OK for this
 835  *      value to be larger than the actual number of IDs that are present in the
 836  *      system.  This is used to size tables by the CMI framework. It is
 837  *      only filled in for Intel and AMD CPUs.
 838  *
 839  * -----------
 840  * Hypervisors
 841  * -----------
 842  *
 843  * If trying to manage the differences between vendors wasn't bad enough, it can
 844  * get worse thanks to our friend hardware virtualization. Hypervisors are given
 845  * the ability to interpose on all cpuid instructions and change them to suit
 846  * their purposes. In general, this is necessary as the hypervisor wants to be
 847  * able to present a more uniform set of features or not necessarily give the
 848  * guest operating system kernel knowledge of all features so it can be
 849  * more easily migrated between systems.
 850  *
 851  * When it comes to trying to determine topology information, this can be a
 852  * double edged sword. When a hypervisor doesn't actually implement a cpuid
 853  * leaf, it'll often return all zeros. Because of that, you'll often see various
 854  * checks scattered about fields being non-zero before we assume we can use
 855  * them.
 856  *
 857  * When it comes to topology information, the hypervisor is often incentivized
 858  * to lie to you about topology. This is because it doesn't always actually
 859  * guarantee that topology at all. The topology path we take in the system
 860  * depends on how the CPU advertises itself. If it advertises itself as an Intel
 861  * or AMD CPU, then we basically do our normal path. However, when they don't
 862  * use an actual vendor, then that usually turns into multiple one-core CPUs
 863  * that we enumerate that are often on different sockets. The actual behavior
 864  * depends greatly on what the hypervisor actually exposes to us.
 865  *
 866  * --------------------
 867  * Exposing Information
 868  * --------------------
 869  *
 870  * We expose CPUID information in three different forms in the system.
 871  *
 872  * The first is through the x86_featureset variable. This is used in conjunction
 873  * with the is_x86_feature() function. This is queried by x86-specific functions
 874  * to determine which features are or aren't present in the system and to make
 875  * decisions based upon them. For example, users of this include everything from
 876  * parts of the system dedicated to reliability, availability, and
 877  * serviceability (RAS), to making decisions about how to handle security
 878  * mitigations, to various x86-specific drivers. General purpose or
 879  * architecture independent drivers should never be calling this function.
 880  *
 881  * The second means is through the auxiliary vector. The auxiliary vector is a
 882  * series of tagged data that the kernel passes down to a user program when it
 883  * begins executing. This information is used to indicate to programs what
 884  * instruction set extensions are present. For example, information about the
 885  * CPU supporting the machine check architecture (MCA) wouldn't be passed down
 886  * since user programs cannot make use of it. However, things like the AVX
 887  * instruction sets are. Programs use this information to make run-time
 888  * decisions about what features they should use. As an example, the run-time
 889  * link-editor (rtld) can relocate different functions depending on the hardware
 890  * support available.
 891  *
 892  * The final form is through a series of accessor functions that all have the
 893  * form cpuid_get*. This is used by a number of different subsystems in the
 894  * kernel to determine more detailed information about what we're running on,
 895  * topology information, etc. Some of these subsystems include processor groups
 896  * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
 897  * microcode, and performance monitoring. These functions all ASSERT that the
 898  * CPU they're being called on has reached a certain cpuid pass. If the passes
 899  * are rearranged, then this needs to be adjusted.
 900  *
 901  * -----------------------------------------------
 902  * Speculative Execution CPU Side Channel Security
 903  * -----------------------------------------------
 904  *
 905  * With the advent of the Spectre and Meltdown attacks which exploit speculative
 906  * execution in the CPU to create side channels there have been a number of
 907  * different attacks and corresponding issues that the operating system needs to
 908  * mitigate against. The following list is some of the common, but not
 909  * exhaustive, set of issues that we know about and have done some or need to do
 910  * more work in the system to mitigate against:
 911  *
 912  *   - Spectre v1
 913  *   - swapgs (Spectre v1 variant)
 914  *   - Spectre v2
 915  *   - Meltdown (Spectre v3)
 916  *   - Rogue Register Read (Spectre v3a)
 917  *   - Speculative Store Bypass (Spectre v4)
 918  *   - ret2spec, SpectreRSB
 919  *   - L1 Terminal Fault (L1TF)
 920  *   - Microarchitectural Data Sampling (MDS)
 921  *
 922  * Each of these requires different sets of mitigations and has different attack
 923  * surfaces. For the most part, this discussion is about protecting the kernel
 924  * from non-kernel executing environments such as user processes and hardware
 925  * virtual machines. Unfortunately, there are a number of user vs. user
 926  * scenarios that exist with these. The rest of this section will describe the
 927  * overall approach that the system has taken to address these as well as their
 928  * shortcomings. Unfortunately, not all of the above have been handled today.
 929  *
 930  * SPECTRE v2, ret2spec, SpectreRSB
 931  *
 932  * The second variant of the spectre attack focuses on performing branch target
 933  * injection. This generally impacts indirect call instructions in the system.
 934  * There are three different ways to mitigate this issue that are commonly
 935  * described today:
 936  *
 937  *  1. Using Indirect Branch Restricted Speculation (IBRS).
 938  *  2. Using Retpolines and RSB Stuffing
 939  *  3. Using Enhanced Indirect Branch Restricted Speculation (EIBRS)
 940  *
 941  * IBRS uses a feature added to microcode to restrict speculation, among other
 942  * things. This form of mitigation has not been used as it has been generally
 943  * seen as too expensive and requires reactivation upon various transitions in
 944  * the system.
 945  *
 946  * As a less impactful alternative to IBRS, retpolines were developed by
 947  * Google. These basically require one to replace indirect calls with a specific
 948  * trampoline that will cause speculation to fail and break the attack.
 949  * Retpolines require compiler support. We always build with retpolines in the
 950  * external thunk mode. This means that a traditional indirect call is replaced
 951  * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
 952  * of this is that all indirect function calls are performed through a register.
 953  *
 954  * We have to use a common external location of the thunk and not inline it into
 955  * the callsite so that way we can have a single place to patch these functions.
 956  * As it turns out, we actually have three different forms of retpolines that
 957  * exist in the system:
 958  *
 959  *  1. A full retpoline
 960  *  2. An AMD-specific optimized retpoline
 961  *  3. A no-op version
 962  *
 963  * The first one is used in the general case. The second one is used if we can
 964  * determine that we're on an AMD system and we can successfully toggle the
 965  * lfence serializing MSR that exists on the platform. Basically with this
 966  * present, an lfence is sufficient and we don't need to do anywhere near as
 967  * complicated a dance to successfully use retpolines.
 968  *
 969  * The third form described above is the most curious. It turns out that the way
 970  * that retpolines are implemented is that they rely on how speculation is
 971  * performed on a 'ret' instruction. Intel has continued to optimize this
 972  * process (which is partly why we need to have return stack buffer stuffing,
 973  * but more on that in a bit) and in processors starting with Cascade Lake
 974  * on the server side, it's dangerous to rely on retpolines. Instead, a new
 975  * mechanism has been introduced called Enhanced IBRS (EIBRS).
 976  *
 977  * Unlike IBRS, EIBRS is designed to be enabled once at boot and left on each
 978  * physical core. However, if this is the case, we don't want to use retpolines
 979  * any more. Therefore if EIBRS is present, we end up turning each retpoline
 980  * function (called a thunk) into a jmp instruction. This means that we're still
 981  * paying the cost of an extra jump to the external thunk, but it gives us
 982  * flexibility and the ability to have a single kernel image that works across a
 983  * wide variety of systems and hardware features.
 984  *
 985  * Unfortunately, this alone is insufficient. First, Skylake systems have
 986  * additional speculation for the Return Stack Buffer (RSB) which is used to
 987  * return from call instructions which retpolines take advantage of. However,
 988  * this problem is not just limited to Skylake and is actually more pernicious.
 989  * The SpectreRSB paper introduces several more problems that can arise with
 990  * dealing with this. The RSB can be poisoned just like the indirect branch
 991  * predictor. This means that one needs to clear the RSB when transitioning
 992  * between two different privilege domains. Some examples include:
 993  *
 994  *  - Switching between two different user processes
 995  *  - Going between user land and the kernel
 996  *  - Returning to the kernel from a hardware virtual machine
 997  *
 998  * Mitigating this involves combining a couple of different things. The first is
 999  * SMEP (supervisor mode execution protection) which was introduced in Ivy
1000  * Bridge. When an RSB entry refers to a user address and we're executing in the
1001  * kernel, speculation through it will be stopped when SMEP is enabled. This
1002  * protects against a number of the different cases that we would normally be
1003  * worried about such as when we enter the kernel from user land.
1004  *
1005  * To prevent against additional manipulation of the RSB from other contexts
1006  * such as a non-root VMX context attacking the kernel we first look to enhanced
1007  * IBRS. When EIBRS is present and enabled, then there is nothing else that we
1008  * need to do to protect the kernel at this time.
1009  *
1010  * On CPUs without EIBRS we need to manually overwrite the contents of the
1011  * return stack buffer. We do this through the x86_rsb_stuff() function.
1012  * Currently this is employed on context switch. The x86_rsb_stuff() function is
1013  * disabled when enhanced IBRS is present because Intel claims on such systems
1014  * it will be ineffective. Stuffing the RSB in context switch helps prevent user
1015  * to user attacks via the RSB.
1016  *
1017  * If SMEP is not present, then we would have to stuff the RSB every time we
1018  * transitioned from user mode to the kernel, which isn't very practical right
1019  * now.
1020  *
1021  * To fully protect user to user and vmx to vmx attacks from these classes of
1022  * issues, we would also need to allow them to opt into performing an Indirect
1023  * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1024  *
1025  * By default, the system will enable RSB stuffing and the required variant of
1026  * retpolines and store that information in the x86_spectrev2_mitigation value.
1027  * This will be evaluated after a microcode update as well, though it is
1028  * expected that microcode updates will not take away features. This may mean
1029  * that a late loaded microcode may not end up in the optimal configuration
1030  * (though this should be rare).
1031  *
1032  * Currently we do not build kmdb with retpolines or perform any additional side
1033  * channel security mitigations for it. One complication with kmdb is that it
1034  * requires its own retpoline thunks and it would need to adjust itself based on
1035  * what the kernel does. The threat model of kmdb is more limited and therefore
1036  * it may make more sense to investigate using prediction barriers as the whole
1037  * system is only executing a single instruction at a time while in kmdb.
1038  *
1039  * SPECTRE v1, v4
1040  *
1041  * The v1 and v4 variants of spectre are not currently mitigated in the
1042  * system and require other classes of changes to occur in the code.
1043  *
1044  * SPECTRE v1 (SWAPGS VARIANT)
1045  *
1046  * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but
1047  * can generally affect any branch-dependent code. The swapgs issue is one
1048  * variant of this. If we are coming in from userspace, we can have code like
1049  * this:
1050  *
1051  *      cmpw    $KCS_SEL, REGOFF_CS(%rsp)
1052  *      je      1f
1053  *      movq    $0, REGOFF_SAVFP(%rsp)
1054  *      swapgs
1055  *      1:
1056  *      movq    %gs:CPU_THREAD, %rax
1057  *
1058  * If an attacker can cause a mis-speculation of the branch here, we could skip
1059  * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based
1060  * load. If subsequent code can act as the usual Spectre cache gadget, this
1061  * would potentially allow KPTI bypass. To fix this, we need an lfence prior to
1062  * any use of the %gs override.
1063  *
1064  * The other case is also an issue: if we're coming into a trap from kernel
1065  * space, we could mis-speculate and swapgs the user %gsbase back in prior to
1066  * using it. AMD systems are not vulnerable to this version, as a swapgs is
1067  * serializing with respect to subsequent uses. But as AMD /does/ need the other
1068  * case, and the fix is the same in both cases (an lfence at the branch target
1069  * 1: in this example), we'll just do it unconditionally.
1070  *
1071  * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it
1072  * harder for user-space to actually set a useful %gsbase value: although it's
1073  * not clear, it might still be feasible via lwp_setprivate(), though, so we
1074  * mitigate anyway.
1075  *
1076  * MELTDOWN
1077  *
1078  * Meltdown, or spectre v3, allowed a user process to read any data in their
1079  * address space regardless of whether or not the page tables in question
1080  * allowed the user to have the ability to read them. The solution to meltdown
1081  * is kernel page table isolation. In this world, there are two page tables that
1082  * are used for a process, one in user land and one in the kernel. To implement
1083  * this we use per-CPU page tables and switch between the user and kernel
1084  * variants when entering and exiting the kernel.  For more information about
1085  * this process and how the trampolines work, please see the big theory
1086  * statements and additional comments in:
1087  *
1088  *  - uts/i86pc/ml/kpti_trampolines.s
1089  *  - uts/i86pc/vm/hat_i86.c
1090  *
1091  * While Meltdown only impacted Intel systems and there are also Intel systems
1092  * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1093  * kernel page table isolation enabled. While this may at first seem weird, an
1094  * important thing to remember is that you can't speculatively read an address
1095  * if it's never in your page table at all. Having user processes without kernel
1096  * pages present provides us with an important layer of defense in the kernel
1097  * against any other side channel attacks that exist and have yet to be
1098  * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1099  * default, no matter the x86 system.
1100  *
1101  * L1 TERMINAL FAULT
1102  *
1103  * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1104  * execution uses page table entries. Effectively, it is two different problems.
1105  * The first is that it ignores the not present bit in the page table entries
1106  * when performing speculative execution. This means that something can
1107  * speculatively read the listed physical address if it's present in the L1
1108  * cache under certain conditions (see Intel's documentation for the full set of
1109  * conditions). Secondly, this can be used to bypass hardware virtualization
1110  * extended page tables (EPT) that are part of Intel's hardware virtual machine
1111  * instructions.
1112  *
1113  * For the non-hardware virtualized case, this is relatively easy to deal with.
1114  * We must make sure that all unmapped pages have an address of zero. This means
1115  * that they could read the first 4k of physical memory; however, we never use
1116  * that first page in the operating system and always skip putting it in our
1117  * memory map, even if firmware tells us we can use it in our memory map. While
1118  * other systems try to put extra metadata in the address and reserved bits,
1119  * which led to this being problematic in those cases, we do not.
1120  *
1121  * For hardware virtual machines things are more complicated. Because they can
1122  * construct their own page tables, it isn't hard for them to perform this
1123  * attack against any physical address. The one wrinkle is that this physical
1124  * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1125  * to flush the L1 data cache. We wrap this up in the function
1126  * spec_uarch_flush(). This function is also used in the mitigation of
1127  * microarchitectural data sampling (MDS) discussed later on. Kernel based
1128  * hypervisors such as KVM or bhyve are responsible for performing this before
1129  * entering the guest.
1130  *
1131  * Because this attack takes place in the L1 cache, there's another wrinkle
1132  * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1133  * designs. This means that when a thread enters a hardware virtualized context
1134  * and flushes the L1 data cache, the other thread on the processor may then go
1135  * ahead and put new data in it that can be potentially attacked. While one
1136  * solution is to disable SMT on the system, another option that is available is
1137  * to use a feature for hardware virtualization called 'SMT exclusion'. This
1138  * goes through and makes sure that if a HVM is being scheduled on one thread,
1139  * then the thing on the other thread is from the same hardware virtual machine.
1140  * If an interrupt comes in or the guest exits to the broader system, then the
1141  * other SMT thread will be kicked out.
1142  *
1143  * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1144  * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1145  * perform L1TF related mitigations.
1146  *
1147  * MICROARCHITECTURAL DATA SAMPLING
1148  *
1149  * Microarchitectural data sampling (MDS) is a combination of four discrete
1150  * vulnerabilities that are similar issues affecting various parts of the CPU's
1151  * microarchitectural implementation around load, store, and fill buffers.
1152  * Specifically it is made up of the following subcomponents:
1153  *
1154  *  1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1155  *  2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1156  *  3. Microarchitectural Load Port Data Sampling (MLPDS)
1157  *  4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1158  *
1159  * To begin addressing these, Intel has introduced another feature in microcode
1160  * called MD_CLEAR. This changes the verw instruction to operate in a different
1161  * way. This allows us to execute the verw instruction in a particular way to
1162  * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1163  * updated when this microcode is present to flush this state.
1164  *
1165  * Primarily we need to flush this state whenever we transition from the kernel
1166  * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1167  * little bit different. Here the structures are statically sized when a logical
1168  * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1169  * flush the microarchitectural state before the CPU goes idles by calling hlt,
1170  * mwait, or another ACPI method. To perform these flushes, we call
1171  * x86_md_clear() at all of these transition points.
1172  *
1173  * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1174  * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1175  * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1176  * a no-op.
1177  *
1178  * Unfortunately, with this issue hyperthreading rears its ugly head. In
1179  * particular, everything we've discussed above is only valid for a single
1180  * thread executing on a core. In the case where you have hyper-threading
1181  * present, this attack can be performed between threads. The theoretical fix
1182  * for this is to ensure that both threads are always in the same security
1183  * domain. This means that they are executing in the same ring and mutually
1184  * trust each other. Practically speaking, this would mean that a system call
1185  * would have to issue an inter-processor interrupt (IPI) to the other thread.
1186  * Rather than implement this, we recommend that one disables hyper-threading
1187  * through the use of psradm -aS.
1188  *
1189  * TSX ASYNCHRONOUS ABORT
1190  *
1191  * TSX Asynchronous Abort (TAA) is another side-channel vulnerability that
1192  * behaves like MDS, but leverages Intel's transactional instructions as another
1193  * vector. Effectively, when a transaction hits one of these cases (unmapped
1194  * page, various cache snoop activity, etc.) then the same data can be exposed
1195  * as in the case of MDS. This means that you can attack your twin.
1196  *
1197  * Intel has described that there are two different ways that we can mitigate
1198  * this problem on affected processors:
1199  *
1200  *   1) We can use the same techniques used to deal with MDS. Flushing the
1201  *      microarchitectural buffers and disabling hyperthreading will mitigate
1202  *      this in the same way.
1203  *
1204  *   2) Using microcode to disable TSX.
1205  *
1206  * Now, most processors that are subject to MDS (as in they don't have MDS_NO in
1207  * the IA32_ARCH_CAPABILITIES MSR) will not receive microcode to disable TSX.
1208  * That's OK as we're already doing all such mitigations. On the other hand,
1209  * processors with MDS_NO are all supposed to receive microcode updates that
1210  * enumerate support for disabling TSX. In general, we'd rather use this method
1211  * when available as it doesn't require disabling hyperthreading to be
1212  * effective. Currently we basically are relying on microcode for processors
1213  * that enumerate MDS_NO.
1214  *
1215  * The microcode features are enumerated as part of the IA32_ARCH_CAPABILITIES.
1216  * When bit 7 (IA32_ARCH_CAP_TSX_CTRL) is present, then we are given two
1217  * different powers. The first allows us to cause all transactions to
1218  * immediately abort. The second gives us a means of disabling TSX completely,
1219  * which includes removing it from cpuid. If we have support for this in
1220  * microcode during the first cpuid pass, then we'll disable TSX completely such
1221  * that user land never has a chance to observe the bit. However, if we are late
1222  * loading the microcode, then we must use the functionality to cause
1223  * transactions to automatically abort. This is necessary for user land's sake.
1224  * Once a program sees a cpuid bit, it must not be taken away.
1225  *
1226  * We track whether or not we should do this based on what cpuid pass we're in.
1227  * Whenever we hit cpuid_scan_security() on the boot CPU and we're still on pass
1228  * 1 of the cpuid logic, then we can completely turn off TSX. Notably this
1229  * should happen twice. Once in the normal cpuid_pass1() code and then a second
1230  * time after we do the initial microcode update.
1231  *
1232  * If TAA has been fixed, then it will be enumerated in IA32_ARCH_CAPABILITIES
1233  * as TAA_NO. In such a case, we will still disable TSX: it's proven to be an
1234  * unfortunate feature in a number of ways, and taking the opportunity to
1235  * finally be able to turn it off is likely to be of benefit in the future.
1236  *
1237  * SUMMARY
1238  *
1239  * The following table attempts to summarize the mitigations for various issues
1240  * and what's done in various places:
1241  *
1242  *  - Spectre v1: Not currently mitigated
1243  *  - swapgs: lfences after swapgs paths
1244  *  - Spectre v2: Retpolines/RSB Stuffing or EIBRS if HW support
1245  *  - Meltdown: Kernel Page Table Isolation
1246  *  - Spectre v3a: Updated CPU microcode
1247  *  - Spectre v4: Not currently mitigated
1248  *  - SpectreRSB: SMEP and RSB Stuffing
1249  *  - L1TF: spec_uarch_flush, SMT exclusion, requires microcode
1250  *  - MDS: x86_md_clear, requires microcode, disabling SMT
1251  *  - TAA: x86_md_clear and disabling SMT OR microcode and disabling TSX
1252  *
1253  * The following table indicates the x86 feature set bits that indicate that a
1254  * given problem has been solved or a notable feature is present:
1255  *
1256  *  - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1257  *  - MDS_NO: All forms of MDS
1258  *  - TAA_NO: TAA
1259  */
1260 
1261 #include <sys/types.h>
1262 #include <sys/archsystm.h>
1263 #include <sys/x86_archext.h>
1264 #include <sys/kmem.h>
1265 #include <sys/systm.h>
1266 #include <sys/cmn_err.h>
1267 #include <sys/sunddi.h>
1268 #include <sys/sunndi.h>
1269 #include <sys/cpuvar.h>
1270 #include <sys/processor.h>
1271 #include <sys/sysmacros.h>
1272 #include <sys/pg.h>
1273 #include <sys/fp.h>
1274 #include <sys/controlregs.h>
1275 #include <sys/bitmap.h>
1276 #include <sys/auxv_386.h>
1277 #include <sys/memnode.h>
1278 #include <sys/pci_cfgspace.h>
1279 #include <sys/comm_page.h>
1280 #include <sys/mach_mmu.h>
1281 #include <sys/ucode.h>
1282 #include <sys/tsc.h>
1283 #include <sys/kobj.h>
1284 #include <sys/asm_misc.h>
1285 
1286 #ifdef __xpv
1287 #include <sys/hypervisor.h>
1288 #else
1289 #include <sys/ontrap.h>
1290 #endif
1291 
1292 uint_t x86_vendor = X86_VENDOR_IntelClone;
1293 uint_t x86_type = X86_TYPE_OTHER;
1294 uint_t x86_clflush_size = 0;
1295 
1296 #if defined(__xpv)
1297 int x86_use_pcid = 0;
1298 int x86_use_invpcid = 0;
1299 #else
1300 int x86_use_pcid = -1;
1301 int x86_use_invpcid = -1;
1302 #endif
1303 
1304 typedef enum {
1305         X86_SPECTREV2_RETPOLINE,
1306         X86_SPECTREV2_RETPOLINE_AMD,
1307         X86_SPECTREV2_ENHANCED_IBRS,
1308         X86_SPECTREV2_DISABLED
1309 } x86_spectrev2_mitigation_t;
1310 
1311 uint_t x86_disable_spectrev2 = 0;
1312 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1313     X86_SPECTREV2_RETPOLINE;
1314 
1315 /*
1316  * The mitigation status for TAA:
1317  * X86_TAA_NOTHING -- no mitigation available for TAA side-channels
1318  * X86_TAA_DISABLED -- mitigation disabled via x86_disable_taa
1319  * X86_TAA_MD_CLEAR -- MDS mitigation also suffices for TAA
1320  * X86_TAA_TSX_FORCE_ABORT -- transactions are forced to abort
1321  * X86_TAA_TSX_DISABLE -- force abort transactions and hide from CPUID
1322  * X86_TAA_HW_MITIGATED -- TSX potentially active but H/W not TAA-vulnerable
1323  */
1324 typedef enum {
1325         X86_TAA_NOTHING,
1326         X86_TAA_DISABLED,
1327         X86_TAA_MD_CLEAR,
1328         X86_TAA_TSX_FORCE_ABORT,
1329         X86_TAA_TSX_DISABLE,
1330         X86_TAA_HW_MITIGATED
1331 } x86_taa_mitigation_t;
1332 
1333 uint_t x86_disable_taa = 0;
1334 static x86_taa_mitigation_t x86_taa_mitigation = X86_TAA_NOTHING;
1335 
1336 uint_t pentiumpro_bug4046376;
1337 
1338 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1339 
1340 static char *x86_feature_names[NUM_X86_FEATURES] = {
1341         "lgpg",
1342         "tsc",
1343         "msr",
1344         "mtrr",
1345         "pge",
1346         "de",
1347         "cmov",
1348         "mmx",
1349         "mca",
1350         "pae",
1351         "cv8",
1352         "pat",
1353         "sep",
1354         "sse",
1355         "sse2",
1356         "htt",
1357         "asysc",
1358         "nx",
1359         "sse3",
1360         "cx16",
1361         "cmp",
1362         "tscp",
1363         "mwait",
1364         "sse4a",
1365         "cpuid",
1366         "ssse3",
1367         "sse4_1",
1368         "sse4_2",
1369         "1gpg",
1370         "clfsh",
1371         "64",
1372         "aes",
1373         "pclmulqdq",
1374         "xsave",
1375         "avx",
1376         "vmx",
1377         "svm",
1378         "topoext",
1379         "f16c",
1380         "rdrand",
1381         "x2apic",
1382         "avx2",
1383         "bmi1",
1384         "bmi2",
1385         "fma",
1386         "smep",
1387         "smap",
1388         "adx",
1389         "rdseed",
1390         "mpx",
1391         "avx512f",
1392         "avx512dq",
1393         "avx512pf",
1394         "avx512er",
1395         "avx512cd",
1396         "avx512bw",
1397         "avx512vl",
1398         "avx512fma",
1399         "avx512vbmi",
1400         "avx512_vpopcntdq",
1401         "avx512_4vnniw",
1402         "avx512_4fmaps",
1403         "xsaveopt",
1404         "xsavec",
1405         "xsaves",
1406         "sha",
1407         "umip",
1408         "pku",
1409         "ospke",
1410         "pcid",
1411         "invpcid",
1412         "ibrs",
1413         "ibpb",
1414         "stibp",
1415         "ssbd",
1416         "ssbd_virt",
1417         "rdcl_no",
1418         "ibrs_all",
1419         "rsba",
1420         "ssb_no",
1421         "stibp_all",
1422         "flush_cmd",
1423         "l1d_vmentry_no",
1424         "fsgsbase",
1425         "clflushopt",
1426         "clwb",
1427         "monitorx",
1428         "clzero",
1429         "xop",
1430         "fma4",
1431         "tbm",
1432         "avx512_vnni",
1433         "amd_pcec",
1434         "mb_clear",
1435         "mds_no",
1436         "core_thermal",
1437         "pkg_thermal",
1438         "tsx_ctrl",
1439         "taa_no"
1440 };
1441 
1442 boolean_t
1443 is_x86_feature(void *featureset, uint_t feature)
1444 {
1445         ASSERT(feature < NUM_X86_FEATURES);
1446         return (BT_TEST((ulong_t *)featureset, feature));
1447 }
1448 
1449 void
1450 add_x86_feature(void *featureset, uint_t feature)
1451 {
1452         ASSERT(feature < NUM_X86_FEATURES);
1453         BT_SET((ulong_t *)featureset, feature);
1454 }
1455 
1456 void
1457 remove_x86_feature(void *featureset, uint_t feature)
1458 {
1459         ASSERT(feature < NUM_X86_FEATURES);
1460         BT_CLEAR((ulong_t *)featureset, feature);
1461 }
1462 
1463 boolean_t
1464 compare_x86_featureset(void *setA, void *setB)
1465 {
1466         /*
1467          * We assume that the unused bits of the bitmap are always zero.
1468          */
1469         if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1470                 return (B_TRUE);
1471         } else {
1472                 return (B_FALSE);
1473         }
1474 }
1475 
1476 void
1477 print_x86_featureset(void *featureset)
1478 {
1479         uint_t i;
1480 
1481         for (i = 0; i < NUM_X86_FEATURES; i++) {
1482                 if (is_x86_feature(featureset, i)) {
1483                         cmn_err(CE_CONT, "?x86_feature: %s\n",
1484                             x86_feature_names[i]);
1485                 }
1486         }
1487 }
1488 
1489 /* Note: This is the maximum size for the CPU, not the size of the structure. */
1490 static size_t xsave_state_size = 0;
1491 uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1492 boolean_t xsave_force_disable = B_FALSE;
1493 extern int disable_smap;
1494 
1495 /*
1496  * This is set to platform type we are running on.
1497  */
1498 static int platform_type = -1;
1499 
1500 #if !defined(__xpv)
1501 /*
1502  * Variable to patch if hypervisor platform detection needs to be
1503  * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1504  */
1505 int enable_platform_detection = 1;
1506 #endif
1507 
1508 /*
1509  * monitor/mwait info.
1510  *
1511  * size_actual and buf_actual are the real address and size allocated to get
1512  * proper mwait_buf alignement.  buf_actual and size_actual should be passed
1513  * to kmem_free().  Currently kmem_alloc() and mwait happen to both use
1514  * processor cache-line alignment, but this is not guarantied in the furture.
1515  */
1516 struct mwait_info {
1517         size_t          mon_min;        /* min size to avoid missed wakeups */
1518         size_t          mon_max;        /* size to avoid false wakeups */
1519         size_t          size_actual;    /* size actually allocated */
1520         void            *buf_actual;    /* memory actually allocated */
1521         uint32_t        support;        /* processor support of monitor/mwait */
1522 };
1523 
1524 /*
1525  * xsave/xrestor info.
1526  *
1527  * This structure contains HW feature bits and the size of the xsave save area.
1528  * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1529  * (xsave_state) to describe the xsave layout. However, at runtime the
1530  * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1531  * xsave_state structure simply represents the legacy layout of the beginning
1532  * of the xsave area.
1533  */
1534 struct xsave_info {
1535         uint32_t        xsav_hw_features_low;   /* Supported HW features */
1536         uint32_t        xsav_hw_features_high;  /* Supported HW features */
1537         size_t          xsav_max_size;  /* max size save area for HW features */
1538         size_t          ymm_size;       /* AVX: size of ymm save area */
1539         size_t          ymm_offset;     /* AVX: offset for ymm save area */
1540         size_t          bndregs_size;   /* MPX: size of bndregs save area */
1541         size_t          bndregs_offset; /* MPX: offset for bndregs save area */
1542         size_t          bndcsr_size;    /* MPX: size of bndcsr save area */
1543         size_t          bndcsr_offset;  /* MPX: offset for bndcsr save area */
1544         size_t          opmask_size;    /* AVX512: size of opmask save */
1545         size_t          opmask_offset;  /* AVX512: offset for opmask save */
1546         size_t          zmmlo_size;     /* AVX512: size of zmm 256 save */
1547         size_t          zmmlo_offset;   /* AVX512: offset for zmm 256 save */
1548         size_t          zmmhi_size;     /* AVX512: size of zmm hi reg save */
1549         size_t          zmmhi_offset;   /* AVX512: offset for zmm hi reg save */
1550 };
1551 
1552 
1553 /*
1554  * These constants determine how many of the elements of the
1555  * cpuid we cache in the cpuid_info data structure; the
1556  * remaining elements are accessible via the cpuid instruction.
1557  */
1558 
1559 #define NMAX_CPI_STD    8               /* eax = 0 .. 7 */
1560 #define NMAX_CPI_EXTD   0x1f            /* eax = 0x80000000 .. 0x8000001e */
1561 
1562 /*
1563  * See the big theory statement for a more detailed explanation of what some of
1564  * these members mean.
1565  */
1566 struct cpuid_info {
1567         uint_t cpi_pass;                /* last pass completed */
1568         /*
1569          * standard function information
1570          */
1571         uint_t cpi_maxeax;              /* fn 0: %eax */
1572         char cpi_vendorstr[13];         /* fn 0: %ebx:%ecx:%edx */
1573         uint_t cpi_vendor;              /* enum of cpi_vendorstr */
1574 
1575         uint_t cpi_family;              /* fn 1: extended family */
1576         uint_t cpi_model;               /* fn 1: extended model */
1577         uint_t cpi_step;                /* fn 1: stepping */
1578         chipid_t cpi_chipid;            /* fn 1: %ebx:  Intel: chip # */
1579                                         /*              AMD: package/socket # */
1580         uint_t cpi_brandid;             /* fn 1: %ebx: brand ID */
1581         int cpi_clogid;                 /* fn 1: %ebx: thread # */
1582         uint_t cpi_ncpu_per_chip;       /* fn 1: %ebx: logical cpu count */
1583         uint8_t cpi_cacheinfo[16];      /* fn 2: intel-style cache desc */
1584         uint_t cpi_ncache;              /* fn 2: number of elements */
1585         uint_t cpi_ncpu_shr_last_cache; /* fn 4: %eax: ncpus sharing cache */
1586         id_t cpi_last_lvl_cacheid;      /* fn 4: %eax: derived cache id */
1587         uint_t cpi_cache_leaf_size;     /* Number of cache elements */
1588                                         /* Intel fn: 4, AMD fn: 8000001d */
1589         struct cpuid_regs **cpi_cache_leaves;   /* Acual leaves from above */
1590         struct cpuid_regs cpi_std[NMAX_CPI_STD];        /* 0 .. 7 */
1591         /*
1592          * extended function information
1593          */
1594         uint_t cpi_xmaxeax;             /* fn 0x80000000: %eax */
1595         char cpi_brandstr[49];          /* fn 0x8000000[234] */
1596         uint8_t cpi_pabits;             /* fn 0x80000006: %eax */
1597         uint8_t cpi_vabits;             /* fn 0x80000006: %eax */
1598         uint8_t cpi_fp_amd_save;        /* AMD: FP error pointer save rqd. */
1599         struct  cpuid_regs cpi_extd[NMAX_CPI_EXTD];     /* 0x800000XX */
1600 
1601         id_t cpi_coreid;                /* same coreid => strands share core */
1602         int cpi_pkgcoreid;              /* core number within single package */
1603         uint_t cpi_ncore_per_chip;      /* AMD: fn 0x80000008: %ecx[7-0] */
1604                                         /* Intel: fn 4: %eax[31-26] */
1605 
1606         /*
1607          * These values represent the number of bits that are required to store
1608          * information about the number of cores and threads.
1609          */
1610         uint_t cpi_ncore_bits;
1611         uint_t cpi_nthread_bits;
1612         /*
1613          * supported feature information
1614          */
1615         uint32_t cpi_support[6];
1616 #define STD_EDX_FEATURES        0
1617 #define AMD_EDX_FEATURES        1
1618 #define TM_EDX_FEATURES         2
1619 #define STD_ECX_FEATURES        3
1620 #define AMD_ECX_FEATURES        4
1621 #define STD_EBX_FEATURES        5
1622         /*
1623          * Synthesized information, where known.
1624          */
1625         uint32_t cpi_chiprev;           /* See X86_CHIPREV_* in x86_archext.h */
1626         const char *cpi_chiprevstr;     /* May be NULL if chiprev unknown */
1627         uint32_t cpi_socket;            /* Chip package/socket type */
1628 
1629         struct mwait_info cpi_mwait;    /* fn 5: monitor/mwait info */
1630         uint32_t cpi_apicid;
1631         uint_t cpi_procnodeid;          /* AMD: nodeID on HT, Intel: chipid */
1632         uint_t cpi_procnodes_per_pkg;   /* AMD: # of nodes in the package */
1633                                         /* Intel: 1 */
1634         uint_t cpi_compunitid;          /* AMD: ComputeUnit ID, Intel: coreid */
1635         uint_t cpi_cores_per_compunit;  /* AMD: # of cores in the ComputeUnit */
1636 
1637         struct xsave_info cpi_xsave;    /* fn D: xsave/xrestor info */
1638 };
1639 
1640 
1641 static struct cpuid_info cpuid_info0;
1642 
1643 /*
1644  * These bit fields are defined by the Intel Application Note AP-485
1645  * "Intel Processor Identification and the CPUID Instruction"
1646  */
1647 #define CPI_FAMILY_XTD(cpi)     BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1648 #define CPI_MODEL_XTD(cpi)      BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1649 #define CPI_TYPE(cpi)           BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1650 #define CPI_FAMILY(cpi)         BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1651 #define CPI_STEP(cpi)           BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1652 #define CPI_MODEL(cpi)          BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1653 
1654 #define CPI_FEATURES_EDX(cpi)           ((cpi)->cpi_std[1].cp_edx)
1655 #define CPI_FEATURES_ECX(cpi)           ((cpi)->cpi_std[1].cp_ecx)
1656 #define CPI_FEATURES_XTD_EDX(cpi)       ((cpi)->cpi_extd[1].cp_edx)
1657 #define CPI_FEATURES_XTD_ECX(cpi)       ((cpi)->cpi_extd[1].cp_ecx)
1658 #define CPI_FEATURES_7_0_EBX(cpi)       ((cpi)->cpi_std[7].cp_ebx)
1659 #define CPI_FEATURES_7_0_ECX(cpi)       ((cpi)->cpi_std[7].cp_ecx)
1660 #define CPI_FEATURES_7_0_EDX(cpi)       ((cpi)->cpi_std[7].cp_edx)
1661 
1662 #define CPI_BRANDID(cpi)        BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1663 #define CPI_CHUNKS(cpi)         BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1664 #define CPI_CPU_COUNT(cpi)      BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1665 #define CPI_APIC_ID(cpi)        BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1666 
1667 #define CPI_MAXEAX_MAX          0x100           /* sanity control */
1668 #define CPI_XMAXEAX_MAX         0x80000100
1669 #define CPI_FN4_ECX_MAX         0x20            /* sanity: max fn 4 levels */
1670 #define CPI_FNB_ECX_MAX         0x20            /* sanity: max fn B levels */
1671 
1672 /*
1673  * Function 4 (Deterministic Cache Parameters) macros
1674  * Defined by Intel Application Note AP-485
1675  */
1676 #define CPI_NUM_CORES(regs)             BITX((regs)->cp_eax, 31, 26)
1677 #define CPI_NTHR_SHR_CACHE(regs)        BITX((regs)->cp_eax, 25, 14)
1678 #define CPI_FULL_ASSOC_CACHE(regs)      BITX((regs)->cp_eax, 9, 9)
1679 #define CPI_SELF_INIT_CACHE(regs)       BITX((regs)->cp_eax, 8, 8)
1680 #define CPI_CACHE_LVL(regs)             BITX((regs)->cp_eax, 7, 5)
1681 #define CPI_CACHE_TYPE(regs)            BITX((regs)->cp_eax, 4, 0)
1682 #define CPI_CPU_LEVEL_TYPE(regs)        BITX((regs)->cp_ecx, 15, 8)
1683 
1684 #define CPI_CACHE_WAYS(regs)            BITX((regs)->cp_ebx, 31, 22)
1685 #define CPI_CACHE_PARTS(regs)           BITX((regs)->cp_ebx, 21, 12)
1686 #define CPI_CACHE_COH_LN_SZ(regs)       BITX((regs)->cp_ebx, 11, 0)
1687 
1688 #define CPI_CACHE_SETS(regs)            BITX((regs)->cp_ecx, 31, 0)
1689 
1690 #define CPI_PREFCH_STRIDE(regs)         BITX((regs)->cp_edx, 9, 0)
1691 
1692 
1693 /*
1694  * A couple of shorthand macros to identify "later" P6-family chips
1695  * like the Pentium M and Core.  First, the "older" P6-based stuff
1696  * (loosely defined as "pre-Pentium-4"):
1697  * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1698  */
1699 #define IS_LEGACY_P6(cpi) (                     \
1700         cpi->cpi_family == 6 &&                      \
1701                 (cpi->cpi_model == 1 ||              \
1702                 cpi->cpi_model == 3 ||               \
1703                 cpi->cpi_model == 5 ||               \
1704                 cpi->cpi_model == 6 ||               \
1705                 cpi->cpi_model == 7 ||               \
1706                 cpi->cpi_model == 8 ||               \
1707                 cpi->cpi_model == 0xA ||     \
1708                 cpi->cpi_model == 0xB)               \
1709 )
1710 
1711 /* A "new F6" is everything with family 6 that's not the above */
1712 #define IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
1713 
1714 /* Extended family/model support */
1715 #define IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
1716         cpi->cpi_family >= 0xf)
1717 
1718 /*
1719  * Info for monitor/mwait idle loop.
1720  *
1721  * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
1722  * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
1723  * 2006.
1724  * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
1725  * Documentation Updates" #33633, Rev 2.05, December 2006.
1726  */
1727 #define MWAIT_SUPPORT           (0x00000001)    /* mwait supported */
1728 #define MWAIT_EXTENSIONS        (0x00000002)    /* extenstion supported */
1729 #define MWAIT_ECX_INT_ENABLE    (0x00000004)    /* ecx 1 extension supported */
1730 #define MWAIT_SUPPORTED(cpi)    ((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
1731 #define MWAIT_INT_ENABLE(cpi)   ((cpi)->cpi_std[5].cp_ecx & 0x2)
1732 #define MWAIT_EXTENSION(cpi)    ((cpi)->cpi_std[5].cp_ecx & 0x1)
1733 #define MWAIT_SIZE_MIN(cpi)     BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
1734 #define MWAIT_SIZE_MAX(cpi)     BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
1735 /*
1736  * Number of sub-cstates for a given c-state.
1737  */
1738 #define MWAIT_NUM_SUBC_STATES(cpi, c_state)                     \
1739         BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
1740 
1741 /*
1742  * XSAVE leaf 0xD enumeration
1743  */
1744 #define CPUID_LEAFD_2_YMM_OFFSET        576
1745 #define CPUID_LEAFD_2_YMM_SIZE          256
1746 
1747 /*
1748  * Common extended leaf names to cut down on typos.
1749  */
1750 #define CPUID_LEAF_EXT_0                0x80000000
1751 #define CPUID_LEAF_EXT_8                0x80000008
1752 #define CPUID_LEAF_EXT_1d               0x8000001d
1753 #define CPUID_LEAF_EXT_1e               0x8000001e
1754 
1755 /*
1756  * Functions we consune from cpuid_subr.c;  don't publish these in a header
1757  * file to try and keep people using the expected cpuid_* interfaces.
1758  */
1759 extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
1760 extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
1761 extern uint32_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
1762 extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
1763 extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
1764 
1765 /*
1766  * Apply up various platform-dependent restrictions where the
1767  * underlying platform restrictions mean the CPU can be marked
1768  * as less capable than its cpuid instruction would imply.
1769  */
1770 #if defined(__xpv)
1771 static void
1772 platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
1773 {
1774         switch (eax) {
1775         case 1: {
1776                 uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
1777                     0 : CPUID_INTC_EDX_MCA;
1778                 cp->cp_edx &=
1779                     ~(mcamask |
1780                     CPUID_INTC_EDX_PSE |
1781                     CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1782                     CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
1783                     CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
1784                     CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1785                     CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
1786                 break;
1787         }
1788 
1789         case 0x80000001:
1790                 cp->cp_edx &=
1791                     ~(CPUID_AMD_EDX_PSE |
1792                     CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1793                     CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
1794                     CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
1795                     CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1796                     CPUID_AMD_EDX_TSCP);
1797                 cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
1798                 break;
1799         default:
1800                 break;
1801         }
1802 
1803         switch (vendor) {
1804         case X86_VENDOR_Intel:
1805                 switch (eax) {
1806                 case 4:
1807                         /*
1808                          * Zero out the (ncores-per-chip - 1) field
1809                          */
1810                         cp->cp_eax &= 0x03fffffff;
1811                         break;
1812                 default:
1813                         break;
1814                 }
1815                 break;
1816         case X86_VENDOR_AMD:
1817                 switch (eax) {
1818 
1819                 case 0x80000001:
1820                         cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
1821                         break;
1822 
1823                 case CPUID_LEAF_EXT_8:
1824                         /*
1825                          * Zero out the (ncores-per-chip - 1) field
1826                          */
1827                         cp->cp_ecx &= 0xffffff00;
1828                         break;
1829                 default:
1830                         break;
1831                 }
1832                 break;
1833         default:
1834                 break;
1835         }
1836 }
1837 #else
1838 #define platform_cpuid_mangle(vendor, eax, cp)  /* nothing */
1839 #endif
1840 
1841 /*
1842  *  Some undocumented ways of patching the results of the cpuid
1843  *  instruction to permit running Solaris 10 on future cpus that
1844  *  we don't currently support.  Could be set to non-zero values
1845  *  via settings in eeprom.
1846  */
1847 
1848 uint32_t cpuid_feature_ecx_include;
1849 uint32_t cpuid_feature_ecx_exclude;
1850 uint32_t cpuid_feature_edx_include;
1851 uint32_t cpuid_feature_edx_exclude;
1852 
1853 /*
1854  * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
1855  */
1856 void
1857 cpuid_alloc_space(cpu_t *cpu)
1858 {
1859         /*
1860          * By convention, cpu0 is the boot cpu, which is set up
1861          * before memory allocation is available.  All other cpus get
1862          * their cpuid_info struct allocated here.
1863          */
1864         ASSERT(cpu->cpu_id != 0);
1865         ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
1866         cpu->cpu_m.mcpu_cpi =
1867             kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
1868 }
1869 
1870 void
1871 cpuid_free_space(cpu_t *cpu)
1872 {
1873         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1874         int i;
1875 
1876         ASSERT(cpi != NULL);
1877         ASSERT(cpi != &cpuid_info0);
1878 
1879         /*
1880          * Free up any cache leaf related dynamic storage. The first entry was
1881          * cached from the standard cpuid storage, so we should not free it.
1882          */
1883         for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
1884                 kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
1885         if (cpi->cpi_cache_leaf_size > 0)
1886                 kmem_free(cpi->cpi_cache_leaves,
1887                     cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
1888 
1889         kmem_free(cpi, sizeof (*cpi));
1890         cpu->cpu_m.mcpu_cpi = NULL;
1891 }
1892 
1893 #if !defined(__xpv)
1894 /*
1895  * Determine the type of the underlying platform. This is used to customize
1896  * initialization of various subsystems (e.g. TSC). determine_platform() must
1897  * only ever be called once to prevent two processors from seeing different
1898  * values of platform_type. Must be called before cpuid_pass1(), the earliest
1899  * consumer to execute (uses _cpuid_chiprev --> synth_amd_info --> get_hwenv).
1900  */
1901 void
1902 determine_platform(void)
1903 {
1904         struct cpuid_regs cp;
1905         uint32_t base;
1906         uint32_t regs[4];
1907         char *hvstr = (char *)regs;
1908 
1909         ASSERT(platform_type == -1);
1910 
1911         platform_type = HW_NATIVE;
1912 
1913         if (!enable_platform_detection)
1914                 return;
1915 
1916         /*
1917          * If Hypervisor CPUID bit is set, try to determine hypervisor
1918          * vendor signature, and set platform type accordingly.
1919          *
1920          * References:
1921          * http://lkml.org/lkml/2008/10/1/246
1922          * http://kb.vmware.com/kb/1009458
1923          */
1924         cp.cp_eax = 0x1;
1925         (void) __cpuid_insn(&cp);
1926         if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
1927                 cp.cp_eax = 0x40000000;
1928                 (void) __cpuid_insn(&cp);
1929                 regs[0] = cp.cp_ebx;
1930                 regs[1] = cp.cp_ecx;
1931                 regs[2] = cp.cp_edx;
1932                 regs[3] = 0;
1933                 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
1934                         platform_type = HW_XEN_HVM;
1935                         return;
1936                 }
1937                 if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
1938                         platform_type = HW_VMWARE;
1939                         return;
1940                 }
1941                 if (strcmp(hvstr, HVSIG_KVM) == 0) {
1942                         platform_type = HW_KVM;
1943                         return;
1944                 }
1945                 if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
1946                         platform_type = HW_BHYVE;
1947                         return;
1948                 }
1949                 if (strcmp(hvstr, HVSIG_MICROSOFT) == 0)
1950                         platform_type = HW_MICROSOFT;
1951         } else {
1952                 /*
1953                  * Check older VMware hardware versions. VMware hypervisor is
1954                  * detected by performing an IN operation to VMware hypervisor
1955                  * port and checking that value returned in %ebx is VMware
1956                  * hypervisor magic value.
1957                  *
1958                  * References: http://kb.vmware.com/kb/1009458
1959                  */
1960                 vmware_port(VMWARE_HVCMD_GETVERSION, regs);
1961                 if (regs[1] == VMWARE_HVMAGIC) {
1962                         platform_type = HW_VMWARE;
1963                         return;
1964                 }
1965         }
1966 
1967         /*
1968          * Check Xen hypervisor. In a fully virtualized domain,
1969          * Xen's pseudo-cpuid function returns a string representing the
1970          * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
1971          * supported cpuid function. We need at least a (base + 2) leaf value
1972          * to do what we want to do. Try different base values, since the
1973          * hypervisor might use a different one depending on whether Hyper-V
1974          * emulation is switched on by default or not.
1975          */
1976         for (base = 0x40000000; base < 0x40010000; base += 0x100) {
1977                 cp.cp_eax = base;
1978                 (void) __cpuid_insn(&cp);
1979                 regs[0] = cp.cp_ebx;
1980                 regs[1] = cp.cp_ecx;
1981                 regs[2] = cp.cp_edx;
1982                 regs[3] = 0;
1983                 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
1984                     cp.cp_eax >= (base + 2)) {
1985                         platform_type &= ~HW_NATIVE;
1986                         platform_type |= HW_XEN_HVM;
1987                         return;
1988                 }
1989         }
1990 }
1991 
1992 int
1993 get_hwenv(void)
1994 {
1995         ASSERT(platform_type != -1);
1996         return (platform_type);
1997 }
1998 
1999 int
2000 is_controldom(void)
2001 {
2002         return (0);
2003 }
2004 
2005 #else
2006 
2007 int
2008 get_hwenv(void)
2009 {
2010         return (HW_XEN_PV);
2011 }
2012 
2013 int
2014 is_controldom(void)
2015 {
2016         return (DOMAIN_IS_INITDOMAIN(xen_info));
2017 }
2018 
2019 #endif  /* __xpv */
2020 
2021 /*
2022  * Make sure that we have gathered all of the CPUID leaves that we might need to
2023  * determine topology. We assume that the standard leaf 1 has already been done
2024  * and that xmaxeax has already been calculated.
2025  */
2026 static void
2027 cpuid_gather_amd_topology_leaves(cpu_t *cpu)
2028 {
2029         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2030 
2031         if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2032                 struct cpuid_regs *cp;
2033 
2034                 cp = &cpi->cpi_extd[8];
2035                 cp->cp_eax = CPUID_LEAF_EXT_8;
2036                 (void) __cpuid_insn(cp);
2037                 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
2038         }
2039 
2040         if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2041             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2042                 struct cpuid_regs *cp;
2043 
2044                 cp = &cpi->cpi_extd[0x1e];
2045                 cp->cp_eax = CPUID_LEAF_EXT_1e;
2046                 (void) __cpuid_insn(cp);
2047         }
2048 }
2049 
2050 /*
2051  * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
2052  * it to everything else. If not, and we're on an AMD system where 8000001e is
2053  * valid, then we use that. Othewrise, we fall back to the default value for the
2054  * APIC ID in leaf 1.
2055  */
2056 static uint32_t
2057 cpuid_gather_apicid(struct cpuid_info *cpi)
2058 {
2059         /*
2060          * Leaf B changes based on the arguments to it. Beacuse we don't cache
2061          * it, we need to gather it again.
2062          */
2063         if (cpi->cpi_maxeax >= 0xB) {
2064                 struct cpuid_regs regs;
2065                 struct cpuid_regs *cp;
2066 
2067                 cp = &regs;
2068                 cp->cp_eax = 0xB;
2069                 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2070                 (void) __cpuid_insn(cp);
2071 
2072                 if (cp->cp_ebx != 0) {
2073                         return (cp->cp_edx);
2074                 }
2075         }
2076 
2077         if (cpi->cpi_vendor == X86_VENDOR_AMD &&
2078             is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2079             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2080                 return (cpi->cpi_extd[0x1e].cp_eax);
2081         }
2082 
2083         return (CPI_APIC_ID(cpi));
2084 }
2085 
2086 /*
2087  * For AMD processors, attempt to calculate the number of chips and cores that
2088  * exist. The way that we do this varies based on the generation, because the
2089  * generations themselves have changed dramatically.
2090  *
2091  * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
2092  * However, with the advent of family 17h (Zen) it actually tells us the number
2093  * of threads, so we need to look at leaf 0x8000001e if available to determine
2094  * its value. Otherwise, for all prior families, the number of enabled cores is
2095  * the same as threads.
2096  *
2097  * If we do not have leaf 0x80000008, then we assume that this processor does
2098  * not have anything. AMD's older CPUID specification says there's no reason to
2099  * fall back to leaf 1.
2100  *
2101  * In some virtualization cases we will not have leaf 8000001e or it will be
2102  * zero. When that happens we assume the number of threads is one.
2103  */
2104 static void
2105 cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2106 {
2107         uint_t nthreads, nthread_per_core;
2108 
2109         nthreads = nthread_per_core = 1;
2110 
2111         if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2112                 nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2113         } else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2114                 nthreads = CPI_CPU_COUNT(cpi);
2115         }
2116 
2117         /*
2118          * For us to have threads, and know about it, we have to be at least at
2119          * family 17h and have the cpuid bit that says we have extended
2120          * topology.
2121          */
2122         if (cpi->cpi_family >= 0x17 &&
2123             is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2124             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2125                 nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2126         }
2127 
2128         *ncpus = nthreads;
2129         *ncores = nthreads / nthread_per_core;
2130 }
2131 
2132 /*
2133  * Seed the initial values for the cores and threads for an Intel based
2134  * processor. These values will be overwritten if we detect that the processor
2135  * supports CPUID leaf 0xb.
2136  */
2137 static void
2138 cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2139 {
2140         /*
2141          * Only seed the number of physical cores from the first level leaf 4
2142          * information. The number of threads there indicate how many share the
2143          * L1 cache, which may or may not have anything to do with the number of
2144          * logical CPUs per core.
2145          */
2146         if (cpi->cpi_maxeax >= 4) {
2147                 *ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2148         } else {
2149                 *ncores = 1;
2150         }
2151 
2152         if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2153                 *ncpus = CPI_CPU_COUNT(cpi);
2154         } else {
2155                 *ncpus = *ncores;
2156         }
2157 }
2158 
2159 static boolean_t
2160 cpuid_leafB_getids(cpu_t *cpu)
2161 {
2162         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2163         struct cpuid_regs regs;
2164         struct cpuid_regs *cp;
2165 
2166         if (cpi->cpi_maxeax < 0xB)
2167                 return (B_FALSE);
2168 
2169         cp = &regs;
2170         cp->cp_eax = 0xB;
2171         cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2172 
2173         (void) __cpuid_insn(cp);
2174 
2175         /*
2176          * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2177          * indicates that the extended topology enumeration leaf is
2178          * available.
2179          */
2180         if (cp->cp_ebx != 0) {
2181                 uint32_t x2apic_id = 0;
2182                 uint_t coreid_shift = 0;
2183                 uint_t ncpu_per_core = 1;
2184                 uint_t chipid_shift = 0;
2185                 uint_t ncpu_per_chip = 1;
2186                 uint_t i;
2187                 uint_t level;
2188 
2189                 for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2190                         cp->cp_eax = 0xB;
2191                         cp->cp_ecx = i;
2192 
2193                         (void) __cpuid_insn(cp);
2194                         level = CPI_CPU_LEVEL_TYPE(cp);
2195 
2196                         if (level == 1) {
2197                                 x2apic_id = cp->cp_edx;
2198                                 coreid_shift = BITX(cp->cp_eax, 4, 0);
2199                                 ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2200                         } else if (level == 2) {
2201                                 x2apic_id = cp->cp_edx;
2202                                 chipid_shift = BITX(cp->cp_eax, 4, 0);
2203                                 ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2204                         }
2205                 }
2206 
2207                 /*
2208                  * cpi_apicid is taken care of in cpuid_gather_apicid.
2209                  */
2210                 cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2211                 cpi->cpi_ncore_per_chip = ncpu_per_chip /
2212                     ncpu_per_core;
2213                 cpi->cpi_chipid = x2apic_id >> chipid_shift;
2214                 cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2215                 cpi->cpi_coreid = x2apic_id >> coreid_shift;
2216                 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2217                 cpi->cpi_procnodeid = cpi->cpi_chipid;
2218                 cpi->cpi_compunitid = cpi->cpi_coreid;
2219 
2220                 if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2221                         cpi->cpi_nthread_bits = coreid_shift;
2222                         cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2223                 }
2224 
2225                 return (B_TRUE);
2226         } else {
2227                 return (B_FALSE);
2228         }
2229 }
2230 
2231 static void
2232 cpuid_intel_getids(cpu_t *cpu, void *feature)
2233 {
2234         uint_t i;
2235         uint_t chipid_shift = 0;
2236         uint_t coreid_shift = 0;
2237         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2238 
2239         /*
2240          * There are no compute units or processor nodes currently on Intel.
2241          * Always set these to one.
2242          */
2243         cpi->cpi_procnodes_per_pkg = 1;
2244         cpi->cpi_cores_per_compunit = 1;
2245 
2246         /*
2247          * If cpuid Leaf B is present, use that to try and get this information.
2248          * It will be the most accurate for Intel CPUs.
2249          */
2250         if (cpuid_leafB_getids(cpu))
2251                 return;
2252 
2253         /*
2254          * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2255          * and ncore_per_chip. These represent the largest power of two values
2256          * that we need to cover all of the IDs in the system. Therefore, we use
2257          * those values to seed the number of bits needed to cover information
2258          * in the case when leaf B is not available. These values will probably
2259          * be larger than required, but that's OK.
2260          */
2261         cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2262         cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2263 
2264         for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2265                 chipid_shift++;
2266 
2267         cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2268         cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2269 
2270         if (is_x86_feature(feature, X86FSET_CMP)) {
2271                 /*
2272                  * Multi-core (and possibly multi-threaded)
2273                  * processors.
2274                  */
2275                 uint_t ncpu_per_core;
2276                 if (cpi->cpi_ncore_per_chip == 1)
2277                         ncpu_per_core = cpi->cpi_ncpu_per_chip;
2278                 else if (cpi->cpi_ncore_per_chip > 1)
2279                         ncpu_per_core = cpi->cpi_ncpu_per_chip /
2280                             cpi->cpi_ncore_per_chip;
2281                 /*
2282                  * 8bit APIC IDs on dual core Pentiums
2283                  * look like this:
2284                  *
2285                  * +-----------------------+------+------+
2286                  * | Physical Package ID   |  MC  |  HT  |
2287                  * +-----------------------+------+------+
2288                  * <------- chipid -------->
2289                  * <------- coreid --------------->
2290                  *                         <--- clogid -->
2291                  *                         <------>
2292                  *                         pkgcoreid
2293                  *
2294                  * Where the number of bits necessary to
2295                  * represent MC and HT fields together equals
2296                  * to the minimum number of bits necessary to
2297                  * store the value of cpi->cpi_ncpu_per_chip.
2298                  * Of those bits, the MC part uses the number
2299                  * of bits necessary to store the value of
2300                  * cpi->cpi_ncore_per_chip.
2301                  */
2302                 for (i = 1; i < ncpu_per_core; i <<= 1)
2303                         coreid_shift++;
2304                 cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2305                 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2306         } else if (is_x86_feature(feature, X86FSET_HTT)) {
2307                 /*
2308                  * Single-core multi-threaded processors.
2309                  */
2310                 cpi->cpi_coreid = cpi->cpi_chipid;
2311                 cpi->cpi_pkgcoreid = 0;
2312         } else {
2313                 /*
2314                  * Single-core single-thread processors.
2315                  */
2316                 cpi->cpi_coreid = cpu->cpu_id;
2317                 cpi->cpi_pkgcoreid = 0;
2318         }
2319         cpi->cpi_procnodeid = cpi->cpi_chipid;
2320         cpi->cpi_compunitid = cpi->cpi_coreid;
2321 }
2322 
2323 /*
2324  * Historically, AMD has had CMP chips with only a single thread per core.
2325  * However, starting in family 17h (Zen), this has changed and they now have
2326  * multiple threads. Our internal core id needs to be a unique value.
2327  *
2328  * To determine the core id of an AMD system, if we're from a family before 17h,
2329  * then we just use the cpu id, as that gives us a good value that will be
2330  * unique for each core. If instead, we're on family 17h or later, then we need
2331  * to do something more complicated. CPUID leaf 0x8000001e can tell us
2332  * how many threads are in the system. Based on that, we'll shift the APIC ID.
2333  * We can't use the normal core id in that leaf as it's only unique within the
2334  * socket, which is perfect for cpi_pkgcoreid, but not us.
2335  */
2336 static id_t
2337 cpuid_amd_get_coreid(cpu_t *cpu)
2338 {
2339         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2340 
2341         if (cpi->cpi_family >= 0x17 &&
2342             is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2343             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2344                 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2345                 if (nthreads > 1) {
2346                         VERIFY3U(nthreads, ==, 2);
2347                         return (cpi->cpi_apicid >> 1);
2348                 }
2349         }
2350 
2351         return (cpu->cpu_id);
2352 }
2353 
2354 /*
2355  * IDs on AMD is a more challenging task. This is notable because of the
2356  * following two facts:
2357  *
2358  *  1. Before family 0x17 (Zen), there was no support for SMT and there was
2359  *     also no way to get an actual unique core id from the system. As such, we
2360  *     synthesize this case by using cpu->cpu_id.  This scheme does not,
2361  *     however, guarantee that sibling cores of a chip will have sequential
2362  *     coreids starting at a multiple of the number of cores per chip - that is
2363  *     usually the case, but if the ACPI MADT table is presented in a different
2364  *     order then we need to perform a few more gymnastics for the pkgcoreid.
2365  *
2366  *  2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2367  *     called compute units. These compute units share the L1I cache, L2 cache,
2368  *     and the FPU. To deal with this, a new topology leaf was added in
2369  *     0x8000001e. However, parts of this leaf have different meanings
2370  *     once we get to family 0x17.
2371  */
2372 
2373 static void
2374 cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2375 {
2376         int i, first_half, coreidsz;
2377         uint32_t nb_caps_reg;
2378         uint_t node2_1;
2379         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2380         struct cpuid_regs *cp;
2381 
2382         /*
2383          * Calculate the core id (this comes from hardware in family 0x17 if it
2384          * hasn't been stripped by virtualization). We always set the compute
2385          * unit id to the same value. Also, initialize the default number of
2386          * cores per compute unit and nodes per package. This will be
2387          * overwritten when we know information about a particular family.
2388          */
2389         cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2390         cpi->cpi_compunitid = cpi->cpi_coreid;
2391         cpi->cpi_cores_per_compunit = 1;
2392         cpi->cpi_procnodes_per_pkg = 1;
2393 
2394         /*
2395          * To construct the logical ID, we need to determine how many APIC IDs
2396          * are dedicated to the cores and threads. This is provided for us in
2397          * 0x80000008. However, if it's not present (say due to virtualization),
2398          * then we assume it's one. This should be present on all 64-bit AMD
2399          * processors.  It was added in family 0xf (Hammer).
2400          */
2401         if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2402                 coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2403 
2404                 /*
2405                  * In AMD parlance chip is really a node while illumos
2406                  * uses chip as equivalent to socket/package.
2407                  */
2408                 if (coreidsz == 0) {
2409                         /* Use legacy method */
2410                         for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2411                                 coreidsz++;
2412                         if (coreidsz == 0)
2413                                 coreidsz = 1;
2414                 }
2415         } else {
2416                 /* Assume single-core part */
2417                 coreidsz = 1;
2418         }
2419         cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2420 
2421         /*
2422          * The package core ID varies depending on the family. While it may be
2423          * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2424          * this value is the core id in the given node. For non-virtualized
2425          * family 17h, we need to take the logical core id and shift off the
2426          * threads like we do when getting the core id.  Otherwise, we can use
2427          * the clogid as is. When family 17h is virtualized, the clogid should
2428          * be sufficient as if we don't have valid data in the leaf, then we
2429          * won't think we have SMT, in which case the cpi_clogid should be
2430          * sufficient.
2431          */
2432         if (cpi->cpi_family >= 0x17 &&
2433             is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2434             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2435             cpi->cpi_extd[0x1e].cp_ebx != 0) {
2436                 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2437                 if (nthreads > 1) {
2438                         VERIFY3U(nthreads, ==, 2);
2439                         cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2440                 } else {
2441                         cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2442                 }
2443         } else {
2444                 cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2445         }
2446 
2447         /*
2448          * Obtain the node ID and compute unit IDs. If we're on family 0x15
2449          * (bulldozer) or newer, then we can derive all of this from leaf
2450          * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2451          */
2452         if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2453             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2454                 cp = &cpi->cpi_extd[0x1e];
2455 
2456                 cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2457                 cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2458 
2459                 /*
2460                  * For Bulldozer-era CPUs, recalculate the compute unit
2461                  * information.
2462                  */
2463                 if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2464                         cpi->cpi_cores_per_compunit =
2465                             BITX(cp->cp_ebx, 15, 8) + 1;
2466                         cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2467                             (cpi->cpi_ncore_per_chip /
2468                             cpi->cpi_cores_per_compunit) *
2469                             (cpi->cpi_procnodeid /
2470                             cpi->cpi_procnodes_per_pkg);
2471                 }
2472         } else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2473                 cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2474         } else if (cpi->cpi_family == 0x10) {
2475                 /*
2476                  * See if we are a multi-node processor.
2477                  * All processors in the system have the same number of nodes
2478                  */
2479                 nb_caps_reg =  pci_getl_func(0, 24, 3, 0xe8);
2480                 if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2481                         /* Single-node */
2482                         cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2483                             coreidsz);
2484                 } else {
2485 
2486                         /*
2487                          * Multi-node revision D (2 nodes per package
2488                          * are supported)
2489                          */
2490                         cpi->cpi_procnodes_per_pkg = 2;
2491 
2492                         first_half = (cpi->cpi_pkgcoreid <=
2493                             (cpi->cpi_ncore_per_chip/2 - 1));
2494 
2495                         if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2496                                 /* We are BSP */
2497                                 cpi->cpi_procnodeid = (first_half ? 0 : 1);
2498                         } else {
2499 
2500                                 /* We are AP */
2501                                 /* NodeId[2:1] bits to use for reading F3xe8 */
2502                                 node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2503 
2504                                 nb_caps_reg =
2505                                     pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2506 
2507                                 /*
2508                                  * Check IntNodeNum bit (31:30, but bit 31 is
2509                                  * always 0 on dual-node processors)
2510                                  */
2511                                 if (BITX(nb_caps_reg, 30, 30) == 0)
2512                                         cpi->cpi_procnodeid = node2_1 +
2513                                             !first_half;
2514                                 else
2515                                         cpi->cpi_procnodeid = node2_1 +
2516                                             first_half;
2517                         }
2518                 }
2519         } else {
2520                 cpi->cpi_procnodeid = 0;
2521         }
2522 
2523         cpi->cpi_chipid =
2524             cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2525 
2526         cpi->cpi_ncore_bits = coreidsz;
2527         cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2528             cpi->cpi_ncore_per_chip);
2529 }
2530 
2531 static void
2532 spec_uarch_flush_noop(void)
2533 {
2534 }
2535 
2536 /*
2537  * When microcode is present that mitigates MDS, this wrmsr will also flush the
2538  * MDS-related micro-architectural state that would normally happen by calling
2539  * x86_md_clear().
2540  */
2541 static void
2542 spec_uarch_flush_msr(void)
2543 {
2544         wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2545 }
2546 
2547 /*
2548  * This function points to a function that will flush certain
2549  * micro-architectural state on the processor. This flush is used to mitigate
2550  * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This
2551  * function can point to one of three functions:
2552  *
2553  * - A noop which is done because we either are vulnerable, but do not have
2554  *   microcode available to help deal with a fix, or because we aren't
2555  *   vulnerable.
2556  *
2557  * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2558  *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2559  *   however, it only flushes the MDS related micro-architectural state on the
2560  *   current hyperthread, it does not do anything for the twin.
2561  *
2562  * - x86_md_clear which will flush the MDS related state. This is done when we
2563  *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2564  *   (RDCL_NO is set).
2565  */
2566 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2567 
2568 static void
2569 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2570 {
2571         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2572 
2573         /*
2574          * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2575          * has been fixed in hardware, it doesn't cover everything related to
2576          * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2577          * need to mitigate this.
2578          */
2579         if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2580             is_x86_feature(featureset, X86FSET_MDS_NO)) {
2581                 return;
2582         }
2583 
2584         if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2585                 const uint8_t nop = NOP_INSTR;
2586                 uint8_t *md = (uint8_t *)x86_md_clear;
2587 
2588                 *md = nop;
2589         }
2590 
2591         membar_producer();
2592 }
2593 
2594 static void
2595 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2596 {
2597         boolean_t need_l1d, need_mds;
2598         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2599 
2600         /*
2601          * If we're not on Intel or we've mitigated both RDCL and MDS in
2602          * hardware, then there's nothing left for us to do for enabling the
2603          * flush. We can also go ahead and say that SMT exclusion is
2604          * unnecessary.
2605          */
2606         if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2607             (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2608             is_x86_feature(featureset, X86FSET_MDS_NO))) {
2609                 extern int smt_exclusion;
2610                 smt_exclusion = 0;
2611                 spec_uarch_flush = spec_uarch_flush_noop;
2612                 membar_producer();
2613                 return;
2614         }
2615 
2616         /*
2617          * The locations where we need to perform an L1D flush are required both
2618          * for mitigating L1TF and MDS. When verw support is present in
2619          * microcode, then the L1D flush will take care of doing that as well.
2620          * However, if we have a system where RDCL_NO is present, but we don't
2621          * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2622          * L1D flush.
2623          */
2624         if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2625             is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2626             !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2627                 need_l1d = B_TRUE;
2628         } else {
2629                 need_l1d = B_FALSE;
2630         }
2631 
2632         if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2633             is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2634                 need_mds = B_TRUE;
2635         } else {
2636                 need_mds = B_FALSE;
2637         }
2638 
2639         if (need_l1d) {
2640                 spec_uarch_flush = spec_uarch_flush_msr;
2641         } else if (need_mds) {
2642                 spec_uarch_flush = x86_md_clear;
2643         } else {
2644                 /*
2645                  * We have no hardware mitigations available to us.
2646                  */
2647                 spec_uarch_flush = spec_uarch_flush_noop;
2648         }
2649         membar_producer();
2650 }
2651 
2652 /*
2653  * We default to enabling RSB mitigations.
2654  */
2655 static void
2656 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)
2657 {
2658         const uint8_t ret = RET_INSTR;
2659         uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
2660 
2661         switch (mit) {
2662         case X86_SPECTREV2_ENHANCED_IBRS:
2663         case X86_SPECTREV2_DISABLED:
2664                 *stuff = ret;
2665                 break;
2666         default:
2667                 break;
2668         }
2669 }
2670 
2671 static void
2672 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
2673 {
2674         const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
2675             "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
2676             "_r14", "_r15" };
2677         const uint_t nthunks = ARRAY_SIZE(thunks);
2678         const char *type;
2679         uint_t i;
2680 
2681         if (mit == x86_spectrev2_mitigation)
2682                 return;
2683 
2684         switch (mit) {
2685         case X86_SPECTREV2_RETPOLINE:
2686                 type = "gen";
2687                 break;
2688         case X86_SPECTREV2_RETPOLINE_AMD:
2689                 type = "amd";
2690                 break;
2691         case X86_SPECTREV2_ENHANCED_IBRS:
2692         case X86_SPECTREV2_DISABLED:
2693                 type = "jmp";
2694                 break;
2695         default:
2696                 panic("asked to updated retpoline state with unknown state!");
2697         }
2698 
2699         for (i = 0; i < nthunks; i++) {
2700                 uintptr_t source, dest;
2701                 int ssize, dsize;
2702                 char sourcebuf[64], destbuf[64];
2703                 size_t len;
2704 
2705                 (void) snprintf(destbuf, sizeof (destbuf),
2706                     "__x86_indirect_thunk%s", thunks[i]);
2707                 (void) snprintf(sourcebuf, sizeof (sourcebuf),
2708                     "__x86_indirect_thunk_%s%s", type, thunks[i]);
2709 
2710                 source = kobj_getelfsym(sourcebuf, NULL, &ssize);
2711                 dest = kobj_getelfsym(destbuf, NULL, &dsize);
2712                 VERIFY3U(source, !=, 0);
2713                 VERIFY3U(dest, !=, 0);
2714                 VERIFY3S(dsize, >=, ssize);
2715                 bcopy((void *)source, (void *)dest, ssize);
2716         }
2717 }
2718 
2719 static void
2720 cpuid_enable_enhanced_ibrs(void)
2721 {
2722         uint64_t val;
2723 
2724         val = rdmsr(MSR_IA32_SPEC_CTRL);
2725         val |= IA32_SPEC_CTRL_IBRS;
2726         wrmsr(MSR_IA32_SPEC_CTRL, val);
2727 }
2728 
2729 #ifndef __xpv
2730 /*
2731  * Determine whether or not we can use the AMD optimized retpoline
2732  * functionality. We use this when we know we're on an AMD system and we can
2733  * successfully verify that lfence is dispatch serializing.
2734  */
2735 static boolean_t
2736 cpuid_use_amd_retpoline(struct cpuid_info *cpi)
2737 {
2738         uint64_t val;
2739         on_trap_data_t otd;
2740 
2741         if (cpi->cpi_vendor != X86_VENDOR_AMD)
2742                 return (B_FALSE);
2743 
2744         /*
2745          * We need to determine whether or not lfence is serializing. It always
2746          * is on families 0xf and 0x11. On others, it's controlled by
2747          * MSR_AMD_DECODE_CONFIG (MSRC001_1029). If some hypervisor gives us a
2748          * crazy old family, don't try and do anything.
2749          */
2750         if (cpi->cpi_family < 0xf)
2751                 return (B_FALSE);
2752         if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11)
2753                 return (B_TRUE);
2754 
2755         /*
2756          * While it may be tempting to use get_hwenv(), there are no promises
2757          * that a hypervisor will actually declare themselves to be so in a
2758          * friendly way. As such, try to read and set the MSR. If we can then
2759          * read back the value we set (it wasn't just set to zero), then we go
2760          * for it.
2761          */
2762         if (!on_trap(&otd, OT_DATA_ACCESS)) {
2763                 val = rdmsr(MSR_AMD_DECODE_CONFIG);
2764                 val |= AMD_DECODE_CONFIG_LFENCE_DISPATCH;
2765                 wrmsr(MSR_AMD_DECODE_CONFIG, val);
2766                 val = rdmsr(MSR_AMD_DECODE_CONFIG);
2767         } else {
2768                 val = 0;
2769         }
2770         no_trap();
2771 
2772         if ((val & AMD_DECODE_CONFIG_LFENCE_DISPATCH) != 0)
2773                 return (B_TRUE);
2774         return (B_FALSE);
2775 }
2776 #endif  /* !__xpv */
2777 
2778 /*
2779  * Determine how we should mitigate TAA or if we need to. Regardless of TAA, if
2780  * we can disable TSX, we do so.
2781  *
2782  * This determination is done only on the boot CPU, potentially after loading
2783  * updated microcode.
2784  */
2785 static void
2786 cpuid_update_tsx(cpu_t *cpu, uchar_t *featureset)
2787 {
2788         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2789 
2790         VERIFY(cpu->cpu_id == 0);
2791 
2792         if (cpi->cpi_vendor != X86_VENDOR_Intel) {
2793                 x86_taa_mitigation = X86_TAA_HW_MITIGATED;
2794                 return;
2795         }
2796 
2797         if (x86_disable_taa) {
2798                 x86_taa_mitigation = X86_TAA_DISABLED;
2799                 return;
2800         }
2801 
2802         /*
2803          * If we do not have the ability to disable TSX, then our only
2804          * mitigation options are in hardware (TAA_NO), or by using our existing
2805          * MDS mitigation as described above.  The latter relies upon us having
2806          * configured MDS mitigations correctly! This includes disabling SMT if
2807          * we want to cross-CPU-thread protection.
2808          */
2809         if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) {
2810                 /*
2811                  * It's not clear whether any parts will enumerate TAA_NO
2812                  * *without* TSX_CTRL, but let's mark it as such if we see this.
2813                  */
2814                 if (is_x86_feature(featureset, X86FSET_TAA_NO)) {
2815                         x86_taa_mitigation = X86_TAA_HW_MITIGATED;
2816                         return;
2817                 }
2818 
2819                 if (is_x86_feature(featureset, X86FSET_MD_CLEAR) &&
2820                     !is_x86_feature(featureset, X86FSET_MDS_NO)) {
2821                         x86_taa_mitigation = X86_TAA_MD_CLEAR;
2822                 } else {
2823                         x86_taa_mitigation = X86_TAA_NOTHING;
2824                 }
2825                 return;
2826         }
2827 
2828         /*
2829          * We have TSX_CTRL, but we can only fully disable TSX if we're early
2830          * enough in boot.
2831          *
2832          * Otherwise, we'll fall back to causing transactions to abort as our
2833          * mitigation. TSX-using code will always take the fallback path.
2834          */
2835         if (cpi->cpi_pass < 4) {
2836                 x86_taa_mitigation = X86_TAA_TSX_DISABLE;
2837         } else {
2838                 x86_taa_mitigation = X86_TAA_TSX_FORCE_ABORT;
2839         }
2840 }
2841 
2842 static void
2843 cpuid_apply_tsx(x86_taa_mitigation_t taa)
2844 {
2845         uint64_t val;
2846 
2847         switch (taa) {
2848         case X86_TAA_TSX_DISABLE:
2849                 val = rdmsr(MSR_IA32_TSX_CTRL);
2850                 val |= IA32_TSX_CTRL_CPUID_CLEAR | IA32_TSX_CTRL_RTM_DISABLE;
2851                 wrmsr(MSR_IA32_TSX_CTRL, val);
2852                 break;
2853         case X86_TAA_TSX_FORCE_ABORT:
2854                 val = rdmsr(MSR_IA32_TSX_CTRL);
2855                 val |= IA32_TSX_CTRL_RTM_DISABLE;
2856                 wrmsr(MSR_IA32_TSX_CTRL, val);
2857                 break;
2858         case X86_TAA_HW_MITIGATED:
2859         case X86_TAA_MD_CLEAR:
2860         case X86_TAA_DISABLED:
2861         case X86_TAA_NOTHING:
2862                 break;
2863         }
2864 }
2865 
2866 static void
2867 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
2868 {
2869         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2870         x86_spectrev2_mitigation_t v2mit;
2871 
2872         if (cpi->cpi_vendor == X86_VENDOR_AMD &&
2873             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2874                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
2875                         add_x86_feature(featureset, X86FSET_IBPB);
2876                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
2877                         add_x86_feature(featureset, X86FSET_IBRS);
2878                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
2879                         add_x86_feature(featureset, X86FSET_STIBP);
2880                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
2881                         add_x86_feature(featureset, X86FSET_STIBP_ALL);
2882                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
2883                         add_x86_feature(featureset, X86FSET_SSBD);
2884                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
2885                         add_x86_feature(featureset, X86FSET_SSBD_VIRT);
2886                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
2887                         add_x86_feature(featureset, X86FSET_SSB_NO);
2888                 /*
2889                  * Don't enable enhanced IBRS unless we're told that we should
2890                  * prefer it and it has the same semantics as Intel. This is
2891                  * split into two bits rather than a single one.
2892                  */
2893                 if ((cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
2894                     (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)) {
2895                         add_x86_feature(featureset, X86FSET_IBRS_ALL);
2896                 }
2897 
2898         } else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
2899             cpi->cpi_maxeax >= 7) {
2900                 struct cpuid_regs *ecp;
2901                 ecp = &cpi->cpi_std[7];
2902 
2903                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
2904                         add_x86_feature(featureset, X86FSET_MD_CLEAR);
2905                 }
2906 
2907                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
2908                         add_x86_feature(featureset, X86FSET_IBRS);
2909                         add_x86_feature(featureset, X86FSET_IBPB);
2910                 }
2911 
2912                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
2913                         add_x86_feature(featureset, X86FSET_STIBP);
2914                 }
2915 
2916                 /*
2917                  * Don't read the arch caps MSR on xpv where we lack the
2918                  * on_trap().
2919                  */
2920 #ifndef __xpv
2921                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
2922                         on_trap_data_t otd;
2923 
2924                         /*
2925                          * Be paranoid and assume we'll get a #GP.
2926                          */
2927                         if (!on_trap(&otd, OT_DATA_ACCESS)) {
2928                                 uint64_t reg;
2929 
2930                                 reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
2931                                 if (reg & IA32_ARCH_CAP_RDCL_NO) {
2932                                         add_x86_feature(featureset,
2933                                             X86FSET_RDCL_NO);
2934                                 }
2935                                 if (reg & IA32_ARCH_CAP_IBRS_ALL) {
2936                                         add_x86_feature(featureset,
2937                                             X86FSET_IBRS_ALL);
2938                                 }
2939                                 if (reg & IA32_ARCH_CAP_RSBA) {
2940                                         add_x86_feature(featureset,
2941                                             X86FSET_RSBA);
2942                                 }
2943                                 if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
2944                                         add_x86_feature(featureset,
2945                                             X86FSET_L1D_VM_NO);
2946                                 }
2947                                 if (reg & IA32_ARCH_CAP_SSB_NO) {
2948                                         add_x86_feature(featureset,
2949                                             X86FSET_SSB_NO);
2950                                 }
2951                                 if (reg & IA32_ARCH_CAP_MDS_NO) {
2952                                         add_x86_feature(featureset,
2953                                             X86FSET_MDS_NO);
2954                                 }
2955                                 if (reg & IA32_ARCH_CAP_TSX_CTRL) {
2956                                         add_x86_feature(featureset,
2957                                             X86FSET_TSX_CTRL);
2958                                 }
2959                                 if (reg & IA32_ARCH_CAP_TAA_NO) {
2960                                         add_x86_feature(featureset,
2961                                             X86FSET_TAA_NO);
2962                                 }
2963                         }
2964                         no_trap();
2965                 }
2966 #endif  /* !__xpv */
2967 
2968                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
2969                         add_x86_feature(featureset, X86FSET_SSBD);
2970 
2971                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
2972                         add_x86_feature(featureset, X86FSET_FLUSH_CMD);
2973         }
2974 
2975         /*
2976          * Take care of certain mitigations on the non-boot CPU. The boot CPU
2977          * will have already run this function and determined what we need to
2978          * do. This gives us a hook for per-HW thread mitigations such as
2979          * enhanced IBRS, or disabling TSX.  For TSX disabling, we need to be
2980          * careful that we've had a chance to load ucode that enables the new
2981          * MSRs.
2982          */
2983         if (cpu->cpu_id != 0) {
2984                 if (x86_spectrev2_mitigation == X86_SPECTREV2_ENHANCED_IBRS) {
2985                         cpuid_enable_enhanced_ibrs();
2986                 }
2987 
2988                 if (cpi->cpi_pass >= 1)
2989                         cpuid_apply_tsx(x86_taa_mitigation);
2990                 return;
2991         }
2992 
2993         /*
2994          * Go through and initialize various security mechanisms that we should
2995          * only do on a single CPU. This includes Spectre V2, L1TF, MDS, and
2996          * TAA.
2997          */
2998 
2999         /*
3000          * By default we've come in with retpolines enabled. Check whether we
3001          * should disable them or enable enhanced IBRS. RSB stuffing is enabled
3002          * by default, but disabled if we are using enhanced IBRS.
3003          */
3004         if (x86_disable_spectrev2 != 0) {
3005                 v2mit = X86_SPECTREV2_DISABLED;
3006         } else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
3007                 cpuid_enable_enhanced_ibrs();
3008                 v2mit = X86_SPECTREV2_ENHANCED_IBRS;
3009 #ifndef __xpv
3010         } else if (cpuid_use_amd_retpoline(cpi)) {
3011                 v2mit = X86_SPECTREV2_RETPOLINE_AMD;
3012 #endif  /* !__xpv */
3013         } else {
3014                 v2mit = X86_SPECTREV2_RETPOLINE;
3015         }
3016 
3017         cpuid_patch_retpolines(v2mit);
3018         cpuid_patch_rsb(v2mit);
3019         x86_spectrev2_mitigation = v2mit;
3020         membar_producer();
3021 
3022         /*
3023          * We need to determine what changes are required for mitigating L1TF
3024          * and MDS. If the CPU suffers from either of them, then SMT exclusion
3025          * is required.
3026          *
3027          * If any of these are present, then we need to flush u-arch state at
3028          * various points. For MDS, we need to do so whenever we change to a
3029          * lesser privilege level or we are halting the CPU. For L1TF we need to
3030          * flush the L1D cache at VM entry. When we have microcode that handles
3031          * MDS, the L1D flush also clears the other u-arch state that the
3032          * md_clear does.
3033          */
3034 
3035         /*
3036          * Update whether or not we need to be taking explicit action against
3037          * MDS.
3038          */
3039         cpuid_update_md_clear(cpu, featureset);
3040 
3041         /*
3042          * Determine whether SMT exclusion is required and whether or not we
3043          * need to perform an l1d flush.
3044          */
3045         cpuid_update_l1d_flush(cpu, featureset);
3046 
3047         /*
3048          * Determine what our mitigation strategy should be for TAA and then
3049          * also apply TAA mitigations.
3050          */
3051         cpuid_update_tsx(cpu, featureset);
3052         cpuid_apply_tsx(x86_taa_mitigation);
3053 }
3054 
3055 /*
3056  * Setup XFeature_Enabled_Mask register. Required by xsave feature.
3057  */
3058 void
3059 setup_xfem(void)
3060 {
3061         uint64_t flags = XFEATURE_LEGACY_FP;
3062 
3063         ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
3064 
3065         if (is_x86_feature(x86_featureset, X86FSET_SSE))
3066                 flags |= XFEATURE_SSE;
3067 
3068         if (is_x86_feature(x86_featureset, X86FSET_AVX))
3069                 flags |= XFEATURE_AVX;
3070 
3071         if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
3072                 flags |= XFEATURE_AVX512;
3073 
3074         set_xcr(XFEATURE_ENABLED_MASK, flags);
3075 
3076         xsave_bv_all = flags;
3077 }
3078 
3079 static void
3080 cpuid_pass1_topology(cpu_t *cpu, uchar_t *featureset)
3081 {
3082         struct cpuid_info *cpi;
3083 
3084         cpi = cpu->cpu_m.mcpu_cpi;
3085 
3086         if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3087                 cpuid_gather_amd_topology_leaves(cpu);
3088         }
3089 
3090         cpi->cpi_apicid = cpuid_gather_apicid(cpi);
3091 
3092         /*
3093          * Before we can calculate the IDs that we should assign to this
3094          * processor, we need to understand how many cores and threads it has.
3095          */
3096         switch (cpi->cpi_vendor) {
3097         case X86_VENDOR_Intel:
3098                 cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3099                     &cpi->cpi_ncore_per_chip);
3100                 break;
3101         case X86_VENDOR_AMD:
3102                 cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3103                     &cpi->cpi_ncore_per_chip);
3104                 break;
3105         default:
3106                 /*
3107                  * If we have some other x86 compatible chip, it's not clear how
3108                  * they would behave. The most common case is virtualization
3109                  * today, though there are also 64-bit VIA chips. Assume that
3110                  * all we can get is the basic Leaf 1 HTT information.
3111                  */
3112                 if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
3113                         cpi->cpi_ncore_per_chip = 1;
3114                         cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
3115                 }
3116                 break;
3117         }
3118 
3119         /*
3120          * Based on the calculated number of threads and cores, potentially
3121          * assign the HTT and CMT features.
3122          */
3123         if (cpi->cpi_ncore_per_chip > 1) {
3124                 add_x86_feature(featureset, X86FSET_CMP);
3125         }
3126 
3127         if (cpi->cpi_ncpu_per_chip > 1 &&
3128             cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
3129                 add_x86_feature(featureset, X86FSET_HTT);
3130         }
3131 
3132         /*
3133          * Now that has been set up, we need to go through and calculate all of
3134          * the rest of the parameters that exist. If we think the CPU doesn't
3135          * have either SMT (HTT) or CMP, then we basically go through and fake
3136          * up information in some way. The most likely case for this is
3137          * virtualization where we have a lot of partial topology information.
3138          */
3139         if (!is_x86_feature(featureset, X86FSET_HTT) &&
3140             !is_x86_feature(featureset, X86FSET_CMP)) {
3141                 /*
3142                  * This is a single core, single-threaded processor.
3143                  */
3144                 cpi->cpi_procnodes_per_pkg = 1;
3145                 cpi->cpi_cores_per_compunit = 1;
3146                 cpi->cpi_compunitid = 0;
3147                 cpi->cpi_chipid = -1;
3148                 cpi->cpi_clogid = 0;
3149                 cpi->cpi_coreid = cpu->cpu_id;
3150                 cpi->cpi_pkgcoreid = 0;
3151                 if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3152                         cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
3153                 } else {
3154                         cpi->cpi_procnodeid = cpi->cpi_chipid;
3155                 }
3156         } else {
3157                 switch (cpi->cpi_vendor) {
3158                 case X86_VENDOR_Intel:
3159                         cpuid_intel_getids(cpu, featureset);
3160                         break;
3161                 case X86_VENDOR_AMD:
3162                         cpuid_amd_getids(cpu, featureset);
3163                         break;
3164                 default:
3165                         /*
3166                          * In this case, it's hard to say what we should do.
3167                          * We're going to model them to the OS as single core
3168                          * threads. We don't have a good identifier for them, so
3169                          * we're just going to use the cpu id all on a single
3170                          * chip.
3171                          *
3172                          * This case has historically been different from the
3173                          * case above where we don't have HTT or CMP. While they
3174                          * could be combined, we've opted to keep it separate to
3175                          * minimize the risk of topology changes in weird cases.
3176                          */
3177                         cpi->cpi_procnodes_per_pkg = 1;
3178                         cpi->cpi_cores_per_compunit = 1;
3179                         cpi->cpi_chipid = 0;
3180                         cpi->cpi_coreid = cpu->cpu_id;
3181                         cpi->cpi_clogid = cpu->cpu_id;
3182                         cpi->cpi_pkgcoreid = cpu->cpu_id;
3183                         cpi->cpi_procnodeid = cpi->cpi_chipid;
3184                         cpi->cpi_compunitid = cpi->cpi_coreid;
3185                         break;
3186                 }
3187         }
3188 }
3189 
3190 /*
3191  * Gather relevant CPU features from leaf 6 which covers thermal information. We
3192  * always gather leaf 6 if it's supported; however, we only look for features on
3193  * Intel systems as AMD does not currently define any of the features we look
3194  * for below.
3195  */
3196 static void
3197 cpuid_pass1_thermal(cpu_t *cpu, uchar_t *featureset)
3198 {
3199         struct cpuid_regs *cp;
3200         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3201 
3202         if (cpi->cpi_maxeax < 6) {
3203                 return;
3204         }
3205 
3206         cp = &cpi->cpi_std[6];
3207         cp->cp_eax = 6;
3208         cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
3209         (void) __cpuid_insn(cp);
3210         platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
3211 
3212         if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3213                 return;
3214         }
3215 
3216         if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
3217                 add_x86_feature(featureset, X86FSET_CORE_THERMAL);
3218         }
3219 
3220         if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
3221                 add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3222         }
3223 }
3224 
3225 void
3226 cpuid_pass1(cpu_t *cpu, uchar_t *featureset)
3227 {
3228         uint32_t mask_ecx, mask_edx;
3229         struct cpuid_info *cpi;
3230         struct cpuid_regs *cp;
3231         int xcpuid;
3232 #if !defined(__xpv)
3233         extern int idle_cpu_prefer_mwait;
3234 #endif
3235 
3236         /*
3237          * Space statically allocated for BSP, ensure pointer is set
3238          */
3239         if (cpu->cpu_id == 0) {
3240                 if (cpu->cpu_m.mcpu_cpi == NULL)
3241                         cpu->cpu_m.mcpu_cpi = &cpuid_info0;
3242         }
3243 
3244         add_x86_feature(featureset, X86FSET_CPUID);
3245 
3246         cpi = cpu->cpu_m.mcpu_cpi;
3247         ASSERT(cpi != NULL);
3248         cp = &cpi->cpi_std[0];
3249         cp->cp_eax = 0;
3250         cpi->cpi_maxeax = __cpuid_insn(cp);
3251         {
3252                 uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
3253                 *iptr++ = cp->cp_ebx;
3254                 *iptr++ = cp->cp_edx;
3255                 *iptr++ = cp->cp_ecx;
3256                 *(char *)&cpi->cpi_vendorstr[12] = '\0';
3257         }
3258 
3259         cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
3260         x86_vendor = cpi->cpi_vendor; /* for compatibility */
3261 
3262         /*
3263          * Limit the range in case of weird hardware
3264          */
3265         if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
3266                 cpi->cpi_maxeax = CPI_MAXEAX_MAX;
3267         if (cpi->cpi_maxeax < 1)
3268                 goto pass1_done;
3269 
3270         cp = &cpi->cpi_std[1];
3271         cp->cp_eax = 1;
3272         (void) __cpuid_insn(cp);
3273 
3274         /*
3275          * Extract identifying constants for easy access.
3276          */
3277         cpi->cpi_model = CPI_MODEL(cpi);
3278         cpi->cpi_family = CPI_FAMILY(cpi);
3279 
3280         if (cpi->cpi_family == 0xf)
3281                 cpi->cpi_family += CPI_FAMILY_XTD(cpi);
3282 
3283         /*
3284          * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
3285          * Intel, and presumably everyone else, uses model == 0xf, as
3286          * one would expect (max value means possible overflow).  Sigh.
3287          */
3288 
3289         switch (cpi->cpi_vendor) {
3290         case X86_VENDOR_Intel:
3291                 if (IS_EXTENDED_MODEL_INTEL(cpi))
3292                         cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3293                 break;
3294         case X86_VENDOR_AMD:
3295                 if (CPI_FAMILY(cpi) == 0xf)
3296                         cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3297                 break;
3298         default:
3299                 if (cpi->cpi_model == 0xf)
3300                         cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3301                 break;
3302         }
3303 
3304         cpi->cpi_step = CPI_STEP(cpi);
3305         cpi->cpi_brandid = CPI_BRANDID(cpi);
3306 
3307         /*
3308          * *default* assumptions:
3309          * - believe %edx feature word
3310          * - ignore %ecx feature word
3311          * - 32-bit virtual and physical addressing
3312          */
3313         mask_edx = 0xffffffff;
3314         mask_ecx = 0;
3315 
3316         cpi->cpi_pabits = cpi->cpi_vabits = 32;
3317 
3318         switch (cpi->cpi_vendor) {
3319         case X86_VENDOR_Intel:
3320                 if (cpi->cpi_family == 5)
3321                         x86_type = X86_TYPE_P5;
3322                 else if (IS_LEGACY_P6(cpi)) {
3323                         x86_type = X86_TYPE_P6;
3324                         pentiumpro_bug4046376 = 1;
3325                         /*
3326                          * Clear the SEP bit when it was set erroneously
3327                          */
3328                         if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
3329                                 cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
3330                 } else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
3331                         x86_type = X86_TYPE_P4;
3332                         /*
3333                          * We don't currently depend on any of the %ecx
3334                          * features until Prescott, so we'll only check
3335                          * this from P4 onwards.  We might want to revisit
3336                          * that idea later.
3337                          */
3338                         mask_ecx = 0xffffffff;
3339                 } else if (cpi->cpi_family > 0xf)
3340                         mask_ecx = 0xffffffff;
3341                 /*
3342                  * We don't support MONITOR/MWAIT if leaf 5 is not available
3343                  * to obtain the monitor linesize.
3344                  */
3345                 if (cpi->cpi_maxeax < 5)
3346                         mask_ecx &= ~CPUID_INTC_ECX_MON;
3347                 break;
3348         case X86_VENDOR_IntelClone:
3349         default:
3350                 break;
3351         case X86_VENDOR_AMD:
3352 #if defined(OPTERON_ERRATUM_108)
3353                 if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
3354                         cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
3355                         cpi->cpi_model = 0xc;
3356                 } else
3357 #endif
3358                 if (cpi->cpi_family == 5) {
3359                         /*
3360                          * AMD K5 and K6
3361                          *
3362                          * These CPUs have an incomplete implementation
3363                          * of MCA/MCE which we mask away.
3364                          */
3365                         mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
3366 
3367                         /*
3368                          * Model 0 uses the wrong (APIC) bit
3369                          * to indicate PGE.  Fix it here.
3370                          */
3371                         if (cpi->cpi_model == 0) {
3372                                 if (cp->cp_edx & 0x200) {
3373                                         cp->cp_edx &= ~0x200;
3374                                         cp->cp_edx |= CPUID_INTC_EDX_PGE;
3375                                 }
3376                         }
3377 
3378                         /*
3379                          * Early models had problems w/ MMX; disable.
3380                          */
3381                         if (cpi->cpi_model < 6)
3382                                 mask_edx &= ~CPUID_INTC_EDX_MMX;
3383                 }
3384 
3385                 /*
3386                  * For newer families, SSE3 and CX16, at least, are valid;
3387                  * enable all
3388                  */
3389                 if (cpi->cpi_family >= 0xf)
3390                         mask_ecx = 0xffffffff;
3391                 /*
3392                  * We don't support MONITOR/MWAIT if leaf 5 is not available
3393                  * to obtain the monitor linesize.
3394                  */
3395                 if (cpi->cpi_maxeax < 5)
3396                         mask_ecx &= ~CPUID_INTC_ECX_MON;
3397 
3398 #if !defined(__xpv)
3399                 /*
3400                  * AMD has not historically used MWAIT in the CPU's idle loop.
3401                  * Pre-family-10h Opterons do not have the MWAIT instruction. We
3402                  * know for certain that in at least family 17h, per AMD, mwait
3403                  * is preferred. Families in-between are less certain.
3404                  */
3405                 if (cpi->cpi_family < 0x17) {
3406                         idle_cpu_prefer_mwait = 0;
3407                 }
3408 #endif
3409 
3410                 break;
3411         case X86_VENDOR_TM:
3412                 /*
3413                  * workaround the NT workaround in CMS 4.1
3414                  */
3415                 if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
3416                     (cpi->cpi_step == 2 || cpi->cpi_step == 3))
3417                         cp->cp_edx |= CPUID_INTC_EDX_CX8;
3418                 break;
3419         case X86_VENDOR_Centaur:
3420                 /*
3421                  * workaround the NT workarounds again
3422                  */
3423                 if (cpi->cpi_family == 6)
3424                         cp->cp_edx |= CPUID_INTC_EDX_CX8;
3425                 break;
3426         case X86_VENDOR_Cyrix:
3427                 /*
3428                  * We rely heavily on the probing in locore
3429                  * to actually figure out what parts, if any,
3430                  * of the Cyrix cpuid instruction to believe.
3431                  */
3432                 switch (x86_type) {
3433                 case X86_TYPE_CYRIX_486:
3434                         mask_edx = 0;
3435                         break;
3436                 case X86_TYPE_CYRIX_6x86:
3437                         mask_edx = 0;
3438                         break;
3439                 case X86_TYPE_CYRIX_6x86L:
3440                         mask_edx =
3441                             CPUID_INTC_EDX_DE |
3442                             CPUID_INTC_EDX_CX8;
3443                         break;
3444                 case X86_TYPE_CYRIX_6x86MX:
3445                         mask_edx =
3446                             CPUID_INTC_EDX_DE |
3447                             CPUID_INTC_EDX_MSR |
3448                             CPUID_INTC_EDX_CX8 |
3449                             CPUID_INTC_EDX_PGE |
3450                             CPUID_INTC_EDX_CMOV |
3451                             CPUID_INTC_EDX_MMX;
3452                         break;
3453                 case X86_TYPE_CYRIX_GXm:
3454                         mask_edx =
3455                             CPUID_INTC_EDX_MSR |
3456                             CPUID_INTC_EDX_CX8 |
3457                             CPUID_INTC_EDX_CMOV |
3458                             CPUID_INTC_EDX_MMX;
3459                         break;
3460                 case X86_TYPE_CYRIX_MediaGX:
3461                         break;
3462                 case X86_TYPE_CYRIX_MII:
3463                 case X86_TYPE_VIA_CYRIX_III:
3464                         mask_edx =
3465                             CPUID_INTC_EDX_DE |
3466                             CPUID_INTC_EDX_TSC |
3467                             CPUID_INTC_EDX_MSR |
3468                             CPUID_INTC_EDX_CX8 |
3469                             CPUID_INTC_EDX_PGE |
3470                             CPUID_INTC_EDX_CMOV |
3471                             CPUID_INTC_EDX_MMX;
3472                         break;
3473                 default:
3474                         break;
3475                 }
3476                 break;
3477         }
3478 
3479 #if defined(__xpv)
3480         /*
3481          * Do not support MONITOR/MWAIT under a hypervisor
3482          */
3483         mask_ecx &= ~CPUID_INTC_ECX_MON;
3484         /*
3485          * Do not support XSAVE under a hypervisor for now
3486          */
3487         xsave_force_disable = B_TRUE;
3488 
3489 #endif  /* __xpv */
3490 
3491         if (xsave_force_disable) {
3492                 mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
3493                 mask_ecx &= ~CPUID_INTC_ECX_AVX;
3494                 mask_ecx &= ~CPUID_INTC_ECX_F16C;
3495                 mask_ecx &= ~CPUID_INTC_ECX_FMA;
3496         }
3497 
3498         /*
3499          * Now we've figured out the masks that determine
3500          * which bits we choose to believe, apply the masks
3501          * to the feature words, then map the kernel's view
3502          * of these feature words into its feature word.
3503          */
3504         cp->cp_edx &= mask_edx;
3505         cp->cp_ecx &= mask_ecx;
3506 
3507         /*
3508          * apply any platform restrictions (we don't call this
3509          * immediately after __cpuid_insn here, because we need the
3510          * workarounds applied above first)
3511          */
3512         platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
3513 
3514         /*
3515          * In addition to ecx and edx, Intel and AMD are storing a bunch of
3516          * instruction set extensions in leaf 7's ebx, ecx, and edx.
3517          */
3518         if (cpi->cpi_maxeax >= 7) {
3519                 struct cpuid_regs *ecp;
3520                 ecp = &cpi->cpi_std[7];
3521                 ecp->cp_eax = 7;
3522                 ecp->cp_ecx = 0;
3523                 (void) __cpuid_insn(ecp);
3524 
3525                 /*
3526                  * If XSAVE has been disabled, just ignore all of the
3527                  * extended-save-area dependent flags here.
3528                  */
3529                 if (xsave_force_disable) {
3530                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
3531                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
3532                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
3533                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
3534                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
3535                         ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
3536                         ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
3537                 }
3538 
3539                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
3540                         add_x86_feature(featureset, X86FSET_SMEP);
3541 
3542                 /*
3543                  * We check disable_smap here in addition to in startup_smap()
3544                  * to ensure CPUs that aren't the boot CPU don't accidentally
3545                  * include it in the feature set and thus generate a mismatched
3546                  * x86 feature set across CPUs.
3547                  */
3548                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
3549                     disable_smap == 0)
3550                         add_x86_feature(featureset, X86FSET_SMAP);
3551 
3552                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
3553                         add_x86_feature(featureset, X86FSET_RDSEED);
3554 
3555                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
3556                         add_x86_feature(featureset, X86FSET_ADX);
3557 
3558                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
3559                         add_x86_feature(featureset, X86FSET_FSGSBASE);
3560 
3561                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
3562                         add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
3563 
3564                 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3565                         if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
3566                                 add_x86_feature(featureset, X86FSET_INVPCID);
3567 
3568                         if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
3569                                 add_x86_feature(featureset, X86FSET_MPX);
3570 
3571                         if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
3572                                 add_x86_feature(featureset, X86FSET_CLWB);
3573                 }
3574         }
3575 
3576         /*
3577          * fold in overrides from the "eeprom" mechanism
3578          */
3579         cp->cp_edx |= cpuid_feature_edx_include;
3580         cp->cp_edx &= ~cpuid_feature_edx_exclude;
3581 
3582         cp->cp_ecx |= cpuid_feature_ecx_include;
3583         cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
3584 
3585         if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
3586                 add_x86_feature(featureset, X86FSET_LARGEPAGE);
3587         }
3588         if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
3589                 add_x86_feature(featureset, X86FSET_TSC);
3590         }
3591         if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
3592                 add_x86_feature(featureset, X86FSET_MSR);
3593         }
3594         if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
3595                 add_x86_feature(featureset, X86FSET_MTRR);
3596         }
3597         if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
3598                 add_x86_feature(featureset, X86FSET_PGE);
3599         }
3600         if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
3601                 add_x86_feature(featureset, X86FSET_CMOV);
3602         }
3603         if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
3604                 add_x86_feature(featureset, X86FSET_MMX);
3605         }
3606         if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
3607             (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
3608                 add_x86_feature(featureset, X86FSET_MCA);
3609         }
3610         if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
3611                 add_x86_feature(featureset, X86FSET_PAE);
3612         }
3613         if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
3614                 add_x86_feature(featureset, X86FSET_CX8);
3615         }
3616         if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
3617                 add_x86_feature(featureset, X86FSET_CX16);
3618         }
3619         if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
3620                 add_x86_feature(featureset, X86FSET_PAT);
3621         }
3622         if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
3623                 add_x86_feature(featureset, X86FSET_SEP);
3624         }
3625         if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
3626                 /*
3627                  * In our implementation, fxsave/fxrstor
3628                  * are prerequisites before we'll even
3629                  * try and do SSE things.
3630                  */
3631                 if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
3632                         add_x86_feature(featureset, X86FSET_SSE);
3633                 }
3634                 if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
3635                         add_x86_feature(featureset, X86FSET_SSE2);
3636                 }
3637                 if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
3638                         add_x86_feature(featureset, X86FSET_SSE3);
3639                 }
3640                 if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
3641                         add_x86_feature(featureset, X86FSET_SSSE3);
3642                 }
3643                 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
3644                         add_x86_feature(featureset, X86FSET_SSE4_1);
3645                 }
3646                 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
3647                         add_x86_feature(featureset, X86FSET_SSE4_2);
3648                 }
3649                 if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
3650                         add_x86_feature(featureset, X86FSET_AES);
3651                 }
3652                 if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
3653                         add_x86_feature(featureset, X86FSET_PCLMULQDQ);
3654                 }
3655 
3656                 if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
3657                         add_x86_feature(featureset, X86FSET_SHA);
3658 
3659                 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
3660                         add_x86_feature(featureset, X86FSET_UMIP);
3661                 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_PKU)
3662                         add_x86_feature(featureset, X86FSET_PKU);
3663                 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
3664                         add_x86_feature(featureset, X86FSET_OSPKE);
3665 
3666                 if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
3667                         add_x86_feature(featureset, X86FSET_XSAVE);
3668 
3669                         /* We only test AVX & AVX512 when there is XSAVE */
3670 
3671                         if (cp->cp_ecx & CPUID_INTC_ECX_AVX) {
3672                                 add_x86_feature(featureset,
3673                                     X86FSET_AVX);
3674 
3675                                 /*
3676                                  * Intel says we can't check these without also
3677                                  * checking AVX.
3678                                  */
3679                                 if (cp->cp_ecx & CPUID_INTC_ECX_F16C)
3680                                         add_x86_feature(featureset,
3681                                             X86FSET_F16C);
3682 
3683                                 if (cp->cp_ecx & CPUID_INTC_ECX_FMA)
3684                                         add_x86_feature(featureset,
3685                                             X86FSET_FMA);
3686 
3687                                 if (cpi->cpi_std[7].cp_ebx &
3688                                     CPUID_INTC_EBX_7_0_BMI1)
3689                                         add_x86_feature(featureset,
3690                                             X86FSET_BMI1);
3691 
3692                                 if (cpi->cpi_std[7].cp_ebx &
3693                                     CPUID_INTC_EBX_7_0_BMI2)
3694                                         add_x86_feature(featureset,
3695                                             X86FSET_BMI2);
3696 
3697                                 if (cpi->cpi_std[7].cp_ebx &
3698                                     CPUID_INTC_EBX_7_0_AVX2)
3699                                         add_x86_feature(featureset,
3700                                             X86FSET_AVX2);
3701                         }
3702 
3703                         if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3704                             (cpi->cpi_std[7].cp_ebx &
3705                             CPUID_INTC_EBX_7_0_AVX512F) != 0) {
3706                                 add_x86_feature(featureset, X86FSET_AVX512F);
3707 
3708                                 if (cpi->cpi_std[7].cp_ebx &
3709                                     CPUID_INTC_EBX_7_0_AVX512DQ)
3710                                         add_x86_feature(featureset,
3711                                             X86FSET_AVX512DQ);
3712                                 if (cpi->cpi_std[7].cp_ebx &
3713                                     CPUID_INTC_EBX_7_0_AVX512IFMA)
3714                                         add_x86_feature(featureset,
3715                                             X86FSET_AVX512FMA);
3716                                 if (cpi->cpi_std[7].cp_ebx &
3717                                     CPUID_INTC_EBX_7_0_AVX512PF)
3718                                         add_x86_feature(featureset,
3719                                             X86FSET_AVX512PF);
3720                                 if (cpi->cpi_std[7].cp_ebx &
3721                                     CPUID_INTC_EBX_7_0_AVX512ER)
3722                                         add_x86_feature(featureset,
3723                                             X86FSET_AVX512ER);
3724                                 if (cpi->cpi_std[7].cp_ebx &
3725                                     CPUID_INTC_EBX_7_0_AVX512CD)
3726                                         add_x86_feature(featureset,
3727                                             X86FSET_AVX512CD);
3728                                 if (cpi->cpi_std[7].cp_ebx &
3729                                     CPUID_INTC_EBX_7_0_AVX512BW)
3730                                         add_x86_feature(featureset,
3731                                             X86FSET_AVX512BW);
3732                                 if (cpi->cpi_std[7].cp_ebx &
3733                                     CPUID_INTC_EBX_7_0_AVX512VL)
3734                                         add_x86_feature(featureset,
3735                                             X86FSET_AVX512VL);
3736 
3737                                 if (cpi->cpi_std[7].cp_ecx &
3738                                     CPUID_INTC_ECX_7_0_AVX512VBMI)
3739                                         add_x86_feature(featureset,
3740                                             X86FSET_AVX512VBMI);
3741                                 if (cpi->cpi_std[7].cp_ecx &
3742                                     CPUID_INTC_ECX_7_0_AVX512VNNI)
3743                                         add_x86_feature(featureset,
3744                                             X86FSET_AVX512VNNI);
3745                                 if (cpi->cpi_std[7].cp_ecx &
3746                                     CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3747                                         add_x86_feature(featureset,
3748                                             X86FSET_AVX512VPOPCDQ);
3749 
3750                                 if (cpi->cpi_std[7].cp_edx &
3751                                     CPUID_INTC_EDX_7_0_AVX5124NNIW)
3752                                         add_x86_feature(featureset,
3753                                             X86FSET_AVX512NNIW);
3754                                 if (cpi->cpi_std[7].cp_edx &
3755                                     CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3756                                         add_x86_feature(featureset,
3757                                             X86FSET_AVX512FMAPS);
3758                         }
3759                 }
3760         }
3761 
3762         if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3763                 if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
3764                         add_x86_feature(featureset, X86FSET_PCID);
3765                 }
3766         }
3767 
3768         if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
3769                 add_x86_feature(featureset, X86FSET_X2APIC);
3770         }
3771         if (cp->cp_edx & CPUID_INTC_EDX_DE) {
3772                 add_x86_feature(featureset, X86FSET_DE);
3773         }
3774 #if !defined(__xpv)
3775         if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
3776 
3777                 /*
3778                  * We require the CLFLUSH instruction for erratum workaround
3779                  * to use MONITOR/MWAIT.
3780                  */
3781                 if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3782                         cpi->cpi_mwait.support |= MWAIT_SUPPORT;
3783                         add_x86_feature(featureset, X86FSET_MWAIT);
3784                 } else {
3785                         extern int idle_cpu_assert_cflush_monitor;
3786 
3787                         /*
3788                          * All processors we are aware of which have
3789                          * MONITOR/MWAIT also have CLFLUSH.
3790                          */
3791                         if (idle_cpu_assert_cflush_monitor) {
3792                                 ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
3793                                     (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
3794                         }
3795                 }
3796         }
3797 #endif  /* __xpv */
3798 
3799         if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
3800                 add_x86_feature(featureset, X86FSET_VMX);
3801         }
3802 
3803         if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
3804                 add_x86_feature(featureset, X86FSET_RDRAND);
3805 
3806         /*
3807          * Only need it first time, rest of the cpus would follow suit.
3808          * we only capture this for the bootcpu.
3809          */
3810         if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3811                 add_x86_feature(featureset, X86FSET_CLFSH);
3812                 x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
3813         }
3814         if (is_x86_feature(featureset, X86FSET_PAE))
3815                 cpi->cpi_pabits = 36;
3816 
3817         if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
3818                 struct cpuid_regs r, *ecp;
3819 
3820                 ecp = &r;
3821                 ecp->cp_eax = 0xD;
3822                 ecp->cp_ecx = 1;
3823                 ecp->cp_edx = ecp->cp_ebx = 0;
3824                 (void) __cpuid_insn(ecp);
3825 
3826                 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
3827                         add_x86_feature(featureset, X86FSET_XSAVEOPT);
3828                 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
3829                         add_x86_feature(featureset, X86FSET_XSAVEC);
3830                 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
3831                         add_x86_feature(featureset, X86FSET_XSAVES);
3832         }
3833 
3834         /*
3835          * Work on the "extended" feature information, doing
3836          * some basic initialization for cpuid_pass2()
3837          */
3838         xcpuid = 0;
3839         switch (cpi->cpi_vendor) {
3840         case X86_VENDOR_Intel:
3841                 /*
3842                  * On KVM we know we will have proper support for extended
3843                  * cpuid.
3844                  */
3845                 if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
3846                     (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
3847                     (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
3848                         xcpuid++;
3849                 break;
3850         case X86_VENDOR_AMD:
3851                 if (cpi->cpi_family > 5 ||
3852                     (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
3853                         xcpuid++;
3854                 break;
3855         case X86_VENDOR_Cyrix:
3856                 /*
3857                  * Only these Cyrix CPUs are -known- to support
3858                  * extended cpuid operations.
3859                  */
3860                 if (x86_type == X86_TYPE_VIA_CYRIX_III ||
3861                     x86_type == X86_TYPE_CYRIX_GXm)
3862                         xcpuid++;
3863                 break;
3864         case X86_VENDOR_Centaur:
3865         case X86_VENDOR_TM:
3866         default:
3867                 xcpuid++;
3868                 break;
3869         }
3870 
3871         if (xcpuid) {
3872                 cp = &cpi->cpi_extd[0];
3873                 cp->cp_eax = CPUID_LEAF_EXT_0;
3874                 cpi->cpi_xmaxeax = __cpuid_insn(cp);
3875         }
3876 
3877         if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
3878 
3879                 if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
3880                         cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
3881 
3882                 switch (cpi->cpi_vendor) {
3883                 case X86_VENDOR_Intel:
3884                 case X86_VENDOR_AMD:
3885                         if (cpi->cpi_xmaxeax < 0x80000001)
3886                                 break;
3887                         cp = &cpi->cpi_extd[1];
3888                         cp->cp_eax = 0x80000001;
3889                         (void) __cpuid_insn(cp);
3890 
3891                         if (cpi->cpi_vendor == X86_VENDOR_AMD &&
3892                             cpi->cpi_family == 5 &&
3893                             cpi->cpi_model == 6 &&
3894                             cpi->cpi_step == 6) {
3895                                 /*
3896                                  * K6 model 6 uses bit 10 to indicate SYSC
3897                                  * Later models use bit 11. Fix it here.
3898                                  */
3899                                 if (cp->cp_edx & 0x400) {
3900                                         cp->cp_edx &= ~0x400;
3901                                         cp->cp_edx |= CPUID_AMD_EDX_SYSC;
3902                                 }
3903                         }
3904 
3905                         platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
3906 
3907                         /*
3908                          * Compute the additions to the kernel's feature word.
3909                          */
3910                         if (cp->cp_edx & CPUID_AMD_EDX_NX) {
3911                                 add_x86_feature(featureset, X86FSET_NX);
3912                         }
3913 
3914                         /*
3915                          * Regardless whether or not we boot 64-bit,
3916                          * we should have a way to identify whether
3917                          * the CPU is capable of running 64-bit.
3918                          */
3919                         if (cp->cp_edx & CPUID_AMD_EDX_LM) {
3920                                 add_x86_feature(featureset, X86FSET_64);
3921                         }
3922 
3923                         /* 1 GB large page - enable only for 64 bit kernel */
3924                         if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
3925                                 add_x86_feature(featureset, X86FSET_1GPG);
3926                         }
3927 
3928                         if ((cpi->cpi_vendor == X86_VENDOR_AMD) &&
3929                             (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
3930                             (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
3931                                 add_x86_feature(featureset, X86FSET_SSE4A);
3932                         }
3933 
3934                         /*
3935                          * It's really tricky to support syscall/sysret in
3936                          * the i386 kernel; we rely on sysenter/sysexit
3937                          * instead.  In the amd64 kernel, things are -way-
3938                          * better.
3939                          */
3940                         if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
3941                                 add_x86_feature(featureset, X86FSET_ASYSC);
3942                         }
3943 
3944                         /*
3945                          * While we're thinking about system calls, note
3946                          * that AMD processors don't support sysenter
3947                          * in long mode at all, so don't try to program them.
3948                          */
3949                         if (x86_vendor == X86_VENDOR_AMD) {
3950                                 remove_x86_feature(featureset, X86FSET_SEP);
3951                         }
3952 
3953                         if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
3954                                 add_x86_feature(featureset, X86FSET_TSCP);
3955                         }
3956 
3957                         if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
3958                                 add_x86_feature(featureset, X86FSET_SVM);
3959                         }
3960 
3961                         if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
3962                                 add_x86_feature(featureset, X86FSET_TOPOEXT);
3963                         }
3964 
3965                         if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
3966                                 add_x86_feature(featureset, X86FSET_AMD_PCEC);
3967                         }
3968 
3969                         if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
3970                                 add_x86_feature(featureset, X86FSET_XOP);
3971                         }
3972 
3973                         if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
3974                                 add_x86_feature(featureset, X86FSET_FMA4);
3975                         }
3976 
3977                         if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
3978                                 add_x86_feature(featureset, X86FSET_TBM);
3979                         }
3980 
3981                         if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
3982                                 add_x86_feature(featureset, X86FSET_MONITORX);
3983                         }
3984                         break;
3985                 default:
3986                         break;
3987                 }
3988 
3989                 /*
3990                  * Get CPUID data about processor cores and hyperthreads.
3991                  */
3992                 switch (cpi->cpi_vendor) {
3993                 case X86_VENDOR_Intel:
3994                         if (cpi->cpi_maxeax >= 4) {
3995                                 cp = &cpi->cpi_std[4];
3996                                 cp->cp_eax = 4;
3997                                 cp->cp_ecx = 0;
3998                                 (void) __cpuid_insn(cp);
3999                                 platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
4000                         }
4001                         /*FALLTHROUGH*/
4002                 case X86_VENDOR_AMD:
4003                         if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
4004                                 break;
4005                         cp = &cpi->cpi_extd[8];
4006                         cp->cp_eax = CPUID_LEAF_EXT_8;
4007                         (void) __cpuid_insn(cp);
4008                         platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
4009                             cp);
4010 
4011                         /*
4012                          * AMD uses ebx for some extended functions.
4013                          */
4014                         if (cpi->cpi_vendor == X86_VENDOR_AMD) {
4015                                 /*
4016                                  * While we're here, check for the AMD "Error
4017                                  * Pointer Zero/Restore" feature. This can be
4018                                  * used to setup the FP save handlers
4019                                  * appropriately.
4020                                  */
4021                                 if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4022                                         cpi->cpi_fp_amd_save = 0;
4023                                 } else {
4024                                         cpi->cpi_fp_amd_save = 1;
4025                                 }
4026 
4027                                 if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
4028                                         add_x86_feature(featureset,
4029                                             X86FSET_CLZERO);
4030                                 }
4031                         }
4032 
4033                         /*
4034                          * Virtual and physical address limits from
4035                          * cpuid override previously guessed values.
4036                          */
4037                         cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
4038                         cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
4039                         break;
4040                 default:
4041                         break;
4042                 }
4043 
4044                 /*
4045                  * Get CPUID data about TSC Invariance in Deep C-State.
4046                  */
4047                 switch (cpi->cpi_vendor) {
4048                 case X86_VENDOR_Intel:
4049                 case X86_VENDOR_AMD:
4050                         if (cpi->cpi_maxeax >= 7) {
4051                                 cp = &cpi->cpi_extd[7];
4052                                 cp->cp_eax = 0x80000007;
4053                                 cp->cp_ecx = 0;
4054                                 (void) __cpuid_insn(cp);
4055                         }
4056                         break;
4057                 default:
4058                         break;
4059                 }
4060         }
4061 
4062         cpuid_pass1_topology(cpu, featureset);
4063         cpuid_pass1_thermal(cpu, featureset);
4064 
4065         /*
4066          * Synthesize chip "revision" and socket type
4067          */
4068         cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
4069             cpi->cpi_model, cpi->cpi_step);
4070         cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
4071             cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
4072         cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
4073             cpi->cpi_model, cpi->cpi_step);
4074 
4075         if (cpi->cpi_vendor == X86_VENDOR_AMD) {
4076                 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
4077                     cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4078                         /* Special handling for AMD FP not necessary. */
4079                         cpi->cpi_fp_amd_save = 0;
4080                 } else {
4081                         cpi->cpi_fp_amd_save = 1;
4082                 }
4083         }
4084 
4085         /*
4086          * Check the processor leaves that are used for security features.
4087          */
4088         cpuid_scan_security(cpu, featureset);
4089 
4090 pass1_done:
4091         cpi->cpi_pass = 1;
4092 }
4093 
4094 /*
4095  * Make copies of the cpuid table entries we depend on, in
4096  * part for ease of parsing now, in part so that we have only
4097  * one place to correct any of it, in part for ease of
4098  * later export to userland, and in part so we can look at
4099  * this stuff in a crash dump.
4100  */
4101 
4102 /*ARGSUSED*/
4103 void
4104 cpuid_pass2(cpu_t *cpu)
4105 {
4106         uint_t n, nmax;
4107         int i;
4108         struct cpuid_regs *cp;
4109         uint8_t *dp;
4110         uint32_t *iptr;
4111         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4112 
4113         ASSERT(cpi->cpi_pass == 1);
4114 
4115         if (cpi->cpi_maxeax < 1)
4116                 goto pass2_done;
4117 
4118         if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
4119                 nmax = NMAX_CPI_STD;
4120         /*
4121          * (We already handled n == 0 and n == 1 in pass 1)
4122          */
4123         for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
4124                 /*
4125                  * leaves 6 and 7 were handled in pass 1
4126                  */
4127                 if (n == 6 || n == 7)
4128                         continue;
4129 
4130                 cp->cp_eax = n;
4131 
4132                 /*
4133                  * CPUID function 4 expects %ecx to be initialized
4134                  * with an index which indicates which cache to return
4135                  * information about. The OS is expected to call function 4
4136                  * with %ecx set to 0, 1, 2, ... until it returns with
4137                  * EAX[4:0] set to 0, which indicates there are no more
4138                  * caches.
4139                  *
4140                  * Here, populate cpi_std[4] with the information returned by
4141                  * function 4 when %ecx == 0, and do the rest in cpuid_pass3()
4142                  * when dynamic memory allocation becomes available.
4143                  *
4144                  * Note: we need to explicitly initialize %ecx here, since
4145                  * function 4 may have been previously invoked.
4146                  */
4147                 if (n == 4)
4148                         cp->cp_ecx = 0;
4149 
4150                 (void) __cpuid_insn(cp);
4151                 platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
4152                 switch (n) {
4153                 case 2:
4154                         /*
4155                          * "the lower 8 bits of the %eax register
4156                          * contain a value that identifies the number
4157                          * of times the cpuid [instruction] has to be
4158                          * executed to obtain a complete image of the
4159                          * processor's caching systems."
4160                          *
4161                          * How *do* they make this stuff up?
4162                          */
4163                         cpi->cpi_ncache = sizeof (*cp) *
4164                             BITX(cp->cp_eax, 7, 0);
4165                         if (cpi->cpi_ncache == 0)
4166                                 break;
4167                         cpi->cpi_ncache--;   /* skip count byte */
4168 
4169                         /*
4170                          * Well, for now, rather than attempt to implement
4171                          * this slightly dubious algorithm, we just look
4172                          * at the first 15 ..
4173                          */
4174                         if (cpi->cpi_ncache > (sizeof (*cp) - 1))
4175                                 cpi->cpi_ncache = sizeof (*cp) - 1;
4176 
4177                         dp = cpi->cpi_cacheinfo;
4178                         if (BITX(cp->cp_eax, 31, 31) == 0) {
4179                                 uint8_t *p = (void *)&cp->cp_eax;
4180                                 for (i = 1; i < 4; i++)
4181                                         if (p[i] != 0)
4182                                                 *dp++ = p[i];
4183                         }
4184                         if (BITX(cp->cp_ebx, 31, 31) == 0) {
4185                                 uint8_t *p = (void *)&cp->cp_ebx;
4186                                 for (i = 0; i < 4; i++)
4187                                         if (p[i] != 0)
4188                                                 *dp++ = p[i];
4189                         }
4190                         if (BITX(cp->cp_ecx, 31, 31) == 0) {
4191                                 uint8_t *p = (void *)&cp->cp_ecx;
4192                                 for (i = 0; i < 4; i++)
4193                                         if (p[i] != 0)
4194                                                 *dp++ = p[i];
4195                         }
4196                         if (BITX(cp->cp_edx, 31, 31) == 0) {
4197                                 uint8_t *p = (void *)&cp->cp_edx;
4198                                 for (i = 0; i < 4; i++)
4199                                         if (p[i] != 0)
4200                                                 *dp++ = p[i];
4201                         }
4202                         break;
4203 
4204                 case 3: /* Processor serial number, if PSN supported */
4205                         break;
4206 
4207                 case 4: /* Deterministic cache parameters */
4208                         break;
4209 
4210                 case 5: /* Monitor/Mwait parameters */
4211                 {
4212                         size_t mwait_size;
4213 
4214                         /*
4215                          * check cpi_mwait.support which was set in cpuid_pass1
4216                          */
4217                         if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
4218                                 break;
4219 
4220                         /*
4221                          * Protect ourself from insane mwait line size.
4222                          * Workaround for incomplete hardware emulator(s).
4223                          */
4224                         mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
4225                         if (mwait_size < sizeof (uint32_t) ||
4226                             !ISP2(mwait_size)) {
4227 #if DEBUG
4228                                 cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
4229                                     "size %ld", cpu->cpu_id, (long)mwait_size);
4230 #endif
4231                                 break;
4232                         }
4233 
4234                         cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
4235                         cpi->cpi_mwait.mon_max = mwait_size;
4236                         if (MWAIT_EXTENSION(cpi)) {
4237                                 cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
4238                                 if (MWAIT_INT_ENABLE(cpi))
4239                                         cpi->cpi_mwait.support |=
4240                                             MWAIT_ECX_INT_ENABLE;
4241                         }
4242                         break;
4243                 }
4244                 default:
4245                         break;
4246                 }
4247         }
4248 
4249         /*
4250          * XSAVE enumeration
4251          */
4252         if (cpi->cpi_maxeax >= 0xD) {
4253                 struct cpuid_regs regs;
4254                 boolean_t cpuid_d_valid = B_TRUE;
4255 
4256                 cp = &regs;
4257                 cp->cp_eax = 0xD;
4258                 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
4259 
4260                 (void) __cpuid_insn(cp);
4261 
4262                 /*
4263                  * Sanity checks for debug
4264                  */
4265                 if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
4266                     (cp->cp_eax & XFEATURE_SSE) == 0) {
4267                         cpuid_d_valid = B_FALSE;
4268                 }
4269 
4270                 cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
4271                 cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
4272                 cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
4273 
4274                 /*
4275                  * If the hw supports AVX, get the size and offset in the save
4276                  * area for the ymm state.
4277                  */
4278                 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
4279                         cp->cp_eax = 0xD;
4280                         cp->cp_ecx = 2;
4281                         cp->cp_edx = cp->cp_ebx = 0;
4282 
4283                         (void) __cpuid_insn(cp);
4284 
4285                         if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
4286                             cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
4287                                 cpuid_d_valid = B_FALSE;
4288                         }
4289 
4290                         cpi->cpi_xsave.ymm_size = cp->cp_eax;
4291                         cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
4292                 }
4293 
4294                 /*
4295                  * If the hw supports MPX, get the size and offset in the
4296                  * save area for BNDREGS and BNDCSR.
4297                  */
4298                 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
4299                         cp->cp_eax = 0xD;
4300                         cp->cp_ecx = 3;
4301                         cp->cp_edx = cp->cp_ebx = 0;
4302 
4303                         (void) __cpuid_insn(cp);
4304 
4305                         cpi->cpi_xsave.bndregs_size = cp->cp_eax;
4306                         cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
4307 
4308                         cp->cp_eax = 0xD;
4309                         cp->cp_ecx = 4;
4310                         cp->cp_edx = cp->cp_ebx = 0;
4311 
4312                         (void) __cpuid_insn(cp);
4313 
4314                         cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
4315                         cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
4316                 }
4317 
4318                 /*
4319                  * If the hw supports AVX512, get the size and offset in the
4320                  * save area for the opmask registers and zmm state.
4321                  */
4322                 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
4323                         cp->cp_eax = 0xD;
4324                         cp->cp_ecx = 5;
4325                         cp->cp_edx = cp->cp_ebx = 0;
4326 
4327                         (void) __cpuid_insn(cp);
4328 
4329                         cpi->cpi_xsave.opmask_size = cp->cp_eax;
4330                         cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
4331 
4332                         cp->cp_eax = 0xD;
4333                         cp->cp_ecx = 6;
4334                         cp->cp_edx = cp->cp_ebx = 0;
4335 
4336                         (void) __cpuid_insn(cp);
4337 
4338                         cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
4339                         cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
4340 
4341                         cp->cp_eax = 0xD;
4342                         cp->cp_ecx = 7;
4343                         cp->cp_edx = cp->cp_ebx = 0;
4344 
4345                         (void) __cpuid_insn(cp);
4346 
4347                         cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
4348                         cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
4349                 }
4350 
4351                 if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
4352                         xsave_state_size = 0;
4353                 } else if (cpuid_d_valid) {
4354                         xsave_state_size = cpi->cpi_xsave.xsav_max_size;
4355                 } else {
4356                         /* Broken CPUID 0xD, probably in HVM */
4357                         cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
4358                             "value: hw_low = %d, hw_high = %d, xsave_size = %d"
4359                             ", ymm_size = %d, ymm_offset = %d\n",
4360                             cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
4361                             cpi->cpi_xsave.xsav_hw_features_high,
4362                             (int)cpi->cpi_xsave.xsav_max_size,
4363                             (int)cpi->cpi_xsave.ymm_size,
4364                             (int)cpi->cpi_xsave.ymm_offset);
4365 
4366                         if (xsave_state_size != 0) {
4367                                 /*
4368                                  * This must be a non-boot CPU. We cannot
4369                                  * continue, because boot cpu has already
4370                                  * enabled XSAVE.
4371                                  */
4372                                 ASSERT(cpu->cpu_id != 0);
4373                                 cmn_err(CE_PANIC, "cpu%d: we have already "
4374                                     "enabled XSAVE on boot cpu, cannot "
4375                                     "continue.", cpu->cpu_id);
4376                         } else {
4377                                 /*
4378                                  * If we reached here on the boot CPU, it's also
4379                                  * almost certain that we'll reach here on the
4380                                  * non-boot CPUs. When we're here on a boot CPU
4381                                  * we should disable the feature, on a non-boot
4382                                  * CPU we need to confirm that we have.
4383                                  */
4384                                 if (cpu->cpu_id == 0) {
4385                                         remove_x86_feature(x86_featureset,
4386                                             X86FSET_XSAVE);
4387                                         remove_x86_feature(x86_featureset,
4388                                             X86FSET_AVX);
4389                                         remove_x86_feature(x86_featureset,
4390                                             X86FSET_F16C);
4391                                         remove_x86_feature(x86_featureset,
4392                                             X86FSET_BMI1);
4393                                         remove_x86_feature(x86_featureset,
4394                                             X86FSET_BMI2);
4395                                         remove_x86_feature(x86_featureset,
4396                                             X86FSET_FMA);
4397                                         remove_x86_feature(x86_featureset,
4398                                             X86FSET_AVX2);
4399                                         remove_x86_feature(x86_featureset,
4400                                             X86FSET_MPX);
4401                                         remove_x86_feature(x86_featureset,
4402                                             X86FSET_AVX512F);
4403                                         remove_x86_feature(x86_featureset,
4404                                             X86FSET_AVX512DQ);
4405                                         remove_x86_feature(x86_featureset,
4406                                             X86FSET_AVX512PF);
4407                                         remove_x86_feature(x86_featureset,
4408                                             X86FSET_AVX512ER);
4409                                         remove_x86_feature(x86_featureset,
4410                                             X86FSET_AVX512CD);
4411                                         remove_x86_feature(x86_featureset,
4412                                             X86FSET_AVX512BW);
4413                                         remove_x86_feature(x86_featureset,
4414                                             X86FSET_AVX512VL);
4415                                         remove_x86_feature(x86_featureset,
4416                                             X86FSET_AVX512FMA);
4417                                         remove_x86_feature(x86_featureset,
4418                                             X86FSET_AVX512VBMI);
4419                                         remove_x86_feature(x86_featureset,
4420                                             X86FSET_AVX512VNNI);
4421                                         remove_x86_feature(x86_featureset,
4422                                             X86FSET_AVX512VPOPCDQ);
4423                                         remove_x86_feature(x86_featureset,
4424                                             X86FSET_AVX512NNIW);
4425                                         remove_x86_feature(x86_featureset,
4426                                             X86FSET_AVX512FMAPS);
4427 
4428                                         CPI_FEATURES_ECX(cpi) &=
4429                                             ~CPUID_INTC_ECX_XSAVE;
4430                                         CPI_FEATURES_ECX(cpi) &=
4431                                             ~CPUID_INTC_ECX_AVX;
4432                                         CPI_FEATURES_ECX(cpi) &=
4433                                             ~CPUID_INTC_ECX_F16C;
4434                                         CPI_FEATURES_ECX(cpi) &=
4435                                             ~CPUID_INTC_ECX_FMA;
4436                                         CPI_FEATURES_7_0_EBX(cpi) &=
4437                                             ~CPUID_INTC_EBX_7_0_BMI1;
4438                                         CPI_FEATURES_7_0_EBX(cpi) &=
4439                                             ~CPUID_INTC_EBX_7_0_BMI2;
4440                                         CPI_FEATURES_7_0_EBX(cpi) &=
4441                                             ~CPUID_INTC_EBX_7_0_AVX2;
4442                                         CPI_FEATURES_7_0_EBX(cpi) &=
4443                                             ~CPUID_INTC_EBX_7_0_MPX;
4444                                         CPI_FEATURES_7_0_EBX(cpi) &=
4445                                             ~CPUID_INTC_EBX_7_0_ALL_AVX512;
4446 
4447                                         CPI_FEATURES_7_0_ECX(cpi) &=
4448                                             ~CPUID_INTC_ECX_7_0_ALL_AVX512;
4449 
4450                                         CPI_FEATURES_7_0_EDX(cpi) &=
4451                                             ~CPUID_INTC_EDX_7_0_ALL_AVX512;
4452 
4453                                         xsave_force_disable = B_TRUE;
4454                                 } else {
4455                                         VERIFY(is_x86_feature(x86_featureset,
4456                                             X86FSET_XSAVE) == B_FALSE);
4457                                 }
4458                         }
4459                 }
4460         }
4461 
4462 
4463         if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
4464                 goto pass2_done;
4465 
4466         if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
4467                 nmax = NMAX_CPI_EXTD;
4468         /*
4469          * Copy the extended properties, fixing them as we go.
4470          * (We already handled n == 0 and n == 1 in pass 1)
4471          */
4472         iptr = (void *)cpi->cpi_brandstr;
4473         for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
4474                 cp->cp_eax = CPUID_LEAF_EXT_0 + n;
4475                 (void) __cpuid_insn(cp);
4476                 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
4477                     cp);
4478                 switch (n) {
4479                 case 2:
4480                 case 3:
4481                 case 4:
4482                         /*
4483                          * Extract the brand string
4484                          */
4485                         *iptr++ = cp->cp_eax;
4486                         *iptr++ = cp->cp_ebx;
4487                         *iptr++ = cp->cp_ecx;
4488                         *iptr++ = cp->cp_edx;
4489                         break;
4490                 case 5:
4491                         switch (cpi->cpi_vendor) {
4492                         case X86_VENDOR_AMD:
4493                                 /*
4494                                  * The Athlon and Duron were the first
4495                                  * parts to report the sizes of the
4496                                  * TLB for large pages. Before then,
4497                                  * we don't trust the data.
4498                                  */
4499                                 if (cpi->cpi_family < 6 ||
4500                                     (cpi->cpi_family == 6 &&
4501                                     cpi->cpi_model < 1))
4502                                         cp->cp_eax = 0;
4503                                 break;
4504                         default:
4505                                 break;
4506                         }
4507                         break;
4508                 case 6:
4509                         switch (cpi->cpi_vendor) {
4510                         case X86_VENDOR_AMD:
4511                                 /*
4512                                  * The Athlon and Duron were the first
4513                                  * AMD parts with L2 TLB's.
4514                                  * Before then, don't trust the data.
4515                                  */
4516                                 if (cpi->cpi_family < 6 ||
4517                                     cpi->cpi_family == 6 &&
4518                                     cpi->cpi_model < 1)
4519                                         cp->cp_eax = cp->cp_ebx = 0;
4520                                 /*
4521                                  * AMD Duron rev A0 reports L2
4522                                  * cache size incorrectly as 1K
4523                                  * when it is really 64K
4524                                  */
4525                                 if (cpi->cpi_family == 6 &&
4526                                     cpi->cpi_model == 3 &&
4527                                     cpi->cpi_step == 0) {
4528                                         cp->cp_ecx &= 0xffff;
4529                                         cp->cp_ecx |= 0x400000;
4530                                 }
4531                                 break;
4532                         case X86_VENDOR_Cyrix:  /* VIA C3 */
4533                                 /*
4534                                  * VIA C3 processors are a bit messed
4535                                  * up w.r.t. encoding cache sizes in %ecx
4536                                  */
4537                                 if (cpi->cpi_family != 6)
4538                                         break;
4539                                 /*
4540                                  * model 7 and 8 were incorrectly encoded
4541                                  *
4542                                  * xxx is model 8 really broken?
4543                                  */
4544                                 if (cpi->cpi_model == 7 ||
4545                                     cpi->cpi_model == 8)
4546                                         cp->cp_ecx =
4547                                             BITX(cp->cp_ecx, 31, 24) << 16 |
4548                                             BITX(cp->cp_ecx, 23, 16) << 12 |
4549                                             BITX(cp->cp_ecx, 15, 8) << 8 |
4550                                             BITX(cp->cp_ecx, 7, 0);
4551                                 /*
4552                                  * model 9 stepping 1 has wrong associativity
4553                                  */
4554                                 if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
4555                                         cp->cp_ecx |= 8 << 12;
4556                                 break;
4557                         case X86_VENDOR_Intel:
4558                                 /*
4559                                  * Extended L2 Cache features function.
4560                                  * First appeared on Prescott.
4561                                  */
4562                         default:
4563                                 break;
4564                         }
4565                         break;
4566                 default:
4567                         break;
4568                 }
4569         }
4570 
4571 pass2_done:
4572         cpi->cpi_pass = 2;
4573 }
4574 
4575 static const char *
4576 intel_cpubrand(const struct cpuid_info *cpi)
4577 {
4578         int i;
4579 
4580         if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4581             cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
4582                 return ("i486");
4583 
4584         switch (cpi->cpi_family) {
4585         case 5:
4586                 return ("Intel Pentium(r)");
4587         case 6:
4588                 switch (cpi->cpi_model) {
4589                         uint_t celeron, xeon;
4590                         const struct cpuid_regs *cp;
4591                 case 0:
4592                 case 1:
4593                 case 2:
4594                         return ("Intel Pentium(r) Pro");
4595                 case 3:
4596                 case 4:
4597                         return ("Intel Pentium(r) II");
4598                 case 6:
4599                         return ("Intel Celeron(r)");
4600                 case 5:
4601                 case 7:
4602                         celeron = xeon = 0;
4603                         cp = &cpi->cpi_std[2];   /* cache info */
4604 
4605                         for (i = 1; i < 4; i++) {
4606                                 uint_t tmp;
4607 
4608                                 tmp = (cp->cp_eax >> (8 * i)) & 0xff;
4609                                 if (tmp == 0x40)
4610                                         celeron++;
4611                                 if (tmp >= 0x44 && tmp <= 0x45)
4612                                         xeon++;
4613                         }
4614 
4615                         for (i = 0; i < 2; i++) {
4616                                 uint_t tmp;
4617 
4618                                 tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
4619                                 if (tmp == 0x40)
4620                                         celeron++;
4621                                 else if (tmp >= 0x44 && tmp <= 0x45)
4622                                         xeon++;
4623                         }
4624 
4625                         for (i = 0; i < 4; i++) {
4626                                 uint_t tmp;
4627 
4628                                 tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
4629                                 if (tmp == 0x40)
4630                                         celeron++;
4631                                 else if (tmp >= 0x44 && tmp <= 0x45)
4632                                         xeon++;
4633                         }
4634 
4635                         for (i = 0; i < 4; i++) {
4636                                 uint_t tmp;
4637 
4638                                 tmp = (cp->cp_edx >> (8 * i)) & 0xff;
4639                                 if (tmp == 0x40)
4640                                         celeron++;
4641                                 else if (tmp >= 0x44 && tmp <= 0x45)
4642                                         xeon++;
4643                         }
4644 
4645                         if (celeron)
4646                                 return ("Intel Celeron(r)");
4647                         if (xeon)
4648                                 return (cpi->cpi_model == 5 ?
4649                                     "Intel Pentium(r) II Xeon(tm)" :
4650                                     "Intel Pentium(r) III Xeon(tm)");
4651                         return (cpi->cpi_model == 5 ?
4652                             "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
4653                             "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
4654                 default:
4655                         break;
4656                 }
4657         default:
4658                 break;
4659         }
4660 
4661         /* BrandID is present if the field is nonzero */
4662         if (cpi->cpi_brandid != 0) {
4663                 static const struct {
4664                         uint_t bt_bid;
4665                         const char *bt_str;
4666                 } brand_tbl[] = {
4667                         { 0x1,  "Intel(r) Celeron(r)" },
4668                         { 0x2,  "Intel(r) Pentium(r) III" },
4669                         { 0x3,  "Intel(r) Pentium(r) III Xeon(tm)" },
4670                         { 0x4,  "Intel(r) Pentium(r) III" },
4671                         { 0x6,  "Mobile Intel(r) Pentium(r) III" },
4672                         { 0x7,  "Mobile Intel(r) Celeron(r)" },
4673                         { 0x8,  "Intel(r) Pentium(r) 4" },
4674                         { 0x9,  "Intel(r) Pentium(r) 4" },
4675                         { 0xa,  "Intel(r) Celeron(r)" },
4676                         { 0xb,  "Intel(r) Xeon(tm)" },
4677                         { 0xc,  "Intel(r) Xeon(tm) MP" },
4678                         { 0xe,  "Mobile Intel(r) Pentium(r) 4" },
4679                         { 0xf,  "Mobile Intel(r) Celeron(r)" },
4680                         { 0x11, "Mobile Genuine Intel(r)" },
4681                         { 0x12, "Intel(r) Celeron(r) M" },
4682                         { 0x13, "Mobile Intel(r) Celeron(r)" },
4683                         { 0x14, "Intel(r) Celeron(r)" },
4684                         { 0x15, "Mobile Genuine Intel(r)" },
4685                         { 0x16, "Intel(r) Pentium(r) M" },
4686                         { 0x17, "Mobile Intel(r) Celeron(r)" }
4687                 };
4688                 uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
4689                 uint_t sgn;
4690 
4691                 sgn = (cpi->cpi_family << 8) |
4692                     (cpi->cpi_model << 4) | cpi->cpi_step;
4693 
4694                 for (i = 0; i < btblmax; i++)
4695                         if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
4696                                 break;
4697                 if (i < btblmax) {
4698                         if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
4699                                 return ("Intel(r) Celeron(r)");
4700                         if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
4701                                 return ("Intel(r) Xeon(tm) MP");
4702                         if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
4703                                 return ("Intel(r) Xeon(tm)");
4704                         return (brand_tbl[i].bt_str);
4705                 }
4706         }
4707 
4708         return (NULL);
4709 }
4710 
4711 static const char *
4712 amd_cpubrand(const struct cpuid_info *cpi)
4713 {
4714         if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4715             cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
4716                 return ("i486 compatible");
4717 
4718         switch (cpi->cpi_family) {
4719         case 5:
4720                 switch (cpi->cpi_model) {
4721                 case 0:
4722                 case 1:
4723                 case 2:
4724                 case 3:
4725                 case 4:
4726                 case 5:
4727                         return ("AMD-K5(r)");
4728                 case 6:
4729                 case 7:
4730                         return ("AMD-K6(r)");
4731                 case 8:
4732                         return ("AMD-K6(r)-2");
4733                 case 9:
4734                         return ("AMD-K6(r)-III");
4735                 default:
4736                         return ("AMD (family 5)");
4737                 }
4738         case 6:
4739                 switch (cpi->cpi_model) {
4740                 case 1:
4741                         return ("AMD-K7(tm)");
4742                 case 0:
4743                 case 2:
4744                 case 4:
4745                         return ("AMD Athlon(tm)");
4746                 case 3:
4747                 case 7:
4748                         return ("AMD Duron(tm)");
4749                 case 6:
4750                 case 8:
4751                 case 10:
4752                         /*
4753                          * Use the L2 cache size to distinguish
4754                          */
4755                         return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
4756                             "AMD Athlon(tm)" : "AMD Duron(tm)");
4757                 default:
4758                         return ("AMD (family 6)");
4759                 }
4760         default:
4761                 break;
4762         }
4763 
4764         if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
4765             cpi->cpi_brandid != 0) {
4766                 switch (BITX(cpi->cpi_brandid, 7, 5)) {
4767                 case 3:
4768                         return ("AMD Opteron(tm) UP 1xx");
4769                 case 4:
4770                         return ("AMD Opteron(tm) DP 2xx");
4771                 case 5:
4772                         return ("AMD Opteron(tm) MP 8xx");
4773                 default:
4774                         return ("AMD Opteron(tm)");
4775                 }
4776         }
4777 
4778         return (NULL);
4779 }
4780 
4781 static const char *
4782 cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
4783 {
4784         if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4785             cpi->cpi_maxeax < 1 || cpi->cpi_family < 5 ||
4786             type == X86_TYPE_CYRIX_486)
4787                 return ("i486 compatible");
4788 
4789         switch (type) {
4790         case X86_TYPE_CYRIX_6x86:
4791                 return ("Cyrix 6x86");
4792         case X86_TYPE_CYRIX_6x86L:
4793                 return ("Cyrix 6x86L");
4794         case X86_TYPE_CYRIX_6x86MX:
4795                 return ("Cyrix 6x86MX");
4796         case X86_TYPE_CYRIX_GXm:
4797                 return ("Cyrix GXm");
4798         case X86_TYPE_CYRIX_MediaGX:
4799                 return ("Cyrix MediaGX");
4800         case X86_TYPE_CYRIX_MII:
4801                 return ("Cyrix M2");
4802         case X86_TYPE_VIA_CYRIX_III:
4803                 return ("VIA Cyrix M3");
4804         default:
4805                 /*
4806                  * Have another wild guess ..
4807                  */
4808                 if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
4809                         return ("Cyrix 5x86");
4810                 else if (cpi->cpi_family == 5) {
4811                         switch (cpi->cpi_model) {
4812                         case 2:
4813                                 return ("Cyrix 6x86");  /* Cyrix M1 */
4814                         case 4:
4815                                 return ("Cyrix MediaGX");
4816                         default:
4817                                 break;
4818                         }
4819                 } else if (cpi->cpi_family == 6) {
4820                         switch (cpi->cpi_model) {
4821                         case 0:
4822                                 return ("Cyrix 6x86MX"); /* Cyrix M2? */
4823                         case 5:
4824                         case 6:
4825                         case 7:
4826                         case 8:
4827                         case 9:
4828                                 return ("VIA C3");
4829                         default:
4830                                 break;
4831                         }
4832                 }
4833                 break;
4834         }
4835         return (NULL);
4836 }
4837 
4838 /*
4839  * This only gets called in the case that the CPU extended
4840  * feature brand string (0x80000002, 0x80000003, 0x80000004)
4841  * aren't available, or contain null bytes for some reason.
4842  */
4843 static void
4844 fabricate_brandstr(struct cpuid_info *cpi)
4845 {
4846         const char *brand = NULL;
4847 
4848         switch (cpi->cpi_vendor) {
4849         case X86_VENDOR_Intel:
4850                 brand = intel_cpubrand(cpi);
4851                 break;
4852         case X86_VENDOR_AMD:
4853                 brand = amd_cpubrand(cpi);
4854                 break;
4855         case X86_VENDOR_Cyrix:
4856                 brand = cyrix_cpubrand(cpi, x86_type);
4857                 break;
4858         case X86_VENDOR_NexGen:
4859                 if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4860                         brand = "NexGen Nx586";
4861                 break;
4862         case X86_VENDOR_Centaur:
4863                 if (cpi->cpi_family == 5)
4864                         switch (cpi->cpi_model) {
4865                         case 4:
4866                                 brand = "Centaur C6";
4867                                 break;
4868                         case 8:
4869                                 brand = "Centaur C2";
4870                                 break;
4871                         case 9:
4872                                 brand = "Centaur C3";
4873                                 break;
4874                         default:
4875                                 break;
4876                         }
4877                 break;
4878         case X86_VENDOR_Rise:
4879                 if (cpi->cpi_family == 5 &&
4880                     (cpi->cpi_model == 0 || cpi->cpi_model == 2))
4881                         brand = "Rise mP6";
4882                 break;
4883         case X86_VENDOR_SiS:
4884                 if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4885                         brand = "SiS 55x";
4886                 break;
4887         case X86_VENDOR_TM:
4888                 if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
4889                         brand = "Transmeta Crusoe TM3x00 or TM5x00";
4890                 break;
4891         case X86_VENDOR_NSC:
4892         case X86_VENDOR_UMC:
4893         default:
4894                 break;
4895         }
4896         if (brand) {
4897                 (void) strcpy((char *)cpi->cpi_brandstr, brand);
4898                 return;
4899         }
4900 
4901         /*
4902          * If all else fails ...
4903          */
4904         (void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
4905             "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
4906             cpi->cpi_model, cpi->cpi_step);
4907 }
4908 
4909 /*
4910  * This routine is called just after kernel memory allocation
4911  * becomes available on cpu0, and as part of mp_startup() on
4912  * the other cpus.
4913  *
4914  * Fixup the brand string, and collect any information from cpuid
4915  * that requires dynamically allocated storage to represent.
4916  */
4917 /*ARGSUSED*/
4918 void
4919 cpuid_pass3(cpu_t *cpu)
4920 {
4921         int     i, max, shft, level, size;
4922         struct cpuid_regs regs;
4923         struct cpuid_regs *cp;
4924         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4925 
4926         ASSERT(cpi->cpi_pass == 2);
4927 
4928         /*
4929          * Deterministic cache parameters
4930          *
4931          * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
4932          * values that are present are currently defined to be the same. This
4933          * means we can use the same logic to parse it as long as we use the
4934          * appropriate leaf to get the data. If you're updating this, make sure
4935          * you're careful about which vendor supports which aspect.
4936          *
4937          * Take this opportunity to detect the number of threads sharing the
4938          * last level cache, and construct a corresponding cache id. The
4939          * respective cpuid_info members are initialized to the default case of
4940          * "no last level cache sharing".
4941          */
4942         cpi->cpi_ncpu_shr_last_cache = 1;
4943         cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
4944 
4945         if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
4946             (cpi->cpi_vendor == X86_VENDOR_AMD &&
4947             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
4948             is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
4949                 uint32_t leaf;
4950 
4951                 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4952                         leaf = 4;
4953                 } else {
4954                         leaf = CPUID_LEAF_EXT_1d;
4955                 }
4956 
4957                 /*
4958                  * Find the # of elements (size) returned by the leaf and along
4959                  * the way detect last level cache sharing details.
4960                  */
4961                 bzero(&regs, sizeof (regs));
4962                 cp = &regs;
4963                 for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
4964                         cp->cp_eax = leaf;
4965                         cp->cp_ecx = i;
4966 
4967                         (void) __cpuid_insn(cp);
4968 
4969                         if (CPI_CACHE_TYPE(cp) == 0)
4970                                 break;
4971                         level = CPI_CACHE_LVL(cp);
4972                         if (level > max) {
4973                                 max = level;
4974                                 cpi->cpi_ncpu_shr_last_cache =
4975                                     CPI_NTHR_SHR_CACHE(cp) + 1;
4976                         }
4977                 }
4978                 cpi->cpi_cache_leaf_size = size = i;
4979 
4980                 /*
4981                  * Allocate the cpi_cache_leaves array. The first element
4982                  * references the regs for the corresponding leaf with %ecx set
4983                  * to 0. This was gathered in cpuid_pass2().
4984                  */
4985                 if (size > 0) {
4986                         cpi->cpi_cache_leaves =
4987                             kmem_alloc(size * sizeof (cp), KM_SLEEP);
4988                         if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4989                                 cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
4990                         } else {
4991                                 cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
4992                         }
4993 
4994                         /*
4995                          * Allocate storage to hold the additional regs
4996                          * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
4997                          *
4998                          * The regs for the leaf, %ecx == 0 has already
4999                          * been allocated as indicated above.
5000                          */
5001                         for (i = 1; i < size; i++) {
5002                                 cp = cpi->cpi_cache_leaves[i] =
5003                                     kmem_zalloc(sizeof (regs), KM_SLEEP);
5004                                 cp->cp_eax = leaf;
5005                                 cp->cp_ecx = i;
5006 
5007                                 (void) __cpuid_insn(cp);
5008                         }
5009                 }
5010                 /*
5011                  * Determine the number of bits needed to represent
5012                  * the number of CPUs sharing the last level cache.
5013                  *
5014                  * Shift off that number of bits from the APIC id to
5015                  * derive the cache id.
5016                  */
5017                 shft = 0;
5018                 for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
5019                         shft++;
5020                 cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
5021         }
5022 
5023         /*
5024          * Now fixup the brand string
5025          */
5026         if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
5027                 fabricate_brandstr(cpi);
5028         } else {
5029 
5030                 /*
5031                  * If we successfully extracted a brand string from the cpuid
5032                  * instruction, clean it up by removing leading spaces and
5033                  * similar junk.
5034                  */
5035                 if (cpi->cpi_brandstr[0]) {
5036                         size_t maxlen = sizeof (cpi->cpi_brandstr);
5037                         char *src, *dst;
5038 
5039                         dst = src = (char *)cpi->cpi_brandstr;
5040                         src[maxlen - 1] = '\0';
5041                         /*
5042                          * strip leading spaces
5043                          */
5044                         while (*src == ' ')
5045                                 src++;
5046                         /*
5047                          * Remove any 'Genuine' or "Authentic" prefixes
5048                          */
5049                         if (strncmp(src, "Genuine ", 8) == 0)
5050                                 src += 8;
5051                         if (strncmp(src, "Authentic ", 10) == 0)
5052                                 src += 10;
5053 
5054                         /*
5055                          * Now do an in-place copy.
5056                          * Map (R) to (r) and (TM) to (tm).
5057                          * The era of teletypes is long gone, and there's
5058                          * -really- no need to shout.
5059                          */
5060                         while (*src != '\0') {
5061                                 if (src[0] == '(') {
5062                                         if (strncmp(src + 1, "R)", 2) == 0) {
5063                                                 (void) strncpy(dst, "(r)", 3);
5064                                                 src += 3;
5065                                                 dst += 3;
5066                                                 continue;
5067                                         }
5068                                         if (strncmp(src + 1, "TM)", 3) == 0) {
5069                                                 (void) strncpy(dst, "(tm)", 4);
5070                                                 src += 4;
5071                                                 dst += 4;
5072                                                 continue;
5073                                         }
5074                                 }
5075                                 *dst++ = *src++;
5076                         }
5077                         *dst = '\0';
5078 
5079                         /*
5080                          * Finally, remove any trailing spaces
5081                          */
5082                         while (--dst > cpi->cpi_brandstr)
5083                                 if (*dst == ' ')
5084                                         *dst = '\0';
5085                                 else
5086                                         break;
5087                 } else
5088                         fabricate_brandstr(cpi);
5089         }
5090         cpi->cpi_pass = 3;
5091 }
5092 
5093 /*
5094  * This routine is called out of bind_hwcap() much later in the life
5095  * of the kernel (post_startup()).  The job of this routine is to resolve
5096  * the hardware feature support and kernel support for those features into
5097  * what we're actually going to tell applications via the aux vector.
5098  */
5099 void
5100 cpuid_pass4(cpu_t *cpu, uint_t *hwcap_out)
5101 {
5102         struct cpuid_info *cpi;
5103         uint_t hwcap_flags = 0, hwcap_flags_2 = 0;
5104 
5105         if (cpu == NULL)
5106                 cpu = CPU;
5107         cpi = cpu->cpu_m.mcpu_cpi;
5108 
5109         ASSERT(cpi->cpi_pass == 3);
5110 
5111         if (cpi->cpi_maxeax >= 1) {
5112                 uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
5113                 uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
5114                 uint32_t *ebx = &cpi->cpi_support[STD_EBX_FEATURES];
5115 
5116                 *edx = CPI_FEATURES_EDX(cpi);
5117                 *ecx = CPI_FEATURES_ECX(cpi);
5118                 *ebx = CPI_FEATURES_7_0_EBX(cpi);
5119 
5120                 /*
5121                  * [these require explicit kernel support]
5122                  */
5123                 if (!is_x86_feature(x86_featureset, X86FSET_SEP))
5124                         *edx &= ~CPUID_INTC_EDX_SEP;
5125 
5126                 if (!is_x86_feature(x86_featureset, X86FSET_SSE))
5127                         *edx &= ~(CPUID_INTC_EDX_FXSR|CPUID_INTC_EDX_SSE);
5128                 if (!is_x86_feature(x86_featureset, X86FSET_SSE2))
5129                         *edx &= ~CPUID_INTC_EDX_SSE2;
5130 
5131                 if (!is_x86_feature(x86_featureset, X86FSET_HTT))
5132                         *edx &= ~CPUID_INTC_EDX_HTT;
5133 
5134                 if (!is_x86_feature(x86_featureset, X86FSET_SSE3))
5135                         *ecx &= ~CPUID_INTC_ECX_SSE3;
5136 
5137                 if (!is_x86_feature(x86_featureset, X86FSET_SSSE3))
5138                         *ecx &= ~CPUID_INTC_ECX_SSSE3;
5139                 if (!is_x86_feature(x86_featureset, X86FSET_SSE4_1))
5140                         *ecx &= ~CPUID_INTC_ECX_SSE4_1;
5141                 if (!is_x86_feature(x86_featureset, X86FSET_SSE4_2))
5142                         *ecx &= ~CPUID_INTC_ECX_SSE4_2;
5143                 if (!is_x86_feature(x86_featureset, X86FSET_AES))
5144                         *ecx &= ~CPUID_INTC_ECX_AES;
5145                 if (!is_x86_feature(x86_featureset, X86FSET_PCLMULQDQ))
5146                         *ecx &= ~CPUID_INTC_ECX_PCLMULQDQ;
5147                 if (!is_x86_feature(x86_featureset, X86FSET_XSAVE))
5148                         *ecx &= ~(CPUID_INTC_ECX_XSAVE |
5149                             CPUID_INTC_ECX_OSXSAVE);
5150                 if (!is_x86_feature(x86_featureset, X86FSET_AVX))
5151                         *ecx &= ~CPUID_INTC_ECX_AVX;
5152                 if (!is_x86_feature(x86_featureset, X86FSET_F16C))
5153                         *ecx &= ~CPUID_INTC_ECX_F16C;
5154                 if (!is_x86_feature(x86_featureset, X86FSET_FMA))
5155                         *ecx &= ~CPUID_INTC_ECX_FMA;
5156                 if (!is_x86_feature(x86_featureset, X86FSET_BMI1))
5157                         *ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
5158                 if (!is_x86_feature(x86_featureset, X86FSET_BMI2))
5159                         *ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
5160                 if (!is_x86_feature(x86_featureset, X86FSET_AVX2))
5161                         *ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
5162                 if (!is_x86_feature(x86_featureset, X86FSET_RDSEED))
5163                         *ebx &= ~CPUID_INTC_EBX_7_0_RDSEED;
5164                 if (!is_x86_feature(x86_featureset, X86FSET_ADX))
5165                         *ebx &= ~CPUID_INTC_EBX_7_0_ADX;
5166 
5167                 /*
5168                  * [no explicit support required beyond x87 fp context]
5169                  */
5170                 if (!fpu_exists)
5171                         *edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
5172 
5173                 /*
5174                  * Now map the supported feature vector to things that we
5175                  * think userland will care about.
5176                  */
5177                 if (*edx & CPUID_INTC_EDX_SEP)
5178                         hwcap_flags |= AV_386_SEP;
5179                 if (*edx & CPUID_INTC_EDX_SSE)
5180                         hwcap_flags |= AV_386_FXSR | AV_386_SSE;
5181                 if (*edx & CPUID_INTC_EDX_SSE2)
5182                         hwcap_flags |= AV_386_SSE2;
5183                 if (*ecx & CPUID_INTC_ECX_SSE3)
5184                         hwcap_flags |= AV_386_SSE3;
5185                 if (*ecx & CPUID_INTC_ECX_SSSE3)
5186                         hwcap_flags |= AV_386_SSSE3;
5187                 if (*ecx & CPUID_INTC_ECX_SSE4_1)
5188                         hwcap_flags |= AV_386_SSE4_1;
5189                 if (*ecx & CPUID_INTC_ECX_SSE4_2)
5190                         hwcap_flags |= AV_386_SSE4_2;
5191                 if (*ecx & CPUID_INTC_ECX_MOVBE)
5192                         hwcap_flags |= AV_386_MOVBE;
5193                 if (*ecx & CPUID_INTC_ECX_AES)
5194                         hwcap_flags |= AV_386_AES;
5195                 if (*ecx & CPUID_INTC_ECX_PCLMULQDQ)
5196                         hwcap_flags |= AV_386_PCLMULQDQ;
5197                 if ((*ecx & CPUID_INTC_ECX_XSAVE) &&
5198                     (*ecx & CPUID_INTC_ECX_OSXSAVE)) {
5199                         hwcap_flags |= AV_386_XSAVE;
5200 
5201                         if (*ecx & CPUID_INTC_ECX_AVX) {
5202                                 uint32_t *ecx_7 = &CPI_FEATURES_7_0_ECX(cpi);
5203                                 uint32_t *edx_7 = &CPI_FEATURES_7_0_EDX(cpi);
5204 
5205                                 hwcap_flags |= AV_386_AVX;
5206                                 if (*ecx & CPUID_INTC_ECX_F16C)
5207                                         hwcap_flags_2 |= AV_386_2_F16C;
5208                                 if (*ecx & CPUID_INTC_ECX_FMA)
5209                                         hwcap_flags_2 |= AV_386_2_FMA;
5210 
5211                                 if (*ebx & CPUID_INTC_EBX_7_0_BMI1)
5212                                         hwcap_flags_2 |= AV_386_2_BMI1;
5213                                 if (*ebx & CPUID_INTC_EBX_7_0_BMI2)
5214                                         hwcap_flags_2 |= AV_386_2_BMI2;
5215                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX2)
5216                                         hwcap_flags_2 |= AV_386_2_AVX2;
5217                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512F)
5218                                         hwcap_flags_2 |= AV_386_2_AVX512F;
5219                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
5220                                         hwcap_flags_2 |= AV_386_2_AVX512DQ;
5221                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
5222                                         hwcap_flags_2 |= AV_386_2_AVX512IFMA;
5223                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512PF)
5224                                         hwcap_flags_2 |= AV_386_2_AVX512PF;
5225                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512ER)
5226                                         hwcap_flags_2 |= AV_386_2_AVX512ER;
5227                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512CD)
5228                                         hwcap_flags_2 |= AV_386_2_AVX512CD;
5229                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512BW)
5230                                         hwcap_flags_2 |= AV_386_2_AVX512BW;
5231                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512VL)
5232                                         hwcap_flags_2 |= AV_386_2_AVX512VL;
5233 
5234                                 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VBMI)
5235                                         hwcap_flags_2 |= AV_386_2_AVX512VBMI;
5236                                 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VNNI)
5237                                         hwcap_flags_2 |= AV_386_2_AVX512_VNNI;
5238                                 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
5239                                         hwcap_flags_2 |= AV_386_2_AVX512VPOPCDQ;
5240 
5241                                 if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124NNIW)
5242                                         hwcap_flags_2 |= AV_386_2_AVX512_4NNIW;
5243                                 if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
5244                                         hwcap_flags_2 |= AV_386_2_AVX512_4FMAPS;
5245                         }
5246                 }
5247                 if (*ecx & CPUID_INTC_ECX_VMX)
5248                         hwcap_flags |= AV_386_VMX;
5249                 if (*ecx & CPUID_INTC_ECX_POPCNT)
5250                         hwcap_flags |= AV_386_POPCNT;
5251                 if (*edx & CPUID_INTC_EDX_FPU)
5252                         hwcap_flags |= AV_386_FPU;
5253                 if (*edx & CPUID_INTC_EDX_MMX)
5254                         hwcap_flags |= AV_386_MMX;
5255 
5256                 if (*edx & CPUID_INTC_EDX_TSC)
5257                         hwcap_flags |= AV_386_TSC;
5258                 if (*edx & CPUID_INTC_EDX_CX8)
5259                         hwcap_flags |= AV_386_CX8;
5260                 if (*edx & CPUID_INTC_EDX_CMOV)
5261                         hwcap_flags |= AV_386_CMOV;
5262                 if (*ecx & CPUID_INTC_ECX_CX16)
5263                         hwcap_flags |= AV_386_CX16;
5264 
5265                 if (*ecx & CPUID_INTC_ECX_RDRAND)
5266                         hwcap_flags_2 |= AV_386_2_RDRAND;
5267                 if (*ebx & CPUID_INTC_EBX_7_0_ADX)
5268                         hwcap_flags_2 |= AV_386_2_ADX;
5269                 if (*ebx & CPUID_INTC_EBX_7_0_RDSEED)
5270                         hwcap_flags_2 |= AV_386_2_RDSEED;
5271                 if (*ebx & CPUID_INTC_EBX_7_0_SHA)
5272                         hwcap_flags_2 |= AV_386_2_SHA;
5273                 if (*ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
5274                         hwcap_flags_2 |= AV_386_2_FSGSBASE;
5275                 if (*ebx & CPUID_INTC_EBX_7_0_CLWB)
5276                         hwcap_flags_2 |= AV_386_2_CLWB;
5277                 if (*ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
5278                         hwcap_flags_2 |= AV_386_2_CLFLUSHOPT;
5279 
5280         }
5281         /*
5282          * Check a few miscilaneous features.
5283          */
5284         if (is_x86_feature(x86_featureset, X86FSET_CLZERO))
5285                 hwcap_flags_2 |= AV_386_2_CLZERO;
5286 
5287         if (cpi->cpi_xmaxeax < 0x80000001)
5288                 goto pass4_done;
5289 
5290         switch (cpi->cpi_vendor) {
5291                 struct cpuid_regs cp;
5292                 uint32_t *edx, *ecx;
5293 
5294         case X86_VENDOR_Intel:
5295                 /*
5296                  * Seems like Intel duplicated what we necessary
5297                  * here to make the initial crop of 64-bit OS's work.
5298                  * Hopefully, those are the only "extended" bits
5299                  * they'll add.
5300                  */
5301                 /*FALLTHROUGH*/
5302 
5303         case X86_VENDOR_AMD:
5304                 edx = &cpi->cpi_support[AMD_EDX_FEATURES];
5305                 ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
5306 
5307                 *edx = CPI_FEATURES_XTD_EDX(cpi);
5308                 *ecx = CPI_FEATURES_XTD_ECX(cpi);
5309 
5310                 /*
5311                  * [these features require explicit kernel support]
5312                  */
5313                 switch (cpi->cpi_vendor) {
5314                 case X86_VENDOR_Intel:
5315                         if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
5316                                 *edx &= ~CPUID_AMD_EDX_TSCP;
5317                         break;
5318 
5319                 case X86_VENDOR_AMD:
5320                         if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
5321                                 *edx &= ~CPUID_AMD_EDX_TSCP;
5322                         if (!is_x86_feature(x86_featureset, X86FSET_SSE4A))
5323                                 *ecx &= ~CPUID_AMD_ECX_SSE4A;
5324                         break;
5325 
5326                 default:
5327                         break;
5328                 }
5329 
5330                 /*
5331                  * [no explicit support required beyond
5332                  * x87 fp context and exception handlers]
5333                  */
5334                 if (!fpu_exists)
5335                         *edx &= ~(CPUID_AMD_EDX_MMXamd |
5336                             CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
5337 
5338                 if (!is_x86_feature(x86_featureset, X86FSET_NX))
5339                         *edx &= ~CPUID_AMD_EDX_NX;
5340 #if !defined(__amd64)
5341                 *edx &= ~CPUID_AMD_EDX_LM;
5342 #endif
5343                 /*
5344                  * Now map the supported feature vector to
5345                  * things that we think userland will care about.
5346                  */
5347 #if defined(__amd64)
5348                 if (*edx & CPUID_AMD_EDX_SYSC)
5349                         hwcap_flags |= AV_386_AMD_SYSC;
5350 #endif
5351                 if (*edx & CPUID_AMD_EDX_MMXamd)
5352                         hwcap_flags |= AV_386_AMD_MMX;
5353                 if (*edx & CPUID_AMD_EDX_3DNow)
5354                         hwcap_flags |= AV_386_AMD_3DNow;
5355                 if (*edx & CPUID_AMD_EDX_3DNowx)
5356                         hwcap_flags |= AV_386_AMD_3DNowx;
5357                 if (*ecx & CPUID_AMD_ECX_SVM)
5358                         hwcap_flags |= AV_386_AMD_SVM;
5359 
5360                 switch (cpi->cpi_vendor) {
5361                 case X86_VENDOR_AMD:
5362                         if (*edx & CPUID_AMD_EDX_TSCP)
5363                                 hwcap_flags |= AV_386_TSCP;
5364                         if (*ecx & CPUID_AMD_ECX_AHF64)
5365                                 hwcap_flags |= AV_386_AHF;
5366                         if (*ecx & CPUID_AMD_ECX_SSE4A)
5367                                 hwcap_flags |= AV_386_AMD_SSE4A;
5368                         if (*ecx & CPUID_AMD_ECX_LZCNT)
5369                                 hwcap_flags |= AV_386_AMD_LZCNT;
5370                         if (*ecx & CPUID_AMD_ECX_MONITORX)
5371                                 hwcap_flags_2 |= AV_386_2_MONITORX;
5372                         break;
5373 
5374                 case X86_VENDOR_Intel:
5375                         if (*edx & CPUID_AMD_EDX_TSCP)
5376                                 hwcap_flags |= AV_386_TSCP;
5377                         if (*ecx & CPUID_AMD_ECX_LZCNT)
5378                                 hwcap_flags |= AV_386_AMD_LZCNT;
5379                         /*
5380                          * Aarrgh.
5381                          * Intel uses a different bit in the same word.
5382                          */
5383                         if (*ecx & CPUID_INTC_ECX_AHF64)
5384                                 hwcap_flags |= AV_386_AHF;
5385                         break;
5386 
5387                 default:
5388                         break;
5389                 }
5390                 break;
5391 
5392         case X86_VENDOR_TM:
5393                 cp.cp_eax = 0x80860001;
5394                 (void) __cpuid_insn(&cp);
5395                 cpi->cpi_support[TM_EDX_FEATURES] = cp.cp_edx;
5396                 break;
5397 
5398         default:
5399                 break;
5400         }
5401 
5402 pass4_done:
5403         cpi->cpi_pass = 4;
5404         if (hwcap_out != NULL) {
5405                 hwcap_out[0] = hwcap_flags;
5406                 hwcap_out[1] = hwcap_flags_2;
5407         }
5408 }
5409 
5410 
5411 /*
5412  * Simulate the cpuid instruction using the data we previously
5413  * captured about this CPU.  We try our best to return the truth
5414  * about the hardware, independently of kernel support.
5415  */
5416 uint32_t
5417 cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
5418 {
5419         struct cpuid_info *cpi;
5420         struct cpuid_regs *xcp;
5421 
5422         if (cpu == NULL)
5423                 cpu = CPU;
5424         cpi = cpu->cpu_m.mcpu_cpi;
5425 
5426         ASSERT(cpuid_checkpass(cpu, 3));
5427 
5428         /*
5429          * CPUID data is cached in two separate places: cpi_std for standard
5430          * CPUID leaves , and cpi_extd for extended CPUID leaves.
5431          */
5432         if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
5433                 xcp = &cpi->cpi_std[cp->cp_eax];
5434         } else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
5435             cp->cp_eax <= cpi->cpi_xmaxeax &&
5436             cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
5437                 xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
5438         } else {
5439                 /*
5440                  * The caller is asking for data from an input parameter which
5441                  * the kernel has not cached.  In this case we go fetch from
5442                  * the hardware and return the data directly to the user.
5443                  */
5444                 return (__cpuid_insn(cp));
5445         }
5446 
5447         cp->cp_eax = xcp->cp_eax;
5448         cp->cp_ebx = xcp->cp_ebx;
5449         cp->cp_ecx = xcp->cp_ecx;
5450         cp->cp_edx = xcp->cp_edx;
5451         return (cp->cp_eax);
5452 }
5453 
5454 int
5455 cpuid_checkpass(cpu_t *cpu, int pass)
5456 {
5457         return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
5458             cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
5459 }
5460 
5461 int
5462 cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
5463 {
5464         ASSERT(cpuid_checkpass(cpu, 3));
5465 
5466         return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
5467 }
5468 
5469 int
5470 cpuid_is_cmt(cpu_t *cpu)
5471 {
5472         if (cpu == NULL)
5473                 cpu = CPU;
5474 
5475         ASSERT(cpuid_checkpass(cpu, 1));
5476 
5477         return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
5478 }
5479 
5480 /*
5481  * AMD and Intel both implement the 64-bit variant of the syscall
5482  * instruction (syscallq), so if there's -any- support for syscall,
5483  * cpuid currently says "yes, we support this".
5484  *
5485  * However, Intel decided to -not- implement the 32-bit variant of the
5486  * syscall instruction, so we provide a predicate to allow our caller
5487  * to test that subtlety here.
5488  *
5489  * XXPV Currently, 32-bit syscall instructions don't work via the hypervisor,
5490  *      even in the case where the hardware would in fact support it.
5491  */
5492 /*ARGSUSED*/
5493 int
5494 cpuid_syscall32_insn(cpu_t *cpu)
5495 {
5496         ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), 1));
5497 
5498 #if !defined(__xpv)
5499         if (cpu == NULL)
5500                 cpu = CPU;
5501 
5502         /*CSTYLED*/
5503         {
5504                 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5505 
5506                 if (cpi->cpi_vendor == X86_VENDOR_AMD &&
5507                     cpi->cpi_xmaxeax >= 0x80000001 &&
5508                     (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
5509                         return (1);
5510         }
5511 #endif
5512         return (0);
5513 }
5514 
5515 int
5516 cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
5517 {
5518         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5519 
5520         static const char fmt[] =
5521             "x86 (%s %X family %d model %d step %d clock %d MHz)";
5522         static const char fmt_ht[] =
5523             "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
5524 
5525         ASSERT(cpuid_checkpass(cpu, 1));
5526 
5527         if (cpuid_is_cmt(cpu))
5528                 return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
5529                     cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5530                     cpi->cpi_family, cpi->cpi_model,
5531                     cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5532         return (snprintf(s, n, fmt,
5533             cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5534             cpi->cpi_family, cpi->cpi_model,
5535             cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5536 }
5537 
5538 const char *
5539 cpuid_getvendorstr(cpu_t *cpu)
5540 {
5541         ASSERT(cpuid_checkpass(cpu, 1));
5542         return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
5543 }
5544 
5545 uint_t
5546 cpuid_getvendor(cpu_t *cpu)
5547 {
5548         ASSERT(cpuid_checkpass(cpu, 1));
5549         return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
5550 }
5551 
5552 uint_t
5553 cpuid_getfamily(cpu_t *cpu)
5554 {
5555         ASSERT(cpuid_checkpass(cpu, 1));
5556         return (cpu->cpu_m.mcpu_cpi->cpi_family);
5557 }
5558 
5559 uint_t
5560 cpuid_getmodel(cpu_t *cpu)
5561 {
5562         ASSERT(cpuid_checkpass(cpu, 1));
5563         return (cpu->cpu_m.mcpu_cpi->cpi_model);
5564 }
5565 
5566 uint_t
5567 cpuid_get_ncpu_per_chip(cpu_t *cpu)
5568 {
5569         ASSERT(cpuid_checkpass(cpu, 1));
5570         return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
5571 }
5572 
5573 uint_t
5574 cpuid_get_ncore_per_chip(cpu_t *cpu)
5575 {
5576         ASSERT(cpuid_checkpass(cpu, 1));
5577         return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
5578 }
5579 
5580 uint_t
5581 cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
5582 {
5583         ASSERT(cpuid_checkpass(cpu, 2));
5584         return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
5585 }
5586 
5587 id_t
5588 cpuid_get_last_lvl_cacheid(cpu_t *cpu)
5589 {
5590         ASSERT(cpuid_checkpass(cpu, 2));
5591         return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5592 }
5593 
5594 uint_t
5595 cpuid_getstep(cpu_t *cpu)
5596 {
5597         ASSERT(cpuid_checkpass(cpu, 1));
5598         return (cpu->cpu_m.mcpu_cpi->cpi_step);
5599 }
5600 
5601 uint_t
5602 cpuid_getsig(struct cpu *cpu)
5603 {
5604         ASSERT(cpuid_checkpass(cpu, 1));
5605         return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
5606 }
5607 
5608 uint32_t
5609 cpuid_getchiprev(struct cpu *cpu)
5610 {
5611         ASSERT(cpuid_checkpass(cpu, 1));
5612         return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
5613 }
5614 
5615 const char *
5616 cpuid_getchiprevstr(struct cpu *cpu)
5617 {
5618         ASSERT(cpuid_checkpass(cpu, 1));
5619         return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
5620 }
5621 
5622 uint32_t
5623 cpuid_getsockettype(struct cpu *cpu)
5624 {
5625         ASSERT(cpuid_checkpass(cpu, 1));
5626         return (cpu->cpu_m.mcpu_cpi->cpi_socket);
5627 }
5628 
5629 const char *
5630 cpuid_getsocketstr(cpu_t *cpu)
5631 {
5632         static const char *socketstr = NULL;
5633         struct cpuid_info *cpi;
5634 
5635         ASSERT(cpuid_checkpass(cpu, 1));
5636         cpi = cpu->cpu_m.mcpu_cpi;
5637 
5638         /* Assume that socket types are the same across the system */
5639         if (socketstr == NULL)
5640                 socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
5641                     cpi->cpi_model, cpi->cpi_step);
5642 
5643 
5644         return (socketstr);
5645 }
5646 
5647 int
5648 cpuid_get_chipid(cpu_t *cpu)
5649 {
5650         ASSERT(cpuid_checkpass(cpu, 1));
5651 
5652         if (cpuid_is_cmt(cpu))
5653                 return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
5654         return (cpu->cpu_id);
5655 }
5656 
5657 id_t
5658 cpuid_get_coreid(cpu_t *cpu)
5659 {
5660         ASSERT(cpuid_checkpass(cpu, 1));
5661         return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
5662 }
5663 
5664 int
5665 cpuid_get_pkgcoreid(cpu_t *cpu)
5666 {
5667         ASSERT(cpuid_checkpass(cpu, 1));
5668         return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
5669 }
5670 
5671 int
5672 cpuid_get_clogid(cpu_t *cpu)
5673 {
5674         ASSERT(cpuid_checkpass(cpu, 1));
5675         return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
5676 }
5677 
5678 int
5679 cpuid_get_cacheid(cpu_t *cpu)
5680 {
5681         ASSERT(cpuid_checkpass(cpu, 1));
5682         return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5683 }
5684 
5685 uint_t
5686 cpuid_get_procnodeid(cpu_t *cpu)
5687 {
5688         ASSERT(cpuid_checkpass(cpu, 1));
5689         return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
5690 }
5691 
5692 uint_t
5693 cpuid_get_procnodes_per_pkg(cpu_t *cpu)
5694 {
5695         ASSERT(cpuid_checkpass(cpu, 1));
5696         return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
5697 }
5698 
5699 uint_t
5700 cpuid_get_compunitid(cpu_t *cpu)
5701 {
5702         ASSERT(cpuid_checkpass(cpu, 1));
5703         return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
5704 }
5705 
5706 uint_t
5707 cpuid_get_cores_per_compunit(cpu_t *cpu)
5708 {
5709         ASSERT(cpuid_checkpass(cpu, 1));
5710         return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
5711 }
5712 
5713 /*ARGSUSED*/
5714 int
5715 cpuid_have_cr8access(cpu_t *cpu)
5716 {
5717 #if defined(__amd64)
5718         return (1);
5719 #else
5720         struct cpuid_info *cpi;
5721 
5722         ASSERT(cpu != NULL);
5723         cpi = cpu->cpu_m.mcpu_cpi;
5724         if (cpi->cpi_vendor == X86_VENDOR_AMD && cpi->cpi_maxeax >= 1 &&
5725             (CPI_FEATURES_XTD_ECX(cpi) & CPUID_AMD_ECX_CR8D) != 0)
5726                 return (1);
5727         return (0);
5728 #endif
5729 }
5730 
5731 uint32_t
5732 cpuid_get_apicid(cpu_t *cpu)
5733 {
5734         ASSERT(cpuid_checkpass(cpu, 1));
5735         if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
5736                 return (UINT32_MAX);
5737         } else {
5738                 return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
5739         }
5740 }
5741 
5742 void
5743 cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
5744 {
5745         struct cpuid_info *cpi;
5746 
5747         if (cpu == NULL)
5748                 cpu = CPU;
5749         cpi = cpu->cpu_m.mcpu_cpi;
5750 
5751         ASSERT(cpuid_checkpass(cpu, 1));
5752 
5753         if (pabits)
5754                 *pabits = cpi->cpi_pabits;
5755         if (vabits)
5756                 *vabits = cpi->cpi_vabits;
5757 }
5758 
5759 size_t
5760 cpuid_get_xsave_size()
5761 {
5762         return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
5763             sizeof (struct xsave_state)));
5764 }
5765 
5766 /*
5767  * Return true if the CPUs on this system require 'pointer clearing' for the
5768  * floating point error pointer exception handling. In the past, this has been
5769  * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
5770  * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
5771  * feature bit and is reflected in the cpi_fp_amd_save member.
5772  */
5773 boolean_t
5774 cpuid_need_fp_excp_handling()
5775 {
5776         return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
5777             cpuid_info0.cpi_fp_amd_save != 0);
5778 }
5779 
5780 /*
5781  * Returns the number of data TLB entries for a corresponding
5782  * pagesize.  If it can't be computed, or isn't known, the
5783  * routine returns zero.  If you ask about an architecturally
5784  * impossible pagesize, the routine will panic (so that the
5785  * hat implementor knows that things are inconsistent.)
5786  */
5787 uint_t
5788 cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
5789 {
5790         struct cpuid_info *cpi;
5791         uint_t dtlb_nent = 0;
5792 
5793         if (cpu == NULL)
5794                 cpu = CPU;
5795         cpi = cpu->cpu_m.mcpu_cpi;
5796 
5797         ASSERT(cpuid_checkpass(cpu, 1));
5798 
5799         /*
5800          * Check the L2 TLB info
5801          */
5802         if (cpi->cpi_xmaxeax >= 0x80000006) {
5803                 struct cpuid_regs *cp = &cpi->cpi_extd[6];
5804 
5805                 switch (pagesize) {
5806 
5807                 case 4 * 1024:
5808                         /*
5809                          * All zero in the top 16 bits of the register
5810                          * indicates a unified TLB. Size is in low 16 bits.
5811                          */
5812                         if ((cp->cp_ebx & 0xffff0000) == 0)
5813                                 dtlb_nent = cp->cp_ebx & 0x0000ffff;
5814                         else
5815                                 dtlb_nent = BITX(cp->cp_ebx, 27, 16);
5816                         break;
5817 
5818                 case 2 * 1024 * 1024:
5819                         if ((cp->cp_eax & 0xffff0000) == 0)
5820                                 dtlb_nent = cp->cp_eax & 0x0000ffff;
5821                         else
5822                                 dtlb_nent = BITX(cp->cp_eax, 27, 16);
5823                         break;
5824 
5825                 default:
5826                         panic("unknown L2 pagesize");
5827                         /*NOTREACHED*/
5828                 }
5829         }
5830 
5831         if (dtlb_nent != 0)
5832                 return (dtlb_nent);
5833 
5834         /*
5835          * No L2 TLB support for this size, try L1.
5836          */
5837         if (cpi->cpi_xmaxeax >= 0x80000005) {
5838                 struct cpuid_regs *cp = &cpi->cpi_extd[5];
5839 
5840                 switch (pagesize) {
5841                 case 4 * 1024:
5842                         dtlb_nent = BITX(cp->cp_ebx, 23, 16);
5843                         break;
5844                 case 2 * 1024 * 1024:
5845                         dtlb_nent = BITX(cp->cp_eax, 23, 16);
5846                         break;
5847                 default:
5848                         panic("unknown L1 d-TLB pagesize");
5849                         /*NOTREACHED*/
5850                 }
5851         }
5852 
5853         return (dtlb_nent);
5854 }
5855 
5856 /*
5857  * Return 0 if the erratum is not present or not applicable, positive
5858  * if it is, and negative if the status of the erratum is unknown.
5859  *
5860  * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
5861  * Processors" #25759, Rev 3.57, August 2005
5862  */
5863 int
5864 cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
5865 {
5866         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5867         uint_t eax;
5868 
5869         /*
5870          * Bail out if this CPU isn't an AMD CPU, or if it's
5871          * a legacy (32-bit) AMD CPU.
5872          */
5873         if (cpi->cpi_vendor != X86_VENDOR_AMD ||
5874             cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
5875             cpi->cpi_family == 6) {
5876                 return (0);
5877         }
5878 
5879         eax = cpi->cpi_std[1].cp_eax;
5880 
5881 #define SH_B0(eax)      (eax == 0xf40 || eax == 0xf50)
5882 #define SH_B3(eax)      (eax == 0xf51)
5883 #define B(eax)          (SH_B0(eax) || SH_B3(eax))
5884 
5885 #define SH_C0(eax)      (eax == 0xf48 || eax == 0xf58)
5886 
5887 #define SH_CG(eax)      (eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
5888 #define DH_CG(eax)      (eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
5889 #define CH_CG(eax)      (eax == 0xf82 || eax == 0xfb2)
5890 #define CG(eax)         (SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
5891 
5892 #define SH_D0(eax)      (eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
5893 #define DH_D0(eax)      (eax == 0x10fc0 || eax == 0x10ff0)
5894 #define CH_D0(eax)      (eax == 0x10f80 || eax == 0x10fb0)
5895 #define D0(eax)         (SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
5896 
5897 #define SH_E0(eax)      (eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
5898 #define JH_E1(eax)      (eax == 0x20f10)        /* JH8_E0 had 0x20f30 */
5899 #define DH_E3(eax)      (eax == 0x20fc0 || eax == 0x20ff0)
5900 #define SH_E4(eax)      (eax == 0x20f51 || eax == 0x20f71)
5901 #define BH_E4(eax)      (eax == 0x20fb1)
5902 #define SH_E5(eax)      (eax == 0x20f42)
5903 #define DH_E6(eax)      (eax == 0x20ff2 || eax == 0x20fc2)
5904 #define JH_E6(eax)      (eax == 0x20f12 || eax == 0x20f32)
5905 #define EX(eax)         (SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
5906                             SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
5907                             DH_E6(eax) || JH_E6(eax))
5908 
5909 #define DR_AX(eax)      (eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
5910 #define DR_B0(eax)      (eax == 0x100f20)
5911 #define DR_B1(eax)      (eax == 0x100f21)
5912 #define DR_BA(eax)      (eax == 0x100f2a)
5913 #define DR_B2(eax)      (eax == 0x100f22)
5914 #define DR_B3(eax)      (eax == 0x100f23)
5915 #define RB_C0(eax)      (eax == 0x100f40)
5916 
5917         switch (erratum) {
5918         case 1:
5919                 return (cpi->cpi_family < 0x10);
5920         case 51:        /* what does the asterisk mean? */
5921                 return (B(eax) || SH_C0(eax) || CG(eax));
5922         case 52:
5923                 return (B(eax));
5924         case 57:
5925                 return (cpi->cpi_family <= 0x11);
5926         case 58:
5927                 return (B(eax));
5928         case 60:
5929                 return (cpi->cpi_family <= 0x11);
5930         case 61:
5931         case 62:
5932         case 63:
5933         case 64:
5934         case 65:
5935         case 66:
5936         case 68:
5937         case 69:
5938         case 70:
5939         case 71:
5940                 return (B(eax));
5941         case 72:
5942                 return (SH_B0(eax));
5943         case 74:
5944                 return (B(eax));
5945         case 75:
5946                 return (cpi->cpi_family < 0x10);
5947         case 76:
5948                 return (B(eax));
5949         case 77:
5950                 return (cpi->cpi_family <= 0x11);
5951         case 78:
5952                 return (B(eax) || SH_C0(eax));
5953         case 79:
5954                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5955         case 80:
5956         case 81:
5957         case 82:
5958                 return (B(eax));
5959         case 83:
5960                 return (B(eax) || SH_C0(eax) || CG(eax));
5961         case 85:
5962                 return (cpi->cpi_family < 0x10);
5963         case 86:
5964                 return (SH_C0(eax) || CG(eax));
5965         case 88:
5966 #if !defined(__amd64)
5967                 return (0);
5968 #else
5969                 return (B(eax) || SH_C0(eax));
5970 #endif
5971         case 89:
5972                 return (cpi->cpi_family < 0x10);
5973         case 90:
5974                 return (B(eax) || SH_C0(eax) || CG(eax));
5975         case 91:
5976         case 92:
5977                 return (B(eax) || SH_C0(eax));
5978         case 93:
5979                 return (SH_C0(eax));
5980         case 94:
5981                 return (B(eax) || SH_C0(eax) || CG(eax));
5982         case 95:
5983 #if !defined(__amd64)
5984                 return (0);
5985 #else
5986                 return (B(eax) || SH_C0(eax));
5987 #endif
5988         case 96:
5989                 return (B(eax) || SH_C0(eax) || CG(eax));
5990         case 97:
5991         case 98:
5992                 return (SH_C0(eax) || CG(eax));
5993         case 99:
5994                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5995         case 100:
5996                 return (B(eax) || SH_C0(eax));
5997         case 101:
5998         case 103:
5999                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6000         case 104:
6001                 return (SH_C0(eax) || CG(eax) || D0(eax));
6002         case 105:
6003         case 106:
6004         case 107:
6005                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6006         case 108:
6007                 return (DH_CG(eax));
6008         case 109:
6009                 return (SH_C0(eax) || CG(eax) || D0(eax));
6010         case 110:
6011                 return (D0(eax) || EX(eax));
6012         case 111:
6013                 return (CG(eax));
6014         case 112:
6015                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6016         case 113:
6017                 return (eax == 0x20fc0);
6018         case 114:
6019                 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6020         case 115:
6021                 return (SH_E0(eax) || JH_E1(eax));
6022         case 116:
6023                 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6024         case 117:
6025                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6026         case 118:
6027                 return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
6028                     JH_E6(eax));
6029         case 121:
6030                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6031         case 122:
6032                 return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
6033         case 123:
6034                 return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
6035         case 131:
6036                 return (cpi->cpi_family < 0x10);
6037         case 6336786:
6038 
6039                 /*
6040                  * Test for AdvPowerMgmtInfo.TscPStateInvariant
6041                  * if this is a K8 family or newer processor. We're testing for
6042                  * this 'erratum' to determine whether or not we have a constant
6043                  * TSC.
6044                  *
6045                  * Our current fix for this is to disable the C1-Clock ramping.
6046                  * However, this doesn't work on newer processor families nor
6047                  * does it work when virtualized as those devices don't exist.
6048                  */
6049                 if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
6050                         return (0);
6051                 }
6052 
6053                 if (CPI_FAMILY(cpi) == 0xf) {
6054                         struct cpuid_regs regs;
6055                         regs.cp_eax = 0x80000007;
6056                         (void) __cpuid_insn(&regs);
6057                         return (!(regs.cp_edx & 0x100));
6058                 }
6059                 return (0);
6060         case 6323525:
6061                 /*
6062                  * This erratum (K8 #147) is not present on family 10 and newer.
6063                  */
6064                 if (cpi->cpi_family >= 0x10) {
6065                         return (0);
6066                 }
6067                 return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
6068                     (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
6069 
6070         case 6671130:
6071                 /*
6072                  * check for processors (pre-Shanghai) that do not provide
6073                  * optimal management of 1gb ptes in its tlb.
6074                  */
6075                 return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
6076 
6077         case 298:
6078                 return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
6079                     DR_B2(eax) || RB_C0(eax));
6080 
6081         case 721:
6082 #if defined(__amd64)
6083                 return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
6084 #else
6085                 return (0);
6086 #endif
6087 
6088         default:
6089                 return (-1);
6090 
6091         }
6092 }
6093 
6094 /*
6095  * Determine if specified erratum is present via OSVW (OS Visible Workaround).
6096  * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
6097  */
6098 int
6099 osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
6100 {
6101         struct cpuid_info       *cpi;
6102         uint_t                  osvwid;
6103         static int              osvwfeature = -1;
6104         uint64_t                osvwlength;
6105 
6106 
6107         cpi = cpu->cpu_m.mcpu_cpi;
6108 
6109         /* confirm OSVW supported */
6110         if (osvwfeature == -1) {
6111                 osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
6112         } else {
6113                 /* assert that osvw feature setting is consistent on all cpus */
6114                 ASSERT(osvwfeature ==
6115                     (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
6116         }
6117         if (!osvwfeature)
6118                 return (-1);
6119 
6120         osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
6121 
6122         switch (erratum) {
6123         case 298:       /* osvwid is 0 */
6124                 osvwid = 0;
6125                 if (osvwlength <= (uint64_t)osvwid) {
6126                         /* osvwid 0 is unknown */
6127                         return (-1);
6128                 }
6129 
6130                 /*
6131                  * Check the OSVW STATUS MSR to determine the state
6132                  * of the erratum where:
6133                  *   0 - fixed by HW
6134                  *   1 - BIOS has applied the workaround when BIOS
6135                  *   workaround is available. (Or for other errata,
6136                  *   OS workaround is required.)
6137                  * For a value of 1, caller will confirm that the
6138                  * erratum 298 workaround has indeed been applied by BIOS.
6139                  *
6140                  * A 1 may be set in cpus that have a HW fix
6141                  * in a mixed cpu system. Regarding erratum 298:
6142                  *   In a multiprocessor platform, the workaround above
6143                  *   should be applied to all processors regardless of
6144                  *   silicon revision when an affected processor is
6145                  *   present.
6146                  */
6147 
6148                 return (rdmsr(MSR_AMD_OSVW_STATUS +
6149                     (osvwid / OSVW_ID_CNT_PER_MSR)) &
6150                     (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
6151 
6152         default:
6153                 return (-1);
6154         }
6155 }
6156 
6157 static const char assoc_str[] = "associativity";
6158 static const char line_str[] = "line-size";
6159 static const char size_str[] = "size";
6160 
6161 static void
6162 add_cache_prop(dev_info_t *devi, const char *label, const char *type,
6163     uint32_t val)
6164 {
6165         char buf[128];
6166 
6167         /*
6168          * ndi_prop_update_int() is used because it is desirable for
6169          * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
6170          */
6171         if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
6172                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
6173 }
6174 
6175 /*
6176  * Intel-style cache/tlb description
6177  *
6178  * Standard cpuid level 2 gives a randomly ordered
6179  * selection of tags that index into a table that describes
6180  * cache and tlb properties.
6181  */
6182 
6183 static const char l1_icache_str[] = "l1-icache";
6184 static const char l1_dcache_str[] = "l1-dcache";
6185 static const char l2_cache_str[] = "l2-cache";
6186 static const char l3_cache_str[] = "l3-cache";
6187 static const char itlb4k_str[] = "itlb-4K";
6188 static const char dtlb4k_str[] = "dtlb-4K";
6189 static const char itlb2M_str[] = "itlb-2M";
6190 static const char itlb4M_str[] = "itlb-4M";
6191 static const char dtlb4M_str[] = "dtlb-4M";
6192 static const char dtlb24_str[] = "dtlb0-2M-4M";
6193 static const char itlb424_str[] = "itlb-4K-2M-4M";
6194 static const char itlb24_str[] = "itlb-2M-4M";
6195 static const char dtlb44_str[] = "dtlb-4K-4M";
6196 static const char sl1_dcache_str[] = "sectored-l1-dcache";
6197 static const char sl2_cache_str[] = "sectored-l2-cache";
6198 static const char itrace_str[] = "itrace-cache";
6199 static const char sl3_cache_str[] = "sectored-l3-cache";
6200 static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
6201 
6202 static const struct cachetab {
6203         uint8_t         ct_code;
6204         uint8_t         ct_assoc;
6205         uint16_t        ct_line_size;
6206         size_t          ct_size;
6207         const char      *ct_label;
6208 } intel_ctab[] = {
6209         /*
6210          * maintain descending order!
6211          *
6212          * Codes ignored - Reason
6213          * ----------------------
6214          * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
6215          * f0H/f1H - Currently we do not interpret prefetch size by design
6216          */
6217         { 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
6218         { 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
6219         { 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
6220         { 0xde, 12, 64, 6*1024*1024, l3_cache_str},
6221         { 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
6222         { 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
6223         { 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
6224         { 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
6225         { 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
6226         { 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
6227         { 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
6228         { 0xd0, 4, 64, 512*1024, l3_cache_str},
6229         { 0xca, 4, 0, 512, sh_l2_tlb4k_str},
6230         { 0xc0, 4, 0, 8, dtlb44_str },
6231         { 0xba, 4, 0, 64, dtlb4k_str },
6232         { 0xb4, 4, 0, 256, dtlb4k_str },
6233         { 0xb3, 4, 0, 128, dtlb4k_str },
6234         { 0xb2, 4, 0, 64, itlb4k_str },
6235         { 0xb0, 4, 0, 128, itlb4k_str },
6236         { 0x87, 8, 64, 1024*1024, l2_cache_str},
6237         { 0x86, 4, 64, 512*1024, l2_cache_str},
6238         { 0x85, 8, 32, 2*1024*1024, l2_cache_str},
6239         { 0x84, 8, 32, 1024*1024, l2_cache_str},
6240         { 0x83, 8, 32, 512*1024, l2_cache_str},
6241         { 0x82, 8, 32, 256*1024, l2_cache_str},
6242         { 0x80, 8, 64, 512*1024, l2_cache_str},
6243         { 0x7f, 2, 64, 512*1024, l2_cache_str},
6244         { 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
6245         { 0x7c, 8, 64, 1024*1024, sl2_cache_str},
6246         { 0x7b, 8, 64, 512*1024, sl2_cache_str},
6247         { 0x7a, 8, 64, 256*1024, sl2_cache_str},
6248         { 0x79, 8, 64, 128*1024, sl2_cache_str},
6249         { 0x78, 8, 64, 1024*1024, l2_cache_str},
6250         { 0x73, 8, 0, 64*1024, itrace_str},
6251         { 0x72, 8, 0, 32*1024, itrace_str},
6252         { 0x71, 8, 0, 16*1024, itrace_str},
6253         { 0x70, 8, 0, 12*1024, itrace_str},
6254         { 0x68, 4, 64, 32*1024, sl1_dcache_str},
6255         { 0x67, 4, 64, 16*1024, sl1_dcache_str},
6256         { 0x66, 4, 64, 8*1024, sl1_dcache_str},
6257         { 0x60, 8, 64, 16*1024, sl1_dcache_str},
6258         { 0x5d, 0, 0, 256, dtlb44_str},
6259         { 0x5c, 0, 0, 128, dtlb44_str},
6260         { 0x5b, 0, 0, 64, dtlb44_str},
6261         { 0x5a, 4, 0, 32, dtlb24_str},
6262         { 0x59, 0, 0, 16, dtlb4k_str},
6263         { 0x57, 4, 0, 16, dtlb4k_str},
6264         { 0x56, 4, 0, 16, dtlb4M_str},
6265         { 0x55, 0, 0, 7, itlb24_str},
6266         { 0x52, 0, 0, 256, itlb424_str},
6267         { 0x51, 0, 0, 128, itlb424_str},
6268         { 0x50, 0, 0, 64, itlb424_str},
6269         { 0x4f, 0, 0, 32, itlb4k_str},
6270         { 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
6271         { 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
6272         { 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
6273         { 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
6274         { 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
6275         { 0x49, 16, 64, 4*1024*1024, l3_cache_str},
6276         { 0x48, 12, 64, 3*1024*1024, l2_cache_str},
6277         { 0x47, 8, 64, 8*1024*1024, l3_cache_str},
6278         { 0x46, 4, 64, 4*1024*1024, l3_cache_str},
6279         { 0x45, 4, 32, 2*1024*1024, l2_cache_str},
6280         { 0x44, 4, 32, 1024*1024, l2_cache_str},
6281         { 0x43, 4, 32, 512*1024, l2_cache_str},
6282         { 0x42, 4, 32, 256*1024, l2_cache_str},
6283         { 0x41, 4, 32, 128*1024, l2_cache_str},
6284         { 0x3e, 4, 64, 512*1024, sl2_cache_str},
6285         { 0x3d, 6, 64, 384*1024, sl2_cache_str},
6286         { 0x3c, 4, 64, 256*1024, sl2_cache_str},
6287         { 0x3b, 2, 64, 128*1024, sl2_cache_str},
6288         { 0x3a, 6, 64, 192*1024, sl2_cache_str},
6289         { 0x39, 4, 64, 128*1024, sl2_cache_str},
6290         { 0x30, 8, 64, 32*1024, l1_icache_str},
6291         { 0x2c, 8, 64, 32*1024, l1_dcache_str},
6292         { 0x29, 8, 64, 4096*1024, sl3_cache_str},
6293         { 0x25, 8, 64, 2048*1024, sl3_cache_str},
6294         { 0x23, 8, 64, 1024*1024, sl3_cache_str},
6295         { 0x22, 4, 64, 512*1024, sl3_cache_str},
6296         { 0x0e, 6, 64, 24*1024, l1_dcache_str},
6297         { 0x0d, 4, 32, 16*1024, l1_dcache_str},
6298         { 0x0c, 4, 32, 16*1024, l1_dcache_str},
6299         { 0x0b, 4, 0, 4, itlb4M_str},
6300         { 0x0a, 2, 32, 8*1024, l1_dcache_str},
6301         { 0x08, 4, 32, 16*1024, l1_icache_str},
6302         { 0x06, 4, 32, 8*1024, l1_icache_str},
6303         { 0x05, 4, 0, 32, dtlb4M_str},
6304         { 0x04, 4, 0, 8, dtlb4M_str},
6305         { 0x03, 4, 0, 64, dtlb4k_str},
6306         { 0x02, 4, 0, 2, itlb4M_str},
6307         { 0x01, 4, 0, 32, itlb4k_str},
6308         { 0 }
6309 };
6310 
6311 static const struct cachetab cyrix_ctab[] = {
6312         { 0x70, 4, 0, 32, "tlb-4K" },
6313         { 0x80, 4, 16, 16*1024, "l1-cache" },
6314         { 0 }
6315 };
6316 
6317 /*
6318  * Search a cache table for a matching entry
6319  */
6320 static const struct cachetab *
6321 find_cacheent(const struct cachetab *ct, uint_t code)
6322 {
6323         if (code != 0) {
6324                 for (; ct->ct_code != 0; ct++)
6325                         if (ct->ct_code <= code)
6326                                 break;
6327                 if (ct->ct_code == code)
6328                         return (ct);
6329         }
6330         return (NULL);
6331 }
6332 
6333 /*
6334  * Populate cachetab entry with L2 or L3 cache-information using
6335  * cpuid function 4. This function is called from intel_walk_cacheinfo()
6336  * when descriptor 0x49 is encountered. It returns 0 if no such cache
6337  * information is found.
6338  */
6339 static int
6340 intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
6341 {
6342         uint32_t level, i;
6343         int ret = 0;
6344 
6345         for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
6346                 level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
6347 
6348                 if (level == 2 || level == 3) {
6349                         ct->ct_assoc =
6350                             CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
6351                         ct->ct_line_size =
6352                             CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
6353                         ct->ct_size = ct->ct_assoc *
6354                             (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
6355                             ct->ct_line_size *
6356                             (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
6357 
6358                         if (level == 2) {
6359                                 ct->ct_label = l2_cache_str;
6360                         } else if (level == 3) {
6361                                 ct->ct_label = l3_cache_str;
6362                         }
6363                         ret = 1;
6364                 }
6365         }
6366 
6367         return (ret);
6368 }
6369 
6370 /*
6371  * Walk the cacheinfo descriptor, applying 'func' to every valid element
6372  * The walk is terminated if the walker returns non-zero.
6373  */
6374 static void
6375 intel_walk_cacheinfo(struct cpuid_info *cpi,
6376     void *arg, int (*func)(void *, const struct cachetab *))
6377 {
6378         const struct cachetab *ct;
6379         struct cachetab des_49_ct, des_b1_ct;
6380         uint8_t *dp;
6381         int i;
6382 
6383         if ((dp = cpi->cpi_cacheinfo) == NULL)
6384                 return;
6385         for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6386                 /*
6387                  * For overloaded descriptor 0x49 we use cpuid function 4
6388                  * if supported by the current processor, to create
6389                  * cache information.
6390                  * For overloaded descriptor 0xb1 we use X86_PAE flag
6391                  * to disambiguate the cache information.
6392                  */
6393                 if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
6394                     intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
6395                                 ct = &des_49_ct;
6396                 } else if (*dp == 0xb1) {
6397                         des_b1_ct.ct_code = 0xb1;
6398                         des_b1_ct.ct_assoc = 4;
6399                         des_b1_ct.ct_line_size = 0;
6400                         if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
6401                                 des_b1_ct.ct_size = 8;
6402                                 des_b1_ct.ct_label = itlb2M_str;
6403                         } else {
6404                                 des_b1_ct.ct_size = 4;
6405                                 des_b1_ct.ct_label = itlb4M_str;
6406                         }
6407                         ct = &des_b1_ct;
6408                 } else {
6409                         if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
6410                                 continue;
6411                         }
6412                 }
6413 
6414                 if (func(arg, ct) != 0) {
6415                         break;
6416                 }
6417         }
6418 }
6419 
6420 /*
6421  * (Like the Intel one, except for Cyrix CPUs)
6422  */
6423 static void
6424 cyrix_walk_cacheinfo(struct cpuid_info *cpi,
6425     void *arg, int (*func)(void *, const struct cachetab *))
6426 {
6427         const struct cachetab *ct;
6428         uint8_t *dp;
6429         int i;
6430 
6431         if ((dp = cpi->cpi_cacheinfo) == NULL)
6432                 return;
6433         for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6434                 /*
6435                  * Search Cyrix-specific descriptor table first ..
6436                  */
6437                 if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
6438                         if (func(arg, ct) != 0)
6439                                 break;
6440                         continue;
6441                 }
6442                 /*
6443                  * .. else fall back to the Intel one
6444                  */
6445                 if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
6446                         if (func(arg, ct) != 0)
6447                                 break;
6448                         continue;
6449                 }
6450         }
6451 }
6452 
6453 /*
6454  * A cacheinfo walker that adds associativity, line-size, and size properties
6455  * to the devinfo node it is passed as an argument.
6456  */
6457 static int
6458 add_cacheent_props(void *arg, const struct cachetab *ct)
6459 {
6460         dev_info_t *devi = arg;
6461 
6462         add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
6463         if (ct->ct_line_size != 0)
6464                 add_cache_prop(devi, ct->ct_label, line_str,
6465                     ct->ct_line_size);
6466         add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
6467         return (0);
6468 }
6469 
6470 
6471 static const char fully_assoc[] = "fully-associative?";
6472 
6473 /*
6474  * AMD style cache/tlb description
6475  *
6476  * Extended functions 5 and 6 directly describe properties of
6477  * tlbs and various cache levels.
6478  */
6479 static void
6480 add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6481 {
6482         switch (assoc) {
6483         case 0: /* reserved; ignore */
6484                 break;
6485         default:
6486                 add_cache_prop(devi, label, assoc_str, assoc);
6487                 break;
6488         case 0xff:
6489                 add_cache_prop(devi, label, fully_assoc, 1);
6490                 break;
6491         }
6492 }
6493 
6494 static void
6495 add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6496 {
6497         if (size == 0)
6498                 return;
6499         add_cache_prop(devi, label, size_str, size);
6500         add_amd_assoc(devi, label, assoc);
6501 }
6502 
6503 static void
6504 add_amd_cache(dev_info_t *devi, const char *label,
6505     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6506 {
6507         if (size == 0 || line_size == 0)
6508                 return;
6509         add_amd_assoc(devi, label, assoc);
6510         /*
6511          * Most AMD parts have a sectored cache. Multiple cache lines are
6512          * associated with each tag. A sector consists of all cache lines
6513          * associated with a tag. For example, the AMD K6-III has a sector
6514          * size of 2 cache lines per tag.
6515          */
6516         if (lines_per_tag != 0)
6517                 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6518         add_cache_prop(devi, label, line_str, line_size);
6519         add_cache_prop(devi, label, size_str, size * 1024);
6520 }
6521 
6522 static void
6523 add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6524 {
6525         switch (assoc) {
6526         case 0: /* off */
6527                 break;
6528         case 1:
6529         case 2:
6530         case 4:
6531                 add_cache_prop(devi, label, assoc_str, assoc);
6532                 break;
6533         case 6:
6534                 add_cache_prop(devi, label, assoc_str, 8);
6535                 break;
6536         case 8:
6537                 add_cache_prop(devi, label, assoc_str, 16);
6538                 break;
6539         case 0xf:
6540                 add_cache_prop(devi, label, fully_assoc, 1);
6541                 break;
6542         default: /* reserved; ignore */
6543                 break;
6544         }
6545 }
6546 
6547 static void
6548 add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6549 {
6550         if (size == 0 || assoc == 0)
6551                 return;
6552         add_amd_l2_assoc(devi, label, assoc);
6553         add_cache_prop(devi, label, size_str, size);
6554 }
6555 
6556 static void
6557 add_amd_l2_cache(dev_info_t *devi, const char *label,
6558     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6559 {
6560         if (size == 0 || assoc == 0 || line_size == 0)
6561                 return;
6562         add_amd_l2_assoc(devi, label, assoc);
6563         if (lines_per_tag != 0)
6564                 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6565         add_cache_prop(devi, label, line_str, line_size);
6566         add_cache_prop(devi, label, size_str, size * 1024);
6567 }
6568 
6569 static void
6570 amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
6571 {
6572         struct cpuid_regs *cp;
6573 
6574         if (cpi->cpi_xmaxeax < 0x80000005)
6575                 return;
6576         cp = &cpi->cpi_extd[5];
6577 
6578         /*
6579          * 4M/2M L1 TLB configuration
6580          *
6581          * We report the size for 2M pages because AMD uses two
6582          * TLB entries for one 4M page.
6583          */
6584         add_amd_tlb(devi, "dtlb-2M",
6585             BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
6586         add_amd_tlb(devi, "itlb-2M",
6587             BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
6588 
6589         /*
6590          * 4K L1 TLB configuration
6591          */
6592 
6593         switch (cpi->cpi_vendor) {
6594                 uint_t nentries;
6595         case X86_VENDOR_TM:
6596                 if (cpi->cpi_family >= 5) {
6597                         /*
6598                          * Crusoe processors have 256 TLB entries, but
6599                          * cpuid data format constrains them to only
6600                          * reporting 255 of them.
6601                          */
6602                         if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
6603                                 nentries = 256;
6604                         /*
6605                          * Crusoe processors also have a unified TLB
6606                          */
6607                         add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
6608                             nentries);
6609                         break;
6610                 }
6611                 /*FALLTHROUGH*/
6612         default:
6613                 add_amd_tlb(devi, itlb4k_str,
6614                     BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
6615                 add_amd_tlb(devi, dtlb4k_str,
6616                     BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
6617                 break;
6618         }
6619 
6620         /*
6621          * data L1 cache configuration
6622          */
6623 
6624         add_amd_cache(devi, l1_dcache_str,
6625             BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
6626             BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
6627 
6628         /*
6629          * code L1 cache configuration
6630          */
6631 
6632         add_amd_cache(devi, l1_icache_str,
6633             BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
6634             BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
6635 
6636         if (cpi->cpi_xmaxeax < 0x80000006)
6637                 return;
6638         cp = &cpi->cpi_extd[6];
6639 
6640         /* Check for a unified L2 TLB for large pages */
6641 
6642         if (BITX(cp->cp_eax, 31, 16) == 0)
6643                 add_amd_l2_tlb(devi, "l2-tlb-2M",
6644                     BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6645         else {
6646                 add_amd_l2_tlb(devi, "l2-dtlb-2M",
6647                     BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6648                 add_amd_l2_tlb(devi, "l2-itlb-2M",
6649                     BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6650         }
6651 
6652         /* Check for a unified L2 TLB for 4K pages */
6653 
6654         if (BITX(cp->cp_ebx, 31, 16) == 0) {
6655                 add_amd_l2_tlb(devi, "l2-tlb-4K",
6656                     BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6657         } else {
6658                 add_amd_l2_tlb(devi, "l2-dtlb-4K",
6659                     BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6660                 add_amd_l2_tlb(devi, "l2-itlb-4K",
6661                     BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6662         }
6663 
6664         add_amd_l2_cache(devi, l2_cache_str,
6665             BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
6666             BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
6667 }
6668 
6669 /*
6670  * There are two basic ways that the x86 world describes it cache
6671  * and tlb architecture - Intel's way and AMD's way.
6672  *
6673  * Return which flavor of cache architecture we should use
6674  */
6675 static int
6676 x86_which_cacheinfo(struct cpuid_info *cpi)
6677 {
6678         switch (cpi->cpi_vendor) {
6679         case X86_VENDOR_Intel:
6680                 if (cpi->cpi_maxeax >= 2)
6681                         return (X86_VENDOR_Intel);
6682                 break;
6683         case X86_VENDOR_AMD:
6684                 /*
6685                  * The K5 model 1 was the first part from AMD that reported
6686                  * cache sizes via extended cpuid functions.
6687                  */
6688                 if (cpi->cpi_family > 5 ||
6689                     (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
6690                         return (X86_VENDOR_AMD);
6691                 break;
6692         case X86_VENDOR_TM:
6693                 if (cpi->cpi_family >= 5)
6694                         return (X86_VENDOR_AMD);
6695                 /*FALLTHROUGH*/
6696         default:
6697                 /*
6698                  * If they have extended CPU data for 0x80000005
6699                  * then we assume they have AMD-format cache
6700                  * information.
6701                  *
6702                  * If not, and the vendor happens to be Cyrix,
6703                  * then try our-Cyrix specific handler.
6704                  *
6705                  * If we're not Cyrix, then assume we're using Intel's
6706                  * table-driven format instead.
6707                  */
6708                 if (cpi->cpi_xmaxeax >= 0x80000005)
6709                         return (X86_VENDOR_AMD);
6710                 else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
6711                         return (X86_VENDOR_Cyrix);
6712                 else if (cpi->cpi_maxeax >= 2)
6713                         return (X86_VENDOR_Intel);
6714                 break;
6715         }
6716         return (-1);
6717 }
6718 
6719 void
6720 cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
6721     struct cpuid_info *cpi)
6722 {
6723         dev_info_t *cpu_devi;
6724         int create;
6725 
6726         cpu_devi = (dev_info_t *)dip;
6727 
6728         /* device_type */
6729         (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6730             "device_type", "cpu");
6731 
6732         /* reg */
6733         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6734             "reg", cpu_id);
6735 
6736         /* cpu-mhz, and clock-frequency */
6737         if (cpu_freq > 0) {
6738                 long long mul;
6739 
6740                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6741                     "cpu-mhz", cpu_freq);
6742                 if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
6743                         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6744                             "clock-frequency", (int)mul);
6745         }
6746 
6747         if (!is_x86_feature(x86_featureset, X86FSET_CPUID)) {
6748                 return;
6749         }
6750 
6751         /* vendor-id */
6752         (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6753             "vendor-id", cpi->cpi_vendorstr);
6754 
6755         if (cpi->cpi_maxeax == 0) {
6756                 return;
6757         }
6758 
6759         /*
6760          * family, model, and step
6761          */
6762         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6763             "family", CPI_FAMILY(cpi));
6764         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6765             "cpu-model", CPI_MODEL(cpi));
6766         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6767             "stepping-id", CPI_STEP(cpi));
6768 
6769         /* type */
6770         switch (cpi->cpi_vendor) {
6771         case X86_VENDOR_Intel:
6772                 create = 1;
6773                 break;
6774         default:
6775                 create = 0;
6776                 break;
6777         }
6778         if (create)
6779                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6780                     "type", CPI_TYPE(cpi));
6781 
6782         /* ext-family */
6783         switch (cpi->cpi_vendor) {
6784         case X86_VENDOR_Intel:
6785         case X86_VENDOR_AMD:
6786                 create = cpi->cpi_family >= 0xf;
6787                 break;
6788         default:
6789                 create = 0;
6790                 break;
6791         }
6792         if (create)
6793                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6794                     "ext-family", CPI_FAMILY_XTD(cpi));
6795 
6796         /* ext-model */
6797         switch (cpi->cpi_vendor) {
6798         case X86_VENDOR_Intel:
6799                 create = IS_EXTENDED_MODEL_INTEL(cpi);
6800                 break;
6801         case X86_VENDOR_AMD:
6802                 create = CPI_FAMILY(cpi) == 0xf;
6803                 break;
6804         default:
6805                 create = 0;
6806                 break;
6807         }
6808         if (create)
6809                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6810                     "ext-model", CPI_MODEL_XTD(cpi));
6811 
6812         /* generation */
6813         switch (cpi->cpi_vendor) {
6814         case X86_VENDOR_AMD:
6815                 /*
6816                  * AMD K5 model 1 was the first part to support this
6817                  */
6818                 create = cpi->cpi_xmaxeax >= 0x80000001;
6819                 break;
6820         default:
6821                 create = 0;
6822                 break;
6823         }
6824         if (create)
6825                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6826                     "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
6827 
6828         /* brand-id */
6829         switch (cpi->cpi_vendor) {
6830         case X86_VENDOR_Intel:
6831                 /*
6832                  * brand id first appeared on Pentium III Xeon model 8,
6833                  * and Celeron model 8 processors and Opteron
6834                  */
6835                 create = cpi->cpi_family > 6 ||
6836                     (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
6837                 break;
6838         case X86_VENDOR_AMD:
6839                 create = cpi->cpi_family >= 0xf;
6840                 break;
6841         default:
6842                 create = 0;
6843                 break;
6844         }
6845         if (create && cpi->cpi_brandid != 0) {
6846                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6847                     "brand-id", cpi->cpi_brandid);
6848         }
6849 
6850         /* chunks, and apic-id */
6851         switch (cpi->cpi_vendor) {
6852                 /*
6853                  * first available on Pentium IV and Opteron (K8)
6854                  */
6855         case X86_VENDOR_Intel:
6856                 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6857                 break;
6858         case X86_VENDOR_AMD:
6859                 create = cpi->cpi_family >= 0xf;
6860                 break;
6861         default:
6862                 create = 0;
6863                 break;
6864         }
6865         if (create) {
6866                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6867                     "chunks", CPI_CHUNKS(cpi));
6868                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6869                     "apic-id", cpi->cpi_apicid);
6870                 if (cpi->cpi_chipid >= 0) {
6871                         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6872                             "chip#", cpi->cpi_chipid);
6873                         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6874                             "clog#", cpi->cpi_clogid);
6875                 }
6876         }
6877 
6878         /* cpuid-features */
6879         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6880             "cpuid-features", CPI_FEATURES_EDX(cpi));
6881 
6882 
6883         /* cpuid-features-ecx */
6884         switch (cpi->cpi_vendor) {
6885         case X86_VENDOR_Intel:
6886                 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6887                 break;
6888         case X86_VENDOR_AMD:
6889                 create = cpi->cpi_family >= 0xf;
6890                 break;
6891         default:
6892                 create = 0;
6893                 break;
6894         }
6895         if (create)
6896                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6897                     "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
6898 
6899         /* ext-cpuid-features */
6900         switch (cpi->cpi_vendor) {
6901         case X86_VENDOR_Intel:
6902         case X86_VENDOR_AMD:
6903         case X86_VENDOR_Cyrix:
6904         case X86_VENDOR_TM:
6905         case X86_VENDOR_Centaur:
6906                 create = cpi->cpi_xmaxeax >= 0x80000001;
6907                 break;
6908         default:
6909                 create = 0;
6910                 break;
6911         }
6912         if (create) {
6913                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6914                     "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
6915                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6916                     "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
6917         }
6918 
6919         /*
6920          * Brand String first appeared in Intel Pentium IV, AMD K5
6921          * model 1, and Cyrix GXm.  On earlier models we try and
6922          * simulate something similar .. so this string should always
6923          * same -something- about the processor, however lame.
6924          */
6925         (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6926             "brand-string", cpi->cpi_brandstr);
6927 
6928         /*
6929          * Finally, cache and tlb information
6930          */
6931         switch (x86_which_cacheinfo(cpi)) {
6932         case X86_VENDOR_Intel:
6933                 intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
6934                 break;
6935         case X86_VENDOR_Cyrix:
6936                 cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
6937                 break;
6938         case X86_VENDOR_AMD:
6939                 amd_cache_info(cpi, cpu_devi);
6940                 break;
6941         default:
6942                 break;
6943         }
6944 }
6945 
6946 struct l2info {
6947         int *l2i_csz;
6948         int *l2i_lsz;
6949         int *l2i_assoc;
6950         int l2i_ret;
6951 };
6952 
6953 /*
6954  * A cacheinfo walker that fetches the size, line-size and associativity
6955  * of the L2 cache
6956  */
6957 static int
6958 intel_l2cinfo(void *arg, const struct cachetab *ct)
6959 {
6960         struct l2info *l2i = arg;
6961         int *ip;
6962 
6963         if (ct->ct_label != l2_cache_str &&
6964             ct->ct_label != sl2_cache_str)
6965                 return (0);     /* not an L2 -- keep walking */
6966 
6967         if ((ip = l2i->l2i_csz) != NULL)
6968                 *ip = ct->ct_size;
6969         if ((ip = l2i->l2i_lsz) != NULL)
6970                 *ip = ct->ct_line_size;
6971         if ((ip = l2i->l2i_assoc) != NULL)
6972                 *ip = ct->ct_assoc;
6973         l2i->l2i_ret = ct->ct_size;
6974         return (1);             /* was an L2 -- terminate walk */
6975 }
6976 
6977 /*
6978  * AMD L2/L3 Cache and TLB Associativity Field Definition:
6979  *
6980  *      Unlike the associativity for the L1 cache and tlb where the 8 bit
6981  *      value is the associativity, the associativity for the L2 cache and
6982  *      tlb is encoded in the following table. The 4 bit L2 value serves as
6983  *      an index into the amd_afd[] array to determine the associativity.
6984  *      -1 is undefined. 0 is fully associative.
6985  */
6986 
6987 static int amd_afd[] =
6988         {-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
6989 
6990 static void
6991 amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
6992 {
6993         struct cpuid_regs *cp;
6994         uint_t size, assoc;
6995         int i;
6996         int *ip;
6997 
6998         if (cpi->cpi_xmaxeax < 0x80000006)
6999                 return;
7000         cp = &cpi->cpi_extd[6];
7001 
7002         if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
7003             (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
7004                 uint_t cachesz = size * 1024;
7005                 assoc = amd_afd[i];
7006 
7007                 ASSERT(assoc != -1);
7008 
7009                 if ((ip = l2i->l2i_csz) != NULL)
7010                         *ip = cachesz;
7011                 if ((ip = l2i->l2i_lsz) != NULL)
7012                         *ip = BITX(cp->cp_ecx, 7, 0);
7013                 if ((ip = l2i->l2i_assoc) != NULL)
7014                         *ip = assoc;
7015                 l2i->l2i_ret = cachesz;
7016         }
7017 }
7018 
7019 int
7020 getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
7021 {
7022         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7023         struct l2info __l2info, *l2i = &__l2info;
7024 
7025         l2i->l2i_csz = csz;
7026         l2i->l2i_lsz = lsz;
7027         l2i->l2i_assoc = assoc;
7028         l2i->l2i_ret = -1;
7029 
7030         switch (x86_which_cacheinfo(cpi)) {
7031         case X86_VENDOR_Intel:
7032                 intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7033                 break;
7034         case X86_VENDOR_Cyrix:
7035                 cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7036                 break;
7037         case X86_VENDOR_AMD:
7038                 amd_l2cacheinfo(cpi, l2i);
7039                 break;
7040         default:
7041                 break;
7042         }
7043         return (l2i->l2i_ret);
7044 }
7045 
7046 #if !defined(__xpv)
7047 
7048 uint32_t *
7049 cpuid_mwait_alloc(cpu_t *cpu)
7050 {
7051         uint32_t        *ret;
7052         size_t          mwait_size;
7053 
7054         ASSERT(cpuid_checkpass(CPU, 2));
7055 
7056         mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
7057         if (mwait_size == 0)
7058                 return (NULL);
7059 
7060         /*
7061          * kmem_alloc() returns cache line size aligned data for mwait_size
7062          * allocations.  mwait_size is currently cache line sized.  Neither
7063          * of these implementation details are guarantied to be true in the
7064          * future.
7065          *
7066          * First try allocating mwait_size as kmem_alloc() currently returns
7067          * correctly aligned memory.  If kmem_alloc() does not return
7068          * mwait_size aligned memory, then use mwait_size ROUNDUP.
7069          *
7070          * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
7071          * decide to free this memory.
7072          */
7073         ret = kmem_zalloc(mwait_size, KM_SLEEP);
7074         if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
7075                 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7076                 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
7077                 *ret = MWAIT_RUNNING;
7078                 return (ret);
7079         } else {
7080                 kmem_free(ret, mwait_size);
7081                 ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
7082                 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7083                 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
7084                 ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
7085                 *ret = MWAIT_RUNNING;
7086                 return (ret);
7087         }
7088 }
7089 
7090 void
7091 cpuid_mwait_free(cpu_t *cpu)
7092 {
7093         if (cpu->cpu_m.mcpu_cpi == NULL) {
7094                 return;
7095         }
7096 
7097         if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
7098             cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
7099                 kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
7100                     cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
7101         }
7102 
7103         cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
7104         cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
7105 }
7106 
7107 void
7108 patch_tsc_read(int flag)
7109 {
7110         size_t cnt;
7111 
7112         switch (flag) {
7113         case TSC_NONE:
7114                 cnt = &_no_rdtsc_end - &_no_rdtsc_start;
7115                 (void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
7116                 break;
7117         case TSC_RDTSC_MFENCE:
7118                 cnt = &_tsc_mfence_end - &_tsc_mfence_start;
7119                 (void) memcpy((void *)tsc_read,
7120                     (void *)&_tsc_mfence_start, cnt);
7121                 break;
7122         case TSC_RDTSC_LFENCE:
7123                 cnt = &_tsc_lfence_end - &_tsc_lfence_start;
7124                 (void) memcpy((void *)tsc_read,
7125                     (void *)&_tsc_lfence_start, cnt);
7126                 break;
7127         case TSC_TSCP:
7128                 cnt = &_tscp_end - &_tscp_start;
7129                 (void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
7130                 break;
7131         default:
7132                 /* Bail for unexpected TSC types. (TSC_NONE covers 0) */
7133                 cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
7134                 break;
7135         }
7136         tsc_type = flag;
7137 }
7138 
7139 int
7140 cpuid_deep_cstates_supported(void)
7141 {
7142         struct cpuid_info *cpi;
7143         struct cpuid_regs regs;
7144 
7145         ASSERT(cpuid_checkpass(CPU, 1));
7146 
7147         cpi = CPU->cpu_m.mcpu_cpi;
7148 
7149         if (!is_x86_feature(x86_featureset, X86FSET_CPUID))
7150                 return (0);
7151 
7152         switch (cpi->cpi_vendor) {
7153         case X86_VENDOR_Intel:
7154                 if (cpi->cpi_xmaxeax < 0x80000007)
7155                         return (0);
7156 
7157                 /*
7158                  * TSC run at a constant rate in all ACPI C-states?
7159                  */
7160                 regs.cp_eax = 0x80000007;
7161                 (void) __cpuid_insn(&regs);
7162                 return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
7163 
7164         default:
7165                 return (0);
7166         }
7167 }
7168 
7169 #endif  /* !__xpv */
7170 
7171 void
7172 post_startup_cpu_fixups(void)
7173 {
7174 #ifndef __xpv
7175         /*
7176          * Some AMD processors support C1E state. Entering this state will
7177          * cause the local APIC timer to stop, which we can't deal with at
7178          * this time.
7179          */
7180         if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
7181                 on_trap_data_t otd;
7182                 uint64_t reg;
7183 
7184                 if (!on_trap(&otd, OT_DATA_ACCESS)) {
7185                         reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
7186                         /* Disable C1E state if it is enabled by BIOS */
7187                         if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
7188                             AMD_ACTONCMPHALT_MASK) {
7189                                 reg &= ~(AMD_ACTONCMPHALT_MASK <<
7190                                     AMD_ACTONCMPHALT_SHIFT);
7191                                 wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
7192                         }
7193                 }
7194                 no_trap();
7195         }
7196 #endif  /* !__xpv */
7197 }
7198 
7199 void
7200 enable_pcid(void)
7201 {
7202         if (x86_use_pcid == -1)
7203                 x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
7204 
7205         if (x86_use_invpcid == -1) {
7206                 x86_use_invpcid = is_x86_feature(x86_featureset,
7207                     X86FSET_INVPCID);
7208         }
7209 
7210         if (!x86_use_pcid)
7211                 return;
7212 
7213         /*
7214          * Intel say that on setting PCIDE, it immediately starts using the PCID
7215          * bits; better make sure there's nothing there.
7216          */
7217         ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
7218 
7219         setcr4(getcr4() | CR4_PCIDE);
7220 }
7221 
7222 /*
7223  * Setup necessary registers to enable XSAVE feature on this processor.
7224  * This function needs to be called early enough, so that no xsave/xrstor
7225  * ops will execute on the processor before the MSRs are properly set up.
7226  *
7227  * Current implementation has the following assumption:
7228  * - cpuid_pass1() is done, so that X86 features are known.
7229  * - fpu_probe() is done, so that fp_save_mech is chosen.
7230  */
7231 void
7232 xsave_setup_msr(cpu_t *cpu)
7233 {
7234         ASSERT(fp_save_mech == FP_XSAVE);
7235         ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
7236 
7237         /* Enable OSXSAVE in CR4. */
7238         setcr4(getcr4() | CR4_OSXSAVE);
7239         /*
7240          * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report
7241          * correct value.
7242          */
7243         cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE;
7244         setup_xfem();
7245 }
7246 
7247 /*
7248  * Starting with the Westmere processor the local
7249  * APIC timer will continue running in all C-states,
7250  * including the deepest C-states.
7251  */
7252 int
7253 cpuid_arat_supported(void)
7254 {
7255         struct cpuid_info *cpi;
7256         struct cpuid_regs regs;
7257 
7258         ASSERT(cpuid_checkpass(CPU, 1));
7259         ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7260 
7261         cpi = CPU->cpu_m.mcpu_cpi;
7262 
7263         switch (cpi->cpi_vendor) {
7264         case X86_VENDOR_Intel:
7265                 /*
7266                  * Always-running Local APIC Timer is
7267                  * indicated by CPUID.6.EAX[2].
7268                  */
7269                 if (cpi->cpi_maxeax >= 6) {
7270                         regs.cp_eax = 6;
7271                         (void) cpuid_insn(NULL, &regs);
7272                         return (regs.cp_eax & CPUID_INTC_EAX_ARAT);
7273                 } else {
7274                         return (0);
7275                 }
7276         default:
7277                 return (0);
7278         }
7279 }
7280 
7281 /*
7282  * Check support for Intel ENERGY_PERF_BIAS feature
7283  */
7284 int
7285 cpuid_iepb_supported(struct cpu *cp)
7286 {
7287         struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi;
7288         struct cpuid_regs regs;
7289 
7290         ASSERT(cpuid_checkpass(cp, 1));
7291 
7292         if (!(is_x86_feature(x86_featureset, X86FSET_CPUID)) ||
7293             !(is_x86_feature(x86_featureset, X86FSET_MSR))) {
7294                 return (0);
7295         }
7296 
7297         /*
7298          * Intel ENERGY_PERF_BIAS MSR is indicated by
7299          * capability bit CPUID.6.ECX.3
7300          */
7301         if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6))
7302                 return (0);
7303 
7304         regs.cp_eax = 0x6;
7305         (void) cpuid_insn(NULL, &regs);
7306         return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS);
7307 }
7308 
7309 /*
7310  * Check support for TSC deadline timer
7311  *
7312  * TSC deadline timer provides a superior software programming
7313  * model over local APIC timer that eliminates "time drifts".
7314  * Instead of specifying a relative time, software specifies an
7315  * absolute time as the target at which the processor should
7316  * generate a timer event.
7317  */
7318 int
7319 cpuid_deadline_tsc_supported(void)
7320 {
7321         struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi;
7322         struct cpuid_regs regs;
7323 
7324         ASSERT(cpuid_checkpass(CPU, 1));
7325         ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7326 
7327         switch (cpi->cpi_vendor) {
7328         case X86_VENDOR_Intel:
7329                 if (cpi->cpi_maxeax >= 1) {
7330                         regs.cp_eax = 1;
7331                         (void) cpuid_insn(NULL, &regs);
7332                         return (regs.cp_ecx & CPUID_DEADLINE_TSC);
7333                 } else {
7334                         return (0);
7335                 }
7336         default:
7337                 return (0);
7338         }
7339 }
7340 
7341 #if defined(__amd64) && !defined(__xpv)
7342 /*
7343  * Patch in versions of bcopy for high performance Intel Nhm processors
7344  * and later...
7345  */
7346 void
7347 patch_memops(uint_t vendor)
7348 {
7349         size_t cnt, i;
7350         caddr_t to, from;
7351 
7352         if ((vendor == X86_VENDOR_Intel) &&
7353             is_x86_feature(x86_featureset, X86FSET_SSE4_2)) {
7354                 cnt = &bcopy_patch_end - &bcopy_patch_start;
7355                 to = &bcopy_ck_size;
7356                 from = &bcopy_patch_start;
7357                 for (i = 0; i < cnt; i++) {
7358                         *to++ = *from++;
7359                 }
7360         }
7361 }
7362 #endif  /* __amd64 && !__xpv */
7363 
7364 /*
7365  * We're being asked to tell the system how many bits are required to represent
7366  * the various thread and strand IDs. While it's tempting to derive this based
7367  * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite
7368  * correct. Instead, this needs to be based on the number of bits that the APIC
7369  * allows for these different configurations. We only update these to a larger
7370  * value if we find one.
7371  */
7372 void
7373 cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
7374 {
7375         struct cpuid_info *cpi;
7376 
7377         VERIFY(cpuid_checkpass(CPU, 1));
7378         cpi = cpu->cpu_m.mcpu_cpi;
7379 
7380         if (cpi->cpi_ncore_bits > *core_nbits) {
7381                 *core_nbits = cpi->cpi_ncore_bits;
7382         }
7383 
7384         if (cpi->cpi_nthread_bits > *strand_nbits) {
7385                 *strand_nbits = cpi->cpi_nthread_bits;
7386         }
7387 }
7388 
7389 void
7390 cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
7391 {
7392         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7393         struct cpuid_regs cp;
7394 
7395         /*
7396          * Reread the CPUID portions that we need for various security
7397          * information.
7398          */
7399         if (cpi->cpi_vendor == X86_VENDOR_Intel) {
7400                 /*
7401                  * Check if we now have leaf 7 available to us.
7402                  */
7403                 if (cpi->cpi_maxeax < 7) {
7404                         bzero(&cp, sizeof (cp));
7405                         cp.cp_eax = 0;
7406                         cpi->cpi_maxeax = __cpuid_insn(&cp);
7407                         if (cpi->cpi_maxeax < 7)
7408                                 return;
7409                 }
7410 
7411                 bzero(&cp, sizeof (cp));
7412                 cp.cp_eax = 7;
7413                 cp.cp_ecx = 0;
7414                 (void) __cpuid_insn(&cp);
7415                 cpi->cpi_std[7] = cp;
7416         } else if (cpi->cpi_vendor == X86_VENDOR_AMD) {
7417                 /* No xcpuid support */
7418                 if (cpi->cpi_family < 5 ||
7419                     (cpi->cpi_family == 5 && cpi->cpi_model < 1))
7420                         return;
7421 
7422                 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7423                         bzero(&cp, sizeof (cp));
7424                         cp.cp_eax = CPUID_LEAF_EXT_0;
7425                         cpi->cpi_xmaxeax = __cpuid_insn(&cp);
7426                         if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7427                                 return;
7428                         }
7429                 }
7430 
7431                 bzero(&cp, sizeof (cp));
7432                 cp.cp_eax = CPUID_LEAF_EXT_8;
7433                 (void) __cpuid_insn(&cp);
7434                 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
7435                 cpi->cpi_extd[8] = cp;
7436         } else {
7437                 /*
7438                  * Nothing to do here. Return an empty set which has already
7439                  * been zeroed for us.
7440                  */
7441                 return;
7442         }
7443         cpuid_scan_security(cpu, fset);
7444 }
7445 
7446 /* ARGSUSED */
7447 static int
7448 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
7449 {
7450         uchar_t *fset;
7451         boolean_t first_pass = (boolean_t)arg1;
7452 
7453         fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
7454         if (first_pass && CPU->cpu_id != 0)
7455                 return (0);
7456         if (!first_pass && CPU->cpu_id == 0)
7457                 return (0);
7458         cpuid_pass_ucode(CPU, fset);
7459 
7460         return (0);
7461 }
7462 
7463 /*
7464  * After a microcode update where the version has changed, then we need to
7465  * rescan CPUID. To do this we check every CPU to make sure that they have the
7466  * same microcode. Then we perform a cross call to all such CPUs. It's the
7467  * caller's job to make sure that no one else can end up doing an update while
7468  * this is going on.
7469  *
7470  * We assume that the system is microcode capable if we're called.
7471  */
7472 void
7473 cpuid_post_ucodeadm(void)
7474 {
7475         uint32_t rev;
7476         int i;
7477         struct cpu *cpu;
7478         cpuset_t cpuset;
7479         void *argdata;
7480         uchar_t *f0;
7481 
7482         argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
7483 
7484         mutex_enter(&cpu_lock);
7485         cpu = cpu_get(0);
7486         rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
7487         CPUSET_ONLY(cpuset, 0);
7488         for (i = 1; i < max_ncpus; i++) {
7489                 if ((cpu = cpu_get(i)) == NULL)
7490                         continue;
7491 
7492                 if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
7493                         panic("post microcode update CPU %d has differing "
7494                             "microcode revision (%u) from CPU 0 (%u)",
7495                             i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
7496                 }
7497                 CPUSET_ADD(cpuset, i);
7498         }
7499 
7500         /*
7501          * We do the cross calls in two passes. The first pass is only for the
7502          * boot CPU. The second pass is for all of the other CPUs. This allows
7503          * the boot CPU to go through and change behavior related to patching or
7504          * whether or not Enhanced IBRS needs to be enabled and then allow all
7505          * other CPUs to follow suit.
7506          */
7507         kpreempt_disable();
7508         xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset),
7509             cpuid_post_ucodeadm_xc);
7510         xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset),
7511             cpuid_post_ucodeadm_xc);
7512         kpreempt_enable();
7513 
7514         /*
7515          * OK, now look at each CPU and see if their feature sets are equal.
7516          */
7517         f0 = argdata;
7518         for (i = 1; i < max_ncpus; i++) {
7519                 uchar_t *fset;
7520                 if (!CPU_IN_SET(cpuset, i))
7521                         continue;
7522 
7523                 fset = (uchar_t *)((uintptr_t)argdata +
7524                     sizeof (x86_featureset) * i);
7525 
7526                 if (!compare_x86_featureset(f0, fset)) {
7527                         panic("Post microcode update CPU %d has "
7528                             "differing security feature (%p) set from CPU 0 "
7529                             "(%p), not appending to feature set", i,
7530                             (void *)fset, (void *)f0);
7531                 }
7532         }
7533 
7534         mutex_exit(&cpu_lock);
7535 
7536         for (i = 0; i < NUM_X86_FEATURES; i++) {
7537                 cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n",
7538                     x86_feature_names[i]);
7539                 if (is_x86_feature(f0, i)) {
7540                         add_x86_feature(x86_featureset, i);
7541                 }
7542         }
7543         kmem_free(argdata, sizeof (x86_featureset) * NCPU);
7544 }