1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  24  * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
  25  * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
  26  */
  27 /*
  28  * Copyright (c) 2010, Intel Corporation.
  29  * All rights reserved.
  30  */
  31 /*
  32  * Portions Copyright 2009 Advanced Micro Devices, Inc.
  33  */
  34 /*
  35  * Copyright 2019 Joyent, Inc.
  36  */
  37 
  38 /*
  39  * CPU Identification logic
  40  *
  41  * The purpose of this file and its companion, cpuid_subr.c, is to help deal
  42  * with the identification of CPUs, their features, and their topologies. More
  43  * specifically, this file helps drive the following:
  44  *
  45  * 1. Enumeration of features of the processor which are used by the kernel to
  46  *    determine what features to enable or disable. These may be instruction set
  47  *    enhancements or features that we use.
  48  *
  49  * 2. Enumeration of instruction set architecture (ISA) additions that userland
  50  *    will be told about through the auxiliary vector.
  51  *
  52  * 3. Understanding the physical topology of the CPU such as the number of
  53  *    caches, how many cores it has, whether or not it supports symmetric
  54  *    multi-processing (SMT), etc.
  55  *
  56  * ------------------------
  57  * CPUID History and Basics
  58  * ------------------------
  59  *
  60  * The cpuid instruction was added by Intel roughly around the time that the
  61  * original Pentium was introduced. The purpose of cpuid was to tell in a
  62  * programmatic fashion information about the CPU that previously was guessed
  63  * at. For example, an important part of cpuid is that we can know what
  64  * extensions to the ISA exist. If you use an invalid opcode you would get a
  65  * #UD, so this method allows a program (whether a user program or the kernel)
  66  * to determine what exists without crashing or getting a SIGILL. Of course,
  67  * this was also during the era of the clones and the AMD Am5x86. The vendor
  68  * name shows up first in cpuid for a reason.
  69  *
  70  * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
  71  * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
  72  * its own meaning. The different leaves are broken down into different regions:
  73  *
  74  *      [ 0, 7fffffff ]                 This region is called the 'basic'
  75  *                                      region. This region is generally defined
  76  *                                      by Intel, though some of the original
  77  *                                      portions have different meanings based
  78  *                                      on the manufacturer. These days, Intel
  79  *                                      adds most new features to this region.
  80  *                                      AMD adds non-Intel compatible
  81  *                                      information in the third, extended
  82  *                                      region. Intel uses this for everything
  83  *                                      including ISA extensions, CPU
  84  *                                      features, cache information, topology,
  85  *                                      and more.
  86  *
  87  *                                      There is a hole carved out of this
  88  *                                      region which is reserved for
  89  *                                      hypervisors.
  90  *
  91  *      [ 40000000, 4fffffff ]          This region, which is found in the
  92  *                                      middle of the previous region, is
  93  *                                      explicitly promised to never be used by
  94  *                                      CPUs. Instead, it is used by hypervisors
  95  *                                      to communicate information about
  96  *                                      themselves to the operating system. The
  97  *                                      values and details are unique for each
  98  *                                      hypervisor.
  99  *
 100  *      [ 80000000, ffffffff ]          This region is called the 'extended'
 101  *                                      region. Some of the low leaves mirror
 102  *                                      parts of the basic leaves. This region
 103  *                                      has generally been used by AMD for
 104  *                                      various extensions. For example, AMD-
 105  *                                      specific information about caches,
 106  *                                      features, and topology are found in this
 107  *                                      region.
 108  *
 109  * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
 110  * and %edx, and then issue the cpuid instruction. At the first leaf in each of
 111  * the ranges, one of the primary things returned is the maximum valid leaf in
 112  * that range. This allows for discovery of what range of CPUID is valid.
 113  *
 114  * The CPUs have potentially surprising behavior when using an invalid leaf or
 115  * unimplemented leaf. If the requested leaf is within the valid basic or
 116  * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
 117  * set to zero. However, if you specify a leaf that is outside of a valid range,
 118  * then instead it will be filled with the last valid _basic_ leaf. For example,
 119  * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
 120  * an invalid extended leaf will return the information for leaf 3.
 121  *
 122  * Some leaves are broken down into sub-leaves. This means that the value
 123  * depends on both the leaf asked for in %eax and a secondary register. For
 124  * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
 125  * additional information. Or when getting topology information in leaf 0xb, the
 126  * initial value in %ecx changes which level of the topology that you are
 127  * getting information about.
 128  *
 129  * cpuid values are always kept to 32 bits regardless of whether or not the
 130  * program is in 64-bit mode. When executing in 64-bit mode, the upper
 131  * 32 bits of the register are always set to zero so that way the values are the
 132  * same regardless of execution mode.
 133  *
 134  * ----------------------
 135  * Identifying Processors
 136  * ----------------------
 137  *
 138  * We can identify a processor in two steps. The first step looks at cpuid leaf
 139  * 0. Leaf 0 contains the processor's vendor information. This is done by
 140  * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
 141  * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
 142  *
 143  * From there, a processor is identified by a combination of three different
 144  * values:
 145  *
 146  *  1. Family
 147  *  2. Model
 148  *  3. Stepping
 149  *
 150  * Each vendor uses the family and model to uniquely identify a processor. The
 151  * way that family and model are changed depends on the vendor. For example,
 152  * Intel has been using family 0x6 for almost all of their processor since the
 153  * Pentium Pro/Pentium II era, often called the P6. The model is used to
 154  * identify the exact processor. Different models are often used for the client
 155  * (consumer) and server parts. Even though each processor often has major
 156  * architectural differences, they still are considered the same family by
 157  * Intel.
 158  *
 159  * On the other hand, each major AMD architecture generally has its own family.
 160  * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
 161  * the model number is used to help identify specific processors.
 162  *
 163  * The stepping is used to refer to a revision of a specific microprocessor. The
 164  * term comes from equipment used to produce masks that are used to create
 165  * integrated circuits.
 166  *
 167  * The information is present in leaf 1, %eax. In technical documentation you
 168  * will see the terms extended model and extended family. The original family,
 169  * model, and stepping fields were each 4 bits wide. If the values in either
 170  * are 0xf, then one is to consult the extended model and extended family, which
 171  * take previously reserved bits and allow for a larger number of models and add
 172  * 0xf to them.
 173  *
 174  * When we process this information, we store the full family, model, and
 175  * stepping in the struct cpuid_info members cpi_family, cpi_model, and
 176  * cpi_step, respectively. Whenever you are performing comparisons with the
 177  * family, model, and stepping, you should use these members and not the raw
 178  * values from cpuid. If you must use the raw values from cpuid directly, you
 179  * must make sure that you add the extended model and family to the base model
 180  * and family.
 181  *
 182  * In general, we do not use information about the family, model, and stepping
 183  * to determine whether or not a feature is present; that is generally driven by
 184  * specific leaves. However, when something we care about on the processor is
 185  * not considered 'architectural' meaning that it is specific to a set of
 186  * processors and not promised in the architecture model to be consistent from
 187  * generation to generation, then we will fall back on this information. The
 188  * most common cases where this comes up is when we have to workaround errata in
 189  * the processor, are dealing with processor-specific features such as CPU
 190  * performance counters, or we want to provide additional information for things
 191  * such as fault management.
 192  *
 193  * While processors also do have a brand string, which is the name that people
 194  * are familiar with when buying the processor, they are not meant for
 195  * programmatic consumption. That is what the family, model, and stepping are
 196  * for.
 197  *
 198  * ------------
 199  * CPUID Passes
 200  * ------------
 201  *
 202  * As part of performing feature detection, we break this into several different
 203  * passes. The passes are as follows:
 204  *
 205  *      Pass 0          This is a primordial pass done in locore.s to deal with
 206  *                      Cyrix CPUs that don't support cpuid. The reality is that
 207  *                      we likely don't run on them any more, but there is still
 208  *                      logic for handling them.
 209  *
 210  *      Pass 1          This is the primary pass and is responsible for doing a
 211  *                      large number of different things:
 212  *
 213  *                      1. Determine which vendor manufactured the CPU and
 214  *                      determining the family, model, and stepping information.
 215  *
 216  *                      2. Gathering a large number of feature flags to
 217  *                      determine which features the CPU support and which
 218  *                      indicate things that we need to do other work in the OS
 219  *                      to enable. Features detected this way are added to the
 220  *                      x86_featureset which can be queried to
 221  *                      determine what we should do. This includes processing
 222  *                      all of the basic and extended CPU features that we care
 223  *                      about.
 224  *
 225  *                      3. Determining the CPU's topology. This includes
 226  *                      information about how many cores and threads are present
 227  *                      in the package. It also is responsible for figuring out
 228  *                      which logical CPUs are potentially part of the same core
 229  *                      and what other resources they might share. For more
 230  *                      information see the 'Topology' section.
 231  *
 232  *                      4. Determining the set of CPU security-specific features
 233  *                      that we need to worry about and determine the
 234  *                      appropriate set of workarounds.
 235  *
 236  *                      Pass 1 on the boot CPU occurs before KMDB is started.
 237  *
 238  *      Pass 2          The second pass is done after startup(). Here, we check
 239  *                      other miscellaneous features. Most of this is gathering
 240  *                      additional basic and extended features that we'll use in
 241  *                      later passes or for debugging support.
 242  *
 243  *      Pass 3          The third pass occurs after the kernel memory allocator
 244  *                      has been fully initialized. This gathers information
 245  *                      where we might need dynamic memory available for our
 246  *                      uses. This includes several varying width leaves that
 247  *                      have cache information and the processor's brand string.
 248  *
 249  *      Pass 4          The fourth and final normal pass is performed after the
 250  *                      kernel has brought most everything online. This is
 251  *                      invoked from post_startup(). In this pass, we go through
 252  *                      the set of features that we have enabled and turn that
 253  *                      into the hardware auxiliary vector features that
 254  *                      userland receives. This is used by userland, primarily
 255  *                      by the run-time link-editor (RTLD), though userland
 256  *                      software could also refer to it directly.
 257  *
 258  *      Microcode       After a microcode update, we do a selective rescan of
 259  *                      the cpuid leaves to determine what features have
 260  *                      changed. Microcode updates can provide more details
 261  *                      about security related features to deal with issues like
 262  *                      Spectre and L1TF. On occasion, vendors have violated
 263  *                      their contract and removed bits. However, we don't try
 264  *                      to detect that because that puts us in a situation that
 265  *                      we really can't deal with. As such, the only thing we
 266  *                      rescan are security related features today. See
 267  *                      cpuid_pass_ucode().
 268  *
 269  * All of the passes (except pass 0) are run on all CPUs. However, for the most
 270  * part we only care about what the boot CPU says about this information and use
 271  * the other CPUs as a rough guide to sanity check that we have the same feature
 272  * set.
 273  *
 274  * We do not support running multiple logical CPUs with disjoint, let alone
 275  * different, feature sets.
 276  *
 277  * ------------------
 278  * Processor Topology
 279  * ------------------
 280  *
 281  * One of the important things that we need to do is to understand the topology
 282  * of the underlying processor. When we say topology in this case, we're trying
 283  * to understand the relationship between the logical CPUs that the operating
 284  * system sees and the underlying physical layout. Different logical CPUs may
 285  * share different resources which can have important consequences for the
 286  * performance of the system. For example, they may share caches, execution
 287  * units, and more.
 288  *
 289  * The topology of the processor changes from generation to generation and
 290  * vendor to vendor.  Along with that, different vendors use different
 291  * terminology, and the operating system itself uses occasionally overlapping
 292  * terminology. It's important to understand what this topology looks like so
 293  * one can understand the different things that we try to calculate and
 294  * determine.
 295  *
 296  * To get started, let's talk about a little bit of terminology that we've used
 297  * so far, is used throughout this file, and is fairly generic across multiple
 298  * vendors:
 299  *
 300  * CPU
 301  *      A central processing unit (CPU) refers to a logical and/or virtual
 302  *      entity that the operating system can execute instructions on. The
 303  *      underlying resources for this CPU may be shared between multiple
 304  *      entities; however, to the operating system it is a discrete unit.
 305  *
 306  * PROCESSOR and PACKAGE
 307  *
 308  *      Generally, when we use the term 'processor' on its own, we are referring
 309  *      to the physical entity that one buys and plugs into a board. However,
 310  *      because processor has been overloaded and one might see it used to mean
 311  *      multiple different levels, we will instead use the term 'package' for
 312  *      the rest of this file. The term package comes from the electrical
 313  *      engineering side and refers to the physical entity that encloses the
 314  *      electronics inside. Strictly speaking the package can contain more than
 315  *      just the CPU, for example, on many processors it may also have what's
 316  *      called an 'integrated graphical processing unit (GPU)'. Because the
 317  *      package can encapsulate multiple units, it is the largest physical unit
 318  *      that we refer to.
 319  *
 320  * SOCKET
 321  *
 322  *      A socket refers to unit on a system board (generally the motherboard)
 323  *      that can receive a package. A single package, or processor, is plugged
 324  *      into a single socket. A system may have multiple sockets. Often times,
 325  *      the term socket is used interchangeably with package and refers to the
 326  *      electrical component that has plugged in, and not the receptacle itself.
 327  *
 328  * CORE
 329  *
 330  *      A core refers to the physical instantiation of a CPU, generally, with a
 331  *      full set of hardware resources available to it. A package may contain
 332  *      multiple cores inside of it or it may just have a single one. A
 333  *      processor with more than one core is often referred to as 'multi-core'.
 334  *      In illumos, we will use the feature X86FSET_CMP to refer to a system
 335  *      that has 'multi-core' processors.
 336  *
 337  *      A core may expose a single logical CPU to the operating system, or it
 338  *      may expose multiple CPUs, which we call threads, defined below.
 339  *
 340  *      Some resources may still be shared by cores in the same package. For
 341  *      example, many processors will share the level 3 cache between cores.
 342  *      Some AMD generations share hardware resources between cores. For more
 343  *      information on that see the section 'AMD Topology'.
 344  *
 345  * THREAD and STRAND
 346  *
 347  *      In this file, generally a thread refers to a hardware resources and not
 348  *      the operating system's logical abstraction. A thread is always exposed
 349  *      as an independent logical CPU to the operating system. A thread belongs
 350  *      to a specific core. A core may have more than one thread. When that is
 351  *      the case, the threads that are part of the same core are often referred
 352  *      to as 'siblings'.
 353  *
 354  *      When multiple threads exist, this is generally referred to as
 355  *      simultaneous multi-threading (SMT). When Intel introduced this in their
 356  *      processors they called it hyper-threading (HT). When multiple threads
 357  *      are active in a core, they split the resources of the core. For example,
 358  *      two threads may share the same set of hardware execution units.
 359  *
 360  *      The operating system often uses the term 'strand' to refer to a thread.
 361  *      This helps disambiguate it from the software concept.
 362  *
 363  * CHIP
 364  *
 365  *      Unfortunately, the term 'chip' is dramatically overloaded. At its most
 366  *      base meaning, it is used to refer to a single integrated circuit, which
 367  *      may or may not be the only thing in the package. In illumos, when you
 368  *      see the term 'chip' it is almost always referring to the same thing as
 369  *      the 'package'. However, many vendors may use chip to refer to one of
 370  *      many integrated circuits that have been placed in the package. As an
 371  *      example, see the subsequent definition.
 372  *
 373  *      To try and keep things consistent, we will only use chip when referring
 374  *      to the entire integrated circuit package, with the exception of the
 375  *      definition of multi-chip module (because it is in the name) and use the
 376  *      term 'die' when we want the more general, potential sub-component
 377  *      definition.
 378  *
 379  * DIE
 380  *
 381  *      A die refers to an integrated circuit. Inside of the package there may
 382  *      be a single die or multiple dies. This is sometimes called a 'chip' in
 383  *      vendor's parlance, but in this file, we use the term die to refer to a
 384  *      subcomponent.
 385  *
 386  * MULTI-CHIP MODULE
 387  *
 388  *      A multi-chip module (MCM) refers to putting multiple distinct chips that
 389  *      are connected together in the same package. When a multi-chip design is
 390  *      used, generally each chip is manufactured independently and then joined
 391  *      together in the package. For example, on AMD's Zen microarchitecture
 392  *      (family 0x17), the package contains several dies (the second meaning of
 393  *      chip from above) that are connected together.
 394  *
 395  * CACHE
 396  *
 397  *      A cache is a part of the processor that maintains copies of recently
 398  *      accessed memory. Caches are split into levels and then into types.
 399  *      Commonly there are one to three levels, called level one, two, and
 400  *      three. The lower the level, the smaller it is, the closer it is to the
 401  *      execution units of the CPU, and the faster it is to access. The layout
 402  *      and design of the cache come in many different flavors, consult other
 403  *      resources for a discussion of those.
 404  *
 405  *      Caches are generally split into two types, the instruction and data
 406  *      cache. The caches contain what their names suggest, the instruction
 407  *      cache has executable program text, while the data cache has all other
 408  *      memory that the processor accesses. As of this writing, data is kept
 409  *      coherent between all of the caches on x86, so if one modifies program
 410  *      text before it is executed, that will be in the data cache, and the
 411  *      instruction cache will be synchronized with that change when the
 412  *      processor actually executes those instructions. This coherency also
 413  *      covers the fact that data could show up in multiple caches.
 414  *
 415  *      Generally, the lowest level caches are specific to a core. However, the
 416  *      last layer cache is shared between some number of cores. The number of
 417  *      CPUs sharing this last level cache is important. This has implications
 418  *      for the choices that the scheduler makes, as accessing memory that might
 419  *      be in a remote cache after thread migration can be quite expensive.
 420  *
 421  *      Sometimes, the word cache is abbreviated with a '$', because in US
 422  *      English the word cache is pronounced the same as cash. So L1D$ refers to
 423  *      the L1 data cache, and L2$ would be the L2 cache. This will not be used
 424  *      in the rest of this theory statement for clarity.
 425  *
 426  * MEMORY CONTROLLER
 427  *
 428  *      The memory controller is a component that provides access to DRAM. Each
 429  *      memory controller can access a set number of DRAM channels. Each channel
 430  *      can have a number of DIMMs (sticks of memory) associated with it. A
 431  *      given package may have more than one memory controller. The association
 432  *      of the memory controller to a group of cores is important as it is
 433  *      cheaper to access memory on the controller that you are associated with.
 434  *
 435  * NUMA
 436  *
 437  *      NUMA or non-uniform memory access, describes a way that systems are
 438  *      built. On x86, any processor core can address all of the memory in the
 439  *      system. However, When using multiple sockets or possibly within a
 440  *      multi-chip module, some of that memory is physically closer and some of
 441  *      it is further. Memory that is further away is more expensive to access.
 442  *      Consider the following image of multiple sockets with memory:
 443  *
 444  *      +--------+                                                +--------+
 445  *      | DIMM A |         +----------+      +----------+         | DIMM D |
 446  *      +--------+-+       |          |      |          |       +-+------+-+
 447  *        | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
 448  *        +--------+-+     |          |      |          |     +-+------+-+
 449  *          | DIMM C |     +----------+      +----------+     | DIMM F |
 450  *          +--------+                                        +--------+
 451  *
 452  *      In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
 453  *      closer to DIMMs D-F. This means that it is cheaper for socket 0 to
 454  *      access DIMMs A-C and more expensive to access D-F as it has to go
 455  *      through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
 456  *      D-F are cheaper than A-C. While the socket form is the most common, when
 457  *      using multi-chip modules, this can also sometimes occur. For another
 458  *      example of this that's more involved, see the AMD topology section.
 459  *
 460  *
 461  * Intel Topology
 462  * --------------
 463  *
 464  * Most Intel processors since Nehalem, (as of this writing the current gen
 465  * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
 466  * the package is a single monolithic die. MCMs currently aren't used. Most
 467  * parts have three levels of caches, with the L3 cache being shared between
 468  * all of the cores on the package. The L1/L2 cache is generally specific to
 469  * an individual core. The following image shows at a simplified level what
 470  * this looks like. The memory controller is commonly part of something called
 471  * the 'Uncore', that used to be separate physical chips that were not a part of
 472  * the package, but are now part of the same chip.
 473  *
 474  *  +-----------------------------------------------------------------------+
 475  *  | Package                                                               |
 476  *  |  +-------------------+  +-------------------+  +-------------------+  |
 477  *  |  | Core              |  | Core              |  | Core              |  |
 478  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
 479  *  |  |  | Thread | | L | |  |  | Thread | | L | |  |  | Thread | | L | |  |
 480  *  |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |
 481  *  |  |  +--------+ |   | |  |  +--------+ |   | |  |  +--------+ |   | |  |
 482  *  |  |  | Thread | |   | |  |  | Thread | |   | |  |  | Thread | |   | |  |
 483  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
 484  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
 485  *  |  |  | L2 Cache     | |  |  | L2 Cache     | |  |  | L2 Cache     | |  |
 486  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
 487  *  |  +-------------------+  +-------------------+  +-------------------+  |
 488  *  | +-------------------------------------------------------------------+ |
 489  *  | |                         Shared L3 Cache                           | |
 490  *  | +-------------------------------------------------------------------+ |
 491  *  | +-------------------------------------------------------------------+ |
 492  *  | |                        Memory Controller                          | |
 493  *  | +-------------------------------------------------------------------+ |
 494  *  +-----------------------------------------------------------------------+
 495  *
 496  * A side effect of this current architecture is that what we care about from a
 497  * scheduling and topology perspective, is simplified. In general we care about
 498  * understanding which logical CPUs are part of the same core and socket.
 499  *
 500  * To determine the relationship between threads and cores, Intel initially used
 501  * the identifier in the advanced programmable interrupt controller (APIC). They
 502  * also added cpuid leaf 4 to give additional information about the number of
 503  * threads and CPUs in the processor. With the addition of x2apic (which
 504  * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
 505  * additional cpuid topology leaf 0xB was added.
 506  *
 507  * AMD Topology
 508  * ------------
 509  *
 510  * When discussing AMD topology, we want to break this into three distinct
 511  * generations of topology. There's the basic topology that has been used in
 512  * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
 513  * with family 0x15 (Bulldozer), and there's the topology that was introduced
 514  * with family 0x17 (Zen). AMD also has some additional terminology that's worth
 515  * talking about.
 516  *
 517  * Until the introduction of family 0x17 (Zen), AMD did not implement something
 518  * that they considered SMT. Whether or not the AMD processors have SMT
 519  * influences many things including scheduling and reliability, availability,
 520  * and serviceability (RAS) features.
 521  *
 522  * NODE
 523  *
 524  *      AMD uses the term node to refer to a die that contains a number of cores
 525  *      and I/O resources. Depending on the processor family and model, more
 526  *      than one node can be present in the package. When there is more than one
 527  *      node this indicates a multi-chip module. Usually each node has its own
 528  *      access to memory and I/O devices. This is important and generally
 529  *      different from the corresponding Intel Nehalem-Skylake+ processors. As a
 530  *      result, we track this relationship in the operating system.
 531  *
 532  *      In processors with an L3 cache, the L3 cache is generally shared across
 533  *      the entire node, though the way this is carved up varies from generation
 534  *      to generation.
 535  *
 536  * BULLDOZER
 537  *
 538  *      Starting with the Bulldozer family (0x15) and continuing until the
 539  *      introduction of the Zen microarchitecture, AMD introduced the idea of a
 540  *      compute unit. In a compute unit, two traditional cores share a number of
 541  *      hardware resources. Critically, they share the FPU, L1 instruction
 542  *      cache, and the L2 cache. Several compute units were then combined inside
 543  *      of a single node.  Because the integer execution units, L1 data cache,
 544  *      and some other resources were not shared between the cores, AMD never
 545  *      considered this to be SMT.
 546  *
 547  * ZEN
 548  *
 549  *      The Zen family (0x17) uses a multi-chip module (MCM) design, the module
 550  *      is called Zeppelin. These modules are similar to the idea of nodes used
 551  *      previously. Each of these nodes has two DRAM channels which all of the
 552  *      cores in the node can access uniformly. These nodes are linked together
 553  *      in the package, creating a NUMA environment.
 554  *
 555  *      The Zeppelin die itself contains two different 'core complexes'. Each
 556  *      core complex consists of four cores which each have two threads, for a
 557  *      total of 8 logical CPUs per complex. Unlike other generations,
 558  *      where all the logical CPUs in a given node share the L3 cache, here each
 559  *      core complex has its own shared L3 cache.
 560  *
 561  *      A further thing that we need to consider is that in some configurations,
 562  *      particularly with the Threadripper line of processors, not every die
 563  *      actually has its memory controllers wired up to actual memory channels.
 564  *      This means that some cores have memory attached to them and others
 565  *      don't.
 566  *
 567  *      To put Zen in perspective, consider the following images:
 568  *
 569  *      +--------------------------------------------------------+
 570  *      | Core Complex                                           |
 571  *      | +-------------------+    +-------------------+  +---+  |
 572  *      | | Core       +----+ |    | Core       +----+ |  |   |  |
 573  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  |   |  |
 574  *      | | | Thread | +----+ |    | | Thread | +----+ |  |   |  |
 575  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  | L |  |
 576  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  | 3 |  |
 577  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
 578  *      | +-------------------+    +-------------------+  | C |  |
 579  *      | +-------------------+    +-------------------+  | a |  |
 580  *      | | Core       +----+ |    | Core       +----+ |  | c |  |
 581  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  | h |  |
 582  *      | | | Thread | +----+ |    | | Thread | +----+ |  | e |  |
 583  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |   |  |
 584  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  |   |  |
 585  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
 586  *      | +-------------------+    +-------------------+  +---+  |
 587  *      |                                                        |
 588  *      +--------------------------------------------------------+
 589  *
 590  *  This first image represents a single Zen core complex that consists of four
 591  *  cores.
 592  *
 593  *
 594  *      +--------------------------------------------------------+
 595  *      | Zeppelin Die                                           |
 596  *      |  +--------------------------------------------------+  |
 597  *      |  |         I/O Units (PCIe, SATA, USB, etc.)        |  |
 598  *      |  +--------------------------------------------------+  |
 599  *      |                           HH                           |
 600  *      |          +-----------+    HH    +-----------+          |
 601  *      |          |           |    HH    |           |          |
 602  *      |          |    Core   |==========|    Core   |          |
 603  *      |          |  Complex  |==========|  Complex  |          |
 604  *      |          |           |    HH    |           |          |
 605  *      |          +-----------+    HH    +-----------+          |
 606  *      |                           HH                           |
 607  *      |  +--------------------------------------------------+  |
 608  *      |  |                Memory Controller                 |  |
 609  *      |  +--------------------------------------------------+  |
 610  *      |                                                        |
 611  *      +--------------------------------------------------------+
 612  *
 613  *  This image represents a single Zeppelin Die. Note how both cores are
 614  *  connected to the same memory controller and I/O units. While each core
 615  *  complex has its own L3 cache as seen in the first image, they both have
 616  *  uniform access to memory.
 617  *
 618  *
 619  *                      PP                     PP
 620  *                      PP                     PP
 621  *           +----------PP---------------------PP---------+
 622  *           |          PP                     PP         |
 623  *           |    +-----------+          +-----------+    |
 624  *           |    |           |          |           |    |
 625  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
 626  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
 627  *           |    |           |          |           |    |
 628  *           |    +-----------+ooo    ...+-----------+    |
 629  *           |          HH      ooo  ...       HH         |
 630  *           |          HH        oo..         HH         |
 631  *           |          HH        ..oo         HH         |
 632  *           |          HH      ...  ooo       HH         |
 633  *           |    +-----------+...    ooo+-----------+    |
 634  *           |    |           |          |           |    |
 635  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
 636  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
 637  *           |    |           |          |           |    |
 638  *           |    +-----------+          +-----------+    |
 639  *           |          PP                     PP         |
 640  *           +----------PP---------------------PP---------+
 641  *                      PP                     PP
 642  *                      PP                     PP
 643  *
 644  *  This image represents a single Zen package. In this example, it has four
 645  *  Zeppelin dies, though some configurations only have a single one. In this
 646  *  example, each die is directly connected to the next. Also, each die is
 647  *  represented as being connected to memory by the 'M' character and connected
 648  *  to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
 649  *  die is made up of two core complexes, we have multiple different NUMA
 650  *  domains that we care about for these systems.
 651  *
 652  * CPUID LEAVES
 653  *
 654  * There are a few different CPUID leaves that we can use to try and understand
 655  * the actual state of the world. As part of the introduction of family 0xf, AMD
 656  * added CPUID leaf 0x80000008. This leaf tells us the number of logical
 657  * processors that are in the system. Because families before Zen didn't have
 658  * SMT, this was always the number of cores that were in the system. However, it
 659  * should always be thought of as the number of logical threads to be consistent
 660  * between generations. In addition we also get the size of the APIC ID that is
 661  * used to represent the number of logical processors. This is important for
 662  * deriving topology information.
 663  *
 664  * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
 665  * bit between Bulldozer and later families, but it is quite useful in
 666  * determining the topology information. Because this information has changed
 667  * across family generations, it's worth calling out what these mean
 668  * explicitly. The registers have the following meanings:
 669  *
 670  *      %eax    The APIC ID. The entire register is defined to have a 32-bit
 671  *              APIC ID, even though on systems without x2apic support, it will
 672  *              be limited to 8 bits.
 673  *
 674  *      %ebx    On Bulldozer-era systems this contains information about the
 675  *              number of cores that are in a compute unit (cores that share
 676  *              resources). It also contains a per-package compute unit ID that
 677  *              identifies which compute unit the logical CPU is a part of.
 678  *
 679  *              On Zen-era systems this instead contains the number of threads
 680  *              per core and the ID of the core that the logical CPU is a part
 681  *              of. Note, this ID is unique only to the package, it is not
 682  *              globally unique across the entire system.
 683  *
 684  *      %ecx    This contains the number of nodes that exist in the package. It
 685  *              also contains an ID that identifies which node the logical CPU
 686  *              is a part of.
 687  *
 688  * Finally, we also use cpuid leaf 0x8000001D to determine information about the
 689  * cache layout to determine which logical CPUs are sharing which caches.
 690  *
 691  * illumos Topology
 692  * ----------------
 693  *
 694  * Based on the above we synthesize the information into several different
 695  * variables that we store in the 'struct cpuid_info'. We'll go into the details
 696  * of what each member is supposed to represent and their uniqueness. In
 697  * general, there are two levels of uniqueness that we care about. We care about
 698  * an ID that is globally unique. That means that it will be unique across all
 699  * entities in the system. For example, the default logical CPU ID is globally
 700  * unique. On the other hand, there is some information that we only care about
 701  * being unique within the context of a single package / socket. Here are the
 702  * variables that we keep track of and their meaning.
 703  *
 704  * Several of the values that are asking for an identifier, with the exception
 705  * of cpi_apicid, are allowed to be synthetic.
 706  *
 707  *
 708  * cpi_apicid
 709  *
 710  *      This is the value of the CPU's APIC id. This should be the full 32-bit
 711  *      ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
 712  *      APIC ID. This value is globally unique between all logical CPUs across
 713  *      all packages. This is usually required by the APIC.
 714  *
 715  * cpi_chipid
 716  *
 717  *      This value indicates the ID of the package that the logical CPU is a
 718  *      part of. This value is allowed to be synthetic. It is usually derived by
 719  *      taking the CPU's APIC ID and determining how many bits are used to
 720  *      represent CPU cores in the package. All logical CPUs that are part of
 721  *      the same package must have the same value.
 722  *
 723  * cpi_coreid
 724  *
 725  *      This represents the ID of a CPU core. Two logical CPUs should only have
 726  *      the same cpi_coreid value if they are part of the same core. These
 727  *      values may be synthetic. On systems that support SMT, this value is
 728  *      usually derived from the APIC ID, otherwise it is often synthetic and
 729  *      just set to the value of the cpu_id in the cpu_t.
 730  *
 731  * cpi_pkgcoreid
 732  *
 733  *      This is similar to the cpi_coreid in that logical CPUs that are part of
 734  *      the same core should have the same ID. The main difference is that these
 735  *      values are only required to be unique to a given socket.
 736  *
 737  * cpi_clogid
 738  *
 739  *      This represents the logical ID of a logical CPU. This value should be
 740  *      unique within a given socket for each logical CPU. This is allowed to be
 741  *      synthetic, though it is usually based off of the CPU's apic ID. The
 742  *      broader system expects that logical CPUs that have are part of the same
 743  *      core have contiguous numbers. For example, if there were two threads per
 744  *      core, then the core IDs divided by two should be the same and the first
 745  *      modulus two should be zero and the second one. For example, IDs 4 and 5
 746  *      indicate two logical CPUs that are part of the same core. But IDs 5 and
 747  *      6 represent two logical CPUs that are part of different cores.
 748  *
 749  *      While it is common for the cpi_coreid and the cpi_clogid to be derived
 750  *      from the same source, strictly speaking, they don't have to be and the
 751  *      two values should be considered logically independent. One should not
 752  *      try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
 753  *      some kind of relationship. While this is tempting, we've seen cases on
 754  *      AMD family 0xf where the system's cpu id is not related to its APIC ID.
 755  *
 756  * cpi_ncpu_per_chip
 757  *
 758  *      This value indicates the total number of logical CPUs that exist in the
 759  *      physical package. Critically, this is not the number of logical CPUs
 760  *      that exist for just the single core.
 761  *
 762  *      This value should be the same for all logical CPUs in the same package.
 763  *
 764  * cpi_ncore_per_chip
 765  *
 766  *      This value indicates the total number of physical CPU cores that exist
 767  *      in the package. The system compares this value with cpi_ncpu_per_chip to
 768  *      determine if simultaneous multi-threading (SMT) is enabled. When
 769  *      cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
 770  *      the X86FSET_HTT feature is not set. If this value is greater than one,
 771  *      than we consider the processor to have the feature X86FSET_CMP, to
 772  *      indicate that there is support for more than one core.
 773  *
 774  *      This value should be the same for all logical CPUs in the same package.
 775  *
 776  * cpi_procnodes_per_pkg
 777  *
 778  *      This value indicates the number of 'nodes' that exist in the package.
 779  *      When processors are actually a multi-chip module, this represents the
 780  *      number of such modules that exist in the package. Currently, on Intel
 781  *      based systems this member is always set to 1.
 782  *
 783  *      This value should be the same for all logical CPUs in the same package.
 784  *
 785  * cpi_procnodeid
 786  *
 787  *      This value indicates the ID of the node that the logical CPU is a part
 788  *      of. All logical CPUs that are in the same node must have the same value
 789  *      here. This value must be unique across all of the packages in the
 790  *      system.  On Intel based systems, this is currently set to the value in
 791  *      cpi_chipid because there is only one node.
 792  *
 793  * cpi_cores_per_compunit
 794  *
 795  *      This value indicates the number of cores that are part of a compute
 796  *      unit. See the AMD topology section for this. This member only has real
 797  *      meaning currently for AMD Bulldozer family processors. For all other
 798  *      processors, this should currently be set to 1.
 799  *
 800  * cpi_compunitid
 801  *
 802  *      This indicates the compute unit that the logical CPU belongs to. For
 803  *      processors without AMD Bulldozer-style compute units this should be set
 804  *      to the value of cpi_coreid.
 805  *
 806  * cpi_ncpu_shr_last_cache
 807  *
 808  *      This indicates the number of logical CPUs that are sharing the same last
 809  *      level cache. This value should be the same for all CPUs that are sharing
 810  *      that cache. The last cache refers to the cache that is closest to memory
 811  *      and furthest away from the CPU.
 812  *
 813  * cpi_last_lvl_cacheid
 814  *
 815  *      This indicates the ID of the last cache that the logical CPU uses. This
 816  *      cache is often shared between multiple logical CPUs and is the cache
 817  *      that is closest to memory and furthest away from the CPU. This value
 818  *      should be the same for a group of logical CPUs only if they actually
 819  *      share the same last level cache. IDs should not overlap between
 820  *      packages.
 821  *
 822  * cpi_ncore_bits
 823  *
 824  *      This indicates the number of bits that are required to represent all of
 825  *      the cores in the system. As cores are derived based on their APIC IDs,
 826  *      we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
 827  *      this value to be larger than the actual number of IDs that are present
 828  *      in the system. This is used to size tables by the CMI framework. It is
 829  *      only filled in for Intel and AMD CPUs.
 830  *
 831  * cpi_nthread_bits
 832  *
 833  *      This indicates the number of bits required to represent all of the IDs
 834  *      that cover the logical CPUs that exist on a given core. It's OK for this
 835  *      value to be larger than the actual number of IDs that are present in the
 836  *      system.  This is used to size tables by the CMI framework. It is
 837  *      only filled in for Intel and AMD CPUs.
 838  *
 839  * -----------
 840  * Hypervisors
 841  * -----------
 842  *
 843  * If trying to manage the differences between vendors wasn't bad enough, it can
 844  * get worse thanks to our friend hardware virtualization. Hypervisors are given
 845  * the ability to interpose on all cpuid instructions and change them to suit
 846  * their purposes. In general, this is necessary as the hypervisor wants to be
 847  * able to present a more uniform set of features or not necessarily give the
 848  * guest operating system kernel knowledge of all features so it can be
 849  * more easily migrated between systems.
 850  *
 851  * When it comes to trying to determine topology information, this can be a
 852  * double edged sword. When a hypervisor doesn't actually implement a cpuid
 853  * leaf, it'll often return all zeros. Because of that, you'll often see various
 854  * checks scattered about fields being non-zero before we assume we can use
 855  * them.
 856  *
 857  * When it comes to topology information, the hypervisor is often incentivized
 858  * to lie to you about topology. This is because it doesn't always actually
 859  * guarantee that topology at all. The topology path we take in the system
 860  * depends on how the CPU advertises itself. If it advertises itself as an Intel
 861  * or AMD CPU, then we basically do our normal path. However, when they don't
 862  * use an actual vendor, then that usually turns into multiple one-core CPUs
 863  * that we enumerate that are often on different sockets. The actual behavior
 864  * depends greatly on what the hypervisor actually exposes to us.
 865  *
 866  * --------------------
 867  * Exposing Information
 868  * --------------------
 869  *
 870  * We expose CPUID information in three different forms in the system.
 871  *
 872  * The first is through the x86_featureset variable. This is used in conjunction
 873  * with the is_x86_feature() function. This is queried by x86-specific functions
 874  * to determine which features are or aren't present in the system and to make
 875  * decisions based upon them. For example, users of this include everything from
 876  * parts of the system dedicated to reliability, availability, and
 877  * serviceability (RAS), to making decisions about how to handle security
 878  * mitigations, to various x86-specific drivers. General purpose or
 879  * architecture independent drivers should never be calling this function.
 880  *
 881  * The second means is through the auxiliary vector. The auxiliary vector is a
 882  * series of tagged data that the kernel passes down to a user program when it
 883  * begins executing. This information is used to indicate to programs what
 884  * instruction set extensions are present. For example, information about the
 885  * CPU supporting the machine check architecture (MCA) wouldn't be passed down
 886  * since user programs cannot make use of it. However, things like the AVX
 887  * instruction sets are. Programs use this information to make run-time
 888  * decisions about what features they should use. As an example, the run-time
 889  * link-editor (rtld) can relocate different functions depending on the hardware
 890  * support available.
 891  *
 892  * The final form is through a series of accessor functions that all have the
 893  * form cpuid_get*. This is used by a number of different subsystems in the
 894  * kernel to determine more detailed information about what we're running on,
 895  * topology information, etc. Some of these subsystems include processor groups
 896  * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
 897  * microcode, and performance monitoring. These functions all ASSERT that the
 898  * CPU they're being called on has reached a certain cpuid pass. If the passes
 899  * are rearranged, then this needs to be adjusted.
 900  *
 901  * -----------------------------------------------
 902  * Speculative Execution CPU Side Channel Security
 903  * -----------------------------------------------
 904  *
 905  * With the advent of the Spectre and Meltdown attacks which exploit speculative
 906  * execution in the CPU to create side channels there have been a number of
 907  * different attacks and corresponding issues that the operating system needs to
 908  * mitigate against. The following list is some of the common, but not
 909  * exhaustive, set of issues that we know about and have done some or need to do
 910  * more work in the system to mitigate against:
 911  *
 912  *   - Spectre v1
 913  *   - swapgs (Spectre v1 variant)
 914  *   - Spectre v2
 915  *   - Meltdown (Spectre v3)
 916  *   - Rogue Register Read (Spectre v3a)
 917  *   - Speculative Store Bypass (Spectre v4)
 918  *   - ret2spec, SpectreRSB
 919  *   - L1 Terminal Fault (L1TF)
 920  *   - Microarchitectural Data Sampling (MDS)
 921  *
 922  * Each of these requires different sets of mitigations and has different attack
 923  * surfaces. For the most part, this discussion is about protecting the kernel
 924  * from non-kernel executing environments such as user processes and hardware
 925  * virtual machines. Unfortunately, there are a number of user vs. user
 926  * scenarios that exist with these. The rest of this section will describe the
 927  * overall approach that the system has taken to address these as well as their
 928  * shortcomings. Unfortunately, not all of the above have been handled today.
 929  *
 930  * SPECTRE v2, ret2spec, SpectreRSB
 931  *
 932  * The second variant of the spectre attack focuses on performing branch target
 933  * injection. This generally impacts indirect call instructions in the system.
 934  * There are three different ways to mitigate this issue that are commonly
 935  * described today:
 936  *
 937  *  1. Using Indirect Branch Restricted Speculation (IBRS).
 938  *  2. Using Retpolines and RSB Stuffing
 939  *  3. Using Enhanced Indirect Branch Restricted Speculation (EIBRS)
 940  *
 941  * IBRS uses a feature added to microcode to restrict speculation, among other
 942  * things. This form of mitigation has not been used as it has been generally
 943  * seen as too expensive and requires reactivation upon various transitions in
 944  * the system.
 945  *
 946  * As a less impactful alternative to IBRS, retpolines were developed by
 947  * Google. These basically require one to replace indirect calls with a specific
 948  * trampoline that will cause speculation to fail and break the attack.
 949  * Retpolines require compiler support. We always build with retpolines in the
 950  * external thunk mode. This means that a traditional indirect call is replaced
 951  * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
 952  * of this is that all indirect function calls are performed through a register.
 953  *
 954  * We have to use a common external location of the thunk and not inline it into
 955  * the callsite so that way we can have a single place to patch these functions.
 956  * As it turns out, we actually have three different forms of retpolines that
 957  * exist in the system:
 958  *
 959  *  1. A full retpoline
 960  *  2. An AMD-specific optimized retpoline
 961  *  3. A no-op version
 962  *
 963  * The first one is used in the general case. The second one is used if we can
 964  * determine that we're on an AMD system and we can successfully toggle the
 965  * lfence serializing MSR that exists on the platform. Basically with this
 966  * present, an lfence is sufficient and we don't need to do anywhere near as
 967  * complicated a dance to successfully use retpolines.
 968  *
 969  * The third form described above is the most curious. It turns out that the way
 970  * that retpolines are implemented is that they rely on how speculation is
 971  * performed on a 'ret' instruction. Intel has continued to optimize this
 972  * process (which is partly why we need to have return stack buffer stuffing,
 973  * but more on that in a bit) and in processors starting with Cascade Lake
 974  * on the server side, it's dangerous to rely on retpolines. Instead, a new
 975  * mechanism has been introduced called Enhanced IBRS (EIBRS).
 976  *
 977  * Unlike IBRS, EIBRS is designed to be enabled once at boot and left on each
 978  * physical core. However, if this is the case, we don't want to use retpolines
 979  * any more. Therefore if EIBRS is present, we end up turning each retpoline
 980  * function (called a thunk) into a jmp instruction. This means that we're still
 981  * paying the cost of an extra jump to the external thunk, but it gives us
 982  * flexibility and the ability to have a single kernel image that works across a
 983  * wide variety of systems and hardware features.
 984  *
 985  * Unfortunately, this alone is insufficient. First, Skylake systems have
 986  * additional speculation for the Return Stack Buffer (RSB) which is used to
 987  * return from call instructions which retpolines take advantage of. However,
 988  * this problem is not just limited to Skylake and is actually more pernicious.
 989  * The SpectreRSB paper introduces several more problems that can arise with
 990  * dealing with this. The RSB can be poisoned just like the indirect branch
 991  * predictor. This means that one needs to clear the RSB when transitioning
 992  * between two different privilege domains. Some examples include:
 993  *
 994  *  - Switching between two different user processes
 995  *  - Going between user land and the kernel
 996  *  - Returning to the kernel from a hardware virtual machine
 997  *
 998  * Mitigating this involves combining a couple of different things. The first is
 999  * SMEP (supervisor mode execution protection) which was introduced in Ivy
1000  * Bridge. When an RSB entry refers to a user address and we're executing in the
1001  * kernel, speculation through it will be stopped when SMEP is enabled. This
1002  * protects against a number of the different cases that we would normally be
1003  * worried about such as when we enter the kernel from user land.
1004  *
1005  * To prevent against additional manipulation of the RSB from other contexts
1006  * such as a non-root VMX context attacking the kernel we first look to enhanced
1007  * IBRS. When EIBRS is present and enabled, then there is nothing else that we
1008  * need to do to protect the kernel at this time.
1009  *
1010  * On CPUs without EIBRS we need to manually overwrite the contents of the
1011  * return stack buffer. We do this through the x86_rsb_stuff() function.
1012  * Currently this is employed on context switch. The x86_rsb_stuff() function is
1013  * disabled when enhanced IBRS is present because Intel claims on such systems
1014  * it will be ineffective. Stuffing the RSB in context switch helps prevent user
1015  * to user attacks via the RSB.
1016  *
1017  * If SMEP is not present, then we would have to stuff the RSB every time we
1018  * transitioned from user mode to the kernel, which isn't very practical right
1019  * now.
1020  *
1021  * To fully protect user to user and vmx to vmx attacks from these classes of
1022  * issues, we would also need to allow them to opt into performing an Indirect
1023  * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1024  *
1025  * By default, the system will enable RSB stuffing and the required variant of
1026  * retpolines and store that information in the x86_spectrev2_mitigation value.
1027  * This will be evaluated after a microcode update as well, though it is
1028  * expected that microcode updates will not take away features. This may mean
1029  * that a late loaded microcode may not end up in the optimal configuration
1030  * (though this should be rare).
1031  *
1032  * Currently we do not build kmdb with retpolines or perform any additional side
1033  * channel security mitigations for it. One complication with kmdb is that it
1034  * requires its own retpoline thunks and it would need to adjust itself based on
1035  * what the kernel does. The threat model of kmdb is more limited and therefore
1036  * it may make more sense to investigate using prediction barriers as the whole
1037  * system is only executing a single instruction at a time while in kmdb.
1038  *
1039  * SPECTRE v1, v4
1040  *
1041  * The v1 and v4 variants of spectre are not currently mitigated in the
1042  * system and require other classes of changes to occur in the code.
1043  *
1044  * SPECTRE v1 (SWAPGS VARIANT)
1045  *
1046  * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but
1047  * can generally affect any branch-dependent code. The swapgs issue is one
1048  * variant of this. If we are coming in from userspace, we can have code like
1049  * this:
1050  *
1051  *      cmpw    $KCS_SEL, REGOFF_CS(%rsp)
1052  *      je      1f
1053  *      movq    $0, REGOFF_SAVFP(%rsp)
1054  *      swapgs
1055  *      1:
1056  *      movq    %gs:CPU_THREAD, %rax
1057  *
1058  * If an attacker can cause a mis-speculation of the branch here, we could skip
1059  * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based
1060  * load. If subsequent code can act as the usual Spectre cache gadget, this
1061  * would potentially allow KPTI bypass. To fix this, we need an lfence prior to
1062  * any use of the %gs override.
1063  *
1064  * The other case is also an issue: if we're coming into a trap from kernel
1065  * space, we could mis-speculate and swapgs the user %gsbase back in prior to
1066  * using it. AMD systems are not vulnerable to this version, as a swapgs is
1067  * serializing with respect to subsequent uses. But as AMD /does/ need the other
1068  * case, and the fix is the same in both cases (an lfence at the branch target
1069  * 1: in this example), we'll just do it unconditionally.
1070  *
1071  * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it
1072  * harder for user-space to actually set a useful %gsbase value: although it's
1073  * not clear, it might still be feasible via lwp_setprivate(), though, so we
1074  * mitigate anyway.
1075  *
1076  * MELTDOWN
1077  *
1078  * Meltdown, or spectre v3, allowed a user process to read any data in their
1079  * address space regardless of whether or not the page tables in question
1080  * allowed the user to have the ability to read them. The solution to meltdown
1081  * is kernel page table isolation. In this world, there are two page tables that
1082  * are used for a process, one in user land and one in the kernel. To implement
1083  * this we use per-CPU page tables and switch between the user and kernel
1084  * variants when entering and exiting the kernel.  For more information about
1085  * this process and how the trampolines work, please see the big theory
1086  * statements and additional comments in:
1087  *
1088  *  - uts/i86pc/ml/kpti_trampolines.s
1089  *  - uts/i86pc/vm/hat_i86.c
1090  *
1091  * While Meltdown only impacted Intel systems and there are also Intel systems
1092  * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1093  * kernel page table isolation enabled. While this may at first seem weird, an
1094  * important thing to remember is that you can't speculatively read an address
1095  * if it's never in your page table at all. Having user processes without kernel
1096  * pages present provides us with an important layer of defense in the kernel
1097  * against any other side channel attacks that exist and have yet to be
1098  * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1099  * default, no matter the x86 system.
1100  *
1101  * L1 TERMINAL FAULT
1102  *
1103  * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1104  * execution uses page table entries. Effectively, it is two different problems.
1105  * The first is that it ignores the not present bit in the page table entries
1106  * when performing speculative execution. This means that something can
1107  * speculatively read the listed physical address if it's present in the L1
1108  * cache under certain conditions (see Intel's documentation for the full set of
1109  * conditions). Secondly, this can be used to bypass hardware virtualization
1110  * extended page tables (EPT) that are part of Intel's hardware virtual machine
1111  * instructions.
1112  *
1113  * For the non-hardware virtualized case, this is relatively easy to deal with.
1114  * We must make sure that all unmapped pages have an address of zero. This means
1115  * that they could read the first 4k of physical memory; however, we never use
1116  * that first page in the operating system and always skip putting it in our
1117  * memory map, even if firmware tells us we can use it in our memory map. While
1118  * other systems try to put extra metadata in the address and reserved bits,
1119  * which led to this being problematic in those cases, we do not.
1120  *
1121  * For hardware virtual machines things are more complicated. Because they can
1122  * construct their own page tables, it isn't hard for them to perform this
1123  * attack against any physical address. The one wrinkle is that this physical
1124  * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1125  * to flush the L1 data cache. We wrap this up in the function
1126  * spec_uarch_flush(). This function is also used in the mitigation of
1127  * microarchitectural data sampling (MDS) discussed later on. Kernel based
1128  * hypervisors such as KVM or bhyve are responsible for performing this before
1129  * entering the guest.
1130  *
1131  * Because this attack takes place in the L1 cache, there's another wrinkle
1132  * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1133  * designs. This means that when a thread enters a hardware virtualized context
1134  * and flushes the L1 data cache, the other thread on the processor may then go
1135  * ahead and put new data in it that can be potentially attacked. While one
1136  * solution is to disable SMT on the system, another option that is available is
1137  * to use a feature for hardware virtualization called 'SMT exclusion'. This
1138  * goes through and makes sure that if a HVM is being scheduled on one thread,
1139  * then the thing on the other thread is from the same hardware virtual machine.
1140  * If an interrupt comes in or the guest exits to the broader system, then the
1141  * other SMT thread will be kicked out.
1142  *
1143  * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1144  * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1145  * perform L1TF related mitigations.
1146  *
1147  * MICROARCHITECTURAL DATA SAMPLING
1148  *
1149  * Microarchitectural data sampling (MDS) is a combination of four discrete
1150  * vulnerabilities that are similar issues affecting various parts of the CPU's
1151  * microarchitectural implementation around load, store, and fill buffers.
1152  * Specifically it is made up of the following subcomponents:
1153  *
1154  *  1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1155  *  2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1156  *  3. Microarchitectural Load Port Data Sampling (MLPDS)
1157  *  4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1158  *
1159  * To begin addressing these, Intel has introduced another feature in microcode
1160  * called MD_CLEAR. This changes the verw instruction to operate in a different
1161  * way. This allows us to execute the verw instruction in a particular way to
1162  * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1163  * updated when this microcode is present to flush this state.
1164  *
1165  * Primarily we need to flush this state whenever we transition from the kernel
1166  * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1167  * little bit different. Here the structures are statically sized when a logical
1168  * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1169  * flush the microarchitectural state before the CPU goes idles by calling hlt,
1170  * mwait, or another ACPI method. To perform these flushes, we call
1171  * x86_md_clear() at all of these transition points.
1172  *
1173  * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1174  * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1175  * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1176  * a no-op.
1177  *
1178  * Unfortunately, with this issue hyperthreading rears its ugly head. In
1179  * particular, everything we've discussed above is only valid for a single
1180  * thread executing on a core. In the case where you have hyper-threading
1181  * present, this attack can be performed between threads. The theoretical fix
1182  * for this is to ensure that both threads are always in the same security
1183  * domain. This means that they are executing in the same ring and mutually
1184  * trust each other. Practically speaking, this would mean that a system call
1185  * would have to issue an inter-processor interrupt (IPI) to the other thread.
1186  * Rather than implement this, we recommend that one disables hyper-threading
1187  * through the use of psradm -aS.
1188  *
1189  * SUMMARY
1190  *
1191  * The following table attempts to summarize the mitigations for various issues
1192  * and what's done in various places:
1193  *
1194  *  - Spectre v1: Not currently mitigated
1195  *  - swapgs: lfences after swapgs paths
1196  *  - Spectre v2: Retpolines/RSB Stuffing or EIBRS if HW support
1197  *  - Meltdown: Kernel Page Table Isolation
1198  *  - Spectre v3a: Updated CPU microcode
1199  *  - Spectre v4: Not currently mitigated
1200  *  - SpectreRSB: SMEP and RSB Stuffing
1201  *  - L1TF: spec_uarch_flush, SMT exclusion, requires microcode
1202  *  - MDS: x86_md_clear, requires microcode, disabling hyper threading
1203  *
1204  * The following table indicates the x86 feature set bits that indicate that a
1205  * given problem has been solved or a notable feature is present:
1206  *
1207  *  - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1208  *  - MDS_NO: All forms of MDS
1209  */
1210 
1211 #include <sys/types.h>
1212 #include <sys/archsystm.h>
1213 #include <sys/x86_archext.h>
1214 #include <sys/kmem.h>
1215 #include <sys/systm.h>
1216 #include <sys/cmn_err.h>
1217 #include <sys/sunddi.h>
1218 #include <sys/sunndi.h>
1219 #include <sys/cpuvar.h>
1220 #include <sys/processor.h>
1221 #include <sys/sysmacros.h>
1222 #include <sys/pg.h>
1223 #include <sys/fp.h>
1224 #include <sys/controlregs.h>
1225 #include <sys/bitmap.h>
1226 #include <sys/auxv_386.h>
1227 #include <sys/memnode.h>
1228 #include <sys/pci_cfgspace.h>
1229 #include <sys/comm_page.h>
1230 #include <sys/mach_mmu.h>
1231 #include <sys/ucode.h>
1232 #include <sys/tsc.h>
1233 #include <sys/kobj.h>
1234 #include <sys/asm_misc.h>
1235 
1236 #ifdef __xpv
1237 #include <sys/hypervisor.h>
1238 #else
1239 #include <sys/ontrap.h>
1240 #endif
1241 
1242 uint_t x86_vendor = X86_VENDOR_IntelClone;
1243 uint_t x86_type = X86_TYPE_OTHER;
1244 uint_t x86_clflush_size = 0;
1245 
1246 #if defined(__xpv)
1247 int x86_use_pcid = 0;
1248 int x86_use_invpcid = 0;
1249 #else
1250 int x86_use_pcid = -1;
1251 int x86_use_invpcid = -1;
1252 #endif
1253 
1254 typedef enum {
1255         X86_SPECTREV2_RETPOLINE,
1256         X86_SPECTREV2_RETPOLINE_AMD,
1257         X86_SPECTREV2_ENHANCED_IBRS,
1258         X86_SPECTREV2_DISABLED
1259 } x86_spectrev2_mitigation_t;
1260 
1261 uint_t x86_disable_spectrev2 = 0;
1262 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1263     X86_SPECTREV2_RETPOLINE;
1264 
1265 uint_t pentiumpro_bug4046376;
1266 
1267 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1268 
1269 static char *x86_feature_names[NUM_X86_FEATURES] = {
1270         "lgpg",
1271         "tsc",
1272         "msr",
1273         "mtrr",
1274         "pge",
1275         "de",
1276         "cmov",
1277         "mmx",
1278         "mca",
1279         "pae",
1280         "cv8",
1281         "pat",
1282         "sep",
1283         "sse",
1284         "sse2",
1285         "htt",
1286         "asysc",
1287         "nx",
1288         "sse3",
1289         "cx16",
1290         "cmp",
1291         "tscp",
1292         "mwait",
1293         "sse4a",
1294         "cpuid",
1295         "ssse3",
1296         "sse4_1",
1297         "sse4_2",
1298         "1gpg",
1299         "clfsh",
1300         "64",
1301         "aes",
1302         "pclmulqdq",
1303         "xsave",
1304         "avx",
1305         "vmx",
1306         "svm",
1307         "topoext",
1308         "f16c",
1309         "rdrand",
1310         "x2apic",
1311         "avx2",
1312         "bmi1",
1313         "bmi2",
1314         "fma",
1315         "smep",
1316         "smap",
1317         "adx",
1318         "rdseed",
1319         "mpx",
1320         "avx512f",
1321         "avx512dq",
1322         "avx512pf",
1323         "avx512er",
1324         "avx512cd",
1325         "avx512bw",
1326         "avx512vl",
1327         "avx512fma",
1328         "avx512vbmi",
1329         "avx512_vpopcntdq",
1330         "avx512_4vnniw",
1331         "avx512_4fmaps",
1332         "xsaveopt",
1333         "xsavec",
1334         "xsaves",
1335         "sha",
1336         "umip",
1337         "pku",
1338         "ospke",
1339         "pcid",
1340         "invpcid",
1341         "ibrs",
1342         "ibpb",
1343         "stibp",
1344         "ssbd",
1345         "ssbd_virt",
1346         "rdcl_no",
1347         "ibrs_all",
1348         "rsba",
1349         "ssb_no",
1350         "stibp_all",
1351         "flush_cmd",
1352         "l1d_vmentry_no",
1353         "fsgsbase",
1354         "clflushopt",
1355         "clwb",
1356         "monitorx",
1357         "clzero",
1358         "xop",
1359         "fma4",
1360         "tbm",
1361         "avx512_vnni",
1362         "amd_pcec",
1363         "mb_clear",
1364         "mds_no",
1365         "core_thermal",
1366         "pkg_thermal"
1367 };
1368 
1369 boolean_t
1370 is_x86_feature(void *featureset, uint_t feature)
1371 {
1372         ASSERT(feature < NUM_X86_FEATURES);
1373         return (BT_TEST((ulong_t *)featureset, feature));
1374 }
1375 
1376 void
1377 add_x86_feature(void *featureset, uint_t feature)
1378 {
1379         ASSERT(feature < NUM_X86_FEATURES);
1380         BT_SET((ulong_t *)featureset, feature);
1381 }
1382 
1383 void
1384 remove_x86_feature(void *featureset, uint_t feature)
1385 {
1386         ASSERT(feature < NUM_X86_FEATURES);
1387         BT_CLEAR((ulong_t *)featureset, feature);
1388 }
1389 
1390 boolean_t
1391 compare_x86_featureset(void *setA, void *setB)
1392 {
1393         /*
1394          * We assume that the unused bits of the bitmap are always zero.
1395          */
1396         if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1397                 return (B_TRUE);
1398         } else {
1399                 return (B_FALSE);
1400         }
1401 }
1402 
1403 void
1404 print_x86_featureset(void *featureset)
1405 {
1406         uint_t i;
1407 
1408         for (i = 0; i < NUM_X86_FEATURES; i++) {
1409                 if (is_x86_feature(featureset, i)) {
1410                         cmn_err(CE_CONT, "?x86_feature: %s\n",
1411                             x86_feature_names[i]);
1412                 }
1413         }
1414 }
1415 
1416 /* Note: This is the maximum size for the CPU, not the size of the structure. */
1417 static size_t xsave_state_size = 0;
1418 uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1419 boolean_t xsave_force_disable = B_FALSE;
1420 extern int disable_smap;
1421 
1422 /*
1423  * This is set to platform type we are running on.
1424  */
1425 static int platform_type = -1;
1426 
1427 #if !defined(__xpv)
1428 /*
1429  * Variable to patch if hypervisor platform detection needs to be
1430  * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1431  */
1432 int enable_platform_detection = 1;
1433 #endif
1434 
1435 /*
1436  * monitor/mwait info.
1437  *
1438  * size_actual and buf_actual are the real address and size allocated to get
1439  * proper mwait_buf alignement.  buf_actual and size_actual should be passed
1440  * to kmem_free().  Currently kmem_alloc() and mwait happen to both use
1441  * processor cache-line alignment, but this is not guarantied in the furture.
1442  */
1443 struct mwait_info {
1444         size_t          mon_min;        /* min size to avoid missed wakeups */
1445         size_t          mon_max;        /* size to avoid false wakeups */
1446         size_t          size_actual;    /* size actually allocated */
1447         void            *buf_actual;    /* memory actually allocated */
1448         uint32_t        support;        /* processor support of monitor/mwait */
1449 };
1450 
1451 /*
1452  * xsave/xrestor info.
1453  *
1454  * This structure contains HW feature bits and the size of the xsave save area.
1455  * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1456  * (xsave_state) to describe the xsave layout. However, at runtime the
1457  * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1458  * xsave_state structure simply represents the legacy layout of the beginning
1459  * of the xsave area.
1460  */
1461 struct xsave_info {
1462         uint32_t        xsav_hw_features_low;   /* Supported HW features */
1463         uint32_t        xsav_hw_features_high;  /* Supported HW features */
1464         size_t          xsav_max_size;  /* max size save area for HW features */
1465         size_t          ymm_size;       /* AVX: size of ymm save area */
1466         size_t          ymm_offset;     /* AVX: offset for ymm save area */
1467         size_t          bndregs_size;   /* MPX: size of bndregs save area */
1468         size_t          bndregs_offset; /* MPX: offset for bndregs save area */
1469         size_t          bndcsr_size;    /* MPX: size of bndcsr save area */
1470         size_t          bndcsr_offset;  /* MPX: offset for bndcsr save area */
1471         size_t          opmask_size;    /* AVX512: size of opmask save */
1472         size_t          opmask_offset;  /* AVX512: offset for opmask save */
1473         size_t          zmmlo_size;     /* AVX512: size of zmm 256 save */
1474         size_t          zmmlo_offset;   /* AVX512: offset for zmm 256 save */
1475         size_t          zmmhi_size;     /* AVX512: size of zmm hi reg save */
1476         size_t          zmmhi_offset;   /* AVX512: offset for zmm hi reg save */
1477 };
1478 
1479 
1480 /*
1481  * These constants determine how many of the elements of the
1482  * cpuid we cache in the cpuid_info data structure; the
1483  * remaining elements are accessible via the cpuid instruction.
1484  */
1485 
1486 #define NMAX_CPI_STD    8               /* eax = 0 .. 7 */
1487 #define NMAX_CPI_EXTD   0x1f            /* eax = 0x80000000 .. 0x8000001e */
1488 
1489 /*
1490  * See the big theory statement for a more detailed explanation of what some of
1491  * these members mean.
1492  */
1493 struct cpuid_info {
1494         uint_t cpi_pass;                /* last pass completed */
1495         /*
1496          * standard function information
1497          */
1498         uint_t cpi_maxeax;              /* fn 0: %eax */
1499         char cpi_vendorstr[13];         /* fn 0: %ebx:%ecx:%edx */
1500         uint_t cpi_vendor;              /* enum of cpi_vendorstr */
1501 
1502         uint_t cpi_family;              /* fn 1: extended family */
1503         uint_t cpi_model;               /* fn 1: extended model */
1504         uint_t cpi_step;                /* fn 1: stepping */
1505         chipid_t cpi_chipid;            /* fn 1: %ebx:  Intel: chip # */
1506                                         /*              AMD: package/socket # */
1507         uint_t cpi_brandid;             /* fn 1: %ebx: brand ID */
1508         int cpi_clogid;                 /* fn 1: %ebx: thread # */
1509         uint_t cpi_ncpu_per_chip;       /* fn 1: %ebx: logical cpu count */
1510         uint8_t cpi_cacheinfo[16];      /* fn 2: intel-style cache desc */
1511         uint_t cpi_ncache;              /* fn 2: number of elements */
1512         uint_t cpi_ncpu_shr_last_cache; /* fn 4: %eax: ncpus sharing cache */
1513         id_t cpi_last_lvl_cacheid;      /* fn 4: %eax: derived cache id */
1514         uint_t cpi_cache_leaf_size;     /* Number of cache elements */
1515                                         /* Intel fn: 4, AMD fn: 8000001d */
1516         struct cpuid_regs **cpi_cache_leaves;   /* Acual leaves from above */
1517         struct cpuid_regs cpi_std[NMAX_CPI_STD];        /* 0 .. 7 */
1518         /*
1519          * extended function information
1520          */
1521         uint_t cpi_xmaxeax;             /* fn 0x80000000: %eax */
1522         char cpi_brandstr[49];          /* fn 0x8000000[234] */
1523         uint8_t cpi_pabits;             /* fn 0x80000006: %eax */
1524         uint8_t cpi_vabits;             /* fn 0x80000006: %eax */
1525         uint8_t cpi_fp_amd_save;        /* AMD: FP error pointer save rqd. */
1526         struct  cpuid_regs cpi_extd[NMAX_CPI_EXTD];     /* 0x800000XX */
1527 
1528         id_t cpi_coreid;                /* same coreid => strands share core */
1529         int cpi_pkgcoreid;              /* core number within single package */
1530         uint_t cpi_ncore_per_chip;      /* AMD: fn 0x80000008: %ecx[7-0] */
1531                                         /* Intel: fn 4: %eax[31-26] */
1532 
1533         /*
1534          * These values represent the number of bits that are required to store
1535          * information about the number of cores and threads.
1536          */
1537         uint_t cpi_ncore_bits;
1538         uint_t cpi_nthread_bits;
1539         /*
1540          * supported feature information
1541          */
1542         uint32_t cpi_support[6];
1543 #define STD_EDX_FEATURES        0
1544 #define AMD_EDX_FEATURES        1
1545 #define TM_EDX_FEATURES         2
1546 #define STD_ECX_FEATURES        3
1547 #define AMD_ECX_FEATURES        4
1548 #define STD_EBX_FEATURES        5
1549         /*
1550          * Synthesized information, where known.
1551          */
1552         uint32_t cpi_chiprev;           /* See X86_CHIPREV_* in x86_archext.h */
1553         const char *cpi_chiprevstr;     /* May be NULL if chiprev unknown */
1554         uint32_t cpi_socket;            /* Chip package/socket type */
1555 
1556         struct mwait_info cpi_mwait;    /* fn 5: monitor/mwait info */
1557         uint32_t cpi_apicid;
1558         uint_t cpi_procnodeid;          /* AMD: nodeID on HT, Intel: chipid */
1559         uint_t cpi_procnodes_per_pkg;   /* AMD: # of nodes in the package */
1560                                         /* Intel: 1 */
1561         uint_t cpi_compunitid;          /* AMD: ComputeUnit ID, Intel: coreid */
1562         uint_t cpi_cores_per_compunit;  /* AMD: # of cores in the ComputeUnit */
1563 
1564         struct xsave_info cpi_xsave;    /* fn D: xsave/xrestor info */
1565 };
1566 
1567 
1568 static struct cpuid_info cpuid_info0;
1569 
1570 /*
1571  * These bit fields are defined by the Intel Application Note AP-485
1572  * "Intel Processor Identification and the CPUID Instruction"
1573  */
1574 #define CPI_FAMILY_XTD(cpi)     BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1575 #define CPI_MODEL_XTD(cpi)      BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1576 #define CPI_TYPE(cpi)           BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1577 #define CPI_FAMILY(cpi)         BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1578 #define CPI_STEP(cpi)           BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1579 #define CPI_MODEL(cpi)          BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1580 
1581 #define CPI_FEATURES_EDX(cpi)           ((cpi)->cpi_std[1].cp_edx)
1582 #define CPI_FEATURES_ECX(cpi)           ((cpi)->cpi_std[1].cp_ecx)
1583 #define CPI_FEATURES_XTD_EDX(cpi)       ((cpi)->cpi_extd[1].cp_edx)
1584 #define CPI_FEATURES_XTD_ECX(cpi)       ((cpi)->cpi_extd[1].cp_ecx)
1585 #define CPI_FEATURES_7_0_EBX(cpi)       ((cpi)->cpi_std[7].cp_ebx)
1586 #define CPI_FEATURES_7_0_ECX(cpi)       ((cpi)->cpi_std[7].cp_ecx)
1587 #define CPI_FEATURES_7_0_EDX(cpi)       ((cpi)->cpi_std[7].cp_edx)
1588 
1589 #define CPI_BRANDID(cpi)        BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1590 #define CPI_CHUNKS(cpi)         BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1591 #define CPI_CPU_COUNT(cpi)      BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1592 #define CPI_APIC_ID(cpi)        BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1593 
1594 #define CPI_MAXEAX_MAX          0x100           /* sanity control */
1595 #define CPI_XMAXEAX_MAX         0x80000100
1596 #define CPI_FN4_ECX_MAX         0x20            /* sanity: max fn 4 levels */
1597 #define CPI_FNB_ECX_MAX         0x20            /* sanity: max fn B levels */
1598 
1599 /*
1600  * Function 4 (Deterministic Cache Parameters) macros
1601  * Defined by Intel Application Note AP-485
1602  */
1603 #define CPI_NUM_CORES(regs)             BITX((regs)->cp_eax, 31, 26)
1604 #define CPI_NTHR_SHR_CACHE(regs)        BITX((regs)->cp_eax, 25, 14)
1605 #define CPI_FULL_ASSOC_CACHE(regs)      BITX((regs)->cp_eax, 9, 9)
1606 #define CPI_SELF_INIT_CACHE(regs)       BITX((regs)->cp_eax, 8, 8)
1607 #define CPI_CACHE_LVL(regs)             BITX((regs)->cp_eax, 7, 5)
1608 #define CPI_CACHE_TYPE(regs)            BITX((regs)->cp_eax, 4, 0)
1609 #define CPI_CPU_LEVEL_TYPE(regs)        BITX((regs)->cp_ecx, 15, 8)
1610 
1611 #define CPI_CACHE_WAYS(regs)            BITX((regs)->cp_ebx, 31, 22)
1612 #define CPI_CACHE_PARTS(regs)           BITX((regs)->cp_ebx, 21, 12)
1613 #define CPI_CACHE_COH_LN_SZ(regs)       BITX((regs)->cp_ebx, 11, 0)
1614 
1615 #define CPI_CACHE_SETS(regs)            BITX((regs)->cp_ecx, 31, 0)
1616 
1617 #define CPI_PREFCH_STRIDE(regs)         BITX((regs)->cp_edx, 9, 0)
1618 
1619 
1620 /*
1621  * A couple of shorthand macros to identify "later" P6-family chips
1622  * like the Pentium M and Core.  First, the "older" P6-based stuff
1623  * (loosely defined as "pre-Pentium-4"):
1624  * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1625  */
1626 #define IS_LEGACY_P6(cpi) (                     \
1627         cpi->cpi_family == 6 &&                      \
1628                 (cpi->cpi_model == 1 ||              \
1629                 cpi->cpi_model == 3 ||               \
1630                 cpi->cpi_model == 5 ||               \
1631                 cpi->cpi_model == 6 ||               \
1632                 cpi->cpi_model == 7 ||               \
1633                 cpi->cpi_model == 8 ||               \
1634                 cpi->cpi_model == 0xA ||     \
1635                 cpi->cpi_model == 0xB)               \
1636 )
1637 
1638 /* A "new F6" is everything with family 6 that's not the above */
1639 #define IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
1640 
1641 /* Extended family/model support */
1642 #define IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
1643         cpi->cpi_family >= 0xf)
1644 
1645 /*
1646  * Info for monitor/mwait idle loop.
1647  *
1648  * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
1649  * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
1650  * 2006.
1651  * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
1652  * Documentation Updates" #33633, Rev 2.05, December 2006.
1653  */
1654 #define MWAIT_SUPPORT           (0x00000001)    /* mwait supported */
1655 #define MWAIT_EXTENSIONS        (0x00000002)    /* extenstion supported */
1656 #define MWAIT_ECX_INT_ENABLE    (0x00000004)    /* ecx 1 extension supported */
1657 #define MWAIT_SUPPORTED(cpi)    ((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
1658 #define MWAIT_INT_ENABLE(cpi)   ((cpi)->cpi_std[5].cp_ecx & 0x2)
1659 #define MWAIT_EXTENSION(cpi)    ((cpi)->cpi_std[5].cp_ecx & 0x1)
1660 #define MWAIT_SIZE_MIN(cpi)     BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
1661 #define MWAIT_SIZE_MAX(cpi)     BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
1662 /*
1663  * Number of sub-cstates for a given c-state.
1664  */
1665 #define MWAIT_NUM_SUBC_STATES(cpi, c_state)                     \
1666         BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
1667 
1668 /*
1669  * XSAVE leaf 0xD enumeration
1670  */
1671 #define CPUID_LEAFD_2_YMM_OFFSET        576
1672 #define CPUID_LEAFD_2_YMM_SIZE          256
1673 
1674 /*
1675  * Common extended leaf names to cut down on typos.
1676  */
1677 #define CPUID_LEAF_EXT_0                0x80000000
1678 #define CPUID_LEAF_EXT_8                0x80000008
1679 #define CPUID_LEAF_EXT_1d               0x8000001d
1680 #define CPUID_LEAF_EXT_1e               0x8000001e
1681 
1682 /*
1683  * Functions we consune from cpuid_subr.c;  don't publish these in a header
1684  * file to try and keep people using the expected cpuid_* interfaces.
1685  */
1686 extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
1687 extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
1688 extern uint32_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
1689 extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
1690 extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
1691 
1692 /*
1693  * Apply up various platform-dependent restrictions where the
1694  * underlying platform restrictions mean the CPU can be marked
1695  * as less capable than its cpuid instruction would imply.
1696  */
1697 #if defined(__xpv)
1698 static void
1699 platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
1700 {
1701         switch (eax) {
1702         case 1: {
1703                 uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
1704                     0 : CPUID_INTC_EDX_MCA;
1705                 cp->cp_edx &=
1706                     ~(mcamask |
1707                     CPUID_INTC_EDX_PSE |
1708                     CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1709                     CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
1710                     CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
1711                     CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1712                     CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
1713                 break;
1714         }
1715 
1716         case 0x80000001:
1717                 cp->cp_edx &=
1718                     ~(CPUID_AMD_EDX_PSE |
1719                     CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1720                     CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
1721                     CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
1722                     CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1723                     CPUID_AMD_EDX_TSCP);
1724                 cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
1725                 break;
1726         default:
1727                 break;
1728         }
1729 
1730         switch (vendor) {
1731         case X86_VENDOR_Intel:
1732                 switch (eax) {
1733                 case 4:
1734                         /*
1735                          * Zero out the (ncores-per-chip - 1) field
1736                          */
1737                         cp->cp_eax &= 0x03fffffff;
1738                         break;
1739                 default:
1740                         break;
1741                 }
1742                 break;
1743         case X86_VENDOR_AMD:
1744                 switch (eax) {
1745 
1746                 case 0x80000001:
1747                         cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
1748                         break;
1749 
1750                 case CPUID_LEAF_EXT_8:
1751                         /*
1752                          * Zero out the (ncores-per-chip - 1) field
1753                          */
1754                         cp->cp_ecx &= 0xffffff00;
1755                         break;
1756                 default:
1757                         break;
1758                 }
1759                 break;
1760         default:
1761                 break;
1762         }
1763 }
1764 #else
1765 #define platform_cpuid_mangle(vendor, eax, cp)  /* nothing */
1766 #endif
1767 
1768 /*
1769  *  Some undocumented ways of patching the results of the cpuid
1770  *  instruction to permit running Solaris 10 on future cpus that
1771  *  we don't currently support.  Could be set to non-zero values
1772  *  via settings in eeprom.
1773  */
1774 
1775 uint32_t cpuid_feature_ecx_include;
1776 uint32_t cpuid_feature_ecx_exclude;
1777 uint32_t cpuid_feature_edx_include;
1778 uint32_t cpuid_feature_edx_exclude;
1779 
1780 /*
1781  * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
1782  */
1783 void
1784 cpuid_alloc_space(cpu_t *cpu)
1785 {
1786         /*
1787          * By convention, cpu0 is the boot cpu, which is set up
1788          * before memory allocation is available.  All other cpus get
1789          * their cpuid_info struct allocated here.
1790          */
1791         ASSERT(cpu->cpu_id != 0);
1792         ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
1793         cpu->cpu_m.mcpu_cpi =
1794             kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
1795 }
1796 
1797 void
1798 cpuid_free_space(cpu_t *cpu)
1799 {
1800         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1801         int i;
1802 
1803         ASSERT(cpi != NULL);
1804         ASSERT(cpi != &cpuid_info0);
1805 
1806         /*
1807          * Free up any cache leaf related dynamic storage. The first entry was
1808          * cached from the standard cpuid storage, so we should not free it.
1809          */
1810         for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
1811                 kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
1812         if (cpi->cpi_cache_leaf_size > 0)
1813                 kmem_free(cpi->cpi_cache_leaves,
1814                     cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
1815 
1816         kmem_free(cpi, sizeof (*cpi));
1817         cpu->cpu_m.mcpu_cpi = NULL;
1818 }
1819 
1820 #if !defined(__xpv)
1821 /*
1822  * Determine the type of the underlying platform. This is used to customize
1823  * initialization of various subsystems (e.g. TSC). determine_platform() must
1824  * only ever be called once to prevent two processors from seeing different
1825  * values of platform_type. Must be called before cpuid_pass1(), the earliest
1826  * consumer to execute (uses _cpuid_chiprev --> synth_amd_info --> get_hwenv).
1827  */
1828 void
1829 determine_platform(void)
1830 {
1831         struct cpuid_regs cp;
1832         uint32_t base;
1833         uint32_t regs[4];
1834         char *hvstr = (char *)regs;
1835 
1836         ASSERT(platform_type == -1);
1837 
1838         platform_type = HW_NATIVE;
1839 
1840         if (!enable_platform_detection)
1841                 return;
1842 
1843         /*
1844          * If Hypervisor CPUID bit is set, try to determine hypervisor
1845          * vendor signature, and set platform type accordingly.
1846          *
1847          * References:
1848          * http://lkml.org/lkml/2008/10/1/246
1849          * http://kb.vmware.com/kb/1009458
1850          */
1851         cp.cp_eax = 0x1;
1852         (void) __cpuid_insn(&cp);
1853         if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
1854                 cp.cp_eax = 0x40000000;
1855                 (void) __cpuid_insn(&cp);
1856                 regs[0] = cp.cp_ebx;
1857                 regs[1] = cp.cp_ecx;
1858                 regs[2] = cp.cp_edx;
1859                 regs[3] = 0;
1860                 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
1861                         platform_type = HW_XEN_HVM;
1862                         return;
1863                 }
1864                 if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
1865                         platform_type = HW_VMWARE;
1866                         return;
1867                 }
1868                 if (strcmp(hvstr, HVSIG_KVM) == 0) {
1869                         platform_type = HW_KVM;
1870                         return;
1871                 }
1872                 if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
1873                         platform_type = HW_BHYVE;
1874                         return;
1875                 }
1876                 if (strcmp(hvstr, HVSIG_MICROSOFT) == 0)
1877                         platform_type = HW_MICROSOFT;
1878         } else {
1879                 /*
1880                  * Check older VMware hardware versions. VMware hypervisor is
1881                  * detected by performing an IN operation to VMware hypervisor
1882                  * port and checking that value returned in %ebx is VMware
1883                  * hypervisor magic value.
1884                  *
1885                  * References: http://kb.vmware.com/kb/1009458
1886                  */
1887                 vmware_port(VMWARE_HVCMD_GETVERSION, regs);
1888                 if (regs[1] == VMWARE_HVMAGIC) {
1889                         platform_type = HW_VMWARE;
1890                         return;
1891                 }
1892         }
1893 
1894         /*
1895          * Check Xen hypervisor. In a fully virtualized domain,
1896          * Xen's pseudo-cpuid function returns a string representing the
1897          * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
1898          * supported cpuid function. We need at least a (base + 2) leaf value
1899          * to do what we want to do. Try different base values, since the
1900          * hypervisor might use a different one depending on whether Hyper-V
1901          * emulation is switched on by default or not.
1902          */
1903         for (base = 0x40000000; base < 0x40010000; base += 0x100) {
1904                 cp.cp_eax = base;
1905                 (void) __cpuid_insn(&cp);
1906                 regs[0] = cp.cp_ebx;
1907                 regs[1] = cp.cp_ecx;
1908                 regs[2] = cp.cp_edx;
1909                 regs[3] = 0;
1910                 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
1911                     cp.cp_eax >= (base + 2)) {
1912                         platform_type &= ~HW_NATIVE;
1913                         platform_type |= HW_XEN_HVM;
1914                         return;
1915                 }
1916         }
1917 }
1918 
1919 int
1920 get_hwenv(void)
1921 {
1922         ASSERT(platform_type != -1);
1923         return (platform_type);
1924 }
1925 
1926 int
1927 is_controldom(void)
1928 {
1929         return (0);
1930 }
1931 
1932 #else
1933 
1934 int
1935 get_hwenv(void)
1936 {
1937         return (HW_XEN_PV);
1938 }
1939 
1940 int
1941 is_controldom(void)
1942 {
1943         return (DOMAIN_IS_INITDOMAIN(xen_info));
1944 }
1945 
1946 #endif  /* __xpv */
1947 
1948 /*
1949  * Make sure that we have gathered all of the CPUID leaves that we might need to
1950  * determine topology. We assume that the standard leaf 1 has already been done
1951  * and that xmaxeax has already been calculated.
1952  */
1953 static void
1954 cpuid_gather_amd_topology_leaves(cpu_t *cpu)
1955 {
1956         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1957 
1958         if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
1959                 struct cpuid_regs *cp;
1960 
1961                 cp = &cpi->cpi_extd[8];
1962                 cp->cp_eax = CPUID_LEAF_EXT_8;
1963                 (void) __cpuid_insn(cp);
1964                 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
1965         }
1966 
1967         if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
1968             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
1969                 struct cpuid_regs *cp;
1970 
1971                 cp = &cpi->cpi_extd[0x1e];
1972                 cp->cp_eax = CPUID_LEAF_EXT_1e;
1973                 (void) __cpuid_insn(cp);
1974         }
1975 }
1976 
1977 /*
1978  * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
1979  * it to everything else. If not, and we're on an AMD system where 8000001e is
1980  * valid, then we use that. Othewrise, we fall back to the default value for the
1981  * APIC ID in leaf 1.
1982  */
1983 static uint32_t
1984 cpuid_gather_apicid(struct cpuid_info *cpi)
1985 {
1986         /*
1987          * Leaf B changes based on the arguments to it. Beacuse we don't cache
1988          * it, we need to gather it again.
1989          */
1990         if (cpi->cpi_maxeax >= 0xB) {
1991                 struct cpuid_regs regs;
1992                 struct cpuid_regs *cp;
1993 
1994                 cp = &regs;
1995                 cp->cp_eax = 0xB;
1996                 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
1997                 (void) __cpuid_insn(cp);
1998 
1999                 if (cp->cp_ebx != 0) {
2000                         return (cp->cp_edx);
2001                 }
2002         }
2003 
2004         if (cpi->cpi_vendor == X86_VENDOR_AMD &&
2005             is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2006             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2007                 return (cpi->cpi_extd[0x1e].cp_eax);
2008         }
2009 
2010         return (CPI_APIC_ID(cpi));
2011 }
2012 
2013 /*
2014  * For AMD processors, attempt to calculate the number of chips and cores that
2015  * exist. The way that we do this varies based on the generation, because the
2016  * generations themselves have changed dramatically.
2017  *
2018  * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
2019  * However, with the advent of family 17h (Zen) it actually tells us the number
2020  * of threads, so we need to look at leaf 0x8000001e if available to determine
2021  * its value. Otherwise, for all prior families, the number of enabled cores is
2022  * the same as threads.
2023  *
2024  * If we do not have leaf 0x80000008, then we assume that this processor does
2025  * not have anything. AMD's older CPUID specification says there's no reason to
2026  * fall back to leaf 1.
2027  *
2028  * In some virtualization cases we will not have leaf 8000001e or it will be
2029  * zero. When that happens we assume the number of threads is one.
2030  */
2031 static void
2032 cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2033 {
2034         uint_t nthreads, nthread_per_core;
2035 
2036         nthreads = nthread_per_core = 1;
2037 
2038         if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2039                 nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2040         } else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2041                 nthreads = CPI_CPU_COUNT(cpi);
2042         }
2043 
2044         /*
2045          * For us to have threads, and know about it, we have to be at least at
2046          * family 17h and have the cpuid bit that says we have extended
2047          * topology.
2048          */
2049         if (cpi->cpi_family >= 0x17 &&
2050             is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2051             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2052                 nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2053         }
2054 
2055         *ncpus = nthreads;
2056         *ncores = nthreads / nthread_per_core;
2057 }
2058 
2059 /*
2060  * Seed the initial values for the cores and threads for an Intel based
2061  * processor. These values will be overwritten if we detect that the processor
2062  * supports CPUID leaf 0xb.
2063  */
2064 static void
2065 cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2066 {
2067         /*
2068          * Only seed the number of physical cores from the first level leaf 4
2069          * information. The number of threads there indicate how many share the
2070          * L1 cache, which may or may not have anything to do with the number of
2071          * logical CPUs per core.
2072          */
2073         if (cpi->cpi_maxeax >= 4) {
2074                 *ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2075         } else {
2076                 *ncores = 1;
2077         }
2078 
2079         if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2080                 *ncpus = CPI_CPU_COUNT(cpi);
2081         } else {
2082                 *ncpus = *ncores;
2083         }
2084 }
2085 
2086 static boolean_t
2087 cpuid_leafB_getids(cpu_t *cpu)
2088 {
2089         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2090         struct cpuid_regs regs;
2091         struct cpuid_regs *cp;
2092 
2093         if (cpi->cpi_maxeax < 0xB)
2094                 return (B_FALSE);
2095 
2096         cp = &regs;
2097         cp->cp_eax = 0xB;
2098         cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2099 
2100         (void) __cpuid_insn(cp);
2101 
2102         /*
2103          * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2104          * indicates that the extended topology enumeration leaf is
2105          * available.
2106          */
2107         if (cp->cp_ebx != 0) {
2108                 uint32_t x2apic_id = 0;
2109                 uint_t coreid_shift = 0;
2110                 uint_t ncpu_per_core = 1;
2111                 uint_t chipid_shift = 0;
2112                 uint_t ncpu_per_chip = 1;
2113                 uint_t i;
2114                 uint_t level;
2115 
2116                 for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2117                         cp->cp_eax = 0xB;
2118                         cp->cp_ecx = i;
2119 
2120                         (void) __cpuid_insn(cp);
2121                         level = CPI_CPU_LEVEL_TYPE(cp);
2122 
2123                         if (level == 1) {
2124                                 x2apic_id = cp->cp_edx;
2125                                 coreid_shift = BITX(cp->cp_eax, 4, 0);
2126                                 ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2127                         } else if (level == 2) {
2128                                 x2apic_id = cp->cp_edx;
2129                                 chipid_shift = BITX(cp->cp_eax, 4, 0);
2130                                 ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2131                         }
2132                 }
2133 
2134                 /*
2135                  * cpi_apicid is taken care of in cpuid_gather_apicid.
2136                  */
2137                 cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2138                 cpi->cpi_ncore_per_chip = ncpu_per_chip /
2139                     ncpu_per_core;
2140                 cpi->cpi_chipid = x2apic_id >> chipid_shift;
2141                 cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2142                 cpi->cpi_coreid = x2apic_id >> coreid_shift;
2143                 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2144                 cpi->cpi_procnodeid = cpi->cpi_chipid;
2145                 cpi->cpi_compunitid = cpi->cpi_coreid;
2146 
2147                 if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2148                         cpi->cpi_nthread_bits = coreid_shift;
2149                         cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2150                 }
2151 
2152                 return (B_TRUE);
2153         } else {
2154                 return (B_FALSE);
2155         }
2156 }
2157 
2158 static void
2159 cpuid_intel_getids(cpu_t *cpu, void *feature)
2160 {
2161         uint_t i;
2162         uint_t chipid_shift = 0;
2163         uint_t coreid_shift = 0;
2164         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2165 
2166         /*
2167          * There are no compute units or processor nodes currently on Intel.
2168          * Always set these to one.
2169          */
2170         cpi->cpi_procnodes_per_pkg = 1;
2171         cpi->cpi_cores_per_compunit = 1;
2172 
2173         /*
2174          * If cpuid Leaf B is present, use that to try and get this information.
2175          * It will be the most accurate for Intel CPUs.
2176          */
2177         if (cpuid_leafB_getids(cpu))
2178                 return;
2179 
2180         /*
2181          * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2182          * and ncore_per_chip. These represent the largest power of two values
2183          * that we need to cover all of the IDs in the system. Therefore, we use
2184          * those values to seed the number of bits needed to cover information
2185          * in the case when leaf B is not available. These values will probably
2186          * be larger than required, but that's OK.
2187          */
2188         cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2189         cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2190 
2191         for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2192                 chipid_shift++;
2193 
2194         cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2195         cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2196 
2197         if (is_x86_feature(feature, X86FSET_CMP)) {
2198                 /*
2199                  * Multi-core (and possibly multi-threaded)
2200                  * processors.
2201                  */
2202                 uint_t ncpu_per_core;
2203                 if (cpi->cpi_ncore_per_chip == 1)
2204                         ncpu_per_core = cpi->cpi_ncpu_per_chip;
2205                 else if (cpi->cpi_ncore_per_chip > 1)
2206                         ncpu_per_core = cpi->cpi_ncpu_per_chip /
2207                             cpi->cpi_ncore_per_chip;
2208                 /*
2209                  * 8bit APIC IDs on dual core Pentiums
2210                  * look like this:
2211                  *
2212                  * +-----------------------+------+------+
2213                  * | Physical Package ID   |  MC  |  HT  |
2214                  * +-----------------------+------+------+
2215                  * <------- chipid -------->
2216                  * <------- coreid --------------->
2217                  *                         <--- clogid -->
2218                  *                         <------>
2219                  *                         pkgcoreid
2220                  *
2221                  * Where the number of bits necessary to
2222                  * represent MC and HT fields together equals
2223                  * to the minimum number of bits necessary to
2224                  * store the value of cpi->cpi_ncpu_per_chip.
2225                  * Of those bits, the MC part uses the number
2226                  * of bits necessary to store the value of
2227                  * cpi->cpi_ncore_per_chip.
2228                  */
2229                 for (i = 1; i < ncpu_per_core; i <<= 1)
2230                         coreid_shift++;
2231                 cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2232                 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2233         } else if (is_x86_feature(feature, X86FSET_HTT)) {
2234                 /*
2235                  * Single-core multi-threaded processors.
2236                  */
2237                 cpi->cpi_coreid = cpi->cpi_chipid;
2238                 cpi->cpi_pkgcoreid = 0;
2239         } else {
2240                 /*
2241                  * Single-core single-thread processors.
2242                  */
2243                 cpi->cpi_coreid = cpu->cpu_id;
2244                 cpi->cpi_pkgcoreid = 0;
2245         }
2246         cpi->cpi_procnodeid = cpi->cpi_chipid;
2247         cpi->cpi_compunitid = cpi->cpi_coreid;
2248 }
2249 
2250 /*
2251  * Historically, AMD has had CMP chips with only a single thread per core.
2252  * However, starting in family 17h (Zen), this has changed and they now have
2253  * multiple threads. Our internal core id needs to be a unique value.
2254  *
2255  * To determine the core id of an AMD system, if we're from a family before 17h,
2256  * then we just use the cpu id, as that gives us a good value that will be
2257  * unique for each core. If instead, we're on family 17h or later, then we need
2258  * to do something more complicated. CPUID leaf 0x8000001e can tell us
2259  * how many threads are in the system. Based on that, we'll shift the APIC ID.
2260  * We can't use the normal core id in that leaf as it's only unique within the
2261  * socket, which is perfect for cpi_pkgcoreid, but not us.
2262  */
2263 static id_t
2264 cpuid_amd_get_coreid(cpu_t *cpu)
2265 {
2266         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2267 
2268         if (cpi->cpi_family >= 0x17 &&
2269             is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2270             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2271                 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2272                 if (nthreads > 1) {
2273                         VERIFY3U(nthreads, ==, 2);
2274                         return (cpi->cpi_apicid >> 1);
2275                 }
2276         }
2277 
2278         return (cpu->cpu_id);
2279 }
2280 
2281 /*
2282  * IDs on AMD is a more challenging task. This is notable because of the
2283  * following two facts:
2284  *
2285  *  1. Before family 0x17 (Zen), there was no support for SMT and there was
2286  *     also no way to get an actual unique core id from the system. As such, we
2287  *     synthesize this case by using cpu->cpu_id.  This scheme does not,
2288  *     however, guarantee that sibling cores of a chip will have sequential
2289  *     coreids starting at a multiple of the number of cores per chip - that is
2290  *     usually the case, but if the ACPI MADT table is presented in a different
2291  *     order then we need to perform a few more gymnastics for the pkgcoreid.
2292  *
2293  *  2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2294  *     called compute units. These compute units share the L1I cache, L2 cache,
2295  *     and the FPU. To deal with this, a new topology leaf was added in
2296  *     0x8000001e. However, parts of this leaf have different meanings
2297  *     once we get to family 0x17.
2298  */
2299 
2300 static void
2301 cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2302 {
2303         int i, first_half, coreidsz;
2304         uint32_t nb_caps_reg;
2305         uint_t node2_1;
2306         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2307         struct cpuid_regs *cp;
2308 
2309         /*
2310          * Calculate the core id (this comes from hardware in family 0x17 if it
2311          * hasn't been stripped by virtualization). We always set the compute
2312          * unit id to the same value. Also, initialize the default number of
2313          * cores per compute unit and nodes per package. This will be
2314          * overwritten when we know information about a particular family.
2315          */
2316         cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2317         cpi->cpi_compunitid = cpi->cpi_coreid;
2318         cpi->cpi_cores_per_compunit = 1;
2319         cpi->cpi_procnodes_per_pkg = 1;
2320 
2321         /*
2322          * To construct the logical ID, we need to determine how many APIC IDs
2323          * are dedicated to the cores and threads. This is provided for us in
2324          * 0x80000008. However, if it's not present (say due to virtualization),
2325          * then we assume it's one. This should be present on all 64-bit AMD
2326          * processors.  It was added in family 0xf (Hammer).
2327          */
2328         if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2329                 coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2330 
2331                 /*
2332                  * In AMD parlance chip is really a node while illumos
2333                  * uses chip as equivalent to socket/package.
2334                  */
2335                 if (coreidsz == 0) {
2336                         /* Use legacy method */
2337                         for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2338                                 coreidsz++;
2339                         if (coreidsz == 0)
2340                                 coreidsz = 1;
2341                 }
2342         } else {
2343                 /* Assume single-core part */
2344                 coreidsz = 1;
2345         }
2346         cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2347 
2348         /*
2349          * The package core ID varies depending on the family. While it may be
2350          * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2351          * this value is the core id in the given node. For non-virtualized
2352          * family 17h, we need to take the logical core id and shift off the
2353          * threads like we do when getting the core id.  Otherwise, we can use
2354          * the clogid as is. When family 17h is virtualized, the clogid should
2355          * be sufficient as if we don't have valid data in the leaf, then we
2356          * won't think we have SMT, in which case the cpi_clogid should be
2357          * sufficient.
2358          */
2359         if (cpi->cpi_family >= 0x17 &&
2360             is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2361             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2362             cpi->cpi_extd[0x1e].cp_ebx != 0) {
2363                 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2364                 if (nthreads > 1) {
2365                         VERIFY3U(nthreads, ==, 2);
2366                         cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2367                 } else {
2368                         cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2369                 }
2370         } else {
2371                 cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2372         }
2373 
2374         /*
2375          * Obtain the node ID and compute unit IDs. If we're on family 0x15
2376          * (bulldozer) or newer, then we can derive all of this from leaf
2377          * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2378          */
2379         if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2380             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2381                 cp = &cpi->cpi_extd[0x1e];
2382 
2383                 cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2384                 cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2385 
2386                 /*
2387                  * For Bulldozer-era CPUs, recalculate the compute unit
2388                  * information.
2389                  */
2390                 if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2391                         cpi->cpi_cores_per_compunit =
2392                             BITX(cp->cp_ebx, 15, 8) + 1;
2393                         cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2394                             (cpi->cpi_ncore_per_chip /
2395                             cpi->cpi_cores_per_compunit) *
2396                             (cpi->cpi_procnodeid /
2397                             cpi->cpi_procnodes_per_pkg);
2398                 }
2399         } else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2400                 cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2401         } else if (cpi->cpi_family == 0x10) {
2402                 /*
2403                  * See if we are a multi-node processor.
2404                  * All processors in the system have the same number of nodes
2405                  */
2406                 nb_caps_reg =  pci_getl_func(0, 24, 3, 0xe8);
2407                 if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2408                         /* Single-node */
2409                         cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2410                             coreidsz);
2411                 } else {
2412 
2413                         /*
2414                          * Multi-node revision D (2 nodes per package
2415                          * are supported)
2416                          */
2417                         cpi->cpi_procnodes_per_pkg = 2;
2418 
2419                         first_half = (cpi->cpi_pkgcoreid <=
2420                             (cpi->cpi_ncore_per_chip/2 - 1));
2421 
2422                         if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2423                                 /* We are BSP */
2424                                 cpi->cpi_procnodeid = (first_half ? 0 : 1);
2425                         } else {
2426 
2427                                 /* We are AP */
2428                                 /* NodeId[2:1] bits to use for reading F3xe8 */
2429                                 node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2430 
2431                                 nb_caps_reg =
2432                                     pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2433 
2434                                 /*
2435                                  * Check IntNodeNum bit (31:30, but bit 31 is
2436                                  * always 0 on dual-node processors)
2437                                  */
2438                                 if (BITX(nb_caps_reg, 30, 30) == 0)
2439                                         cpi->cpi_procnodeid = node2_1 +
2440                                             !first_half;
2441                                 else
2442                                         cpi->cpi_procnodeid = node2_1 +
2443                                             first_half;
2444                         }
2445                 }
2446         } else {
2447                 cpi->cpi_procnodeid = 0;
2448         }
2449 
2450         cpi->cpi_chipid =
2451             cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2452 
2453         cpi->cpi_ncore_bits = coreidsz;
2454         cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2455             cpi->cpi_ncore_per_chip);
2456 }
2457 
2458 static void
2459 spec_uarch_flush_noop(void)
2460 {
2461 }
2462 
2463 /*
2464  * When microcode is present that mitigates MDS, this wrmsr will also flush the
2465  * MDS-related micro-architectural state that would normally happen by calling
2466  * x86_md_clear().
2467  */
2468 static void
2469 spec_uarch_flush_msr(void)
2470 {
2471         wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2472 }
2473 
2474 /*
2475  * This function points to a function that will flush certain
2476  * micro-architectural state on the processor. This flush is used to mitigate
2477  * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This
2478  * function can point to one of three functions:
2479  *
2480  * - A noop which is done because we either are vulnerable, but do not have
2481  *   microcode available to help deal with a fix, or because we aren't
2482  *   vulnerable.
2483  *
2484  * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2485  *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2486  *   however, it only flushes the MDS related micro-architectural state on the
2487  *   current hyperthread, it does not do anything for the twin.
2488  *
2489  * - x86_md_clear which will flush the MDS related state. This is done when we
2490  *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2491  *   (RDCL_NO is set).
2492  */
2493 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2494 
2495 static void
2496 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2497 {
2498         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2499 
2500         /*
2501          * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2502          * has been fixed in hardware, it doesn't cover everything related to
2503          * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2504          * need to mitigate this.
2505          */
2506         if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2507             is_x86_feature(featureset, X86FSET_MDS_NO)) {
2508                 return;
2509         }
2510 
2511         if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2512                 const uint8_t nop = NOP_INSTR;
2513                 uint8_t *md = (uint8_t *)x86_md_clear;
2514 
2515                 *md = nop;
2516         }
2517 
2518         membar_producer();
2519 }
2520 
2521 static void
2522 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2523 {
2524         boolean_t need_l1d, need_mds;
2525         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2526 
2527         /*
2528          * If we're not on Intel or we've mitigated both RDCL and MDS in
2529          * hardware, then there's nothing left for us to do for enabling the
2530          * flush. We can also go ahead and say that SMT exclusion is
2531          * unnecessary.
2532          */
2533         if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2534             (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2535             is_x86_feature(featureset, X86FSET_MDS_NO))) {
2536                 extern int smt_exclusion;
2537                 smt_exclusion = 0;
2538                 spec_uarch_flush = spec_uarch_flush_noop;
2539                 membar_producer();
2540                 return;
2541         }
2542 
2543         /*
2544          * The locations where we need to perform an L1D flush are required both
2545          * for mitigating L1TF and MDS. When verw support is present in
2546          * microcode, then the L1D flush will take care of doing that as well.
2547          * However, if we have a system where RDCL_NO is present, but we don't
2548          * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2549          * L1D flush.
2550          */
2551         if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2552             is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2553             !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2554                 need_l1d = B_TRUE;
2555         } else {
2556                 need_l1d = B_FALSE;
2557         }
2558 
2559         if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2560             is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2561                 need_mds = B_TRUE;
2562         } else {
2563                 need_mds = B_FALSE;
2564         }
2565 
2566         if (need_l1d) {
2567                 spec_uarch_flush = spec_uarch_flush_msr;
2568         } else if (need_mds) {
2569                 spec_uarch_flush = x86_md_clear;
2570         } else {
2571                 /*
2572                  * We have no hardware mitigations available to us.
2573                  */
2574                 spec_uarch_flush = spec_uarch_flush_noop;
2575         }
2576         membar_producer();
2577 }
2578 
2579 /*
2580  * We default to enabling RSB mitigations.
2581  */
2582 static void
2583 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)
2584 {
2585         const uint8_t ret = RET_INSTR;
2586         uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
2587 
2588         switch (mit) {
2589         case X86_SPECTREV2_ENHANCED_IBRS:
2590         case X86_SPECTREV2_DISABLED:
2591                 *stuff = ret;
2592                 break;
2593         default:
2594                 break;
2595         }
2596 }
2597 
2598 static void
2599 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
2600 {
2601         const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
2602             "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
2603             "_r14", "_r15" };
2604         const uint_t nthunks = ARRAY_SIZE(thunks);
2605         const char *type;
2606         uint_t i;
2607 
2608         if (mit == x86_spectrev2_mitigation)
2609                 return;
2610 
2611         switch (mit) {
2612         case X86_SPECTREV2_RETPOLINE:
2613                 type = "gen";
2614                 break;
2615         case X86_SPECTREV2_RETPOLINE_AMD:
2616                 type = "amd";
2617                 break;
2618         case X86_SPECTREV2_ENHANCED_IBRS:
2619         case X86_SPECTREV2_DISABLED:
2620                 type = "jmp";
2621                 break;
2622         default:
2623                 panic("asked to updated retpoline state with unknown state!");
2624         }
2625 
2626         for (i = 0; i < nthunks; i++) {
2627                 uintptr_t source, dest;
2628                 int ssize, dsize;
2629                 char sourcebuf[64], destbuf[64];
2630                 size_t len;
2631 
2632                 (void) snprintf(destbuf, sizeof (destbuf),
2633                     "__x86_indirect_thunk%s", thunks[i]);
2634                 (void) snprintf(sourcebuf, sizeof (sourcebuf),
2635                     "__x86_indirect_thunk_%s%s", type, thunks[i]);
2636 
2637                 source = kobj_getelfsym(sourcebuf, NULL, &ssize);
2638                 dest = kobj_getelfsym(destbuf, NULL, &dsize);
2639                 VERIFY3U(source, !=, 0);
2640                 VERIFY3U(dest, !=, 0);
2641                 VERIFY3S(dsize, >=, ssize);
2642                 bcopy((void *)source, (void *)dest, ssize);
2643         }
2644 }
2645 
2646 static void
2647 cpuid_enable_enhanced_ibrs(void)
2648 {
2649         uint64_t val;
2650 
2651         val = rdmsr(MSR_IA32_SPEC_CTRL);
2652         val |= IA32_SPEC_CTRL_IBRS;
2653         wrmsr(MSR_IA32_SPEC_CTRL, val);
2654 }
2655 
2656 #ifndef __xpv
2657 /*
2658  * Determine whether or not we can use the AMD optimized retpoline
2659  * functionality. We use this when we know we're on an AMD system and we can
2660  * successfully verify that lfence is dispatch serializing.
2661  */
2662 static boolean_t
2663 cpuid_use_amd_retpoline(struct cpuid_info *cpi)
2664 {
2665         uint64_t val;
2666         on_trap_data_t otd;
2667 
2668         if (cpi->cpi_vendor != X86_VENDOR_AMD)
2669                 return (B_FALSE);
2670 
2671         /*
2672          * We need to determine whether or not lfence is serializing. It always
2673          * is on families 0xf and 0x11. On others, it's controlled by
2674          * MSR_AMD_DECODE_CONFIG (MSRC001_1029). If some hypervisor gives us a
2675          * crazy old family, don't try and do anything.
2676          */
2677         if (cpi->cpi_family < 0xf)
2678                 return (B_FALSE);
2679         if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11)
2680                 return (B_TRUE);
2681 
2682         /*
2683          * While it may be tempting to use get_hwenv(), there are no promises
2684          * that a hypervisor will actually declare themselves to be so in a
2685          * friendly way. As such, try to read and set the MSR. If we can then
2686          * read back the value we set (it wasn't just set to zero), then we go
2687          * for it.
2688          */
2689         if (!on_trap(&otd, OT_DATA_ACCESS)) {
2690                 val = rdmsr(MSR_AMD_DECODE_CONFIG);
2691                 val |= AMD_DECODE_CONFIG_LFENCE_DISPATCH;
2692                 wrmsr(MSR_AMD_DECODE_CONFIG, val);
2693                 val = rdmsr(MSR_AMD_DECODE_CONFIG);
2694         } else {
2695                 val = 0;
2696         }
2697         no_trap();
2698 
2699         if ((val & AMD_DECODE_CONFIG_LFENCE_DISPATCH) != 0)
2700                 return (B_TRUE);
2701         return (B_FALSE);
2702 }
2703 #endif  /* !__xpv */
2704 
2705 static void
2706 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
2707 {
2708         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2709         x86_spectrev2_mitigation_t v2mit;
2710 
2711         if (cpi->cpi_vendor == X86_VENDOR_AMD &&
2712             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2713                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
2714                         add_x86_feature(featureset, X86FSET_IBPB);
2715                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
2716                         add_x86_feature(featureset, X86FSET_IBRS);
2717                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
2718                         add_x86_feature(featureset, X86FSET_STIBP);
2719                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
2720                         add_x86_feature(featureset, X86FSET_STIBP_ALL);
2721                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
2722                         add_x86_feature(featureset, X86FSET_SSBD);
2723                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
2724                         add_x86_feature(featureset, X86FSET_SSBD_VIRT);
2725                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
2726                         add_x86_feature(featureset, X86FSET_SSB_NO);
2727                 /*
2728                  * Don't enable enhanced IBRS unless we're told that we should
2729                  * prefer it and it has the same semantics as Intel. This is
2730                  * split into two bits rather than a single one.
2731                  */
2732                 if ((cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
2733                     (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)) {
2734                         add_x86_feature(featureset, X86FSET_IBRS_ALL);
2735                 }
2736 
2737         } else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
2738             cpi->cpi_maxeax >= 7) {
2739                 struct cpuid_regs *ecp;
2740                 ecp = &cpi->cpi_std[7];
2741 
2742                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
2743                         add_x86_feature(featureset, X86FSET_MD_CLEAR);
2744                 }
2745 
2746                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
2747                         add_x86_feature(featureset, X86FSET_IBRS);
2748                         add_x86_feature(featureset, X86FSET_IBPB);
2749                 }
2750 
2751                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
2752                         add_x86_feature(featureset, X86FSET_STIBP);
2753                 }
2754 
2755                 /*
2756                  * Don't read the arch caps MSR on xpv where we lack the
2757                  * on_trap().
2758                  */
2759 #ifndef __xpv
2760                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
2761                         on_trap_data_t otd;
2762 
2763                         /*
2764                          * Be paranoid and assume we'll get a #GP.
2765                          */
2766                         if (!on_trap(&otd, OT_DATA_ACCESS)) {
2767                                 uint64_t reg;
2768 
2769                                 reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
2770                                 if (reg & IA32_ARCH_CAP_RDCL_NO) {
2771                                         add_x86_feature(featureset,
2772                                             X86FSET_RDCL_NO);
2773                                 }
2774                                 if (reg & IA32_ARCH_CAP_IBRS_ALL) {
2775                                         add_x86_feature(featureset,
2776                                             X86FSET_IBRS_ALL);
2777                                 }
2778                                 if (reg & IA32_ARCH_CAP_RSBA) {
2779                                         add_x86_feature(featureset,
2780                                             X86FSET_RSBA);
2781                                 }
2782                                 if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
2783                                         add_x86_feature(featureset,
2784                                             X86FSET_L1D_VM_NO);
2785                                 }
2786                                 if (reg & IA32_ARCH_CAP_SSB_NO) {
2787                                         add_x86_feature(featureset,
2788                                             X86FSET_SSB_NO);
2789                                 }
2790                                 if (reg & IA32_ARCH_CAP_MDS_NO) {
2791                                         add_x86_feature(featureset,
2792                                             X86FSET_MDS_NO);
2793                                 }
2794                         }
2795                         no_trap();
2796                 }
2797 #endif  /* !__xpv */
2798 
2799                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
2800                         add_x86_feature(featureset, X86FSET_SSBD);
2801 
2802                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
2803                         add_x86_feature(featureset, X86FSET_FLUSH_CMD);
2804         }
2805 
2806         if (cpu->cpu_id != 0) {
2807                 if (x86_spectrev2_mitigation == X86_SPECTREV2_ENHANCED_IBRS) {
2808                         cpuid_enable_enhanced_ibrs();
2809                 }
2810                 return;
2811         }
2812 
2813         /*
2814          * Go through and initialize various security mechanisms that we should
2815          * only do on a single CPU. This includes Spectre V2, L1TF, and MDS.
2816          */
2817 
2818         /*
2819          * By default we've come in with retpolines enabled. Check whether we
2820          * should disable them or enable enhanced IBRS. RSB stuffing is enabled
2821          * by default, but disabled if we are using enhanced IBRS.
2822          */
2823         if (x86_disable_spectrev2 != 0) {
2824                 v2mit = X86_SPECTREV2_DISABLED;
2825         } else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
2826                 cpuid_enable_enhanced_ibrs();
2827                 v2mit = X86_SPECTREV2_ENHANCED_IBRS;
2828 #ifndef __xpv
2829         } else if (cpuid_use_amd_retpoline(cpi)) {
2830                 v2mit = X86_SPECTREV2_RETPOLINE_AMD;
2831 #endif  /* !__xpv */
2832         } else {
2833                 v2mit = X86_SPECTREV2_RETPOLINE;
2834         }
2835 
2836         cpuid_patch_retpolines(v2mit);
2837         cpuid_patch_rsb(v2mit);
2838         x86_spectrev2_mitigation = v2mit;
2839         membar_producer();
2840 
2841         /*
2842          * We need to determine what changes are required for mitigating L1TF
2843          * and MDS. If the CPU suffers from either of them, then SMT exclusion
2844          * is required.
2845          *
2846          * If any of these are present, then we need to flush u-arch state at
2847          * various points. For MDS, we need to do so whenever we change to a
2848          * lesser privilege level or we are halting the CPU. For L1TF we need to
2849          * flush the L1D cache at VM entry. When we have microcode that handles
2850          * MDS, the L1D flush also clears the other u-arch state that the
2851          * md_clear does.
2852          */
2853 
2854         /*
2855          * Update whether or not we need to be taking explicit action against
2856          * MDS.
2857          */
2858         cpuid_update_md_clear(cpu, featureset);
2859 
2860         /*
2861          * Determine whether SMT exclusion is required and whether or not we
2862          * need to perform an l1d flush.
2863          */
2864         cpuid_update_l1d_flush(cpu, featureset);
2865 }
2866 
2867 /*
2868  * Setup XFeature_Enabled_Mask register. Required by xsave feature.
2869  */
2870 void
2871 setup_xfem(void)
2872 {
2873         uint64_t flags = XFEATURE_LEGACY_FP;
2874 
2875         ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
2876 
2877         if (is_x86_feature(x86_featureset, X86FSET_SSE))
2878                 flags |= XFEATURE_SSE;
2879 
2880         if (is_x86_feature(x86_featureset, X86FSET_AVX))
2881                 flags |= XFEATURE_AVX;
2882 
2883         if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
2884                 flags |= XFEATURE_AVX512;
2885 
2886         set_xcr(XFEATURE_ENABLED_MASK, flags);
2887 
2888         xsave_bv_all = flags;
2889 }
2890 
2891 static void
2892 cpuid_pass1_topology(cpu_t *cpu, uchar_t *featureset)
2893 {
2894         struct cpuid_info *cpi;
2895 
2896         cpi = cpu->cpu_m.mcpu_cpi;
2897 
2898         if (cpi->cpi_vendor == X86_VENDOR_AMD) {
2899                 cpuid_gather_amd_topology_leaves(cpu);
2900         }
2901 
2902         cpi->cpi_apicid = cpuid_gather_apicid(cpi);
2903 
2904         /*
2905          * Before we can calculate the IDs that we should assign to this
2906          * processor, we need to understand how many cores and threads it has.
2907          */
2908         switch (cpi->cpi_vendor) {
2909         case X86_VENDOR_Intel:
2910                 cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
2911                     &cpi->cpi_ncore_per_chip);
2912                 break;
2913         case X86_VENDOR_AMD:
2914                 cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
2915                     &cpi->cpi_ncore_per_chip);
2916                 break;
2917         default:
2918                 /*
2919                  * If we have some other x86 compatible chip, it's not clear how
2920                  * they would behave. The most common case is virtualization
2921                  * today, though there are also 64-bit VIA chips. Assume that
2922                  * all we can get is the basic Leaf 1 HTT information.
2923                  */
2924                 if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2925                         cpi->cpi_ncore_per_chip = 1;
2926                         cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
2927                 }
2928                 break;
2929         }
2930 
2931         /*
2932          * Based on the calculated number of threads and cores, potentially
2933          * assign the HTT and CMT features.
2934          */
2935         if (cpi->cpi_ncore_per_chip > 1) {
2936                 add_x86_feature(featureset, X86FSET_CMP);
2937         }
2938 
2939         if (cpi->cpi_ncpu_per_chip > 1 &&
2940             cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
2941                 add_x86_feature(featureset, X86FSET_HTT);
2942         }
2943 
2944         /*
2945          * Now that has been set up, we need to go through and calculate all of
2946          * the rest of the parameters that exist. If we think the CPU doesn't
2947          * have either SMT (HTT) or CMP, then we basically go through and fake
2948          * up information in some way. The most likely case for this is
2949          * virtualization where we have a lot of partial topology information.
2950          */
2951         if (!is_x86_feature(featureset, X86FSET_HTT) &&
2952             !is_x86_feature(featureset, X86FSET_CMP)) {
2953                 /*
2954                  * This is a single core, single-threaded processor.
2955                  */
2956                 cpi->cpi_procnodes_per_pkg = 1;
2957                 cpi->cpi_cores_per_compunit = 1;
2958                 cpi->cpi_compunitid = 0;
2959                 cpi->cpi_chipid = -1;
2960                 cpi->cpi_clogid = 0;
2961                 cpi->cpi_coreid = cpu->cpu_id;
2962                 cpi->cpi_pkgcoreid = 0;
2963                 if (cpi->cpi_vendor == X86_VENDOR_AMD) {
2964                         cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
2965                 } else {
2966                         cpi->cpi_procnodeid = cpi->cpi_chipid;
2967                 }
2968         } else {
2969                 switch (cpi->cpi_vendor) {
2970                 case X86_VENDOR_Intel:
2971                         cpuid_intel_getids(cpu, featureset);
2972                         break;
2973                 case X86_VENDOR_AMD:
2974                         cpuid_amd_getids(cpu, featureset);
2975                         break;
2976                 default:
2977                         /*
2978                          * In this case, it's hard to say what we should do.
2979                          * We're going to model them to the OS as single core
2980                          * threads. We don't have a good identifier for them, so
2981                          * we're just going to use the cpu id all on a single
2982                          * chip.
2983                          *
2984                          * This case has historically been different from the
2985                          * case above where we don't have HTT or CMP. While they
2986                          * could be combined, we've opted to keep it separate to
2987                          * minimize the risk of topology changes in weird cases.
2988                          */
2989                         cpi->cpi_procnodes_per_pkg = 1;
2990                         cpi->cpi_cores_per_compunit = 1;
2991                         cpi->cpi_chipid = 0;
2992                         cpi->cpi_coreid = cpu->cpu_id;
2993                         cpi->cpi_clogid = cpu->cpu_id;
2994                         cpi->cpi_pkgcoreid = cpu->cpu_id;
2995                         cpi->cpi_procnodeid = cpi->cpi_chipid;
2996                         cpi->cpi_compunitid = cpi->cpi_coreid;
2997                         break;
2998                 }
2999         }
3000 }
3001 
3002 /*
3003  * Gather relevant CPU features from leaf 6 which covers thermal information. We
3004  * always gather leaf 6 if it's supported; however, we only look for features on
3005  * Intel systems as AMD does not currently define any of the features we look
3006  * for below.
3007  */
3008 static void
3009 cpuid_pass1_thermal(cpu_t *cpu, uchar_t *featureset)
3010 {
3011         struct cpuid_regs *cp;
3012         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3013 
3014         if (cpi->cpi_maxeax < 6) {
3015                 return;
3016         }
3017 
3018         cp = &cpi->cpi_std[6];
3019         cp->cp_eax = 6;
3020         cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
3021         (void) __cpuid_insn(cp);
3022         platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
3023 
3024         if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3025                 return;
3026         }
3027 
3028         if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
3029                 add_x86_feature(featureset, X86FSET_CORE_THERMAL);
3030         }
3031 
3032         if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
3033                 add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3034         }
3035 }
3036 
3037 void
3038 cpuid_pass1(cpu_t *cpu, uchar_t *featureset)
3039 {
3040         uint32_t mask_ecx, mask_edx;
3041         struct cpuid_info *cpi;
3042         struct cpuid_regs *cp;
3043         int xcpuid;
3044 #if !defined(__xpv)
3045         extern int idle_cpu_prefer_mwait;
3046 #endif
3047 
3048         /*
3049          * Space statically allocated for BSP, ensure pointer is set
3050          */
3051         if (cpu->cpu_id == 0) {
3052                 if (cpu->cpu_m.mcpu_cpi == NULL)
3053                         cpu->cpu_m.mcpu_cpi = &cpuid_info0;
3054         }
3055 
3056         add_x86_feature(featureset, X86FSET_CPUID);
3057 
3058         cpi = cpu->cpu_m.mcpu_cpi;
3059         ASSERT(cpi != NULL);
3060         cp = &cpi->cpi_std[0];
3061         cp->cp_eax = 0;
3062         cpi->cpi_maxeax = __cpuid_insn(cp);
3063         {
3064                 uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
3065                 *iptr++ = cp->cp_ebx;
3066                 *iptr++ = cp->cp_edx;
3067                 *iptr++ = cp->cp_ecx;
3068                 *(char *)&cpi->cpi_vendorstr[12] = '\0';
3069         }
3070 
3071         cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
3072         x86_vendor = cpi->cpi_vendor; /* for compatibility */
3073 
3074         /*
3075          * Limit the range in case of weird hardware
3076          */
3077         if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
3078                 cpi->cpi_maxeax = CPI_MAXEAX_MAX;
3079         if (cpi->cpi_maxeax < 1)
3080                 goto pass1_done;
3081 
3082         cp = &cpi->cpi_std[1];
3083         cp->cp_eax = 1;
3084         (void) __cpuid_insn(cp);
3085 
3086         /*
3087          * Extract identifying constants for easy access.
3088          */
3089         cpi->cpi_model = CPI_MODEL(cpi);
3090         cpi->cpi_family = CPI_FAMILY(cpi);
3091 
3092         if (cpi->cpi_family == 0xf)
3093                 cpi->cpi_family += CPI_FAMILY_XTD(cpi);
3094 
3095         /*
3096          * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
3097          * Intel, and presumably everyone else, uses model == 0xf, as
3098          * one would expect (max value means possible overflow).  Sigh.
3099          */
3100 
3101         switch (cpi->cpi_vendor) {
3102         case X86_VENDOR_Intel:
3103                 if (IS_EXTENDED_MODEL_INTEL(cpi))
3104                         cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3105                 break;
3106         case X86_VENDOR_AMD:
3107                 if (CPI_FAMILY(cpi) == 0xf)
3108                         cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3109                 break;
3110         default:
3111                 if (cpi->cpi_model == 0xf)
3112                         cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3113                 break;
3114         }
3115 
3116         cpi->cpi_step = CPI_STEP(cpi);
3117         cpi->cpi_brandid = CPI_BRANDID(cpi);
3118 
3119         /*
3120          * *default* assumptions:
3121          * - believe %edx feature word
3122          * - ignore %ecx feature word
3123          * - 32-bit virtual and physical addressing
3124          */
3125         mask_edx = 0xffffffff;
3126         mask_ecx = 0;
3127 
3128         cpi->cpi_pabits = cpi->cpi_vabits = 32;
3129 
3130         switch (cpi->cpi_vendor) {
3131         case X86_VENDOR_Intel:
3132                 if (cpi->cpi_family == 5)
3133                         x86_type = X86_TYPE_P5;
3134                 else if (IS_LEGACY_P6(cpi)) {
3135                         x86_type = X86_TYPE_P6;
3136                         pentiumpro_bug4046376 = 1;
3137                         /*
3138                          * Clear the SEP bit when it was set erroneously
3139                          */
3140                         if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
3141                                 cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
3142                 } else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
3143                         x86_type = X86_TYPE_P4;
3144                         /*
3145                          * We don't currently depend on any of the %ecx
3146                          * features until Prescott, so we'll only check
3147                          * this from P4 onwards.  We might want to revisit
3148                          * that idea later.
3149                          */
3150                         mask_ecx = 0xffffffff;
3151                 } else if (cpi->cpi_family > 0xf)
3152                         mask_ecx = 0xffffffff;
3153                 /*
3154                  * We don't support MONITOR/MWAIT if leaf 5 is not available
3155                  * to obtain the monitor linesize.
3156                  */
3157                 if (cpi->cpi_maxeax < 5)
3158                         mask_ecx &= ~CPUID_INTC_ECX_MON;
3159                 break;
3160         case X86_VENDOR_IntelClone:
3161         default:
3162                 break;
3163         case X86_VENDOR_AMD:
3164 #if defined(OPTERON_ERRATUM_108)
3165                 if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
3166                         cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
3167                         cpi->cpi_model = 0xc;
3168                 } else
3169 #endif
3170                 if (cpi->cpi_family == 5) {
3171                         /*
3172                          * AMD K5 and K6
3173                          *
3174                          * These CPUs have an incomplete implementation
3175                          * of MCA/MCE which we mask away.
3176                          */
3177                         mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
3178 
3179                         /*
3180                          * Model 0 uses the wrong (APIC) bit
3181                          * to indicate PGE.  Fix it here.
3182                          */
3183                         if (cpi->cpi_model == 0) {
3184                                 if (cp->cp_edx & 0x200) {
3185                                         cp->cp_edx &= ~0x200;
3186                                         cp->cp_edx |= CPUID_INTC_EDX_PGE;
3187                                 }
3188                         }
3189 
3190                         /*
3191                          * Early models had problems w/ MMX; disable.
3192                          */
3193                         if (cpi->cpi_model < 6)
3194                                 mask_edx &= ~CPUID_INTC_EDX_MMX;
3195                 }
3196 
3197                 /*
3198                  * For newer families, SSE3 and CX16, at least, are valid;
3199                  * enable all
3200                  */
3201                 if (cpi->cpi_family >= 0xf)
3202                         mask_ecx = 0xffffffff;
3203                 /*
3204                  * We don't support MONITOR/MWAIT if leaf 5 is not available
3205                  * to obtain the monitor linesize.
3206                  */
3207                 if (cpi->cpi_maxeax < 5)
3208                         mask_ecx &= ~CPUID_INTC_ECX_MON;
3209 
3210 #if !defined(__xpv)
3211                 /*
3212                  * AMD has not historically used MWAIT in the CPU's idle loop.
3213                  * Pre-family-10h Opterons do not have the MWAIT instruction. We
3214                  * know for certain that in at least family 17h, per AMD, mwait
3215                  * is preferred. Families in-between are less certain.
3216                  */
3217                 if (cpi->cpi_family < 0x17) {
3218                         idle_cpu_prefer_mwait = 0;
3219                 }
3220 #endif
3221 
3222                 break;
3223         case X86_VENDOR_TM:
3224                 /*
3225                  * workaround the NT workaround in CMS 4.1
3226                  */
3227                 if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
3228                     (cpi->cpi_step == 2 || cpi->cpi_step == 3))
3229                         cp->cp_edx |= CPUID_INTC_EDX_CX8;
3230                 break;
3231         case X86_VENDOR_Centaur:
3232                 /*
3233                  * workaround the NT workarounds again
3234                  */
3235                 if (cpi->cpi_family == 6)
3236                         cp->cp_edx |= CPUID_INTC_EDX_CX8;
3237                 break;
3238         case X86_VENDOR_Cyrix:
3239                 /*
3240                  * We rely heavily on the probing in locore
3241                  * to actually figure out what parts, if any,
3242                  * of the Cyrix cpuid instruction to believe.
3243                  */
3244                 switch (x86_type) {
3245                 case X86_TYPE_CYRIX_486:
3246                         mask_edx = 0;
3247                         break;
3248                 case X86_TYPE_CYRIX_6x86:
3249                         mask_edx = 0;
3250                         break;
3251                 case X86_TYPE_CYRIX_6x86L:
3252                         mask_edx =
3253                             CPUID_INTC_EDX_DE |
3254                             CPUID_INTC_EDX_CX8;
3255                         break;
3256                 case X86_TYPE_CYRIX_6x86MX:
3257                         mask_edx =
3258                             CPUID_INTC_EDX_DE |
3259                             CPUID_INTC_EDX_MSR |
3260                             CPUID_INTC_EDX_CX8 |
3261                             CPUID_INTC_EDX_PGE |
3262                             CPUID_INTC_EDX_CMOV |
3263                             CPUID_INTC_EDX_MMX;
3264                         break;
3265                 case X86_TYPE_CYRIX_GXm:
3266                         mask_edx =
3267                             CPUID_INTC_EDX_MSR |
3268                             CPUID_INTC_EDX_CX8 |
3269                             CPUID_INTC_EDX_CMOV |
3270                             CPUID_INTC_EDX_MMX;
3271                         break;
3272                 case X86_TYPE_CYRIX_MediaGX:
3273                         break;
3274                 case X86_TYPE_CYRIX_MII:
3275                 case X86_TYPE_VIA_CYRIX_III:
3276                         mask_edx =
3277                             CPUID_INTC_EDX_DE |
3278                             CPUID_INTC_EDX_TSC |
3279                             CPUID_INTC_EDX_MSR |
3280                             CPUID_INTC_EDX_CX8 |
3281                             CPUID_INTC_EDX_PGE |
3282                             CPUID_INTC_EDX_CMOV |
3283                             CPUID_INTC_EDX_MMX;
3284                         break;
3285                 default:
3286                         break;
3287                 }
3288                 break;
3289         }
3290 
3291 #if defined(__xpv)
3292         /*
3293          * Do not support MONITOR/MWAIT under a hypervisor
3294          */
3295         mask_ecx &= ~CPUID_INTC_ECX_MON;
3296         /*
3297          * Do not support XSAVE under a hypervisor for now
3298          */
3299         xsave_force_disable = B_TRUE;
3300 
3301 #endif  /* __xpv */
3302 
3303         if (xsave_force_disable) {
3304                 mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
3305                 mask_ecx &= ~CPUID_INTC_ECX_AVX;
3306                 mask_ecx &= ~CPUID_INTC_ECX_F16C;
3307                 mask_ecx &= ~CPUID_INTC_ECX_FMA;
3308         }
3309 
3310         /*
3311          * Now we've figured out the masks that determine
3312          * which bits we choose to believe, apply the masks
3313          * to the feature words, then map the kernel's view
3314          * of these feature words into its feature word.
3315          */
3316         cp->cp_edx &= mask_edx;
3317         cp->cp_ecx &= mask_ecx;
3318 
3319         /*
3320          * apply any platform restrictions (we don't call this
3321          * immediately after __cpuid_insn here, because we need the
3322          * workarounds applied above first)
3323          */
3324         platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
3325 
3326         /*
3327          * In addition to ecx and edx, Intel and AMD are storing a bunch of
3328          * instruction set extensions in leaf 7's ebx, ecx, and edx.
3329          */
3330         if (cpi->cpi_maxeax >= 7) {
3331                 struct cpuid_regs *ecp;
3332                 ecp = &cpi->cpi_std[7];
3333                 ecp->cp_eax = 7;
3334                 ecp->cp_ecx = 0;
3335                 (void) __cpuid_insn(ecp);
3336 
3337                 /*
3338                  * If XSAVE has been disabled, just ignore all of the
3339                  * extended-save-area dependent flags here.
3340                  */
3341                 if (xsave_force_disable) {
3342                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
3343                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
3344                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
3345                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
3346                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
3347                         ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
3348                         ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
3349                 }
3350 
3351                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
3352                         add_x86_feature(featureset, X86FSET_SMEP);
3353 
3354                 /*
3355                  * We check disable_smap here in addition to in startup_smap()
3356                  * to ensure CPUs that aren't the boot CPU don't accidentally
3357                  * include it in the feature set and thus generate a mismatched
3358                  * x86 feature set across CPUs.
3359                  */
3360                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
3361                     disable_smap == 0)
3362                         add_x86_feature(featureset, X86FSET_SMAP);
3363 
3364                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
3365                         add_x86_feature(featureset, X86FSET_RDSEED);
3366 
3367                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
3368                         add_x86_feature(featureset, X86FSET_ADX);
3369 
3370                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
3371                         add_x86_feature(featureset, X86FSET_FSGSBASE);
3372 
3373                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
3374                         add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
3375 
3376                 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3377                         if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
3378                                 add_x86_feature(featureset, X86FSET_INVPCID);
3379 
3380                         if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
3381                                 add_x86_feature(featureset, X86FSET_MPX);
3382 
3383                         if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
3384                                 add_x86_feature(featureset, X86FSET_CLWB);
3385                 }
3386         }
3387 
3388         /*
3389          * fold in overrides from the "eeprom" mechanism
3390          */
3391         cp->cp_edx |= cpuid_feature_edx_include;
3392         cp->cp_edx &= ~cpuid_feature_edx_exclude;
3393 
3394         cp->cp_ecx |= cpuid_feature_ecx_include;
3395         cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
3396 
3397         if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
3398                 add_x86_feature(featureset, X86FSET_LARGEPAGE);
3399         }
3400         if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
3401                 add_x86_feature(featureset, X86FSET_TSC);
3402         }
3403         if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
3404                 add_x86_feature(featureset, X86FSET_MSR);
3405         }
3406         if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
3407                 add_x86_feature(featureset, X86FSET_MTRR);
3408         }
3409         if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
3410                 add_x86_feature(featureset, X86FSET_PGE);
3411         }
3412         if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
3413                 add_x86_feature(featureset, X86FSET_CMOV);
3414         }
3415         if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
3416                 add_x86_feature(featureset, X86FSET_MMX);
3417         }
3418         if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
3419             (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
3420                 add_x86_feature(featureset, X86FSET_MCA);
3421         }
3422         if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
3423                 add_x86_feature(featureset, X86FSET_PAE);
3424         }
3425         if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
3426                 add_x86_feature(featureset, X86FSET_CX8);
3427         }
3428         if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
3429                 add_x86_feature(featureset, X86FSET_CX16);
3430         }
3431         if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
3432                 add_x86_feature(featureset, X86FSET_PAT);
3433         }
3434         if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
3435                 add_x86_feature(featureset, X86FSET_SEP);
3436         }
3437         if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
3438                 /*
3439                  * In our implementation, fxsave/fxrstor
3440                  * are prerequisites before we'll even
3441                  * try and do SSE things.
3442                  */
3443                 if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
3444                         add_x86_feature(featureset, X86FSET_SSE);
3445                 }
3446                 if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
3447                         add_x86_feature(featureset, X86FSET_SSE2);
3448                 }
3449                 if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
3450                         add_x86_feature(featureset, X86FSET_SSE3);
3451                 }
3452                 if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
3453                         add_x86_feature(featureset, X86FSET_SSSE3);
3454                 }
3455                 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
3456                         add_x86_feature(featureset, X86FSET_SSE4_1);
3457                 }
3458                 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
3459                         add_x86_feature(featureset, X86FSET_SSE4_2);
3460                 }
3461                 if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
3462                         add_x86_feature(featureset, X86FSET_AES);
3463                 }
3464                 if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
3465                         add_x86_feature(featureset, X86FSET_PCLMULQDQ);
3466                 }
3467 
3468                 if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
3469                         add_x86_feature(featureset, X86FSET_SHA);
3470 
3471                 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
3472                         add_x86_feature(featureset, X86FSET_UMIP);
3473                 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_PKU)
3474                         add_x86_feature(featureset, X86FSET_PKU);
3475                 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
3476                         add_x86_feature(featureset, X86FSET_OSPKE);
3477 
3478                 if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
3479                         add_x86_feature(featureset, X86FSET_XSAVE);
3480 
3481                         /* We only test AVX & AVX512 when there is XSAVE */
3482 
3483                         if (cp->cp_ecx & CPUID_INTC_ECX_AVX) {
3484                                 add_x86_feature(featureset,
3485                                     X86FSET_AVX);
3486 
3487                                 /*
3488                                  * Intel says we can't check these without also
3489                                  * checking AVX.
3490                                  */
3491                                 if (cp->cp_ecx & CPUID_INTC_ECX_F16C)
3492                                         add_x86_feature(featureset,
3493                                             X86FSET_F16C);
3494 
3495                                 if (cp->cp_ecx & CPUID_INTC_ECX_FMA)
3496                                         add_x86_feature(featureset,
3497                                             X86FSET_FMA);
3498 
3499                                 if (cpi->cpi_std[7].cp_ebx &
3500                                     CPUID_INTC_EBX_7_0_BMI1)
3501                                         add_x86_feature(featureset,
3502                                             X86FSET_BMI1);
3503 
3504                                 if (cpi->cpi_std[7].cp_ebx &
3505                                     CPUID_INTC_EBX_7_0_BMI2)
3506                                         add_x86_feature(featureset,
3507                                             X86FSET_BMI2);
3508 
3509                                 if (cpi->cpi_std[7].cp_ebx &
3510                                     CPUID_INTC_EBX_7_0_AVX2)
3511                                         add_x86_feature(featureset,
3512                                             X86FSET_AVX2);
3513                         }
3514 
3515                         if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3516                             (cpi->cpi_std[7].cp_ebx &
3517                             CPUID_INTC_EBX_7_0_AVX512F) != 0) {
3518                                 add_x86_feature(featureset, X86FSET_AVX512F);
3519 
3520                                 if (cpi->cpi_std[7].cp_ebx &
3521                                     CPUID_INTC_EBX_7_0_AVX512DQ)
3522                                         add_x86_feature(featureset,
3523                                             X86FSET_AVX512DQ);
3524                                 if (cpi->cpi_std[7].cp_ebx &
3525                                     CPUID_INTC_EBX_7_0_AVX512IFMA)
3526                                         add_x86_feature(featureset,
3527                                             X86FSET_AVX512FMA);
3528                                 if (cpi->cpi_std[7].cp_ebx &
3529                                     CPUID_INTC_EBX_7_0_AVX512PF)
3530                                         add_x86_feature(featureset,
3531                                             X86FSET_AVX512PF);
3532                                 if (cpi->cpi_std[7].cp_ebx &
3533                                     CPUID_INTC_EBX_7_0_AVX512ER)
3534                                         add_x86_feature(featureset,
3535                                             X86FSET_AVX512ER);
3536                                 if (cpi->cpi_std[7].cp_ebx &
3537                                     CPUID_INTC_EBX_7_0_AVX512CD)
3538                                         add_x86_feature(featureset,
3539                                             X86FSET_AVX512CD);
3540                                 if (cpi->cpi_std[7].cp_ebx &
3541                                     CPUID_INTC_EBX_7_0_AVX512BW)
3542                                         add_x86_feature(featureset,
3543                                             X86FSET_AVX512BW);
3544                                 if (cpi->cpi_std[7].cp_ebx &
3545                                     CPUID_INTC_EBX_7_0_AVX512VL)
3546                                         add_x86_feature(featureset,
3547                                             X86FSET_AVX512VL);
3548 
3549                                 if (cpi->cpi_std[7].cp_ecx &
3550                                     CPUID_INTC_ECX_7_0_AVX512VBMI)
3551                                         add_x86_feature(featureset,
3552                                             X86FSET_AVX512VBMI);
3553                                 if (cpi->cpi_std[7].cp_ecx &
3554                                     CPUID_INTC_ECX_7_0_AVX512VNNI)
3555                                         add_x86_feature(featureset,
3556                                             X86FSET_AVX512VNNI);
3557                                 if (cpi->cpi_std[7].cp_ecx &
3558                                     CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3559                                         add_x86_feature(featureset,
3560                                             X86FSET_AVX512VPOPCDQ);
3561 
3562                                 if (cpi->cpi_std[7].cp_edx &
3563                                     CPUID_INTC_EDX_7_0_AVX5124NNIW)
3564                                         add_x86_feature(featureset,
3565                                             X86FSET_AVX512NNIW);
3566                                 if (cpi->cpi_std[7].cp_edx &
3567                                     CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3568                                         add_x86_feature(featureset,
3569                                             X86FSET_AVX512FMAPS);
3570                         }
3571                 }
3572         }
3573 
3574         if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3575                 if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
3576                         add_x86_feature(featureset, X86FSET_PCID);
3577                 }
3578         }
3579 
3580         if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
3581                 add_x86_feature(featureset, X86FSET_X2APIC);
3582         }
3583         if (cp->cp_edx & CPUID_INTC_EDX_DE) {
3584                 add_x86_feature(featureset, X86FSET_DE);
3585         }
3586 #if !defined(__xpv)
3587         if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
3588 
3589                 /*
3590                  * We require the CLFLUSH instruction for erratum workaround
3591                  * to use MONITOR/MWAIT.
3592                  */
3593                 if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3594                         cpi->cpi_mwait.support |= MWAIT_SUPPORT;
3595                         add_x86_feature(featureset, X86FSET_MWAIT);
3596                 } else {
3597                         extern int idle_cpu_assert_cflush_monitor;
3598 
3599                         /*
3600                          * All processors we are aware of which have
3601                          * MONITOR/MWAIT also have CLFLUSH.
3602                          */
3603                         if (idle_cpu_assert_cflush_monitor) {
3604                                 ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
3605                                     (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
3606                         }
3607                 }
3608         }
3609 #endif  /* __xpv */
3610 
3611         if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
3612                 add_x86_feature(featureset, X86FSET_VMX);
3613         }
3614 
3615         if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
3616                 add_x86_feature(featureset, X86FSET_RDRAND);
3617 
3618         /*
3619          * Only need it first time, rest of the cpus would follow suit.
3620          * we only capture this for the bootcpu.
3621          */
3622         if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3623                 add_x86_feature(featureset, X86FSET_CLFSH);
3624                 x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
3625         }
3626         if (is_x86_feature(featureset, X86FSET_PAE))
3627                 cpi->cpi_pabits = 36;
3628 
3629         if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
3630                 struct cpuid_regs r, *ecp;
3631 
3632                 ecp = &r;
3633                 ecp->cp_eax = 0xD;
3634                 ecp->cp_ecx = 1;
3635                 ecp->cp_edx = ecp->cp_ebx = 0;
3636                 (void) __cpuid_insn(ecp);
3637 
3638                 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
3639                         add_x86_feature(featureset, X86FSET_XSAVEOPT);
3640                 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
3641                         add_x86_feature(featureset, X86FSET_XSAVEC);
3642                 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
3643                         add_x86_feature(featureset, X86FSET_XSAVES);
3644         }
3645 
3646         /*
3647          * Work on the "extended" feature information, doing
3648          * some basic initialization for cpuid_pass2()
3649          */
3650         xcpuid = 0;
3651         switch (cpi->cpi_vendor) {
3652         case X86_VENDOR_Intel:
3653                 /*
3654                  * On KVM we know we will have proper support for extended
3655                  * cpuid.
3656                  */
3657                 if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
3658                     (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
3659                     (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
3660                         xcpuid++;
3661                 break;
3662         case X86_VENDOR_AMD:
3663                 if (cpi->cpi_family > 5 ||
3664                     (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
3665                         xcpuid++;
3666                 break;
3667         case X86_VENDOR_Cyrix:
3668                 /*
3669                  * Only these Cyrix CPUs are -known- to support
3670                  * extended cpuid operations.
3671                  */
3672                 if (x86_type == X86_TYPE_VIA_CYRIX_III ||
3673                     x86_type == X86_TYPE_CYRIX_GXm)
3674                         xcpuid++;
3675                 break;
3676         case X86_VENDOR_Centaur:
3677         case X86_VENDOR_TM:
3678         default:
3679                 xcpuid++;
3680                 break;
3681         }
3682 
3683         if (xcpuid) {
3684                 cp = &cpi->cpi_extd[0];
3685                 cp->cp_eax = CPUID_LEAF_EXT_0;
3686                 cpi->cpi_xmaxeax = __cpuid_insn(cp);
3687         }
3688 
3689         if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
3690 
3691                 if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
3692                         cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
3693 
3694                 switch (cpi->cpi_vendor) {
3695                 case X86_VENDOR_Intel:
3696                 case X86_VENDOR_AMD:
3697                         if (cpi->cpi_xmaxeax < 0x80000001)
3698                                 break;
3699                         cp = &cpi->cpi_extd[1];
3700                         cp->cp_eax = 0x80000001;
3701                         (void) __cpuid_insn(cp);
3702 
3703                         if (cpi->cpi_vendor == X86_VENDOR_AMD &&
3704                             cpi->cpi_family == 5 &&
3705                             cpi->cpi_model == 6 &&
3706                             cpi->cpi_step == 6) {
3707                                 /*
3708                                  * K6 model 6 uses bit 10 to indicate SYSC
3709                                  * Later models use bit 11. Fix it here.
3710                                  */
3711                                 if (cp->cp_edx & 0x400) {
3712                                         cp->cp_edx &= ~0x400;
3713                                         cp->cp_edx |= CPUID_AMD_EDX_SYSC;
3714                                 }
3715                         }
3716 
3717                         platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
3718 
3719                         /*
3720                          * Compute the additions to the kernel's feature word.
3721                          */
3722                         if (cp->cp_edx & CPUID_AMD_EDX_NX) {
3723                                 add_x86_feature(featureset, X86FSET_NX);
3724                         }
3725 
3726                         /*
3727                          * Regardless whether or not we boot 64-bit,
3728                          * we should have a way to identify whether
3729                          * the CPU is capable of running 64-bit.
3730                          */
3731                         if (cp->cp_edx & CPUID_AMD_EDX_LM) {
3732                                 add_x86_feature(featureset, X86FSET_64);
3733                         }
3734 
3735                         /* 1 GB large page - enable only for 64 bit kernel */
3736                         if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
3737                                 add_x86_feature(featureset, X86FSET_1GPG);
3738                         }
3739 
3740                         if ((cpi->cpi_vendor == X86_VENDOR_AMD) &&
3741                             (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
3742                             (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
3743                                 add_x86_feature(featureset, X86FSET_SSE4A);
3744                         }
3745 
3746                         /*
3747                          * It's really tricky to support syscall/sysret in
3748                          * the i386 kernel; we rely on sysenter/sysexit
3749                          * instead.  In the amd64 kernel, things are -way-
3750                          * better.
3751                          */
3752                         if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
3753                                 add_x86_feature(featureset, X86FSET_ASYSC);
3754                         }
3755 
3756                         /*
3757                          * While we're thinking about system calls, note
3758                          * that AMD processors don't support sysenter
3759                          * in long mode at all, so don't try to program them.
3760                          */
3761                         if (x86_vendor == X86_VENDOR_AMD) {
3762                                 remove_x86_feature(featureset, X86FSET_SEP);
3763                         }
3764 
3765                         if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
3766                                 add_x86_feature(featureset, X86FSET_TSCP);
3767                         }
3768 
3769                         if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
3770                                 add_x86_feature(featureset, X86FSET_SVM);
3771                         }
3772 
3773                         if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
3774                                 add_x86_feature(featureset, X86FSET_TOPOEXT);
3775                         }
3776 
3777                         if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
3778                                 add_x86_feature(featureset, X86FSET_AMD_PCEC);
3779                         }
3780 
3781                         if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
3782                                 add_x86_feature(featureset, X86FSET_XOP);
3783                         }
3784 
3785                         if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
3786                                 add_x86_feature(featureset, X86FSET_FMA4);
3787                         }
3788 
3789                         if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
3790                                 add_x86_feature(featureset, X86FSET_TBM);
3791                         }
3792 
3793                         if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
3794                                 add_x86_feature(featureset, X86FSET_MONITORX);
3795                         }
3796                         break;
3797                 default:
3798                         break;
3799                 }
3800 
3801                 /*
3802                  * Get CPUID data about processor cores and hyperthreads.
3803                  */
3804                 switch (cpi->cpi_vendor) {
3805                 case X86_VENDOR_Intel:
3806                         if (cpi->cpi_maxeax >= 4) {
3807                                 cp = &cpi->cpi_std[4];
3808                                 cp->cp_eax = 4;
3809                                 cp->cp_ecx = 0;
3810                                 (void) __cpuid_insn(cp);
3811                                 platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
3812                         }
3813                         /*FALLTHROUGH*/
3814                 case X86_VENDOR_AMD:
3815                         if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
3816                                 break;
3817                         cp = &cpi->cpi_extd[8];
3818                         cp->cp_eax = CPUID_LEAF_EXT_8;
3819                         (void) __cpuid_insn(cp);
3820                         platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
3821                             cp);
3822 
3823                         /*
3824                          * AMD uses ebx for some extended functions.
3825                          */
3826                         if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3827                                 /*
3828                                  * While we're here, check for the AMD "Error
3829                                  * Pointer Zero/Restore" feature. This can be
3830                                  * used to setup the FP save handlers
3831                                  * appropriately.
3832                                  */
3833                                 if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
3834                                         cpi->cpi_fp_amd_save = 0;
3835                                 } else {
3836                                         cpi->cpi_fp_amd_save = 1;
3837                                 }
3838 
3839                                 if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
3840                                         add_x86_feature(featureset,
3841                                             X86FSET_CLZERO);
3842                                 }
3843                         }
3844 
3845                         /*
3846                          * Virtual and physical address limits from
3847                          * cpuid override previously guessed values.
3848                          */
3849                         cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
3850                         cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
3851                         break;
3852                 default:
3853                         break;
3854                 }
3855 
3856                 /*
3857                  * Get CPUID data about TSC Invariance in Deep C-State.
3858                  */
3859                 switch (cpi->cpi_vendor) {
3860                 case X86_VENDOR_Intel:
3861                 case X86_VENDOR_AMD:
3862                         if (cpi->cpi_maxeax >= 7) {
3863                                 cp = &cpi->cpi_extd[7];
3864                                 cp->cp_eax = 0x80000007;
3865                                 cp->cp_ecx = 0;
3866                                 (void) __cpuid_insn(cp);
3867                         }
3868                         break;
3869                 default:
3870                         break;
3871                 }
3872         }
3873 
3874         cpuid_pass1_topology(cpu, featureset);
3875         cpuid_pass1_thermal(cpu, featureset);
3876 
3877         /*
3878          * Synthesize chip "revision" and socket type
3879          */
3880         cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
3881             cpi->cpi_model, cpi->cpi_step);
3882         cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
3883             cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
3884         cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
3885             cpi->cpi_model, cpi->cpi_step);
3886 
3887         if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3888                 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
3889                     cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
3890                         /* Special handling for AMD FP not necessary. */
3891                         cpi->cpi_fp_amd_save = 0;
3892                 } else {
3893                         cpi->cpi_fp_amd_save = 1;
3894                 }
3895         }
3896 
3897         /*
3898          * Check the processor leaves that are used for security features.
3899          */
3900         cpuid_scan_security(cpu, featureset);
3901 
3902 pass1_done:
3903         cpi->cpi_pass = 1;
3904 }
3905 
3906 /*
3907  * Make copies of the cpuid table entries we depend on, in
3908  * part for ease of parsing now, in part so that we have only
3909  * one place to correct any of it, in part for ease of
3910  * later export to userland, and in part so we can look at
3911  * this stuff in a crash dump.
3912  */
3913 
3914 /*ARGSUSED*/
3915 void
3916 cpuid_pass2(cpu_t *cpu)
3917 {
3918         uint_t n, nmax;
3919         int i;
3920         struct cpuid_regs *cp;
3921         uint8_t *dp;
3922         uint32_t *iptr;
3923         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3924 
3925         ASSERT(cpi->cpi_pass == 1);
3926 
3927         if (cpi->cpi_maxeax < 1)
3928                 goto pass2_done;
3929 
3930         if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
3931                 nmax = NMAX_CPI_STD;
3932         /*
3933          * (We already handled n == 0 and n == 1 in pass 1)
3934          */
3935         for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
3936                 /*
3937                  * leaves 6 and 7 were handled in pass 1
3938                  */
3939                 if (n == 6 || n == 7)
3940                         continue;
3941 
3942                 cp->cp_eax = n;
3943 
3944                 /*
3945                  * CPUID function 4 expects %ecx to be initialized
3946                  * with an index which indicates which cache to return
3947                  * information about. The OS is expected to call function 4
3948                  * with %ecx set to 0, 1, 2, ... until it returns with
3949                  * EAX[4:0] set to 0, which indicates there are no more
3950                  * caches.
3951                  *
3952                  * Here, populate cpi_std[4] with the information returned by
3953                  * function 4 when %ecx == 0, and do the rest in cpuid_pass3()
3954                  * when dynamic memory allocation becomes available.
3955                  *
3956                  * Note: we need to explicitly initialize %ecx here, since
3957                  * function 4 may have been previously invoked.
3958                  */
3959                 if (n == 4)
3960                         cp->cp_ecx = 0;
3961 
3962                 (void) __cpuid_insn(cp);
3963                 platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
3964                 switch (n) {
3965                 case 2:
3966                         /*
3967                          * "the lower 8 bits of the %eax register
3968                          * contain a value that identifies the number
3969                          * of times the cpuid [instruction] has to be
3970                          * executed to obtain a complete image of the
3971                          * processor's caching systems."
3972                          *
3973                          * How *do* they make this stuff up?
3974                          */
3975                         cpi->cpi_ncache = sizeof (*cp) *
3976                             BITX(cp->cp_eax, 7, 0);
3977                         if (cpi->cpi_ncache == 0)
3978                                 break;
3979                         cpi->cpi_ncache--;   /* skip count byte */
3980 
3981                         /*
3982                          * Well, for now, rather than attempt to implement
3983                          * this slightly dubious algorithm, we just look
3984                          * at the first 15 ..
3985                          */
3986                         if (cpi->cpi_ncache > (sizeof (*cp) - 1))
3987                                 cpi->cpi_ncache = sizeof (*cp) - 1;
3988 
3989                         dp = cpi->cpi_cacheinfo;
3990                         if (BITX(cp->cp_eax, 31, 31) == 0) {
3991                                 uint8_t *p = (void *)&cp->cp_eax;
3992                                 for (i = 1; i < 4; i++)
3993                                         if (p[i] != 0)
3994                                                 *dp++ = p[i];
3995                         }
3996                         if (BITX(cp->cp_ebx, 31, 31) == 0) {
3997                                 uint8_t *p = (void *)&cp->cp_ebx;
3998                                 for (i = 0; i < 4; i++)
3999                                         if (p[i] != 0)
4000                                                 *dp++ = p[i];
4001                         }
4002                         if (BITX(cp->cp_ecx, 31, 31) == 0) {
4003                                 uint8_t *p = (void *)&cp->cp_ecx;
4004                                 for (i = 0; i < 4; i++)
4005                                         if (p[i] != 0)
4006                                                 *dp++ = p[i];
4007                         }
4008                         if (BITX(cp->cp_edx, 31, 31) == 0) {
4009                                 uint8_t *p = (void *)&cp->cp_edx;
4010                                 for (i = 0; i < 4; i++)
4011                                         if (p[i] != 0)
4012                                                 *dp++ = p[i];
4013                         }
4014                         break;
4015 
4016                 case 3: /* Processor serial number, if PSN supported */
4017                         break;
4018 
4019                 case 4: /* Deterministic cache parameters */
4020                         break;
4021 
4022                 case 5: /* Monitor/Mwait parameters */
4023                 {
4024                         size_t mwait_size;
4025 
4026                         /*
4027                          * check cpi_mwait.support which was set in cpuid_pass1
4028                          */
4029                         if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
4030                                 break;
4031 
4032                         /*
4033                          * Protect ourself from insane mwait line size.
4034                          * Workaround for incomplete hardware emulator(s).
4035                          */
4036                         mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
4037                         if (mwait_size < sizeof (uint32_t) ||
4038                             !ISP2(mwait_size)) {
4039 #if DEBUG
4040                                 cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
4041                                     "size %ld", cpu->cpu_id, (long)mwait_size);
4042 #endif
4043                                 break;
4044                         }
4045 
4046                         cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
4047                         cpi->cpi_mwait.mon_max = mwait_size;
4048                         if (MWAIT_EXTENSION(cpi)) {
4049                                 cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
4050                                 if (MWAIT_INT_ENABLE(cpi))
4051                                         cpi->cpi_mwait.support |=
4052                                             MWAIT_ECX_INT_ENABLE;
4053                         }
4054                         break;
4055                 }
4056                 default:
4057                         break;
4058                 }
4059         }
4060 
4061         /*
4062          * XSAVE enumeration
4063          */
4064         if (cpi->cpi_maxeax >= 0xD) {
4065                 struct cpuid_regs regs;
4066                 boolean_t cpuid_d_valid = B_TRUE;
4067 
4068                 cp = &regs;
4069                 cp->cp_eax = 0xD;
4070                 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
4071 
4072                 (void) __cpuid_insn(cp);
4073 
4074                 /*
4075                  * Sanity checks for debug
4076                  */
4077                 if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
4078                     (cp->cp_eax & XFEATURE_SSE) == 0) {
4079                         cpuid_d_valid = B_FALSE;
4080                 }
4081 
4082                 cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
4083                 cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
4084                 cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
4085 
4086                 /*
4087                  * If the hw supports AVX, get the size and offset in the save
4088                  * area for the ymm state.
4089                  */
4090                 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
4091                         cp->cp_eax = 0xD;
4092                         cp->cp_ecx = 2;
4093                         cp->cp_edx = cp->cp_ebx = 0;
4094 
4095                         (void) __cpuid_insn(cp);
4096 
4097                         if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
4098                             cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
4099                                 cpuid_d_valid = B_FALSE;
4100                         }
4101 
4102                         cpi->cpi_xsave.ymm_size = cp->cp_eax;
4103                         cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
4104                 }
4105 
4106                 /*
4107                  * If the hw supports MPX, get the size and offset in the
4108                  * save area for BNDREGS and BNDCSR.
4109                  */
4110                 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
4111                         cp->cp_eax = 0xD;
4112                         cp->cp_ecx = 3;
4113                         cp->cp_edx = cp->cp_ebx = 0;
4114 
4115                         (void) __cpuid_insn(cp);
4116 
4117                         cpi->cpi_xsave.bndregs_size = cp->cp_eax;
4118                         cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
4119 
4120                         cp->cp_eax = 0xD;
4121                         cp->cp_ecx = 4;
4122                         cp->cp_edx = cp->cp_ebx = 0;
4123 
4124                         (void) __cpuid_insn(cp);
4125 
4126                         cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
4127                         cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
4128                 }
4129 
4130                 /*
4131                  * If the hw supports AVX512, get the size and offset in the
4132                  * save area for the opmask registers and zmm state.
4133                  */
4134                 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
4135                         cp->cp_eax = 0xD;
4136                         cp->cp_ecx = 5;
4137                         cp->cp_edx = cp->cp_ebx = 0;
4138 
4139                         (void) __cpuid_insn(cp);
4140 
4141                         cpi->cpi_xsave.opmask_size = cp->cp_eax;
4142                         cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
4143 
4144                         cp->cp_eax = 0xD;
4145                         cp->cp_ecx = 6;
4146                         cp->cp_edx = cp->cp_ebx = 0;
4147 
4148                         (void) __cpuid_insn(cp);
4149 
4150                         cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
4151                         cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
4152 
4153                         cp->cp_eax = 0xD;
4154                         cp->cp_ecx = 7;
4155                         cp->cp_edx = cp->cp_ebx = 0;
4156 
4157                         (void) __cpuid_insn(cp);
4158 
4159                         cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
4160                         cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
4161                 }
4162 
4163                 if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
4164                         xsave_state_size = 0;
4165                 } else if (cpuid_d_valid) {
4166                         xsave_state_size = cpi->cpi_xsave.xsav_max_size;
4167                 } else {
4168                         /* Broken CPUID 0xD, probably in HVM */
4169                         cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
4170                             "value: hw_low = %d, hw_high = %d, xsave_size = %d"
4171                             ", ymm_size = %d, ymm_offset = %d\n",
4172                             cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
4173                             cpi->cpi_xsave.xsav_hw_features_high,
4174                             (int)cpi->cpi_xsave.xsav_max_size,
4175                             (int)cpi->cpi_xsave.ymm_size,
4176                             (int)cpi->cpi_xsave.ymm_offset);
4177 
4178                         if (xsave_state_size != 0) {
4179                                 /*
4180                                  * This must be a non-boot CPU. We cannot
4181                                  * continue, because boot cpu has already
4182                                  * enabled XSAVE.
4183                                  */
4184                                 ASSERT(cpu->cpu_id != 0);
4185                                 cmn_err(CE_PANIC, "cpu%d: we have already "
4186                                     "enabled XSAVE on boot cpu, cannot "
4187                                     "continue.", cpu->cpu_id);
4188                         } else {
4189                                 /*
4190                                  * If we reached here on the boot CPU, it's also
4191                                  * almost certain that we'll reach here on the
4192                                  * non-boot CPUs. When we're here on a boot CPU
4193                                  * we should disable the feature, on a non-boot
4194                                  * CPU we need to confirm that we have.
4195                                  */
4196                                 if (cpu->cpu_id == 0) {
4197                                         remove_x86_feature(x86_featureset,
4198                                             X86FSET_XSAVE);
4199                                         remove_x86_feature(x86_featureset,
4200                                             X86FSET_AVX);
4201                                         remove_x86_feature(x86_featureset,
4202                                             X86FSET_F16C);
4203                                         remove_x86_feature(x86_featureset,
4204                                             X86FSET_BMI1);
4205                                         remove_x86_feature(x86_featureset,
4206                                             X86FSET_BMI2);
4207                                         remove_x86_feature(x86_featureset,
4208                                             X86FSET_FMA);
4209                                         remove_x86_feature(x86_featureset,
4210                                             X86FSET_AVX2);
4211                                         remove_x86_feature(x86_featureset,
4212                                             X86FSET_MPX);
4213                                         remove_x86_feature(x86_featureset,
4214                                             X86FSET_AVX512F);
4215                                         remove_x86_feature(x86_featureset,
4216                                             X86FSET_AVX512DQ);
4217                                         remove_x86_feature(x86_featureset,
4218                                             X86FSET_AVX512PF);
4219                                         remove_x86_feature(x86_featureset,
4220                                             X86FSET_AVX512ER);
4221                                         remove_x86_feature(x86_featureset,
4222                                             X86FSET_AVX512CD);
4223                                         remove_x86_feature(x86_featureset,
4224                                             X86FSET_AVX512BW);
4225                                         remove_x86_feature(x86_featureset,
4226                                             X86FSET_AVX512VL);
4227                                         remove_x86_feature(x86_featureset,
4228                                             X86FSET_AVX512FMA);
4229                                         remove_x86_feature(x86_featureset,
4230                                             X86FSET_AVX512VBMI);
4231                                         remove_x86_feature(x86_featureset,
4232                                             X86FSET_AVX512VNNI);
4233                                         remove_x86_feature(x86_featureset,
4234                                             X86FSET_AVX512VPOPCDQ);
4235                                         remove_x86_feature(x86_featureset,
4236                                             X86FSET_AVX512NNIW);
4237                                         remove_x86_feature(x86_featureset,
4238                                             X86FSET_AVX512FMAPS);
4239 
4240                                         CPI_FEATURES_ECX(cpi) &=
4241                                             ~CPUID_INTC_ECX_XSAVE;
4242                                         CPI_FEATURES_ECX(cpi) &=
4243                                             ~CPUID_INTC_ECX_AVX;
4244                                         CPI_FEATURES_ECX(cpi) &=
4245                                             ~CPUID_INTC_ECX_F16C;
4246                                         CPI_FEATURES_ECX(cpi) &=
4247                                             ~CPUID_INTC_ECX_FMA;
4248                                         CPI_FEATURES_7_0_EBX(cpi) &=
4249                                             ~CPUID_INTC_EBX_7_0_BMI1;
4250                                         CPI_FEATURES_7_0_EBX(cpi) &=
4251                                             ~CPUID_INTC_EBX_7_0_BMI2;
4252                                         CPI_FEATURES_7_0_EBX(cpi) &=
4253                                             ~CPUID_INTC_EBX_7_0_AVX2;
4254                                         CPI_FEATURES_7_0_EBX(cpi) &=
4255                                             ~CPUID_INTC_EBX_7_0_MPX;
4256                                         CPI_FEATURES_7_0_EBX(cpi) &=
4257                                             ~CPUID_INTC_EBX_7_0_ALL_AVX512;
4258 
4259                                         CPI_FEATURES_7_0_ECX(cpi) &=
4260                                             ~CPUID_INTC_ECX_7_0_ALL_AVX512;
4261 
4262                                         CPI_FEATURES_7_0_EDX(cpi) &=
4263                                             ~CPUID_INTC_EDX_7_0_ALL_AVX512;
4264 
4265                                         xsave_force_disable = B_TRUE;
4266                                 } else {
4267                                         VERIFY(is_x86_feature(x86_featureset,
4268                                             X86FSET_XSAVE) == B_FALSE);
4269                                 }
4270                         }
4271                 }
4272         }
4273 
4274 
4275         if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
4276                 goto pass2_done;
4277 
4278         if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
4279                 nmax = NMAX_CPI_EXTD;
4280         /*
4281          * Copy the extended properties, fixing them as we go.
4282          * (We already handled n == 0 and n == 1 in pass 1)
4283          */
4284         iptr = (void *)cpi->cpi_brandstr;
4285         for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
4286                 cp->cp_eax = CPUID_LEAF_EXT_0 + n;
4287                 (void) __cpuid_insn(cp);
4288                 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
4289                     cp);
4290                 switch (n) {
4291                 case 2:
4292                 case 3:
4293                 case 4:
4294                         /*
4295                          * Extract the brand string
4296                          */
4297                         *iptr++ = cp->cp_eax;
4298                         *iptr++ = cp->cp_ebx;
4299                         *iptr++ = cp->cp_ecx;
4300                         *iptr++ = cp->cp_edx;
4301                         break;
4302                 case 5:
4303                         switch (cpi->cpi_vendor) {
4304                         case X86_VENDOR_AMD:
4305                                 /*
4306                                  * The Athlon and Duron were the first
4307                                  * parts to report the sizes of the
4308                                  * TLB for large pages. Before then,
4309                                  * we don't trust the data.
4310                                  */
4311                                 if (cpi->cpi_family < 6 ||
4312                                     (cpi->cpi_family == 6 &&
4313                                     cpi->cpi_model < 1))
4314                                         cp->cp_eax = 0;
4315                                 break;
4316                         default:
4317                                 break;
4318                         }
4319                         break;
4320                 case 6:
4321                         switch (cpi->cpi_vendor) {
4322                         case X86_VENDOR_AMD:
4323                                 /*
4324                                  * The Athlon and Duron were the first
4325                                  * AMD parts with L2 TLB's.
4326                                  * Before then, don't trust the data.
4327                                  */
4328                                 if (cpi->cpi_family < 6 ||
4329                                     cpi->cpi_family == 6 &&
4330                                     cpi->cpi_model < 1)
4331                                         cp->cp_eax = cp->cp_ebx = 0;
4332                                 /*
4333                                  * AMD Duron rev A0 reports L2
4334                                  * cache size incorrectly as 1K
4335                                  * when it is really 64K
4336                                  */
4337                                 if (cpi->cpi_family == 6 &&
4338                                     cpi->cpi_model == 3 &&
4339                                     cpi->cpi_step == 0) {
4340                                         cp->cp_ecx &= 0xffff;
4341                                         cp->cp_ecx |= 0x400000;
4342                                 }
4343                                 break;
4344                         case X86_VENDOR_Cyrix:  /* VIA C3 */
4345                                 /*
4346                                  * VIA C3 processors are a bit messed
4347                                  * up w.r.t. encoding cache sizes in %ecx
4348                                  */
4349                                 if (cpi->cpi_family != 6)
4350                                         break;
4351                                 /*
4352                                  * model 7 and 8 were incorrectly encoded
4353                                  *
4354                                  * xxx is model 8 really broken?
4355                                  */
4356                                 if (cpi->cpi_model == 7 ||
4357                                     cpi->cpi_model == 8)
4358                                         cp->cp_ecx =
4359                                             BITX(cp->cp_ecx, 31, 24) << 16 |
4360                                             BITX(cp->cp_ecx, 23, 16) << 12 |
4361                                             BITX(cp->cp_ecx, 15, 8) << 8 |
4362                                             BITX(cp->cp_ecx, 7, 0);
4363                                 /*
4364                                  * model 9 stepping 1 has wrong associativity
4365                                  */
4366                                 if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
4367                                         cp->cp_ecx |= 8 << 12;
4368                                 break;
4369                         case X86_VENDOR_Intel:
4370                                 /*
4371                                  * Extended L2 Cache features function.
4372                                  * First appeared on Prescott.
4373                                  */
4374                         default:
4375                                 break;
4376                         }
4377                         break;
4378                 default:
4379                         break;
4380                 }
4381         }
4382 
4383 pass2_done:
4384         cpi->cpi_pass = 2;
4385 }
4386 
4387 static const char *
4388 intel_cpubrand(const struct cpuid_info *cpi)
4389 {
4390         int i;
4391 
4392         if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4393             cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
4394                 return ("i486");
4395 
4396         switch (cpi->cpi_family) {
4397         case 5:
4398                 return ("Intel Pentium(r)");
4399         case 6:
4400                 switch (cpi->cpi_model) {
4401                         uint_t celeron, xeon;
4402                         const struct cpuid_regs *cp;
4403                 case 0:
4404                 case 1:
4405                 case 2:
4406                         return ("Intel Pentium(r) Pro");
4407                 case 3:
4408                 case 4:
4409                         return ("Intel Pentium(r) II");
4410                 case 6:
4411                         return ("Intel Celeron(r)");
4412                 case 5:
4413                 case 7:
4414                         celeron = xeon = 0;
4415                         cp = &cpi->cpi_std[2];   /* cache info */
4416 
4417                         for (i = 1; i < 4; i++) {
4418                                 uint_t tmp;
4419 
4420                                 tmp = (cp->cp_eax >> (8 * i)) & 0xff;
4421                                 if (tmp == 0x40)
4422                                         celeron++;
4423                                 if (tmp >= 0x44 && tmp <= 0x45)
4424                                         xeon++;
4425                         }
4426 
4427                         for (i = 0; i < 2; i++) {
4428                                 uint_t tmp;
4429 
4430                                 tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
4431                                 if (tmp == 0x40)
4432                                         celeron++;
4433                                 else if (tmp >= 0x44 && tmp <= 0x45)
4434                                         xeon++;
4435                         }
4436 
4437                         for (i = 0; i < 4; i++) {
4438                                 uint_t tmp;
4439 
4440                                 tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
4441                                 if (tmp == 0x40)
4442                                         celeron++;
4443                                 else if (tmp >= 0x44 && tmp <= 0x45)
4444                                         xeon++;
4445                         }
4446 
4447                         for (i = 0; i < 4; i++) {
4448                                 uint_t tmp;
4449 
4450                                 tmp = (cp->cp_edx >> (8 * i)) & 0xff;
4451                                 if (tmp == 0x40)
4452                                         celeron++;
4453                                 else if (tmp >= 0x44 && tmp <= 0x45)
4454                                         xeon++;
4455                         }
4456 
4457                         if (celeron)
4458                                 return ("Intel Celeron(r)");
4459                         if (xeon)
4460                                 return (cpi->cpi_model == 5 ?
4461                                     "Intel Pentium(r) II Xeon(tm)" :
4462                                     "Intel Pentium(r) III Xeon(tm)");
4463                         return (cpi->cpi_model == 5 ?
4464                             "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
4465                             "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
4466                 default:
4467                         break;
4468                 }
4469         default:
4470                 break;
4471         }
4472 
4473         /* BrandID is present if the field is nonzero */
4474         if (cpi->cpi_brandid != 0) {
4475                 static const struct {
4476                         uint_t bt_bid;
4477                         const char *bt_str;
4478                 } brand_tbl[] = {
4479                         { 0x1,  "Intel(r) Celeron(r)" },
4480                         { 0x2,  "Intel(r) Pentium(r) III" },
4481                         { 0x3,  "Intel(r) Pentium(r) III Xeon(tm)" },
4482                         { 0x4,  "Intel(r) Pentium(r) III" },
4483                         { 0x6,  "Mobile Intel(r) Pentium(r) III" },
4484                         { 0x7,  "Mobile Intel(r) Celeron(r)" },
4485                         { 0x8,  "Intel(r) Pentium(r) 4" },
4486                         { 0x9,  "Intel(r) Pentium(r) 4" },
4487                         { 0xa,  "Intel(r) Celeron(r)" },
4488                         { 0xb,  "Intel(r) Xeon(tm)" },
4489                         { 0xc,  "Intel(r) Xeon(tm) MP" },
4490                         { 0xe,  "Mobile Intel(r) Pentium(r) 4" },
4491                         { 0xf,  "Mobile Intel(r) Celeron(r)" },
4492                         { 0x11, "Mobile Genuine Intel(r)" },
4493                         { 0x12, "Intel(r) Celeron(r) M" },
4494                         { 0x13, "Mobile Intel(r) Celeron(r)" },
4495                         { 0x14, "Intel(r) Celeron(r)" },
4496                         { 0x15, "Mobile Genuine Intel(r)" },
4497                         { 0x16, "Intel(r) Pentium(r) M" },
4498                         { 0x17, "Mobile Intel(r) Celeron(r)" }
4499                 };
4500                 uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
4501                 uint_t sgn;
4502 
4503                 sgn = (cpi->cpi_family << 8) |
4504                     (cpi->cpi_model << 4) | cpi->cpi_step;
4505 
4506                 for (i = 0; i < btblmax; i++)
4507                         if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
4508                                 break;
4509                 if (i < btblmax) {
4510                         if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
4511                                 return ("Intel(r) Celeron(r)");
4512                         if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
4513                                 return ("Intel(r) Xeon(tm) MP");
4514                         if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
4515                                 return ("Intel(r) Xeon(tm)");
4516                         return (brand_tbl[i].bt_str);
4517                 }
4518         }
4519 
4520         return (NULL);
4521 }
4522 
4523 static const char *
4524 amd_cpubrand(const struct cpuid_info *cpi)
4525 {
4526         if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4527             cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
4528                 return ("i486 compatible");
4529 
4530         switch (cpi->cpi_family) {
4531         case 5:
4532                 switch (cpi->cpi_model) {
4533                 case 0:
4534                 case 1:
4535                 case 2:
4536                 case 3:
4537                 case 4:
4538                 case 5:
4539                         return ("AMD-K5(r)");
4540                 case 6:
4541                 case 7:
4542                         return ("AMD-K6(r)");
4543                 case 8:
4544                         return ("AMD-K6(r)-2");
4545                 case 9:
4546                         return ("AMD-K6(r)-III");
4547                 default:
4548                         return ("AMD (family 5)");
4549                 }
4550         case 6:
4551                 switch (cpi->cpi_model) {
4552                 case 1:
4553                         return ("AMD-K7(tm)");
4554                 case 0:
4555                 case 2:
4556                 case 4:
4557                         return ("AMD Athlon(tm)");
4558                 case 3:
4559                 case 7:
4560                         return ("AMD Duron(tm)");
4561                 case 6:
4562                 case 8:
4563                 case 10:
4564                         /*
4565                          * Use the L2 cache size to distinguish
4566                          */
4567                         return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
4568                             "AMD Athlon(tm)" : "AMD Duron(tm)");
4569                 default:
4570                         return ("AMD (family 6)");
4571                 }
4572         default:
4573                 break;
4574         }
4575 
4576         if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
4577             cpi->cpi_brandid != 0) {
4578                 switch (BITX(cpi->cpi_brandid, 7, 5)) {
4579                 case 3:
4580                         return ("AMD Opteron(tm) UP 1xx");
4581                 case 4:
4582                         return ("AMD Opteron(tm) DP 2xx");
4583                 case 5:
4584                         return ("AMD Opteron(tm) MP 8xx");
4585                 default:
4586                         return ("AMD Opteron(tm)");
4587                 }
4588         }
4589 
4590         return (NULL);
4591 }
4592 
4593 static const char *
4594 cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
4595 {
4596         if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4597             cpi->cpi_maxeax < 1 || cpi->cpi_family < 5 ||
4598             type == X86_TYPE_CYRIX_486)
4599                 return ("i486 compatible");
4600 
4601         switch (type) {
4602         case X86_TYPE_CYRIX_6x86:
4603                 return ("Cyrix 6x86");
4604         case X86_TYPE_CYRIX_6x86L:
4605                 return ("Cyrix 6x86L");
4606         case X86_TYPE_CYRIX_6x86MX:
4607                 return ("Cyrix 6x86MX");
4608         case X86_TYPE_CYRIX_GXm:
4609                 return ("Cyrix GXm");
4610         case X86_TYPE_CYRIX_MediaGX:
4611                 return ("Cyrix MediaGX");
4612         case X86_TYPE_CYRIX_MII:
4613                 return ("Cyrix M2");
4614         case X86_TYPE_VIA_CYRIX_III:
4615                 return ("VIA Cyrix M3");
4616         default:
4617                 /*
4618                  * Have another wild guess ..
4619                  */
4620                 if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
4621                         return ("Cyrix 5x86");
4622                 else if (cpi->cpi_family == 5) {
4623                         switch (cpi->cpi_model) {
4624                         case 2:
4625                                 return ("Cyrix 6x86");  /* Cyrix M1 */
4626                         case 4:
4627                                 return ("Cyrix MediaGX");
4628                         default:
4629                                 break;
4630                         }
4631                 } else if (cpi->cpi_family == 6) {
4632                         switch (cpi->cpi_model) {
4633                         case 0:
4634                                 return ("Cyrix 6x86MX"); /* Cyrix M2? */
4635                         case 5:
4636                         case 6:
4637                         case 7:
4638                         case 8:
4639                         case 9:
4640                                 return ("VIA C3");
4641                         default:
4642                                 break;
4643                         }
4644                 }
4645                 break;
4646         }
4647         return (NULL);
4648 }
4649 
4650 /*
4651  * This only gets called in the case that the CPU extended
4652  * feature brand string (0x80000002, 0x80000003, 0x80000004)
4653  * aren't available, or contain null bytes for some reason.
4654  */
4655 static void
4656 fabricate_brandstr(struct cpuid_info *cpi)
4657 {
4658         const char *brand = NULL;
4659 
4660         switch (cpi->cpi_vendor) {
4661         case X86_VENDOR_Intel:
4662                 brand = intel_cpubrand(cpi);
4663                 break;
4664         case X86_VENDOR_AMD:
4665                 brand = amd_cpubrand(cpi);
4666                 break;
4667         case X86_VENDOR_Cyrix:
4668                 brand = cyrix_cpubrand(cpi, x86_type);
4669                 break;
4670         case X86_VENDOR_NexGen:
4671                 if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4672                         brand = "NexGen Nx586";
4673                 break;
4674         case X86_VENDOR_Centaur:
4675                 if (cpi->cpi_family == 5)
4676                         switch (cpi->cpi_model) {
4677                         case 4:
4678                                 brand = "Centaur C6";
4679                                 break;
4680                         case 8:
4681                                 brand = "Centaur C2";
4682                                 break;
4683                         case 9:
4684                                 brand = "Centaur C3";
4685                                 break;
4686                         default:
4687                                 break;
4688                         }
4689                 break;
4690         case X86_VENDOR_Rise:
4691                 if (cpi->cpi_family == 5 &&
4692                     (cpi->cpi_model == 0 || cpi->cpi_model == 2))
4693                         brand = "Rise mP6";
4694                 break;
4695         case X86_VENDOR_SiS:
4696                 if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4697                         brand = "SiS 55x";
4698                 break;
4699         case X86_VENDOR_TM:
4700                 if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
4701                         brand = "Transmeta Crusoe TM3x00 or TM5x00";
4702                 break;
4703         case X86_VENDOR_NSC:
4704         case X86_VENDOR_UMC:
4705         default:
4706                 break;
4707         }
4708         if (brand) {
4709                 (void) strcpy((char *)cpi->cpi_brandstr, brand);
4710                 return;
4711         }
4712 
4713         /*
4714          * If all else fails ...
4715          */
4716         (void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
4717             "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
4718             cpi->cpi_model, cpi->cpi_step);
4719 }
4720 
4721 /*
4722  * This routine is called just after kernel memory allocation
4723  * becomes available on cpu0, and as part of mp_startup() on
4724  * the other cpus.
4725  *
4726  * Fixup the brand string, and collect any information from cpuid
4727  * that requires dynamically allocated storage to represent.
4728  */
4729 /*ARGSUSED*/
4730 void
4731 cpuid_pass3(cpu_t *cpu)
4732 {
4733         int     i, max, shft, level, size;
4734         struct cpuid_regs regs;
4735         struct cpuid_regs *cp;
4736         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4737 
4738         ASSERT(cpi->cpi_pass == 2);
4739 
4740         /*
4741          * Deterministic cache parameters
4742          *
4743          * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
4744          * values that are present are currently defined to be the same. This
4745          * means we can use the same logic to parse it as long as we use the
4746          * appropriate leaf to get the data. If you're updating this, make sure
4747          * you're careful about which vendor supports which aspect.
4748          *
4749          * Take this opportunity to detect the number of threads sharing the
4750          * last level cache, and construct a corresponding cache id. The
4751          * respective cpuid_info members are initialized to the default case of
4752          * "no last level cache sharing".
4753          */
4754         cpi->cpi_ncpu_shr_last_cache = 1;
4755         cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
4756 
4757         if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
4758             (cpi->cpi_vendor == X86_VENDOR_AMD &&
4759             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
4760             is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
4761                 uint32_t leaf;
4762 
4763                 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4764                         leaf = 4;
4765                 } else {
4766                         leaf = CPUID_LEAF_EXT_1d;
4767                 }
4768 
4769                 /*
4770                  * Find the # of elements (size) returned by the leaf and along
4771                  * the way detect last level cache sharing details.
4772                  */
4773                 bzero(&regs, sizeof (regs));
4774                 cp = &regs;
4775                 for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
4776                         cp->cp_eax = leaf;
4777                         cp->cp_ecx = i;
4778 
4779                         (void) __cpuid_insn(cp);
4780 
4781                         if (CPI_CACHE_TYPE(cp) == 0)
4782                                 break;
4783                         level = CPI_CACHE_LVL(cp);
4784                         if (level > max) {
4785                                 max = level;
4786                                 cpi->cpi_ncpu_shr_last_cache =
4787                                     CPI_NTHR_SHR_CACHE(cp) + 1;
4788                         }
4789                 }
4790                 cpi->cpi_cache_leaf_size = size = i;
4791 
4792                 /*
4793                  * Allocate the cpi_cache_leaves array. The first element
4794                  * references the regs for the corresponding leaf with %ecx set
4795                  * to 0. This was gathered in cpuid_pass2().
4796                  */
4797                 if (size > 0) {
4798                         cpi->cpi_cache_leaves =
4799                             kmem_alloc(size * sizeof (cp), KM_SLEEP);
4800                         if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4801                                 cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
4802                         } else {
4803                                 cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
4804                         }
4805 
4806                         /*
4807                          * Allocate storage to hold the additional regs
4808                          * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
4809                          *
4810                          * The regs for the leaf, %ecx == 0 has already
4811                          * been allocated as indicated above.
4812                          */
4813                         for (i = 1; i < size; i++) {
4814                                 cp = cpi->cpi_cache_leaves[i] =
4815                                     kmem_zalloc(sizeof (regs), KM_SLEEP);
4816                                 cp->cp_eax = leaf;
4817                                 cp->cp_ecx = i;
4818 
4819                                 (void) __cpuid_insn(cp);
4820                         }
4821                 }
4822                 /*
4823                  * Determine the number of bits needed to represent
4824                  * the number of CPUs sharing the last level cache.
4825                  *
4826                  * Shift off that number of bits from the APIC id to
4827                  * derive the cache id.
4828                  */
4829                 shft = 0;
4830                 for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
4831                         shft++;
4832                 cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
4833         }
4834 
4835         /*
4836          * Now fixup the brand string
4837          */
4838         if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
4839                 fabricate_brandstr(cpi);
4840         } else {
4841 
4842                 /*
4843                  * If we successfully extracted a brand string from the cpuid
4844                  * instruction, clean it up by removing leading spaces and
4845                  * similar junk.
4846                  */
4847                 if (cpi->cpi_brandstr[0]) {
4848                         size_t maxlen = sizeof (cpi->cpi_brandstr);
4849                         char *src, *dst;
4850 
4851                         dst = src = (char *)cpi->cpi_brandstr;
4852                         src[maxlen - 1] = '\0';
4853                         /*
4854                          * strip leading spaces
4855                          */
4856                         while (*src == ' ')
4857                                 src++;
4858                         /*
4859                          * Remove any 'Genuine' or "Authentic" prefixes
4860                          */
4861                         if (strncmp(src, "Genuine ", 8) == 0)
4862                                 src += 8;
4863                         if (strncmp(src, "Authentic ", 10) == 0)
4864                                 src += 10;
4865 
4866                         /*
4867                          * Now do an in-place copy.
4868                          * Map (R) to (r) and (TM) to (tm).
4869                          * The era of teletypes is long gone, and there's
4870                          * -really- no need to shout.
4871                          */
4872                         while (*src != '\0') {
4873                                 if (src[0] == '(') {
4874                                         if (strncmp(src + 1, "R)", 2) == 0) {
4875                                                 (void) strncpy(dst, "(r)", 3);
4876                                                 src += 3;
4877                                                 dst += 3;
4878                                                 continue;
4879                                         }
4880                                         if (strncmp(src + 1, "TM)", 3) == 0) {
4881                                                 (void) strncpy(dst, "(tm)", 4);
4882                                                 src += 4;
4883                                                 dst += 4;
4884                                                 continue;
4885                                         }
4886                                 }
4887                                 *dst++ = *src++;
4888                         }
4889                         *dst = '\0';
4890 
4891                         /*
4892                          * Finally, remove any trailing spaces
4893                          */
4894                         while (--dst > cpi->cpi_brandstr)
4895                                 if (*dst == ' ')
4896                                         *dst = '\0';
4897                                 else
4898                                         break;
4899                 } else
4900                         fabricate_brandstr(cpi);
4901         }
4902         cpi->cpi_pass = 3;
4903 }
4904 
4905 /*
4906  * This routine is called out of bind_hwcap() much later in the life
4907  * of the kernel (post_startup()).  The job of this routine is to resolve
4908  * the hardware feature support and kernel support for those features into
4909  * what we're actually going to tell applications via the aux vector.
4910  */
4911 void
4912 cpuid_pass4(cpu_t *cpu, uint_t *hwcap_out)
4913 {
4914         struct cpuid_info *cpi;
4915         uint_t hwcap_flags = 0, hwcap_flags_2 = 0;
4916 
4917         if (cpu == NULL)
4918                 cpu = CPU;
4919         cpi = cpu->cpu_m.mcpu_cpi;
4920 
4921         ASSERT(cpi->cpi_pass == 3);
4922 
4923         if (cpi->cpi_maxeax >= 1) {
4924                 uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
4925                 uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
4926                 uint32_t *ebx = &cpi->cpi_support[STD_EBX_FEATURES];
4927 
4928                 *edx = CPI_FEATURES_EDX(cpi);
4929                 *ecx = CPI_FEATURES_ECX(cpi);
4930                 *ebx = CPI_FEATURES_7_0_EBX(cpi);
4931 
4932                 /*
4933                  * [these require explicit kernel support]
4934                  */
4935                 if (!is_x86_feature(x86_featureset, X86FSET_SEP))
4936                         *edx &= ~CPUID_INTC_EDX_SEP;
4937 
4938                 if (!is_x86_feature(x86_featureset, X86FSET_SSE))
4939                         *edx &= ~(CPUID_INTC_EDX_FXSR|CPUID_INTC_EDX_SSE);
4940                 if (!is_x86_feature(x86_featureset, X86FSET_SSE2))
4941                         *edx &= ~CPUID_INTC_EDX_SSE2;
4942 
4943                 if (!is_x86_feature(x86_featureset, X86FSET_HTT))
4944                         *edx &= ~CPUID_INTC_EDX_HTT;
4945 
4946                 if (!is_x86_feature(x86_featureset, X86FSET_SSE3))
4947                         *ecx &= ~CPUID_INTC_ECX_SSE3;
4948 
4949                 if (!is_x86_feature(x86_featureset, X86FSET_SSSE3))
4950                         *ecx &= ~CPUID_INTC_ECX_SSSE3;
4951                 if (!is_x86_feature(x86_featureset, X86FSET_SSE4_1))
4952                         *ecx &= ~CPUID_INTC_ECX_SSE4_1;
4953                 if (!is_x86_feature(x86_featureset, X86FSET_SSE4_2))
4954                         *ecx &= ~CPUID_INTC_ECX_SSE4_2;
4955                 if (!is_x86_feature(x86_featureset, X86FSET_AES))
4956                         *ecx &= ~CPUID_INTC_ECX_AES;
4957                 if (!is_x86_feature(x86_featureset, X86FSET_PCLMULQDQ))
4958                         *ecx &= ~CPUID_INTC_ECX_PCLMULQDQ;
4959                 if (!is_x86_feature(x86_featureset, X86FSET_XSAVE))
4960                         *ecx &= ~(CPUID_INTC_ECX_XSAVE |
4961                             CPUID_INTC_ECX_OSXSAVE);
4962                 if (!is_x86_feature(x86_featureset, X86FSET_AVX))
4963                         *ecx &= ~CPUID_INTC_ECX_AVX;
4964                 if (!is_x86_feature(x86_featureset, X86FSET_F16C))
4965                         *ecx &= ~CPUID_INTC_ECX_F16C;
4966                 if (!is_x86_feature(x86_featureset, X86FSET_FMA))
4967                         *ecx &= ~CPUID_INTC_ECX_FMA;
4968                 if (!is_x86_feature(x86_featureset, X86FSET_BMI1))
4969                         *ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
4970                 if (!is_x86_feature(x86_featureset, X86FSET_BMI2))
4971                         *ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
4972                 if (!is_x86_feature(x86_featureset, X86FSET_AVX2))
4973                         *ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
4974                 if (!is_x86_feature(x86_featureset, X86FSET_RDSEED))
4975                         *ebx &= ~CPUID_INTC_EBX_7_0_RDSEED;
4976                 if (!is_x86_feature(x86_featureset, X86FSET_ADX))
4977                         *ebx &= ~CPUID_INTC_EBX_7_0_ADX;
4978 
4979                 /*
4980                  * [no explicit support required beyond x87 fp context]
4981                  */
4982                 if (!fpu_exists)
4983                         *edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
4984 
4985                 /*
4986                  * Now map the supported feature vector to things that we
4987                  * think userland will care about.
4988                  */
4989                 if (*edx & CPUID_INTC_EDX_SEP)
4990                         hwcap_flags |= AV_386_SEP;
4991                 if (*edx & CPUID_INTC_EDX_SSE)
4992                         hwcap_flags |= AV_386_FXSR | AV_386_SSE;
4993                 if (*edx & CPUID_INTC_EDX_SSE2)
4994                         hwcap_flags |= AV_386_SSE2;
4995                 if (*ecx & CPUID_INTC_ECX_SSE3)
4996                         hwcap_flags |= AV_386_SSE3;
4997                 if (*ecx & CPUID_INTC_ECX_SSSE3)
4998                         hwcap_flags |= AV_386_SSSE3;
4999                 if (*ecx & CPUID_INTC_ECX_SSE4_1)
5000                         hwcap_flags |= AV_386_SSE4_1;
5001                 if (*ecx & CPUID_INTC_ECX_SSE4_2)
5002                         hwcap_flags |= AV_386_SSE4_2;
5003                 if (*ecx & CPUID_INTC_ECX_MOVBE)
5004                         hwcap_flags |= AV_386_MOVBE;
5005                 if (*ecx & CPUID_INTC_ECX_AES)
5006                         hwcap_flags |= AV_386_AES;
5007                 if (*ecx & CPUID_INTC_ECX_PCLMULQDQ)
5008                         hwcap_flags |= AV_386_PCLMULQDQ;
5009                 if ((*ecx & CPUID_INTC_ECX_XSAVE) &&
5010                     (*ecx & CPUID_INTC_ECX_OSXSAVE)) {
5011                         hwcap_flags |= AV_386_XSAVE;
5012 
5013                         if (*ecx & CPUID_INTC_ECX_AVX) {
5014                                 uint32_t *ecx_7 = &CPI_FEATURES_7_0_ECX(cpi);
5015                                 uint32_t *edx_7 = &CPI_FEATURES_7_0_EDX(cpi);
5016 
5017                                 hwcap_flags |= AV_386_AVX;
5018                                 if (*ecx & CPUID_INTC_ECX_F16C)
5019                                         hwcap_flags_2 |= AV_386_2_F16C;
5020                                 if (*ecx & CPUID_INTC_ECX_FMA)
5021                                         hwcap_flags_2 |= AV_386_2_FMA;
5022 
5023                                 if (*ebx & CPUID_INTC_EBX_7_0_BMI1)
5024                                         hwcap_flags_2 |= AV_386_2_BMI1;
5025                                 if (*ebx & CPUID_INTC_EBX_7_0_BMI2)
5026                                         hwcap_flags_2 |= AV_386_2_BMI2;
5027                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX2)
5028                                         hwcap_flags_2 |= AV_386_2_AVX2;
5029                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512F)
5030                                         hwcap_flags_2 |= AV_386_2_AVX512F;
5031                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
5032                                         hwcap_flags_2 |= AV_386_2_AVX512DQ;
5033                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
5034                                         hwcap_flags_2 |= AV_386_2_AVX512IFMA;
5035                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512PF)
5036                                         hwcap_flags_2 |= AV_386_2_AVX512PF;
5037                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512ER)
5038                                         hwcap_flags_2 |= AV_386_2_AVX512ER;
5039                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512CD)
5040                                         hwcap_flags_2 |= AV_386_2_AVX512CD;
5041                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512BW)
5042                                         hwcap_flags_2 |= AV_386_2_AVX512BW;
5043                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512VL)
5044                                         hwcap_flags_2 |= AV_386_2_AVX512VL;
5045 
5046                                 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VBMI)
5047                                         hwcap_flags_2 |= AV_386_2_AVX512VBMI;
5048                                 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VNNI)
5049                                         hwcap_flags_2 |= AV_386_2_AVX512_VNNI;
5050                                 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
5051                                         hwcap_flags_2 |= AV_386_2_AVX512VPOPCDQ;
5052 
5053                                 if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124NNIW)
5054                                         hwcap_flags_2 |= AV_386_2_AVX512_4NNIW;
5055                                 if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
5056                                         hwcap_flags_2 |= AV_386_2_AVX512_4FMAPS;
5057                         }
5058                 }
5059                 if (*ecx & CPUID_INTC_ECX_VMX)
5060                         hwcap_flags |= AV_386_VMX;
5061                 if (*ecx & CPUID_INTC_ECX_POPCNT)
5062                         hwcap_flags |= AV_386_POPCNT;
5063                 if (*edx & CPUID_INTC_EDX_FPU)
5064                         hwcap_flags |= AV_386_FPU;
5065                 if (*edx & CPUID_INTC_EDX_MMX)
5066                         hwcap_flags |= AV_386_MMX;
5067 
5068                 if (*edx & CPUID_INTC_EDX_TSC)
5069                         hwcap_flags |= AV_386_TSC;
5070                 if (*edx & CPUID_INTC_EDX_CX8)
5071                         hwcap_flags |= AV_386_CX8;
5072                 if (*edx & CPUID_INTC_EDX_CMOV)
5073                         hwcap_flags |= AV_386_CMOV;
5074                 if (*ecx & CPUID_INTC_ECX_CX16)
5075                         hwcap_flags |= AV_386_CX16;
5076 
5077                 if (*ecx & CPUID_INTC_ECX_RDRAND)
5078                         hwcap_flags_2 |= AV_386_2_RDRAND;
5079                 if (*ebx & CPUID_INTC_EBX_7_0_ADX)
5080                         hwcap_flags_2 |= AV_386_2_ADX;
5081                 if (*ebx & CPUID_INTC_EBX_7_0_RDSEED)
5082                         hwcap_flags_2 |= AV_386_2_RDSEED;
5083                 if (*ebx & CPUID_INTC_EBX_7_0_SHA)
5084                         hwcap_flags_2 |= AV_386_2_SHA;
5085                 if (*ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
5086                         hwcap_flags_2 |= AV_386_2_FSGSBASE;
5087                 if (*ebx & CPUID_INTC_EBX_7_0_CLWB)
5088                         hwcap_flags_2 |= AV_386_2_CLWB;
5089                 if (*ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
5090                         hwcap_flags_2 |= AV_386_2_CLFLUSHOPT;
5091 
5092         }
5093         /*
5094          * Check a few miscilaneous features.
5095          */
5096         if (is_x86_feature(x86_featureset, X86FSET_CLZERO))
5097                 hwcap_flags_2 |= AV_386_2_CLZERO;
5098 
5099         if (cpi->cpi_xmaxeax < 0x80000001)
5100                 goto pass4_done;
5101 
5102         switch (cpi->cpi_vendor) {
5103                 struct cpuid_regs cp;
5104                 uint32_t *edx, *ecx;
5105 
5106         case X86_VENDOR_Intel:
5107                 /*
5108                  * Seems like Intel duplicated what we necessary
5109                  * here to make the initial crop of 64-bit OS's work.
5110                  * Hopefully, those are the only "extended" bits
5111                  * they'll add.
5112                  */
5113                 /*FALLTHROUGH*/
5114 
5115         case X86_VENDOR_AMD:
5116                 edx = &cpi->cpi_support[AMD_EDX_FEATURES];
5117                 ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
5118 
5119                 *edx = CPI_FEATURES_XTD_EDX(cpi);
5120                 *ecx = CPI_FEATURES_XTD_ECX(cpi);
5121 
5122                 /*
5123                  * [these features require explicit kernel support]
5124                  */
5125                 switch (cpi->cpi_vendor) {
5126                 case X86_VENDOR_Intel:
5127                         if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
5128                                 *edx &= ~CPUID_AMD_EDX_TSCP;
5129                         break;
5130 
5131                 case X86_VENDOR_AMD:
5132                         if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
5133                                 *edx &= ~CPUID_AMD_EDX_TSCP;
5134                         if (!is_x86_feature(x86_featureset, X86FSET_SSE4A))
5135                                 *ecx &= ~CPUID_AMD_ECX_SSE4A;
5136                         break;
5137 
5138                 default:
5139                         break;
5140                 }
5141 
5142                 /*
5143                  * [no explicit support required beyond
5144                  * x87 fp context and exception handlers]
5145                  */
5146                 if (!fpu_exists)
5147                         *edx &= ~(CPUID_AMD_EDX_MMXamd |
5148                             CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
5149 
5150                 if (!is_x86_feature(x86_featureset, X86FSET_NX))
5151                         *edx &= ~CPUID_AMD_EDX_NX;
5152 #if !defined(__amd64)
5153                 *edx &= ~CPUID_AMD_EDX_LM;
5154 #endif
5155                 /*
5156                  * Now map the supported feature vector to
5157                  * things that we think userland will care about.
5158                  */
5159 #if defined(__amd64)
5160                 if (*edx & CPUID_AMD_EDX_SYSC)
5161                         hwcap_flags |= AV_386_AMD_SYSC;
5162 #endif
5163                 if (*edx & CPUID_AMD_EDX_MMXamd)
5164                         hwcap_flags |= AV_386_AMD_MMX;
5165                 if (*edx & CPUID_AMD_EDX_3DNow)
5166                         hwcap_flags |= AV_386_AMD_3DNow;
5167                 if (*edx & CPUID_AMD_EDX_3DNowx)
5168                         hwcap_flags |= AV_386_AMD_3DNowx;
5169                 if (*ecx & CPUID_AMD_ECX_SVM)
5170                         hwcap_flags |= AV_386_AMD_SVM;
5171 
5172                 switch (cpi->cpi_vendor) {
5173                 case X86_VENDOR_AMD:
5174                         if (*edx & CPUID_AMD_EDX_TSCP)
5175                                 hwcap_flags |= AV_386_TSCP;
5176                         if (*ecx & CPUID_AMD_ECX_AHF64)
5177                                 hwcap_flags |= AV_386_AHF;
5178                         if (*ecx & CPUID_AMD_ECX_SSE4A)
5179                                 hwcap_flags |= AV_386_AMD_SSE4A;
5180                         if (*ecx & CPUID_AMD_ECX_LZCNT)
5181                                 hwcap_flags |= AV_386_AMD_LZCNT;
5182                         if (*ecx & CPUID_AMD_ECX_MONITORX)
5183                                 hwcap_flags_2 |= AV_386_2_MONITORX;
5184                         break;
5185 
5186                 case X86_VENDOR_Intel:
5187                         if (*edx & CPUID_AMD_EDX_TSCP)
5188                                 hwcap_flags |= AV_386_TSCP;
5189                         if (*ecx & CPUID_AMD_ECX_LZCNT)
5190                                 hwcap_flags |= AV_386_AMD_LZCNT;
5191                         /*
5192                          * Aarrgh.
5193                          * Intel uses a different bit in the same word.
5194                          */
5195                         if (*ecx & CPUID_INTC_ECX_AHF64)
5196                                 hwcap_flags |= AV_386_AHF;
5197                         break;
5198 
5199                 default:
5200                         break;
5201                 }
5202                 break;
5203 
5204         case X86_VENDOR_TM:
5205                 cp.cp_eax = 0x80860001;
5206                 (void) __cpuid_insn(&cp);
5207                 cpi->cpi_support[TM_EDX_FEATURES] = cp.cp_edx;
5208                 break;
5209 
5210         default:
5211                 break;
5212         }
5213 
5214 pass4_done:
5215         cpi->cpi_pass = 4;
5216         if (hwcap_out != NULL) {
5217                 hwcap_out[0] = hwcap_flags;
5218                 hwcap_out[1] = hwcap_flags_2;
5219         }
5220 }
5221 
5222 
5223 /*
5224  * Simulate the cpuid instruction using the data we previously
5225  * captured about this CPU.  We try our best to return the truth
5226  * about the hardware, independently of kernel support.
5227  */
5228 uint32_t
5229 cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
5230 {
5231         struct cpuid_info *cpi;
5232         struct cpuid_regs *xcp;
5233 
5234         if (cpu == NULL)
5235                 cpu = CPU;
5236         cpi = cpu->cpu_m.mcpu_cpi;
5237 
5238         ASSERT(cpuid_checkpass(cpu, 3));
5239 
5240         /*
5241          * CPUID data is cached in two separate places: cpi_std for standard
5242          * CPUID leaves , and cpi_extd for extended CPUID leaves.
5243          */
5244         if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
5245                 xcp = &cpi->cpi_std[cp->cp_eax];
5246         } else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
5247             cp->cp_eax <= cpi->cpi_xmaxeax &&
5248             cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
5249                 xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
5250         } else {
5251                 /*
5252                  * The caller is asking for data from an input parameter which
5253                  * the kernel has not cached.  In this case we go fetch from
5254                  * the hardware and return the data directly to the user.
5255                  */
5256                 return (__cpuid_insn(cp));
5257         }
5258 
5259         cp->cp_eax = xcp->cp_eax;
5260         cp->cp_ebx = xcp->cp_ebx;
5261         cp->cp_ecx = xcp->cp_ecx;
5262         cp->cp_edx = xcp->cp_edx;
5263         return (cp->cp_eax);
5264 }
5265 
5266 int
5267 cpuid_checkpass(cpu_t *cpu, int pass)
5268 {
5269         return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
5270             cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
5271 }
5272 
5273 int
5274 cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
5275 {
5276         ASSERT(cpuid_checkpass(cpu, 3));
5277 
5278         return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
5279 }
5280 
5281 int
5282 cpuid_is_cmt(cpu_t *cpu)
5283 {
5284         if (cpu == NULL)
5285                 cpu = CPU;
5286 
5287         ASSERT(cpuid_checkpass(cpu, 1));
5288 
5289         return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
5290 }
5291 
5292 /*
5293  * AMD and Intel both implement the 64-bit variant of the syscall
5294  * instruction (syscallq), so if there's -any- support for syscall,
5295  * cpuid currently says "yes, we support this".
5296  *
5297  * However, Intel decided to -not- implement the 32-bit variant of the
5298  * syscall instruction, so we provide a predicate to allow our caller
5299  * to test that subtlety here.
5300  *
5301  * XXPV Currently, 32-bit syscall instructions don't work via the hypervisor,
5302  *      even in the case where the hardware would in fact support it.
5303  */
5304 /*ARGSUSED*/
5305 int
5306 cpuid_syscall32_insn(cpu_t *cpu)
5307 {
5308         ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), 1));
5309 
5310 #if !defined(__xpv)
5311         if (cpu == NULL)
5312                 cpu = CPU;
5313 
5314         /*CSTYLED*/
5315         {
5316                 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5317 
5318                 if (cpi->cpi_vendor == X86_VENDOR_AMD &&
5319                     cpi->cpi_xmaxeax >= 0x80000001 &&
5320                     (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
5321                         return (1);
5322         }
5323 #endif
5324         return (0);
5325 }
5326 
5327 int
5328 cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
5329 {
5330         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5331 
5332         static const char fmt[] =
5333             "x86 (%s %X family %d model %d step %d clock %d MHz)";
5334         static const char fmt_ht[] =
5335             "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
5336 
5337         ASSERT(cpuid_checkpass(cpu, 1));
5338 
5339         if (cpuid_is_cmt(cpu))
5340                 return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
5341                     cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5342                     cpi->cpi_family, cpi->cpi_model,
5343                     cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5344         return (snprintf(s, n, fmt,
5345             cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5346             cpi->cpi_family, cpi->cpi_model,
5347             cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5348 }
5349 
5350 const char *
5351 cpuid_getvendorstr(cpu_t *cpu)
5352 {
5353         ASSERT(cpuid_checkpass(cpu, 1));
5354         return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
5355 }
5356 
5357 uint_t
5358 cpuid_getvendor(cpu_t *cpu)
5359 {
5360         ASSERT(cpuid_checkpass(cpu, 1));
5361         return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
5362 }
5363 
5364 uint_t
5365 cpuid_getfamily(cpu_t *cpu)
5366 {
5367         ASSERT(cpuid_checkpass(cpu, 1));
5368         return (cpu->cpu_m.mcpu_cpi->cpi_family);
5369 }
5370 
5371 uint_t
5372 cpuid_getmodel(cpu_t *cpu)
5373 {
5374         ASSERT(cpuid_checkpass(cpu, 1));
5375         return (cpu->cpu_m.mcpu_cpi->cpi_model);
5376 }
5377 
5378 uint_t
5379 cpuid_get_ncpu_per_chip(cpu_t *cpu)
5380 {
5381         ASSERT(cpuid_checkpass(cpu, 1));
5382         return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
5383 }
5384 
5385 uint_t
5386 cpuid_get_ncore_per_chip(cpu_t *cpu)
5387 {
5388         ASSERT(cpuid_checkpass(cpu, 1));
5389         return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
5390 }
5391 
5392 uint_t
5393 cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
5394 {
5395         ASSERT(cpuid_checkpass(cpu, 2));
5396         return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
5397 }
5398 
5399 id_t
5400 cpuid_get_last_lvl_cacheid(cpu_t *cpu)
5401 {
5402         ASSERT(cpuid_checkpass(cpu, 2));
5403         return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5404 }
5405 
5406 uint_t
5407 cpuid_getstep(cpu_t *cpu)
5408 {
5409         ASSERT(cpuid_checkpass(cpu, 1));
5410         return (cpu->cpu_m.mcpu_cpi->cpi_step);
5411 }
5412 
5413 uint_t
5414 cpuid_getsig(struct cpu *cpu)
5415 {
5416         ASSERT(cpuid_checkpass(cpu, 1));
5417         return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
5418 }
5419 
5420 uint32_t
5421 cpuid_getchiprev(struct cpu *cpu)
5422 {
5423         ASSERT(cpuid_checkpass(cpu, 1));
5424         return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
5425 }
5426 
5427 const char *
5428 cpuid_getchiprevstr(struct cpu *cpu)
5429 {
5430         ASSERT(cpuid_checkpass(cpu, 1));
5431         return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
5432 }
5433 
5434 uint32_t
5435 cpuid_getsockettype(struct cpu *cpu)
5436 {
5437         ASSERT(cpuid_checkpass(cpu, 1));
5438         return (cpu->cpu_m.mcpu_cpi->cpi_socket);
5439 }
5440 
5441 const char *
5442 cpuid_getsocketstr(cpu_t *cpu)
5443 {
5444         static const char *socketstr = NULL;
5445         struct cpuid_info *cpi;
5446 
5447         ASSERT(cpuid_checkpass(cpu, 1));
5448         cpi = cpu->cpu_m.mcpu_cpi;
5449 
5450         /* Assume that socket types are the same across the system */
5451         if (socketstr == NULL)
5452                 socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
5453                     cpi->cpi_model, cpi->cpi_step);
5454 
5455 
5456         return (socketstr);
5457 }
5458 
5459 int
5460 cpuid_get_chipid(cpu_t *cpu)
5461 {
5462         ASSERT(cpuid_checkpass(cpu, 1));
5463 
5464         if (cpuid_is_cmt(cpu))
5465                 return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
5466         return (cpu->cpu_id);
5467 }
5468 
5469 id_t
5470 cpuid_get_coreid(cpu_t *cpu)
5471 {
5472         ASSERT(cpuid_checkpass(cpu, 1));
5473         return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
5474 }
5475 
5476 int
5477 cpuid_get_pkgcoreid(cpu_t *cpu)
5478 {
5479         ASSERT(cpuid_checkpass(cpu, 1));
5480         return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
5481 }
5482 
5483 int
5484 cpuid_get_clogid(cpu_t *cpu)
5485 {
5486         ASSERT(cpuid_checkpass(cpu, 1));
5487         return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
5488 }
5489 
5490 int
5491 cpuid_get_cacheid(cpu_t *cpu)
5492 {
5493         ASSERT(cpuid_checkpass(cpu, 1));
5494         return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5495 }
5496 
5497 uint_t
5498 cpuid_get_procnodeid(cpu_t *cpu)
5499 {
5500         ASSERT(cpuid_checkpass(cpu, 1));
5501         return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
5502 }
5503 
5504 uint_t
5505 cpuid_get_procnodes_per_pkg(cpu_t *cpu)
5506 {
5507         ASSERT(cpuid_checkpass(cpu, 1));
5508         return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
5509 }
5510 
5511 uint_t
5512 cpuid_get_compunitid(cpu_t *cpu)
5513 {
5514         ASSERT(cpuid_checkpass(cpu, 1));
5515         return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
5516 }
5517 
5518 uint_t
5519 cpuid_get_cores_per_compunit(cpu_t *cpu)
5520 {
5521         ASSERT(cpuid_checkpass(cpu, 1));
5522         return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
5523 }
5524 
5525 /*ARGSUSED*/
5526 int
5527 cpuid_have_cr8access(cpu_t *cpu)
5528 {
5529 #if defined(__amd64)
5530         return (1);
5531 #else
5532         struct cpuid_info *cpi;
5533 
5534         ASSERT(cpu != NULL);
5535         cpi = cpu->cpu_m.mcpu_cpi;
5536         if (cpi->cpi_vendor == X86_VENDOR_AMD && cpi->cpi_maxeax >= 1 &&
5537             (CPI_FEATURES_XTD_ECX(cpi) & CPUID_AMD_ECX_CR8D) != 0)
5538                 return (1);
5539         return (0);
5540 #endif
5541 }
5542 
5543 uint32_t
5544 cpuid_get_apicid(cpu_t *cpu)
5545 {
5546         ASSERT(cpuid_checkpass(cpu, 1));
5547         if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
5548                 return (UINT32_MAX);
5549         } else {
5550                 return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
5551         }
5552 }
5553 
5554 void
5555 cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
5556 {
5557         struct cpuid_info *cpi;
5558 
5559         if (cpu == NULL)
5560                 cpu = CPU;
5561         cpi = cpu->cpu_m.mcpu_cpi;
5562 
5563         ASSERT(cpuid_checkpass(cpu, 1));
5564 
5565         if (pabits)
5566                 *pabits = cpi->cpi_pabits;
5567         if (vabits)
5568                 *vabits = cpi->cpi_vabits;
5569 }
5570 
5571 size_t
5572 cpuid_get_xsave_size()
5573 {
5574         return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
5575             sizeof (struct xsave_state)));
5576 }
5577 
5578 /*
5579  * Return true if the CPUs on this system require 'pointer clearing' for the
5580  * floating point error pointer exception handling. In the past, this has been
5581  * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
5582  * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
5583  * feature bit and is reflected in the cpi_fp_amd_save member.
5584  */
5585 boolean_t
5586 cpuid_need_fp_excp_handling()
5587 {
5588         return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
5589             cpuid_info0.cpi_fp_amd_save != 0);
5590 }
5591 
5592 /*
5593  * Returns the number of data TLB entries for a corresponding
5594  * pagesize.  If it can't be computed, or isn't known, the
5595  * routine returns zero.  If you ask about an architecturally
5596  * impossible pagesize, the routine will panic (so that the
5597  * hat implementor knows that things are inconsistent.)
5598  */
5599 uint_t
5600 cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
5601 {
5602         struct cpuid_info *cpi;
5603         uint_t dtlb_nent = 0;
5604 
5605         if (cpu == NULL)
5606                 cpu = CPU;
5607         cpi = cpu->cpu_m.mcpu_cpi;
5608 
5609         ASSERT(cpuid_checkpass(cpu, 1));
5610 
5611         /*
5612          * Check the L2 TLB info
5613          */
5614         if (cpi->cpi_xmaxeax >= 0x80000006) {
5615                 struct cpuid_regs *cp = &cpi->cpi_extd[6];
5616 
5617                 switch (pagesize) {
5618 
5619                 case 4 * 1024:
5620                         /*
5621                          * All zero in the top 16 bits of the register
5622                          * indicates a unified TLB. Size is in low 16 bits.
5623                          */
5624                         if ((cp->cp_ebx & 0xffff0000) == 0)
5625                                 dtlb_nent = cp->cp_ebx & 0x0000ffff;
5626                         else
5627                                 dtlb_nent = BITX(cp->cp_ebx, 27, 16);
5628                         break;
5629 
5630                 case 2 * 1024 * 1024:
5631                         if ((cp->cp_eax & 0xffff0000) == 0)
5632                                 dtlb_nent = cp->cp_eax & 0x0000ffff;
5633                         else
5634                                 dtlb_nent = BITX(cp->cp_eax, 27, 16);
5635                         break;
5636 
5637                 default:
5638                         panic("unknown L2 pagesize");
5639                         /*NOTREACHED*/
5640                 }
5641         }
5642 
5643         if (dtlb_nent != 0)
5644                 return (dtlb_nent);
5645 
5646         /*
5647          * No L2 TLB support for this size, try L1.
5648          */
5649         if (cpi->cpi_xmaxeax >= 0x80000005) {
5650                 struct cpuid_regs *cp = &cpi->cpi_extd[5];
5651 
5652                 switch (pagesize) {
5653                 case 4 * 1024:
5654                         dtlb_nent = BITX(cp->cp_ebx, 23, 16);
5655                         break;
5656                 case 2 * 1024 * 1024:
5657                         dtlb_nent = BITX(cp->cp_eax, 23, 16);
5658                         break;
5659                 default:
5660                         panic("unknown L1 d-TLB pagesize");
5661                         /*NOTREACHED*/
5662                 }
5663         }
5664 
5665         return (dtlb_nent);
5666 }
5667 
5668 /*
5669  * Return 0 if the erratum is not present or not applicable, positive
5670  * if it is, and negative if the status of the erratum is unknown.
5671  *
5672  * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
5673  * Processors" #25759, Rev 3.57, August 2005
5674  */
5675 int
5676 cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
5677 {
5678         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5679         uint_t eax;
5680 
5681         /*
5682          * Bail out if this CPU isn't an AMD CPU, or if it's
5683          * a legacy (32-bit) AMD CPU.
5684          */
5685         if (cpi->cpi_vendor != X86_VENDOR_AMD ||
5686             cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
5687             cpi->cpi_family == 6) {
5688                 return (0);
5689         }
5690 
5691         eax = cpi->cpi_std[1].cp_eax;
5692 
5693 #define SH_B0(eax)      (eax == 0xf40 || eax == 0xf50)
5694 #define SH_B3(eax)      (eax == 0xf51)
5695 #define B(eax)          (SH_B0(eax) || SH_B3(eax))
5696 
5697 #define SH_C0(eax)      (eax == 0xf48 || eax == 0xf58)
5698 
5699 #define SH_CG(eax)      (eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
5700 #define DH_CG(eax)      (eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
5701 #define CH_CG(eax)      (eax == 0xf82 || eax == 0xfb2)
5702 #define CG(eax)         (SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
5703 
5704 #define SH_D0(eax)      (eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
5705 #define DH_D0(eax)      (eax == 0x10fc0 || eax == 0x10ff0)
5706 #define CH_D0(eax)      (eax == 0x10f80 || eax == 0x10fb0)
5707 #define D0(eax)         (SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
5708 
5709 #define SH_E0(eax)      (eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
5710 #define JH_E1(eax)      (eax == 0x20f10)        /* JH8_E0 had 0x20f30 */
5711 #define DH_E3(eax)      (eax == 0x20fc0 || eax == 0x20ff0)
5712 #define SH_E4(eax)      (eax == 0x20f51 || eax == 0x20f71)
5713 #define BH_E4(eax)      (eax == 0x20fb1)
5714 #define SH_E5(eax)      (eax == 0x20f42)
5715 #define DH_E6(eax)      (eax == 0x20ff2 || eax == 0x20fc2)
5716 #define JH_E6(eax)      (eax == 0x20f12 || eax == 0x20f32)
5717 #define EX(eax)         (SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
5718                             SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
5719                             DH_E6(eax) || JH_E6(eax))
5720 
5721 #define DR_AX(eax)      (eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
5722 #define DR_B0(eax)      (eax == 0x100f20)
5723 #define DR_B1(eax)      (eax == 0x100f21)
5724 #define DR_BA(eax)      (eax == 0x100f2a)
5725 #define DR_B2(eax)      (eax == 0x100f22)
5726 #define DR_B3(eax)      (eax == 0x100f23)
5727 #define RB_C0(eax)      (eax == 0x100f40)
5728 
5729         switch (erratum) {
5730         case 1:
5731                 return (cpi->cpi_family < 0x10);
5732         case 51:        /* what does the asterisk mean? */
5733                 return (B(eax) || SH_C0(eax) || CG(eax));
5734         case 52:
5735                 return (B(eax));
5736         case 57:
5737                 return (cpi->cpi_family <= 0x11);
5738         case 58:
5739                 return (B(eax));
5740         case 60:
5741                 return (cpi->cpi_family <= 0x11);
5742         case 61:
5743         case 62:
5744         case 63:
5745         case 64:
5746         case 65:
5747         case 66:
5748         case 68:
5749         case 69:
5750         case 70:
5751         case 71:
5752                 return (B(eax));
5753         case 72:
5754                 return (SH_B0(eax));
5755         case 74:
5756                 return (B(eax));
5757         case 75:
5758                 return (cpi->cpi_family < 0x10);
5759         case 76:
5760                 return (B(eax));
5761         case 77:
5762                 return (cpi->cpi_family <= 0x11);
5763         case 78:
5764                 return (B(eax) || SH_C0(eax));
5765         case 79:
5766                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5767         case 80:
5768         case 81:
5769         case 82:
5770                 return (B(eax));
5771         case 83:
5772                 return (B(eax) || SH_C0(eax) || CG(eax));
5773         case 85:
5774                 return (cpi->cpi_family < 0x10);
5775         case 86:
5776                 return (SH_C0(eax) || CG(eax));
5777         case 88:
5778 #if !defined(__amd64)
5779                 return (0);
5780 #else
5781                 return (B(eax) || SH_C0(eax));
5782 #endif
5783         case 89:
5784                 return (cpi->cpi_family < 0x10);
5785         case 90:
5786                 return (B(eax) || SH_C0(eax) || CG(eax));
5787         case 91:
5788         case 92:
5789                 return (B(eax) || SH_C0(eax));
5790         case 93:
5791                 return (SH_C0(eax));
5792         case 94:
5793                 return (B(eax) || SH_C0(eax) || CG(eax));
5794         case 95:
5795 #if !defined(__amd64)
5796                 return (0);
5797 #else
5798                 return (B(eax) || SH_C0(eax));
5799 #endif
5800         case 96:
5801                 return (B(eax) || SH_C0(eax) || CG(eax));
5802         case 97:
5803         case 98:
5804                 return (SH_C0(eax) || CG(eax));
5805         case 99:
5806                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5807         case 100:
5808                 return (B(eax) || SH_C0(eax));
5809         case 101:
5810         case 103:
5811                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5812         case 104:
5813                 return (SH_C0(eax) || CG(eax) || D0(eax));
5814         case 105:
5815         case 106:
5816         case 107:
5817                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5818         case 108:
5819                 return (DH_CG(eax));
5820         case 109:
5821                 return (SH_C0(eax) || CG(eax) || D0(eax));
5822         case 110:
5823                 return (D0(eax) || EX(eax));
5824         case 111:
5825                 return (CG(eax));
5826         case 112:
5827                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5828         case 113:
5829                 return (eax == 0x20fc0);
5830         case 114:
5831                 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
5832         case 115:
5833                 return (SH_E0(eax) || JH_E1(eax));
5834         case 116:
5835                 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
5836         case 117:
5837                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5838         case 118:
5839                 return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
5840                     JH_E6(eax));
5841         case 121:
5842                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5843         case 122:
5844                 return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
5845         case 123:
5846                 return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
5847         case 131:
5848                 return (cpi->cpi_family < 0x10);
5849         case 6336786:
5850 
5851                 /*
5852                  * Test for AdvPowerMgmtInfo.TscPStateInvariant
5853                  * if this is a K8 family or newer processor. We're testing for
5854                  * this 'erratum' to determine whether or not we have a constant
5855                  * TSC.
5856                  *
5857                  * Our current fix for this is to disable the C1-Clock ramping.
5858                  * However, this doesn't work on newer processor families nor
5859                  * does it work when virtualized as those devices don't exist.
5860                  */
5861                 if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
5862                         return (0);
5863                 }
5864 
5865                 if (CPI_FAMILY(cpi) == 0xf) {
5866                         struct cpuid_regs regs;
5867                         regs.cp_eax = 0x80000007;
5868                         (void) __cpuid_insn(&regs);
5869                         return (!(regs.cp_edx & 0x100));
5870                 }
5871                 return (0);
5872         case 6323525:
5873                 /*
5874                  * This erratum (K8 #147) is not present on family 10 and newer.
5875                  */
5876                 if (cpi->cpi_family >= 0x10) {
5877                         return (0);
5878                 }
5879                 return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
5880                     (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
5881 
5882         case 6671130:
5883                 /*
5884                  * check for processors (pre-Shanghai) that do not provide
5885                  * optimal management of 1gb ptes in its tlb.
5886                  */
5887                 return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
5888 
5889         case 298:
5890                 return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
5891                     DR_B2(eax) || RB_C0(eax));
5892 
5893         case 721:
5894 #if defined(__amd64)
5895                 return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
5896 #else
5897                 return (0);
5898 #endif
5899 
5900         default:
5901                 return (-1);
5902 
5903         }
5904 }
5905 
5906 /*
5907  * Determine if specified erratum is present via OSVW (OS Visible Workaround).
5908  * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
5909  */
5910 int
5911 osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
5912 {
5913         struct cpuid_info       *cpi;
5914         uint_t                  osvwid;
5915         static int              osvwfeature = -1;
5916         uint64_t                osvwlength;
5917 
5918 
5919         cpi = cpu->cpu_m.mcpu_cpi;
5920 
5921         /* confirm OSVW supported */
5922         if (osvwfeature == -1) {
5923                 osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
5924         } else {
5925                 /* assert that osvw feature setting is consistent on all cpus */
5926                 ASSERT(osvwfeature ==
5927                     (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
5928         }
5929         if (!osvwfeature)
5930                 return (-1);
5931 
5932         osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
5933 
5934         switch (erratum) {
5935         case 298:       /* osvwid is 0 */
5936                 osvwid = 0;
5937                 if (osvwlength <= (uint64_t)osvwid) {
5938                         /* osvwid 0 is unknown */
5939                         return (-1);
5940                 }
5941 
5942                 /*
5943                  * Check the OSVW STATUS MSR to determine the state
5944                  * of the erratum where:
5945                  *   0 - fixed by HW
5946                  *   1 - BIOS has applied the workaround when BIOS
5947                  *   workaround is available. (Or for other errata,
5948                  *   OS workaround is required.)
5949                  * For a value of 1, caller will confirm that the
5950                  * erratum 298 workaround has indeed been applied by BIOS.
5951                  *
5952                  * A 1 may be set in cpus that have a HW fix
5953                  * in a mixed cpu system. Regarding erratum 298:
5954                  *   In a multiprocessor platform, the workaround above
5955                  *   should be applied to all processors regardless of
5956                  *   silicon revision when an affected processor is
5957                  *   present.
5958                  */
5959 
5960                 return (rdmsr(MSR_AMD_OSVW_STATUS +
5961                     (osvwid / OSVW_ID_CNT_PER_MSR)) &
5962                     (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
5963 
5964         default:
5965                 return (-1);
5966         }
5967 }
5968 
5969 static const char assoc_str[] = "associativity";
5970 static const char line_str[] = "line-size";
5971 static const char size_str[] = "size";
5972 
5973 static void
5974 add_cache_prop(dev_info_t *devi, const char *label, const char *type,
5975     uint32_t val)
5976 {
5977         char buf[128];
5978 
5979         /*
5980          * ndi_prop_update_int() is used because it is desirable for
5981          * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
5982          */
5983         if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
5984                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
5985 }
5986 
5987 /*
5988  * Intel-style cache/tlb description
5989  *
5990  * Standard cpuid level 2 gives a randomly ordered
5991  * selection of tags that index into a table that describes
5992  * cache and tlb properties.
5993  */
5994 
5995 static const char l1_icache_str[] = "l1-icache";
5996 static const char l1_dcache_str[] = "l1-dcache";
5997 static const char l2_cache_str[] = "l2-cache";
5998 static const char l3_cache_str[] = "l3-cache";
5999 static const char itlb4k_str[] = "itlb-4K";
6000 static const char dtlb4k_str[] = "dtlb-4K";
6001 static const char itlb2M_str[] = "itlb-2M";
6002 static const char itlb4M_str[] = "itlb-4M";
6003 static const char dtlb4M_str[] = "dtlb-4M";
6004 static const char dtlb24_str[] = "dtlb0-2M-4M";
6005 static const char itlb424_str[] = "itlb-4K-2M-4M";
6006 static const char itlb24_str[] = "itlb-2M-4M";
6007 static const char dtlb44_str[] = "dtlb-4K-4M";
6008 static const char sl1_dcache_str[] = "sectored-l1-dcache";
6009 static const char sl2_cache_str[] = "sectored-l2-cache";
6010 static const char itrace_str[] = "itrace-cache";
6011 static const char sl3_cache_str[] = "sectored-l3-cache";
6012 static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
6013 
6014 static const struct cachetab {
6015         uint8_t         ct_code;
6016         uint8_t         ct_assoc;
6017         uint16_t        ct_line_size;
6018         size_t          ct_size;
6019         const char      *ct_label;
6020 } intel_ctab[] = {
6021         /*
6022          * maintain descending order!
6023          *
6024          * Codes ignored - Reason
6025          * ----------------------
6026          * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
6027          * f0H/f1H - Currently we do not interpret prefetch size by design
6028          */
6029         { 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
6030         { 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
6031         { 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
6032         { 0xde, 12, 64, 6*1024*1024, l3_cache_str},
6033         { 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
6034         { 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
6035         { 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
6036         { 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
6037         { 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
6038         { 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
6039         { 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
6040         { 0xd0, 4, 64, 512*1024, l3_cache_str},
6041         { 0xca, 4, 0, 512, sh_l2_tlb4k_str},
6042         { 0xc0, 4, 0, 8, dtlb44_str },
6043         { 0xba, 4, 0, 64, dtlb4k_str },
6044         { 0xb4, 4, 0, 256, dtlb4k_str },
6045         { 0xb3, 4, 0, 128, dtlb4k_str },
6046         { 0xb2, 4, 0, 64, itlb4k_str },
6047         { 0xb0, 4, 0, 128, itlb4k_str },
6048         { 0x87, 8, 64, 1024*1024, l2_cache_str},
6049         { 0x86, 4, 64, 512*1024, l2_cache_str},
6050         { 0x85, 8, 32, 2*1024*1024, l2_cache_str},
6051         { 0x84, 8, 32, 1024*1024, l2_cache_str},
6052         { 0x83, 8, 32, 512*1024, l2_cache_str},
6053         { 0x82, 8, 32, 256*1024, l2_cache_str},
6054         { 0x80, 8, 64, 512*1024, l2_cache_str},
6055         { 0x7f, 2, 64, 512*1024, l2_cache_str},
6056         { 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
6057         { 0x7c, 8, 64, 1024*1024, sl2_cache_str},
6058         { 0x7b, 8, 64, 512*1024, sl2_cache_str},
6059         { 0x7a, 8, 64, 256*1024, sl2_cache_str},
6060         { 0x79, 8, 64, 128*1024, sl2_cache_str},
6061         { 0x78, 8, 64, 1024*1024, l2_cache_str},
6062         { 0x73, 8, 0, 64*1024, itrace_str},
6063         { 0x72, 8, 0, 32*1024, itrace_str},
6064         { 0x71, 8, 0, 16*1024, itrace_str},
6065         { 0x70, 8, 0, 12*1024, itrace_str},
6066         { 0x68, 4, 64, 32*1024, sl1_dcache_str},
6067         { 0x67, 4, 64, 16*1024, sl1_dcache_str},
6068         { 0x66, 4, 64, 8*1024, sl1_dcache_str},
6069         { 0x60, 8, 64, 16*1024, sl1_dcache_str},
6070         { 0x5d, 0, 0, 256, dtlb44_str},
6071         { 0x5c, 0, 0, 128, dtlb44_str},
6072         { 0x5b, 0, 0, 64, dtlb44_str},
6073         { 0x5a, 4, 0, 32, dtlb24_str},
6074         { 0x59, 0, 0, 16, dtlb4k_str},
6075         { 0x57, 4, 0, 16, dtlb4k_str},
6076         { 0x56, 4, 0, 16, dtlb4M_str},
6077         { 0x55, 0, 0, 7, itlb24_str},
6078         { 0x52, 0, 0, 256, itlb424_str},
6079         { 0x51, 0, 0, 128, itlb424_str},
6080         { 0x50, 0, 0, 64, itlb424_str},
6081         { 0x4f, 0, 0, 32, itlb4k_str},
6082         { 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
6083         { 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
6084         { 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
6085         { 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
6086         { 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
6087         { 0x49, 16, 64, 4*1024*1024, l3_cache_str},
6088         { 0x48, 12, 64, 3*1024*1024, l2_cache_str},
6089         { 0x47, 8, 64, 8*1024*1024, l3_cache_str},
6090         { 0x46, 4, 64, 4*1024*1024, l3_cache_str},
6091         { 0x45, 4, 32, 2*1024*1024, l2_cache_str},
6092         { 0x44, 4, 32, 1024*1024, l2_cache_str},
6093         { 0x43, 4, 32, 512*1024, l2_cache_str},
6094         { 0x42, 4, 32, 256*1024, l2_cache_str},
6095         { 0x41, 4, 32, 128*1024, l2_cache_str},
6096         { 0x3e, 4, 64, 512*1024, sl2_cache_str},
6097         { 0x3d, 6, 64, 384*1024, sl2_cache_str},
6098         { 0x3c, 4, 64, 256*1024, sl2_cache_str},
6099         { 0x3b, 2, 64, 128*1024, sl2_cache_str},
6100         { 0x3a, 6, 64, 192*1024, sl2_cache_str},
6101         { 0x39, 4, 64, 128*1024, sl2_cache_str},
6102         { 0x30, 8, 64, 32*1024, l1_icache_str},
6103         { 0x2c, 8, 64, 32*1024, l1_dcache_str},
6104         { 0x29, 8, 64, 4096*1024, sl3_cache_str},
6105         { 0x25, 8, 64, 2048*1024, sl3_cache_str},
6106         { 0x23, 8, 64, 1024*1024, sl3_cache_str},
6107         { 0x22, 4, 64, 512*1024, sl3_cache_str},
6108         { 0x0e, 6, 64, 24*1024, l1_dcache_str},
6109         { 0x0d, 4, 32, 16*1024, l1_dcache_str},
6110         { 0x0c, 4, 32, 16*1024, l1_dcache_str},
6111         { 0x0b, 4, 0, 4, itlb4M_str},
6112         { 0x0a, 2, 32, 8*1024, l1_dcache_str},
6113         { 0x08, 4, 32, 16*1024, l1_icache_str},
6114         { 0x06, 4, 32, 8*1024, l1_icache_str},
6115         { 0x05, 4, 0, 32, dtlb4M_str},
6116         { 0x04, 4, 0, 8, dtlb4M_str},
6117         { 0x03, 4, 0, 64, dtlb4k_str},
6118         { 0x02, 4, 0, 2, itlb4M_str},
6119         { 0x01, 4, 0, 32, itlb4k_str},
6120         { 0 }
6121 };
6122 
6123 static const struct cachetab cyrix_ctab[] = {
6124         { 0x70, 4, 0, 32, "tlb-4K" },
6125         { 0x80, 4, 16, 16*1024, "l1-cache" },
6126         { 0 }
6127 };
6128 
6129 /*
6130  * Search a cache table for a matching entry
6131  */
6132 static const struct cachetab *
6133 find_cacheent(const struct cachetab *ct, uint_t code)
6134 {
6135         if (code != 0) {
6136                 for (; ct->ct_code != 0; ct++)
6137                         if (ct->ct_code <= code)
6138                                 break;
6139                 if (ct->ct_code == code)
6140                         return (ct);
6141         }
6142         return (NULL);
6143 }
6144 
6145 /*
6146  * Populate cachetab entry with L2 or L3 cache-information using
6147  * cpuid function 4. This function is called from intel_walk_cacheinfo()
6148  * when descriptor 0x49 is encountered. It returns 0 if no such cache
6149  * information is found.
6150  */
6151 static int
6152 intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
6153 {
6154         uint32_t level, i;
6155         int ret = 0;
6156 
6157         for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
6158                 level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
6159 
6160                 if (level == 2 || level == 3) {
6161                         ct->ct_assoc =
6162                             CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
6163                         ct->ct_line_size =
6164                             CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
6165                         ct->ct_size = ct->ct_assoc *
6166                             (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
6167                             ct->ct_line_size *
6168                             (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
6169 
6170                         if (level == 2) {
6171                                 ct->ct_label = l2_cache_str;
6172                         } else if (level == 3) {
6173                                 ct->ct_label = l3_cache_str;
6174                         }
6175                         ret = 1;
6176                 }
6177         }
6178 
6179         return (ret);
6180 }
6181 
6182 /*
6183  * Walk the cacheinfo descriptor, applying 'func' to every valid element
6184  * The walk is terminated if the walker returns non-zero.
6185  */
6186 static void
6187 intel_walk_cacheinfo(struct cpuid_info *cpi,
6188     void *arg, int (*func)(void *, const struct cachetab *))
6189 {
6190         const struct cachetab *ct;
6191         struct cachetab des_49_ct, des_b1_ct;
6192         uint8_t *dp;
6193         int i;
6194 
6195         if ((dp = cpi->cpi_cacheinfo) == NULL)
6196                 return;
6197         for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6198                 /*
6199                  * For overloaded descriptor 0x49 we use cpuid function 4
6200                  * if supported by the current processor, to create
6201                  * cache information.
6202                  * For overloaded descriptor 0xb1 we use X86_PAE flag
6203                  * to disambiguate the cache information.
6204                  */
6205                 if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
6206                     intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
6207                                 ct = &des_49_ct;
6208                 } else if (*dp == 0xb1) {
6209                         des_b1_ct.ct_code = 0xb1;
6210                         des_b1_ct.ct_assoc = 4;
6211                         des_b1_ct.ct_line_size = 0;
6212                         if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
6213                                 des_b1_ct.ct_size = 8;
6214                                 des_b1_ct.ct_label = itlb2M_str;
6215                         } else {
6216                                 des_b1_ct.ct_size = 4;
6217                                 des_b1_ct.ct_label = itlb4M_str;
6218                         }
6219                         ct = &des_b1_ct;
6220                 } else {
6221                         if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
6222                                 continue;
6223                         }
6224                 }
6225 
6226                 if (func(arg, ct) != 0) {
6227                         break;
6228                 }
6229         }
6230 }
6231 
6232 /*
6233  * (Like the Intel one, except for Cyrix CPUs)
6234  */
6235 static void
6236 cyrix_walk_cacheinfo(struct cpuid_info *cpi,
6237     void *arg, int (*func)(void *, const struct cachetab *))
6238 {
6239         const struct cachetab *ct;
6240         uint8_t *dp;
6241         int i;
6242 
6243         if ((dp = cpi->cpi_cacheinfo) == NULL)
6244                 return;
6245         for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6246                 /*
6247                  * Search Cyrix-specific descriptor table first ..
6248                  */
6249                 if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
6250                         if (func(arg, ct) != 0)
6251                                 break;
6252                         continue;
6253                 }
6254                 /*
6255                  * .. else fall back to the Intel one
6256                  */
6257                 if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
6258                         if (func(arg, ct) != 0)
6259                                 break;
6260                         continue;
6261                 }
6262         }
6263 }
6264 
6265 /*
6266  * A cacheinfo walker that adds associativity, line-size, and size properties
6267  * to the devinfo node it is passed as an argument.
6268  */
6269 static int
6270 add_cacheent_props(void *arg, const struct cachetab *ct)
6271 {
6272         dev_info_t *devi = arg;
6273 
6274         add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
6275         if (ct->ct_line_size != 0)
6276                 add_cache_prop(devi, ct->ct_label, line_str,
6277                     ct->ct_line_size);
6278         add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
6279         return (0);
6280 }
6281 
6282 
6283 static const char fully_assoc[] = "fully-associative?";
6284 
6285 /*
6286  * AMD style cache/tlb description
6287  *
6288  * Extended functions 5 and 6 directly describe properties of
6289  * tlbs and various cache levels.
6290  */
6291 static void
6292 add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6293 {
6294         switch (assoc) {
6295         case 0: /* reserved; ignore */
6296                 break;
6297         default:
6298                 add_cache_prop(devi, label, assoc_str, assoc);
6299                 break;
6300         case 0xff:
6301                 add_cache_prop(devi, label, fully_assoc, 1);
6302                 break;
6303         }
6304 }
6305 
6306 static void
6307 add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6308 {
6309         if (size == 0)
6310                 return;
6311         add_cache_prop(devi, label, size_str, size);
6312         add_amd_assoc(devi, label, assoc);
6313 }
6314 
6315 static void
6316 add_amd_cache(dev_info_t *devi, const char *label,
6317     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6318 {
6319         if (size == 0 || line_size == 0)
6320                 return;
6321         add_amd_assoc(devi, label, assoc);
6322         /*
6323          * Most AMD parts have a sectored cache. Multiple cache lines are
6324          * associated with each tag. A sector consists of all cache lines
6325          * associated with a tag. For example, the AMD K6-III has a sector
6326          * size of 2 cache lines per tag.
6327          */
6328         if (lines_per_tag != 0)
6329                 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6330         add_cache_prop(devi, label, line_str, line_size);
6331         add_cache_prop(devi, label, size_str, size * 1024);
6332 }
6333 
6334 static void
6335 add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6336 {
6337         switch (assoc) {
6338         case 0: /* off */
6339                 break;
6340         case 1:
6341         case 2:
6342         case 4:
6343                 add_cache_prop(devi, label, assoc_str, assoc);
6344                 break;
6345         case 6:
6346                 add_cache_prop(devi, label, assoc_str, 8);
6347                 break;
6348         case 8:
6349                 add_cache_prop(devi, label, assoc_str, 16);
6350                 break;
6351         case 0xf:
6352                 add_cache_prop(devi, label, fully_assoc, 1);
6353                 break;
6354         default: /* reserved; ignore */
6355                 break;
6356         }
6357 }
6358 
6359 static void
6360 add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6361 {
6362         if (size == 0 || assoc == 0)
6363                 return;
6364         add_amd_l2_assoc(devi, label, assoc);
6365         add_cache_prop(devi, label, size_str, size);
6366 }
6367 
6368 static void
6369 add_amd_l2_cache(dev_info_t *devi, const char *label,
6370     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6371 {
6372         if (size == 0 || assoc == 0 || line_size == 0)
6373                 return;
6374         add_amd_l2_assoc(devi, label, assoc);
6375         if (lines_per_tag != 0)
6376                 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6377         add_cache_prop(devi, label, line_str, line_size);
6378         add_cache_prop(devi, label, size_str, size * 1024);
6379 }
6380 
6381 static void
6382 amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
6383 {
6384         struct cpuid_regs *cp;
6385 
6386         if (cpi->cpi_xmaxeax < 0x80000005)
6387                 return;
6388         cp = &cpi->cpi_extd[5];
6389 
6390         /*
6391          * 4M/2M L1 TLB configuration
6392          *
6393          * We report the size for 2M pages because AMD uses two
6394          * TLB entries for one 4M page.
6395          */
6396         add_amd_tlb(devi, "dtlb-2M",
6397             BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
6398         add_amd_tlb(devi, "itlb-2M",
6399             BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
6400 
6401         /*
6402          * 4K L1 TLB configuration
6403          */
6404 
6405         switch (cpi->cpi_vendor) {
6406                 uint_t nentries;
6407         case X86_VENDOR_TM:
6408                 if (cpi->cpi_family >= 5) {
6409                         /*
6410                          * Crusoe processors have 256 TLB entries, but
6411                          * cpuid data format constrains them to only
6412                          * reporting 255 of them.
6413                          */
6414                         if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
6415                                 nentries = 256;
6416                         /*
6417                          * Crusoe processors also have a unified TLB
6418                          */
6419                         add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
6420                             nentries);
6421                         break;
6422                 }
6423                 /*FALLTHROUGH*/
6424         default:
6425                 add_amd_tlb(devi, itlb4k_str,
6426                     BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
6427                 add_amd_tlb(devi, dtlb4k_str,
6428                     BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
6429                 break;
6430         }
6431 
6432         /*
6433          * data L1 cache configuration
6434          */
6435 
6436         add_amd_cache(devi, l1_dcache_str,
6437             BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
6438             BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
6439 
6440         /*
6441          * code L1 cache configuration
6442          */
6443 
6444         add_amd_cache(devi, l1_icache_str,
6445             BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
6446             BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
6447 
6448         if (cpi->cpi_xmaxeax < 0x80000006)
6449                 return;
6450         cp = &cpi->cpi_extd[6];
6451 
6452         /* Check for a unified L2 TLB for large pages */
6453 
6454         if (BITX(cp->cp_eax, 31, 16) == 0)
6455                 add_amd_l2_tlb(devi, "l2-tlb-2M",
6456                     BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6457         else {
6458                 add_amd_l2_tlb(devi, "l2-dtlb-2M",
6459                     BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6460                 add_amd_l2_tlb(devi, "l2-itlb-2M",
6461                     BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6462         }
6463 
6464         /* Check for a unified L2 TLB for 4K pages */
6465 
6466         if (BITX(cp->cp_ebx, 31, 16) == 0) {
6467                 add_amd_l2_tlb(devi, "l2-tlb-4K",
6468                     BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6469         } else {
6470                 add_amd_l2_tlb(devi, "l2-dtlb-4K",
6471                     BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6472                 add_amd_l2_tlb(devi, "l2-itlb-4K",
6473                     BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6474         }
6475 
6476         add_amd_l2_cache(devi, l2_cache_str,
6477             BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
6478             BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
6479 }
6480 
6481 /*
6482  * There are two basic ways that the x86 world describes it cache
6483  * and tlb architecture - Intel's way and AMD's way.
6484  *
6485  * Return which flavor of cache architecture we should use
6486  */
6487 static int
6488 x86_which_cacheinfo(struct cpuid_info *cpi)
6489 {
6490         switch (cpi->cpi_vendor) {
6491         case X86_VENDOR_Intel:
6492                 if (cpi->cpi_maxeax >= 2)
6493                         return (X86_VENDOR_Intel);
6494                 break;
6495         case X86_VENDOR_AMD:
6496                 /*
6497                  * The K5 model 1 was the first part from AMD that reported
6498                  * cache sizes via extended cpuid functions.
6499                  */
6500                 if (cpi->cpi_family > 5 ||
6501                     (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
6502                         return (X86_VENDOR_AMD);
6503                 break;
6504         case X86_VENDOR_TM:
6505                 if (cpi->cpi_family >= 5)
6506                         return (X86_VENDOR_AMD);
6507                 /*FALLTHROUGH*/
6508         default:
6509                 /*
6510                  * If they have extended CPU data for 0x80000005
6511                  * then we assume they have AMD-format cache
6512                  * information.
6513                  *
6514                  * If not, and the vendor happens to be Cyrix,
6515                  * then try our-Cyrix specific handler.
6516                  *
6517                  * If we're not Cyrix, then assume we're using Intel's
6518                  * table-driven format instead.
6519                  */
6520                 if (cpi->cpi_xmaxeax >= 0x80000005)
6521                         return (X86_VENDOR_AMD);
6522                 else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
6523                         return (X86_VENDOR_Cyrix);
6524                 else if (cpi->cpi_maxeax >= 2)
6525                         return (X86_VENDOR_Intel);
6526                 break;
6527         }
6528         return (-1);
6529 }
6530 
6531 void
6532 cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
6533     struct cpuid_info *cpi)
6534 {
6535         dev_info_t *cpu_devi;
6536         int create;
6537 
6538         cpu_devi = (dev_info_t *)dip;
6539 
6540         /* device_type */
6541         (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6542             "device_type", "cpu");
6543 
6544         /* reg */
6545         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6546             "reg", cpu_id);
6547 
6548         /* cpu-mhz, and clock-frequency */
6549         if (cpu_freq > 0) {
6550                 long long mul;
6551 
6552                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6553                     "cpu-mhz", cpu_freq);
6554                 if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
6555                         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6556                             "clock-frequency", (int)mul);
6557         }
6558 
6559         if (!is_x86_feature(x86_featureset, X86FSET_CPUID)) {
6560                 return;
6561         }
6562 
6563         /* vendor-id */
6564         (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6565             "vendor-id", cpi->cpi_vendorstr);
6566 
6567         if (cpi->cpi_maxeax == 0) {
6568                 return;
6569         }
6570 
6571         /*
6572          * family, model, and step
6573          */
6574         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6575             "family", CPI_FAMILY(cpi));
6576         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6577             "cpu-model", CPI_MODEL(cpi));
6578         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6579             "stepping-id", CPI_STEP(cpi));
6580 
6581         /* type */
6582         switch (cpi->cpi_vendor) {
6583         case X86_VENDOR_Intel:
6584                 create = 1;
6585                 break;
6586         default:
6587                 create = 0;
6588                 break;
6589         }
6590         if (create)
6591                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6592                     "type", CPI_TYPE(cpi));
6593 
6594         /* ext-family */
6595         switch (cpi->cpi_vendor) {
6596         case X86_VENDOR_Intel:
6597         case X86_VENDOR_AMD:
6598                 create = cpi->cpi_family >= 0xf;
6599                 break;
6600         default:
6601                 create = 0;
6602                 break;
6603         }
6604         if (create)
6605                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6606                     "ext-family", CPI_FAMILY_XTD(cpi));
6607 
6608         /* ext-model */
6609         switch (cpi->cpi_vendor) {
6610         case X86_VENDOR_Intel:
6611                 create = IS_EXTENDED_MODEL_INTEL(cpi);
6612                 break;
6613         case X86_VENDOR_AMD:
6614                 create = CPI_FAMILY(cpi) == 0xf;
6615                 break;
6616         default:
6617                 create = 0;
6618                 break;
6619         }
6620         if (create)
6621                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6622                     "ext-model", CPI_MODEL_XTD(cpi));
6623 
6624         /* generation */
6625         switch (cpi->cpi_vendor) {
6626         case X86_VENDOR_AMD:
6627                 /*
6628                  * AMD K5 model 1 was the first part to support this
6629                  */
6630                 create = cpi->cpi_xmaxeax >= 0x80000001;
6631                 break;
6632         default:
6633                 create = 0;
6634                 break;
6635         }
6636         if (create)
6637                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6638                     "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
6639 
6640         /* brand-id */
6641         switch (cpi->cpi_vendor) {
6642         case X86_VENDOR_Intel:
6643                 /*
6644                  * brand id first appeared on Pentium III Xeon model 8,
6645                  * and Celeron model 8 processors and Opteron
6646                  */
6647                 create = cpi->cpi_family > 6 ||
6648                     (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
6649                 break;
6650         case X86_VENDOR_AMD:
6651                 create = cpi->cpi_family >= 0xf;
6652                 break;
6653         default:
6654                 create = 0;
6655                 break;
6656         }
6657         if (create && cpi->cpi_brandid != 0) {
6658                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6659                     "brand-id", cpi->cpi_brandid);
6660         }
6661 
6662         /* chunks, and apic-id */
6663         switch (cpi->cpi_vendor) {
6664                 /*
6665                  * first available on Pentium IV and Opteron (K8)
6666                  */
6667         case X86_VENDOR_Intel:
6668                 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6669                 break;
6670         case X86_VENDOR_AMD:
6671                 create = cpi->cpi_family >= 0xf;
6672                 break;
6673         default:
6674                 create = 0;
6675                 break;
6676         }
6677         if (create) {
6678                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6679                     "chunks", CPI_CHUNKS(cpi));
6680                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6681                     "apic-id", cpi->cpi_apicid);
6682                 if (cpi->cpi_chipid >= 0) {
6683                         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6684                             "chip#", cpi->cpi_chipid);
6685                         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6686                             "clog#", cpi->cpi_clogid);
6687                 }
6688         }
6689 
6690         /* cpuid-features */
6691         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6692             "cpuid-features", CPI_FEATURES_EDX(cpi));
6693 
6694 
6695         /* cpuid-features-ecx */
6696         switch (cpi->cpi_vendor) {
6697         case X86_VENDOR_Intel:
6698                 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6699                 break;
6700         case X86_VENDOR_AMD:
6701                 create = cpi->cpi_family >= 0xf;
6702                 break;
6703         default:
6704                 create = 0;
6705                 break;
6706         }
6707         if (create)
6708                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6709                     "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
6710 
6711         /* ext-cpuid-features */
6712         switch (cpi->cpi_vendor) {
6713         case X86_VENDOR_Intel:
6714         case X86_VENDOR_AMD:
6715         case X86_VENDOR_Cyrix:
6716         case X86_VENDOR_TM:
6717         case X86_VENDOR_Centaur:
6718                 create = cpi->cpi_xmaxeax >= 0x80000001;
6719                 break;
6720         default:
6721                 create = 0;
6722                 break;
6723         }
6724         if (create) {
6725                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6726                     "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
6727                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6728                     "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
6729         }
6730 
6731         /*
6732          * Brand String first appeared in Intel Pentium IV, AMD K5
6733          * model 1, and Cyrix GXm.  On earlier models we try and
6734          * simulate something similar .. so this string should always
6735          * same -something- about the processor, however lame.
6736          */
6737         (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6738             "brand-string", cpi->cpi_brandstr);
6739 
6740         /*
6741          * Finally, cache and tlb information
6742          */
6743         switch (x86_which_cacheinfo(cpi)) {
6744         case X86_VENDOR_Intel:
6745                 intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
6746                 break;
6747         case X86_VENDOR_Cyrix:
6748                 cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
6749                 break;
6750         case X86_VENDOR_AMD:
6751                 amd_cache_info(cpi, cpu_devi);
6752                 break;
6753         default:
6754                 break;
6755         }
6756 }
6757 
6758 struct l2info {
6759         int *l2i_csz;
6760         int *l2i_lsz;
6761         int *l2i_assoc;
6762         int l2i_ret;
6763 };
6764 
6765 /*
6766  * A cacheinfo walker that fetches the size, line-size and associativity
6767  * of the L2 cache
6768  */
6769 static int
6770 intel_l2cinfo(void *arg, const struct cachetab *ct)
6771 {
6772         struct l2info *l2i = arg;
6773         int *ip;
6774 
6775         if (ct->ct_label != l2_cache_str &&
6776             ct->ct_label != sl2_cache_str)
6777                 return (0);     /* not an L2 -- keep walking */
6778 
6779         if ((ip = l2i->l2i_csz) != NULL)
6780                 *ip = ct->ct_size;
6781         if ((ip = l2i->l2i_lsz) != NULL)
6782                 *ip = ct->ct_line_size;
6783         if ((ip = l2i->l2i_assoc) != NULL)
6784                 *ip = ct->ct_assoc;
6785         l2i->l2i_ret = ct->ct_size;
6786         return (1);             /* was an L2 -- terminate walk */
6787 }
6788 
6789 /*
6790  * AMD L2/L3 Cache and TLB Associativity Field Definition:
6791  *
6792  *      Unlike the associativity for the L1 cache and tlb where the 8 bit
6793  *      value is the associativity, the associativity for the L2 cache and
6794  *      tlb is encoded in the following table. The 4 bit L2 value serves as
6795  *      an index into the amd_afd[] array to determine the associativity.
6796  *      -1 is undefined. 0 is fully associative.
6797  */
6798 
6799 static int amd_afd[] =
6800         {-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
6801 
6802 static void
6803 amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
6804 {
6805         struct cpuid_regs *cp;
6806         uint_t size, assoc;
6807         int i;
6808         int *ip;
6809 
6810         if (cpi->cpi_xmaxeax < 0x80000006)
6811                 return;
6812         cp = &cpi->cpi_extd[6];
6813 
6814         if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
6815             (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
6816                 uint_t cachesz = size * 1024;
6817                 assoc = amd_afd[i];
6818 
6819                 ASSERT(assoc != -1);
6820 
6821                 if ((ip = l2i->l2i_csz) != NULL)
6822                         *ip = cachesz;
6823                 if ((ip = l2i->l2i_lsz) != NULL)
6824                         *ip = BITX(cp->cp_ecx, 7, 0);
6825                 if ((ip = l2i->l2i_assoc) != NULL)
6826                         *ip = assoc;
6827                 l2i->l2i_ret = cachesz;
6828         }
6829 }
6830 
6831 int
6832 getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
6833 {
6834         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6835         struct l2info __l2info, *l2i = &__l2info;
6836 
6837         l2i->l2i_csz = csz;
6838         l2i->l2i_lsz = lsz;
6839         l2i->l2i_assoc = assoc;
6840         l2i->l2i_ret = -1;
6841 
6842         switch (x86_which_cacheinfo(cpi)) {
6843         case X86_VENDOR_Intel:
6844                 intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
6845                 break;
6846         case X86_VENDOR_Cyrix:
6847                 cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
6848                 break;
6849         case X86_VENDOR_AMD:
6850                 amd_l2cacheinfo(cpi, l2i);
6851                 break;
6852         default:
6853                 break;
6854         }
6855         return (l2i->l2i_ret);
6856 }
6857 
6858 #if !defined(__xpv)
6859 
6860 uint32_t *
6861 cpuid_mwait_alloc(cpu_t *cpu)
6862 {
6863         uint32_t        *ret;
6864         size_t          mwait_size;
6865 
6866         ASSERT(cpuid_checkpass(CPU, 2));
6867 
6868         mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
6869         if (mwait_size == 0)
6870                 return (NULL);
6871 
6872         /*
6873          * kmem_alloc() returns cache line size aligned data for mwait_size
6874          * allocations.  mwait_size is currently cache line sized.  Neither
6875          * of these implementation details are guarantied to be true in the
6876          * future.
6877          *
6878          * First try allocating mwait_size as kmem_alloc() currently returns
6879          * correctly aligned memory.  If kmem_alloc() does not return
6880          * mwait_size aligned memory, then use mwait_size ROUNDUP.
6881          *
6882          * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
6883          * decide to free this memory.
6884          */
6885         ret = kmem_zalloc(mwait_size, KM_SLEEP);
6886         if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
6887                 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
6888                 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
6889                 *ret = MWAIT_RUNNING;
6890                 return (ret);
6891         } else {
6892                 kmem_free(ret, mwait_size);
6893                 ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
6894                 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
6895                 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
6896                 ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
6897                 *ret = MWAIT_RUNNING;
6898                 return (ret);
6899         }
6900 }
6901 
6902 void
6903 cpuid_mwait_free(cpu_t *cpu)
6904 {
6905         if (cpu->cpu_m.mcpu_cpi == NULL) {
6906                 return;
6907         }
6908 
6909         if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
6910             cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
6911                 kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
6912                     cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
6913         }
6914 
6915         cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
6916         cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
6917 }
6918 
6919 void
6920 patch_tsc_read(int flag)
6921 {
6922         size_t cnt;
6923 
6924         switch (flag) {
6925         case TSC_NONE:
6926                 cnt = &_no_rdtsc_end - &_no_rdtsc_start;
6927                 (void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
6928                 break;
6929         case TSC_RDTSC_MFENCE:
6930                 cnt = &_tsc_mfence_end - &_tsc_mfence_start;
6931                 (void) memcpy((void *)tsc_read,
6932                     (void *)&_tsc_mfence_start, cnt);
6933                 break;
6934         case TSC_RDTSC_LFENCE:
6935                 cnt = &_tsc_lfence_end - &_tsc_lfence_start;
6936                 (void) memcpy((void *)tsc_read,
6937                     (void *)&_tsc_lfence_start, cnt);
6938                 break;
6939         case TSC_TSCP:
6940                 cnt = &_tscp_end - &_tscp_start;
6941                 (void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
6942                 break;
6943         default:
6944                 /* Bail for unexpected TSC types. (TSC_NONE covers 0) */
6945                 cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
6946                 break;
6947         }
6948         tsc_type = flag;
6949 }
6950 
6951 int
6952 cpuid_deep_cstates_supported(void)
6953 {
6954         struct cpuid_info *cpi;
6955         struct cpuid_regs regs;
6956 
6957         ASSERT(cpuid_checkpass(CPU, 1));
6958 
6959         cpi = CPU->cpu_m.mcpu_cpi;
6960 
6961         if (!is_x86_feature(x86_featureset, X86FSET_CPUID))
6962                 return (0);
6963 
6964         switch (cpi->cpi_vendor) {
6965         case X86_VENDOR_Intel:
6966                 if (cpi->cpi_xmaxeax < 0x80000007)
6967                         return (0);
6968 
6969                 /*
6970                  * TSC run at a constant rate in all ACPI C-states?
6971                  */
6972                 regs.cp_eax = 0x80000007;
6973                 (void) __cpuid_insn(&regs);
6974                 return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
6975 
6976         default:
6977                 return (0);
6978         }
6979 }
6980 
6981 #endif  /* !__xpv */
6982 
6983 void
6984 post_startup_cpu_fixups(void)
6985 {
6986 #ifndef __xpv
6987         /*
6988          * Some AMD processors support C1E state. Entering this state will
6989          * cause the local APIC timer to stop, which we can't deal with at
6990          * this time.
6991          */
6992         if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
6993                 on_trap_data_t otd;
6994                 uint64_t reg;
6995 
6996                 if (!on_trap(&otd, OT_DATA_ACCESS)) {
6997                         reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
6998                         /* Disable C1E state if it is enabled by BIOS */
6999                         if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
7000                             AMD_ACTONCMPHALT_MASK) {
7001                                 reg &= ~(AMD_ACTONCMPHALT_MASK <<
7002                                     AMD_ACTONCMPHALT_SHIFT);
7003                                 wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
7004                         }
7005                 }
7006                 no_trap();
7007         }
7008 #endif  /* !__xpv */
7009 }
7010 
7011 void
7012 enable_pcid(void)
7013 {
7014         if (x86_use_pcid == -1)
7015                 x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
7016 
7017         if (x86_use_invpcid == -1) {
7018                 x86_use_invpcid = is_x86_feature(x86_featureset,
7019                     X86FSET_INVPCID);
7020         }
7021 
7022         if (!x86_use_pcid)
7023                 return;
7024 
7025         /*
7026          * Intel say that on setting PCIDE, it immediately starts using the PCID
7027          * bits; better make sure there's nothing there.
7028          */
7029         ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
7030 
7031         setcr4(getcr4() | CR4_PCIDE);
7032 }
7033 
7034 /*
7035  * Setup necessary registers to enable XSAVE feature on this processor.
7036  * This function needs to be called early enough, so that no xsave/xrstor
7037  * ops will execute on the processor before the MSRs are properly set up.
7038  *
7039  * Current implementation has the following assumption:
7040  * - cpuid_pass1() is done, so that X86 features are known.
7041  * - fpu_probe() is done, so that fp_save_mech is chosen.
7042  */
7043 void
7044 xsave_setup_msr(cpu_t *cpu)
7045 {
7046         ASSERT(fp_save_mech == FP_XSAVE);
7047         ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
7048 
7049         /* Enable OSXSAVE in CR4. */
7050         setcr4(getcr4() | CR4_OSXSAVE);
7051         /*
7052          * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report
7053          * correct value.
7054          */
7055         cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE;
7056         setup_xfem();
7057 }
7058 
7059 /*
7060  * Starting with the Westmere processor the local
7061  * APIC timer will continue running in all C-states,
7062  * including the deepest C-states.
7063  */
7064 int
7065 cpuid_arat_supported(void)
7066 {
7067         struct cpuid_info *cpi;
7068         struct cpuid_regs regs;
7069 
7070         ASSERT(cpuid_checkpass(CPU, 1));
7071         ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7072 
7073         cpi = CPU->cpu_m.mcpu_cpi;
7074 
7075         switch (cpi->cpi_vendor) {
7076         case X86_VENDOR_Intel:
7077                 /*
7078                  * Always-running Local APIC Timer is
7079                  * indicated by CPUID.6.EAX[2].
7080                  */
7081                 if (cpi->cpi_maxeax >= 6) {
7082                         regs.cp_eax = 6;
7083                         (void) cpuid_insn(NULL, &regs);
7084                         return (regs.cp_eax & CPUID_INTC_EAX_ARAT);
7085                 } else {
7086                         return (0);
7087                 }
7088         default:
7089                 return (0);
7090         }
7091 }
7092 
7093 /*
7094  * Check support for Intel ENERGY_PERF_BIAS feature
7095  */
7096 int
7097 cpuid_iepb_supported(struct cpu *cp)
7098 {
7099         struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi;
7100         struct cpuid_regs regs;
7101 
7102         ASSERT(cpuid_checkpass(cp, 1));
7103 
7104         if (!(is_x86_feature(x86_featureset, X86FSET_CPUID)) ||
7105             !(is_x86_feature(x86_featureset, X86FSET_MSR))) {
7106                 return (0);
7107         }
7108 
7109         /*
7110          * Intel ENERGY_PERF_BIAS MSR is indicated by
7111          * capability bit CPUID.6.ECX.3
7112          */
7113         if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6))
7114                 return (0);
7115 
7116         regs.cp_eax = 0x6;
7117         (void) cpuid_insn(NULL, &regs);
7118         return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS);
7119 }
7120 
7121 /*
7122  * Check support for TSC deadline timer
7123  *
7124  * TSC deadline timer provides a superior software programming
7125  * model over local APIC timer that eliminates "time drifts".
7126  * Instead of specifying a relative time, software specifies an
7127  * absolute time as the target at which the processor should
7128  * generate a timer event.
7129  */
7130 int
7131 cpuid_deadline_tsc_supported(void)
7132 {
7133         struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi;
7134         struct cpuid_regs regs;
7135 
7136         ASSERT(cpuid_checkpass(CPU, 1));
7137         ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7138 
7139         switch (cpi->cpi_vendor) {
7140         case X86_VENDOR_Intel:
7141                 if (cpi->cpi_maxeax >= 1) {
7142                         regs.cp_eax = 1;
7143                         (void) cpuid_insn(NULL, &regs);
7144                         return (regs.cp_ecx & CPUID_DEADLINE_TSC);
7145                 } else {
7146                         return (0);
7147                 }
7148         default:
7149                 return (0);
7150         }
7151 }
7152 
7153 #if defined(__amd64) && !defined(__xpv)
7154 /*
7155  * Patch in versions of bcopy for high performance Intel Nhm processors
7156  * and later...
7157  */
7158 void
7159 patch_memops(uint_t vendor)
7160 {
7161         size_t cnt, i;
7162         caddr_t to, from;
7163 
7164         if ((vendor == X86_VENDOR_Intel) &&
7165             is_x86_feature(x86_featureset, X86FSET_SSE4_2)) {
7166                 cnt = &bcopy_patch_end - &bcopy_patch_start;
7167                 to = &bcopy_ck_size;
7168                 from = &bcopy_patch_start;
7169                 for (i = 0; i < cnt; i++) {
7170                         *to++ = *from++;
7171                 }
7172         }
7173 }
7174 #endif  /* __amd64 && !__xpv */
7175 
7176 /*
7177  * We're being asked to tell the system how many bits are required to represent
7178  * the various thread and strand IDs. While it's tempting to derive this based
7179  * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite
7180  * correct. Instead, this needs to be based on the number of bits that the APIC
7181  * allows for these different configurations. We only update these to a larger
7182  * value if we find one.
7183  */
7184 void
7185 cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
7186 {
7187         struct cpuid_info *cpi;
7188 
7189         VERIFY(cpuid_checkpass(CPU, 1));
7190         cpi = cpu->cpu_m.mcpu_cpi;
7191 
7192         if (cpi->cpi_ncore_bits > *core_nbits) {
7193                 *core_nbits = cpi->cpi_ncore_bits;
7194         }
7195 
7196         if (cpi->cpi_nthread_bits > *strand_nbits) {
7197                 *strand_nbits = cpi->cpi_nthread_bits;
7198         }
7199 }
7200 
7201 void
7202 cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
7203 {
7204         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7205         struct cpuid_regs cp;
7206 
7207         /*
7208          * Reread the CPUID portions that we need for various security
7209          * information.
7210          */
7211         if (cpi->cpi_vendor == X86_VENDOR_Intel) {
7212                 /*
7213                  * Check if we now have leaf 7 available to us.
7214                  */
7215                 if (cpi->cpi_maxeax < 7) {
7216                         bzero(&cp, sizeof (cp));
7217                         cp.cp_eax = 0;
7218                         cpi->cpi_maxeax = __cpuid_insn(&cp);
7219                         if (cpi->cpi_maxeax < 7)
7220                                 return;
7221                 }
7222 
7223                 bzero(&cp, sizeof (cp));
7224                 cp.cp_eax = 7;
7225                 cp.cp_ecx = 0;
7226                 (void) __cpuid_insn(&cp);
7227                 cpi->cpi_std[7] = cp;
7228         } else if (cpi->cpi_vendor == X86_VENDOR_AMD) {
7229                 /* No xcpuid support */
7230                 if (cpi->cpi_family < 5 ||
7231                     (cpi->cpi_family == 5 && cpi->cpi_model < 1))
7232                         return;
7233 
7234                 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7235                         bzero(&cp, sizeof (cp));
7236                         cp.cp_eax = CPUID_LEAF_EXT_0;
7237                         cpi->cpi_xmaxeax = __cpuid_insn(&cp);
7238                         if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7239                                 return;
7240                         }
7241                 }
7242 
7243                 bzero(&cp, sizeof (cp));
7244                 cp.cp_eax = CPUID_LEAF_EXT_8;
7245                 (void) __cpuid_insn(&cp);
7246                 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
7247                 cpi->cpi_extd[8] = cp;
7248         } else {
7249                 /*
7250                  * Nothing to do here. Return an empty set which has already
7251                  * been zeroed for us.
7252                  */
7253                 return;
7254         }
7255         cpuid_scan_security(cpu, fset);
7256 }
7257 
7258 /* ARGSUSED */
7259 static int
7260 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
7261 {
7262         uchar_t *fset;
7263         boolean_t first_pass = (boolean_t)arg1;
7264 
7265         fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
7266         if (first_pass && CPU->cpu_id != 0)
7267                 return (0);
7268         if (!first_pass && CPU->cpu_id == 0)
7269                 return (0);
7270         cpuid_pass_ucode(CPU, fset);
7271 
7272         return (0);
7273 }
7274 
7275 /*
7276  * After a microcode update where the version has changed, then we need to
7277  * rescan CPUID. To do this we check every CPU to make sure that they have the
7278  * same microcode. Then we perform a cross call to all such CPUs. It's the
7279  * caller's job to make sure that no one else can end up doing an update while
7280  * this is going on.
7281  *
7282  * We assume that the system is microcode capable if we're called.
7283  */
7284 void
7285 cpuid_post_ucodeadm(void)
7286 {
7287         uint32_t rev;
7288         int i;
7289         struct cpu *cpu;
7290         cpuset_t cpuset;
7291         void *argdata;
7292         uchar_t *f0;
7293 
7294         argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
7295 
7296         mutex_enter(&cpu_lock);
7297         cpu = cpu_get(0);
7298         rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
7299         CPUSET_ONLY(cpuset, 0);
7300         for (i = 1; i < max_ncpus; i++) {
7301                 if ((cpu = cpu_get(i)) == NULL)
7302                         continue;
7303 
7304                 if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
7305                         panic("post microcode update CPU %d has differing "
7306                             "microcode revision (%u) from CPU 0 (%u)",
7307                             i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
7308                 }
7309                 CPUSET_ADD(cpuset, i);
7310         }
7311 
7312         /*
7313          * We do the cross calls in two passes. The first pass is only for the
7314          * boot CPU. The second pass is for all of the other CPUs. This allows
7315          * the boot CPU to go through and change behavior related to patching or
7316          * whether or not Enhanced IBRS needs to be enabled and then allow all
7317          * other CPUs to follow suit.
7318          */
7319         kpreempt_disable();
7320         xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset),
7321             cpuid_post_ucodeadm_xc);
7322         xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset),
7323             cpuid_post_ucodeadm_xc);
7324         kpreempt_enable();
7325 
7326         /*
7327          * OK, now look at each CPU and see if their feature sets are equal.
7328          */
7329         f0 = argdata;
7330         for (i = 1; i < max_ncpus; i++) {
7331                 uchar_t *fset;
7332                 if (!CPU_IN_SET(cpuset, i))
7333                         continue;
7334 
7335                 fset = (uchar_t *)((uintptr_t)argdata +
7336                     sizeof (x86_featureset) * i);
7337 
7338                 if (!compare_x86_featureset(f0, fset)) {
7339                         panic("Post microcode update CPU %d has "
7340                             "differing security feature (%p) set from CPU 0 "
7341                             "(%p), not appending to feature set", i,
7342                             (void *)fset, (void *)f0);
7343                 }
7344         }
7345 
7346         mutex_exit(&cpu_lock);
7347 
7348         for (i = 0; i < NUM_X86_FEATURES; i++) {
7349                 cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n",
7350                     x86_feature_names[i]);
7351                 if (is_x86_feature(f0, i)) {
7352                         add_x86_feature(x86_featureset, i);
7353                 }
7354         }
7355         kmem_free(argdata, sizeof (x86_featureset) * NCPU);
7356 }