2 New usr/src/uts/i86pc/os/cpuid.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  24  * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
  25  * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
  26  * Copyright 2020 Joyent, Inc.
  27  */
  28 /*
  29  * Copyright (c) 2010, Intel Corporation.
  30  * All rights reserved.
  31  */
  32 /*
  33  * Portions Copyright 2009 Advanced Micro Devices, Inc.
  34  */
  35 
  36 /*
  37  * CPU Identification logic
  38  *
  39  * The purpose of this file and its companion, cpuid_subr.c, is to help deal
  40  * with the identification of CPUs, their features, and their topologies. More
  41  * specifically, this file helps drive the following:
  42  *
  43  * 1. Enumeration of features of the processor which are used by the kernel to
  44  *    determine what features to enable or disable. These may be instruction set
  45  *    enhancements or features that we use.
  46  *
  47  * 2. Enumeration of instruction set architecture (ISA) additions that userland
  48  *    will be told about through the auxiliary vector.
  49  *
  50  * 3. Understanding the physical topology of the CPU such as the number of
  51  *    caches, how many cores it has, whether or not it supports symmetric
  52  *    multi-processing (SMT), etc.
  53  *
  54  * ------------------------
  55  * CPUID History and Basics
  56  * ------------------------
  57  *
  58  * The cpuid instruction was added by Intel roughly around the time that the
  59  * original Pentium was introduced. The purpose of cpuid was to tell in a
  60  * programmatic fashion information about the CPU that previously was guessed
  61  * at. For example, an important part of cpuid is that we can know what
  62  * extensions to the ISA exist. If you use an invalid opcode you would get a
  63  * #UD, so this method allows a program (whether a user program or the kernel)
  64  * to determine what exists without crashing or getting a SIGILL. Of course,
  65  * this was also during the era of the clones and the AMD Am5x86. The vendor
  66  * name shows up first in cpuid for a reason.
  67  *
  68  * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
  69  * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
  70  * its own meaning. The different leaves are broken down into different regions:
  71  *
  72  *      [ 0, 7fffffff ]                 This region is called the 'basic'
  73  *                                      region. This region is generally defined
  74  *                                      by Intel, though some of the original
  75  *                                      portions have different meanings based
  76  *                                      on the manufacturer. These days, Intel
  77  *                                      adds most new features to this region.
  78  *                                      AMD adds non-Intel compatible
  79  *                                      information in the third, extended
  80  *                                      region. Intel uses this for everything
  81  *                                      including ISA extensions, CPU
  82  *                                      features, cache information, topology,
  83  *                                      and more.
  84  *
  85  *                                      There is a hole carved out of this
  86  *                                      region which is reserved for
  87  *                                      hypervisors.
  88  *
  89  *      [ 40000000, 4fffffff ]          This region, which is found in the
  90  *                                      middle of the previous region, is
  91  *                                      explicitly promised to never be used by
  92  *                                      CPUs. Instead, it is used by hypervisors
  93  *                                      to communicate information about
  94  *                                      themselves to the operating system. The
  95  *                                      values and details are unique for each
  96  *                                      hypervisor.
  97  *
  98  *      [ 80000000, ffffffff ]          This region is called the 'extended'
  99  *                                      region. Some of the low leaves mirror
 100  *                                      parts of the basic leaves. This region
 101  *                                      has generally been used by AMD for
 102  *                                      various extensions. For example, AMD-
 103  *                                      specific information about caches,
 104  *                                      features, and topology are found in this
 105  *                                      region.
 106  *
 107  * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
 108  * and %edx, and then issue the cpuid instruction. At the first leaf in each of
 109  * the ranges, one of the primary things returned is the maximum valid leaf in
 110  * that range. This allows for discovery of what range of CPUID is valid.
 111  *
 112  * The CPUs have potentially surprising behavior when using an invalid leaf or
 113  * unimplemented leaf. If the requested leaf is within the valid basic or
 114  * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
 115  * set to zero. However, if you specify a leaf that is outside of a valid range,
 116  * then instead it will be filled with the last valid _basic_ leaf. For example,
 117  * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
 118  * an invalid extended leaf will return the information for leaf 3.
 119  *
 120  * Some leaves are broken down into sub-leaves. This means that the value
 121  * depends on both the leaf asked for in %eax and a secondary register. For
 122  * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
 123  * additional information. Or when getting topology information in leaf 0xb, the
 124  * initial value in %ecx changes which level of the topology that you are
 125  * getting information about.
 126  *
 127  * cpuid values are always kept to 32 bits regardless of whether or not the
 128  * program is in 64-bit mode. When executing in 64-bit mode, the upper
 129  * 32 bits of the register are always set to zero so that way the values are the
 130  * same regardless of execution mode.
 131  *
 132  * ----------------------
 133  * Identifying Processors
 134  * ----------------------
 135  *
 136  * We can identify a processor in two steps. The first step looks at cpuid leaf
 137  * 0. Leaf 0 contains the processor's vendor information. This is done by
 138  * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
 139  * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
 140  *
 141  * From there, a processor is identified by a combination of three different
 142  * values:
 143  *
 144  *  1. Family
 145  *  2. Model
 146  *  3. Stepping
 147  *
 148  * Each vendor uses the family and model to uniquely identify a processor. The
 149  * way that family and model are changed depends on the vendor. For example,
 150  * Intel has been using family 0x6 for almost all of their processor since the
 151  * Pentium Pro/Pentium II era, often called the P6. The model is used to
 152  * identify the exact processor. Different models are often used for the client
 153  * (consumer) and server parts. Even though each processor often has major
 154  * architectural differences, they still are considered the same family by
 155  * Intel.
 156  *
 157  * On the other hand, each major AMD architecture generally has its own family.
 158  * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
 159  * the model number is used to help identify specific processors.
 160  *
 161  * The stepping is used to refer to a revision of a specific microprocessor. The
 162  * term comes from equipment used to produce masks that are used to create
 163  * integrated circuits.
 164  *
 165  * The information is present in leaf 1, %eax. In technical documentation you
 166  * will see the terms extended model and extended family. The original family,
 167  * model, and stepping fields were each 4 bits wide. If the values in either
 168  * are 0xf, then one is to consult the extended model and extended family, which
 169  * take previously reserved bits and allow for a larger number of models and add
 170  * 0xf to them.
 171  *
 172  * When we process this information, we store the full family, model, and
 173  * stepping in the struct cpuid_info members cpi_family, cpi_model, and
 174  * cpi_step, respectively. Whenever you are performing comparisons with the
 175  * family, model, and stepping, you should use these members and not the raw
 176  * values from cpuid. If you must use the raw values from cpuid directly, you
 177  * must make sure that you add the extended model and family to the base model
 178  * and family.
 179  *
 180  * In general, we do not use information about the family, model, and stepping
 181  * to determine whether or not a feature is present; that is generally driven by
 182  * specific leaves. However, when something we care about on the processor is
 183  * not considered 'architectural' meaning that it is specific to a set of
 184  * processors and not promised in the architecture model to be consistent from
 185  * generation to generation, then we will fall back on this information. The
 186  * most common cases where this comes up is when we have to workaround errata in
 187  * the processor, are dealing with processor-specific features such as CPU
 188  * performance counters, or we want to provide additional information for things
 189  * such as fault management.
 190  *
 191  * While processors also do have a brand string, which is the name that people
 192  * are familiar with when buying the processor, they are not meant for
 193  * programmatic consumption. That is what the family, model, and stepping are
 194  * for.
 195  *
 196  * ------------
 197  * CPUID Passes
 198  * ------------
 199  *
 200  * As part of performing feature detection, we break this into several different
 201  * passes. The passes are as follows:
 202  *
 203  *      Pass 0          This is a primordial pass done in locore.s to deal with
 204  *                      Cyrix CPUs that don't support cpuid. The reality is that
 205  *                      we likely don't run on them any more, but there is still
 206  *                      logic for handling them.
 207  *
 208  *      Pass 1          This is the primary pass and is responsible for doing a
 209  *                      large number of different things:
 210  *
 211  *                      1. Determine which vendor manufactured the CPU and
 212  *                      determining the family, model, and stepping information.
 213  *
 214  *                      2. Gathering a large number of feature flags to
 215  *                      determine which features the CPU support and which
 216  *                      indicate things that we need to do other work in the OS
 217  *                      to enable. Features detected this way are added to the
 218  *                      x86_featureset which can be queried to
 219  *                      determine what we should do. This includes processing
 220  *                      all of the basic and extended CPU features that we care
 221  *                      about.
 222  *
 223  *                      3. Determining the CPU's topology. This includes
 224  *                      information about how many cores and threads are present
 225  *                      in the package. It also is responsible for figuring out
 226  *                      which logical CPUs are potentially part of the same core
 227  *                      and what other resources they might share. For more
 228  *                      information see the 'Topology' section.
 229  *
 230  *                      4. Determining the set of CPU security-specific features
 231  *                      that we need to worry about and determine the
 232  *                      appropriate set of workarounds.
 233  *
 234  *                      Pass 1 on the boot CPU occurs before KMDB is started.
 235  *
 236  *      Pass 2          The second pass is done after startup(). Here, we check
 237  *                      other miscellaneous features. Most of this is gathering
 238  *                      additional basic and extended features that we'll use in
 239  *                      later passes or for debugging support.
 240  *
 241  *      Pass 3          The third pass occurs after the kernel memory allocator
 242  *                      has been fully initialized. This gathers information
 243  *                      where we might need dynamic memory available for our
 244  *                      uses. This includes several varying width leaves that
 245  *                      have cache information and the processor's brand string.
 246  *
 247  *      Pass 4          The fourth and final normal pass is performed after the
 248  *                      kernel has brought most everything online. This is
 249  *                      invoked from post_startup(). In this pass, we go through
 250  *                      the set of features that we have enabled and turn that
 251  *                      into the hardware auxiliary vector features that
 252  *                      userland receives. This is used by userland, primarily
 253  *                      by the run-time link-editor (RTLD), though userland
 254  *                      software could also refer to it directly.
 255  *
 256  *      Microcode       After a microcode update, we do a selective rescan of
 257  *                      the cpuid leaves to determine what features have
 258  *                      changed. Microcode updates can provide more details
 259  *                      about security related features to deal with issues like
 260  *                      Spectre and L1TF. On occasion, vendors have violated
 261  *                      their contract and removed bits. However, we don't try
 262  *                      to detect that because that puts us in a situation that
 263  *                      we really can't deal with. As such, the only thing we
 264  *                      rescan are security related features today. See
 265  *                      cpuid_pass_ucode().
 266  *
 267  * All of the passes (except pass 0) are run on all CPUs. However, for the most
 268  * part we only care about what the boot CPU says about this information and use
 269  * the other CPUs as a rough guide to sanity check that we have the same feature
 270  * set.
 271  *
 272  * We do not support running multiple logical CPUs with disjoint, let alone
 273  * different, feature sets.
 274  *
 275  * ------------------
 276  * Processor Topology
 277  * ------------------
 278  *
 279  * One of the important things that we need to do is to understand the topology
 280  * of the underlying processor. When we say topology in this case, we're trying
 281  * to understand the relationship between the logical CPUs that the operating
 282  * system sees and the underlying physical layout. Different logical CPUs may
 283  * share different resources which can have important consequences for the
 284  * performance of the system. For example, they may share caches, execution
 285  * units, and more.
 286  *
 287  * The topology of the processor changes from generation to generation and
 288  * vendor to vendor.  Along with that, different vendors use different
 289  * terminology, and the operating system itself uses occasionally overlapping
 290  * terminology. It's important to understand what this topology looks like so
 291  * one can understand the different things that we try to calculate and
 292  * determine.
 293  *
 294  * To get started, let's talk about a little bit of terminology that we've used
 295  * so far, is used throughout this file, and is fairly generic across multiple
 296  * vendors:
 297  *
 298  * CPU
 299  *      A central processing unit (CPU) refers to a logical and/or virtual
 300  *      entity that the operating system can execute instructions on. The
 301  *      underlying resources for this CPU may be shared between multiple
 302  *      entities; however, to the operating system it is a discrete unit.
 303  *
 304  * PROCESSOR and PACKAGE
 305  *
 306  *      Generally, when we use the term 'processor' on its own, we are referring
 307  *      to the physical entity that one buys and plugs into a board. However,
 308  *      because processor has been overloaded and one might see it used to mean
 309  *      multiple different levels, we will instead use the term 'package' for
 310  *      the rest of this file. The term package comes from the electrical
 311  *      engineering side and refers to the physical entity that encloses the
 312  *      electronics inside. Strictly speaking the package can contain more than
 313  *      just the CPU, for example, on many processors it may also have what's
 314  *      called an 'integrated graphical processing unit (GPU)'. Because the
 315  *      package can encapsulate multiple units, it is the largest physical unit
 316  *      that we refer to.
 317  *
 318  * SOCKET
 319  *
 320  *      A socket refers to unit on a system board (generally the motherboard)
 321  *      that can receive a package. A single package, or processor, is plugged
 322  *      into a single socket. A system may have multiple sockets. Often times,
 323  *      the term socket is used interchangeably with package and refers to the
 324  *      electrical component that has plugged in, and not the receptacle itself.
 325  *
 326  * CORE
 327  *
 328  *      A core refers to the physical instantiation of a CPU, generally, with a
 329  *      full set of hardware resources available to it. A package may contain
 330  *      multiple cores inside of it or it may just have a single one. A
 331  *      processor with more than one core is often referred to as 'multi-core'.
 332  *      In illumos, we will use the feature X86FSET_CMP to refer to a system
 333  *      that has 'multi-core' processors.
 334  *
 335  *      A core may expose a single logical CPU to the operating system, or it
 336  *      may expose multiple CPUs, which we call threads, defined below.
 337  *
 338  *      Some resources may still be shared by cores in the same package. For
 339  *      example, many processors will share the level 3 cache between cores.
 340  *      Some AMD generations share hardware resources between cores. For more
 341  *      information on that see the section 'AMD Topology'.
 342  *
 343  * THREAD and STRAND
 344  *
 345  *      In this file, generally a thread refers to a hardware resources and not
 346  *      the operating system's logical abstraction. A thread is always exposed
 347  *      as an independent logical CPU to the operating system. A thread belongs
 348  *      to a specific core. A core may have more than one thread. When that is
 349  *      the case, the threads that are part of the same core are often referred
 350  *      to as 'siblings'.
 351  *
 352  *      When multiple threads exist, this is generally referred to as
 353  *      simultaneous multi-threading (SMT). When Intel introduced this in their
 354  *      processors they called it hyper-threading (HT). When multiple threads
 355  *      are active in a core, they split the resources of the core. For example,
 356  *      two threads may share the same set of hardware execution units.
 357  *
 358  *      The operating system often uses the term 'strand' to refer to a thread.
 359  *      This helps disambiguate it from the software concept.
 360  *
 361  * CHIP
 362  *
 363  *      Unfortunately, the term 'chip' is dramatically overloaded. At its most
 364  *      base meaning, it is used to refer to a single integrated circuit, which
 365  *      may or may not be the only thing in the package. In illumos, when you
 366  *      see the term 'chip' it is almost always referring to the same thing as
 367  *      the 'package'. However, many vendors may use chip to refer to one of
 368  *      many integrated circuits that have been placed in the package. As an
 369  *      example, see the subsequent definition.
 370  *
 371  *      To try and keep things consistent, we will only use chip when referring
 372  *      to the entire integrated circuit package, with the exception of the
 373  *      definition of multi-chip module (because it is in the name) and use the
 374  *      term 'die' when we want the more general, potential sub-component
 375  *      definition.
 376  *
 377  * DIE
 378  *
 379  *      A die refers to an integrated circuit. Inside of the package there may
 380  *      be a single die or multiple dies. This is sometimes called a 'chip' in
 381  *      vendor's parlance, but in this file, we use the term die to refer to a
 382  *      subcomponent.
 383  *
 384  * MULTI-CHIP MODULE
 385  *
 386  *      A multi-chip module (MCM) refers to putting multiple distinct chips that
 387  *      are connected together in the same package. When a multi-chip design is
 388  *      used, generally each chip is manufactured independently and then joined
 389  *      together in the package. For example, on AMD's Zen microarchitecture
 390  *      (family 0x17), the package contains several dies (the second meaning of
 391  *      chip from above) that are connected together.
 392  *
 393  * CACHE
 394  *
 395  *      A cache is a part of the processor that maintains copies of recently
 396  *      accessed memory. Caches are split into levels and then into types.
 397  *      Commonly there are one to three levels, called level one, two, and
 398  *      three. The lower the level, the smaller it is, the closer it is to the
 399  *      execution units of the CPU, and the faster it is to access. The layout
 400  *      and design of the cache come in many different flavors, consult other
 401  *      resources for a discussion of those.
 402  *
 403  *      Caches are generally split into two types, the instruction and data
 404  *      cache. The caches contain what their names suggest, the instruction
 405  *      cache has executable program text, while the data cache has all other
 406  *      memory that the processor accesses. As of this writing, data is kept
 407  *      coherent between all of the caches on x86, so if one modifies program
 408  *      text before it is executed, that will be in the data cache, and the
 409  *      instruction cache will be synchronized with that change when the
 410  *      processor actually executes those instructions. This coherency also
 411  *      covers the fact that data could show up in multiple caches.
 412  *
 413  *      Generally, the lowest level caches are specific to a core. However, the
 414  *      last layer cache is shared between some number of cores. The number of
 415  *      CPUs sharing this last level cache is important. This has implications
 416  *      for the choices that the scheduler makes, as accessing memory that might
 417  *      be in a remote cache after thread migration can be quite expensive.
 418  *
 419  *      Sometimes, the word cache is abbreviated with a '$', because in US
 420  *      English the word cache is pronounced the same as cash. So L1D$ refers to
 421  *      the L1 data cache, and L2$ would be the L2 cache. This will not be used
 422  *      in the rest of this theory statement for clarity.
 423  *
 424  * MEMORY CONTROLLER
 425  *
 426  *      The memory controller is a component that provides access to DRAM. Each
 427  *      memory controller can access a set number of DRAM channels. Each channel
 428  *      can have a number of DIMMs (sticks of memory) associated with it. A
 429  *      given package may have more than one memory controller. The association
 430  *      of the memory controller to a group of cores is important as it is
 431  *      cheaper to access memory on the controller that you are associated with.
 432  *
 433  * NUMA
 434  *
 435  *      NUMA or non-uniform memory access, describes a way that systems are
 436  *      built. On x86, any processor core can address all of the memory in the
 437  *      system. However, When using multiple sockets or possibly within a
 438  *      multi-chip module, some of that memory is physically closer and some of
 439  *      it is further. Memory that is further away is more expensive to access.
 440  *      Consider the following image of multiple sockets with memory:
 441  *
 442  *      +--------+                                                +--------+
 443  *      | DIMM A |         +----------+      +----------+         | DIMM D |
 444  *      +--------+-+       |          |      |          |       +-+------+-+
 445  *        | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
 446  *        +--------+-+     |          |      |          |     +-+------+-+
 447  *          | DIMM C |     +----------+      +----------+     | DIMM F |
 448  *          +--------+                                        +--------+
 449  *
 450  *      In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
 451  *      closer to DIMMs D-F. This means that it is cheaper for socket 0 to
 452  *      access DIMMs A-C and more expensive to access D-F as it has to go
 453  *      through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
 454  *      D-F are cheaper than A-C. While the socket form is the most common, when
 455  *      using multi-chip modules, this can also sometimes occur. For another
 456  *      example of this that's more involved, see the AMD topology section.
 457  *
 458  *
 459  * Intel Topology
 460  * --------------
 461  *
 462  * Most Intel processors since Nehalem, (as of this writing the current gen
 463  * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
 464  * the package is a single monolithic die. MCMs currently aren't used. Most
 465  * parts have three levels of caches, with the L3 cache being shared between
 466  * all of the cores on the package. The L1/L2 cache is generally specific to
 467  * an individual core. The following image shows at a simplified level what
 468  * this looks like. The memory controller is commonly part of something called
 469  * the 'Uncore', that used to be separate physical chips that were not a part of
 470  * the package, but are now part of the same chip.
 471  *
 472  *  +-----------------------------------------------------------------------+
 473  *  | Package                                                               |
 474  *  |  +-------------------+  +-------------------+  +-------------------+  |
 475  *  |  | Core              |  | Core              |  | Core              |  |
 476  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
 477  *  |  |  | Thread | | L | |  |  | Thread | | L | |  |  | Thread | | L | |  |
 478  *  |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |
 479  *  |  |  +--------+ |   | |  |  +--------+ |   | |  |  +--------+ |   | |  |
 480  *  |  |  | Thread | |   | |  |  | Thread | |   | |  |  | Thread | |   | |  |
 481  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
 482  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
 483  *  |  |  | L2 Cache     | |  |  | L2 Cache     | |  |  | L2 Cache     | |  |
 484  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
 485  *  |  +-------------------+  +-------------------+  +-------------------+  |
 486  *  | +-------------------------------------------------------------------+ |
 487  *  | |                         Shared L3 Cache                           | |
 488  *  | +-------------------------------------------------------------------+ |
 489  *  | +-------------------------------------------------------------------+ |
 490  *  | |                        Memory Controller                          | |
 491  *  | +-------------------------------------------------------------------+ |
 492  *  +-----------------------------------------------------------------------+
 493  *
 494  * A side effect of this current architecture is that what we care about from a
 495  * scheduling and topology perspective, is simplified. In general we care about
 496  * understanding which logical CPUs are part of the same core and socket.
 497  *
 498  * To determine the relationship between threads and cores, Intel initially used
 499  * the identifier in the advanced programmable interrupt controller (APIC). They
 500  * also added cpuid leaf 4 to give additional information about the number of
 501  * threads and CPUs in the processor. With the addition of x2apic (which
 502  * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
 503  * additional cpuid topology leaf 0xB was added.
 504  *
 505  * AMD Topology
 506  * ------------
 507  *
 508  * When discussing AMD topology, we want to break this into three distinct
 509  * generations of topology. There's the basic topology that has been used in
 510  * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
 511  * with family 0x15 (Bulldozer), and there's the topology that was introduced
 512  * with family 0x17 (Zen). AMD also has some additional terminology that's worth
 513  * talking about.
 514  *
 515  * Until the introduction of family 0x17 (Zen), AMD did not implement something
 516  * that they considered SMT. Whether or not the AMD processors have SMT
 517  * influences many things including scheduling and reliability, availability,
 518  * and serviceability (RAS) features.
 519  *
 520  * NODE
 521  *
 522  *      AMD uses the term node to refer to a die that contains a number of cores
 523  *      and I/O resources. Depending on the processor family and model, more
 524  *      than one node can be present in the package. When there is more than one
 525  *      node this indicates a multi-chip module. Usually each node has its own
 526  *      access to memory and I/O devices. This is important and generally
 527  *      different from the corresponding Intel Nehalem-Skylake+ processors. As a
 528  *      result, we track this relationship in the operating system.
 529  *
 530  *      In processors with an L3 cache, the L3 cache is generally shared across
 531  *      the entire node, though the way this is carved up varies from generation
 532  *      to generation.
 533  *
 534  * BULLDOZER
 535  *
 536  *      Starting with the Bulldozer family (0x15) and continuing until the
 537  *      introduction of the Zen microarchitecture, AMD introduced the idea of a
 538  *      compute unit. In a compute unit, two traditional cores share a number of
 539  *      hardware resources. Critically, they share the FPU, L1 instruction
 540  *      cache, and the L2 cache. Several compute units were then combined inside
 541  *      of a single node.  Because the integer execution units, L1 data cache,
 542  *      and some other resources were not shared between the cores, AMD never
 543  *      considered this to be SMT.
 544  *
 545  * ZEN
 546  *
 547  *      The Zen family (0x17) uses a multi-chip module (MCM) design, the module
 548  *      is called Zeppelin. These modules are similar to the idea of nodes used
 549  *      previously. Each of these nodes has two DRAM channels which all of the
 550  *      cores in the node can access uniformly. These nodes are linked together
 551  *      in the package, creating a NUMA environment.
 552  *
 553  *      The Zeppelin die itself contains two different 'core complexes'. Each
 554  *      core complex consists of four cores which each have two threads, for a
 555  *      total of 8 logical CPUs per complex. Unlike other generations,
 556  *      where all the logical CPUs in a given node share the L3 cache, here each
 557  *      core complex has its own shared L3 cache.
 558  *
 559  *      A further thing that we need to consider is that in some configurations,
 560  *      particularly with the Threadripper line of processors, not every die
 561  *      actually has its memory controllers wired up to actual memory channels.
 562  *      This means that some cores have memory attached to them and others
 563  *      don't.
 564  *
 565  *      To put Zen in perspective, consider the following images:
 566  *
 567  *      +--------------------------------------------------------+
 568  *      | Core Complex                                           |
 569  *      | +-------------------+    +-------------------+  +---+  |
 570  *      | | Core       +----+ |    | Core       +----+ |  |   |  |
 571  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  |   |  |
 572  *      | | | Thread | +----+ |    | | Thread | +----+ |  |   |  |
 573  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  | L |  |
 574  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  | 3 |  |
 575  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
 576  *      | +-------------------+    +-------------------+  | C |  |
 577  *      | +-------------------+    +-------------------+  | a |  |
 578  *      | | Core       +----+ |    | Core       +----+ |  | c |  |
 579  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  | h |  |
 580  *      | | | Thread | +----+ |    | | Thread | +----+ |  | e |  |
 581  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |   |  |
 582  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  |   |  |
 583  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
 584  *      | +-------------------+    +-------------------+  +---+  |
 585  *      |                                                        |
 586  *      +--------------------------------------------------------+
 587  *
 588  *  This first image represents a single Zen core complex that consists of four
 589  *  cores.
 590  *
 591  *
 592  *      +--------------------------------------------------------+
 593  *      | Zeppelin Die                                           |
 594  *      |  +--------------------------------------------------+  |
 595  *      |  |         I/O Units (PCIe, SATA, USB, etc.)        |  |
 596  *      |  +--------------------------------------------------+  |
 597  *      |                           HH                           |
 598  *      |          +-----------+    HH    +-----------+          |
 599  *      |          |           |    HH    |           |          |
 600  *      |          |    Core   |==========|    Core   |          |
 601  *      |          |  Complex  |==========|  Complex  |          |
 602  *      |          |           |    HH    |           |          |
 603  *      |          +-----------+    HH    +-----------+          |
 604  *      |                           HH                           |
 605  *      |  +--------------------------------------------------+  |
 606  *      |  |                Memory Controller                 |  |
 607  *      |  +--------------------------------------------------+  |
 608  *      |                                                        |
 609  *      +--------------------------------------------------------+
 610  *
 611  *  This image represents a single Zeppelin Die. Note how both cores are
 612  *  connected to the same memory controller and I/O units. While each core
 613  *  complex has its own L3 cache as seen in the first image, they both have
 614  *  uniform access to memory.
 615  *
 616  *
 617  *                      PP                     PP
 618  *                      PP                     PP
 619  *           +----------PP---------------------PP---------+
 620  *           |          PP                     PP         |
 621  *           |    +-----------+          +-----------+    |
 622  *           |    |           |          |           |    |
 623  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
 624  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
 625  *           |    |           |          |           |    |
 626  *           |    +-----------+ooo    ...+-----------+    |
 627  *           |          HH      ooo  ...       HH         |
 628  *           |          HH        oo..         HH         |
 629  *           |          HH        ..oo         HH         |
 630  *           |          HH      ...  ooo       HH         |
 631  *           |    +-----------+...    ooo+-----------+    |
 632  *           |    |           |          |           |    |
 633  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
 634  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
 635  *           |    |           |          |           |    |
 636  *           |    +-----------+          +-----------+    |
 637  *           |          PP                     PP         |
 638  *           +----------PP---------------------PP---------+
 639  *                      PP                     PP
 640  *                      PP                     PP
 641  *
 642  *  This image represents a single Zen package. In this example, it has four
 643  *  Zeppelin dies, though some configurations only have a single one. In this
 644  *  example, each die is directly connected to the next. Also, each die is
 645  *  represented as being connected to memory by the 'M' character and connected
 646  *  to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
 647  *  die is made up of two core complexes, we have multiple different NUMA
 648  *  domains that we care about for these systems.
 649  *
 650  * CPUID LEAVES
 651  *
 652  * There are a few different CPUID leaves that we can use to try and understand
 653  * the actual state of the world. As part of the introduction of family 0xf, AMD
 654  * added CPUID leaf 0x80000008. This leaf tells us the number of logical
 655  * processors that are in the system. Because families before Zen didn't have
 656  * SMT, this was always the number of cores that were in the system. However, it
 657  * should always be thought of as the number of logical threads to be consistent
 658  * between generations. In addition we also get the size of the APIC ID that is
 659  * used to represent the number of logical processors. This is important for
 660  * deriving topology information.
 661  *
 662  * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
 663  * bit between Bulldozer and later families, but it is quite useful in
 664  * determining the topology information. Because this information has changed
 665  * across family generations, it's worth calling out what these mean
 666  * explicitly. The registers have the following meanings:
 667  *
 668  *      %eax    The APIC ID. The entire register is defined to have a 32-bit
 669  *              APIC ID, even though on systems without x2apic support, it will
 670  *              be limited to 8 bits.
 671  *
 672  *      %ebx    On Bulldozer-era systems this contains information about the
 673  *              number of cores that are in a compute unit (cores that share
 674  *              resources). It also contains a per-package compute unit ID that
 675  *              identifies which compute unit the logical CPU is a part of.
 676  *
 677  *              On Zen-era systems this instead contains the number of threads
 678  *              per core and the ID of the core that the logical CPU is a part
 679  *              of. Note, this ID is unique only to the package, it is not
 680  *              globally unique across the entire system.
 681  *
 682  *      %ecx    This contains the number of nodes that exist in the package. It
 683  *              also contains an ID that identifies which node the logical CPU
 684  *              is a part of.
 685  *
 686  * Finally, we also use cpuid leaf 0x8000001D to determine information about the
 687  * cache layout to determine which logical CPUs are sharing which caches.
 688  *
 689  * illumos Topology
 690  * ----------------
 691  *
 692  * Based on the above we synthesize the information into several different
 693  * variables that we store in the 'struct cpuid_info'. We'll go into the details
 694  * of what each member is supposed to represent and their uniqueness. In
 695  * general, there are two levels of uniqueness that we care about. We care about
 696  * an ID that is globally unique. That means that it will be unique across all
 697  * entities in the system. For example, the default logical CPU ID is globally
 698  * unique. On the other hand, there is some information that we only care about
 699  * being unique within the context of a single package / socket. Here are the
 700  * variables that we keep track of and their meaning.
 701  *
 702  * Several of the values that are asking for an identifier, with the exception
 703  * of cpi_apicid, are allowed to be synthetic.
 704  *
 705  *
 706  * cpi_apicid
 707  *
 708  *      This is the value of the CPU's APIC id. This should be the full 32-bit
 709  *      ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
 710  *      APIC ID. This value is globally unique between all logical CPUs across
 711  *      all packages. This is usually required by the APIC.
 712  *
 713  * cpi_chipid
 714  *
 715  *      This value indicates the ID of the package that the logical CPU is a
 716  *      part of. This value is allowed to be synthetic. It is usually derived by
 717  *      taking the CPU's APIC ID and determining how many bits are used to
 718  *      represent CPU cores in the package. All logical CPUs that are part of
 719  *      the same package must have the same value.
 720  *
 721  * cpi_coreid
 722  *
 723  *      This represents the ID of a CPU core. Two logical CPUs should only have
 724  *      the same cpi_coreid value if they are part of the same core. These
 725  *      values may be synthetic. On systems that support SMT, this value is
 726  *      usually derived from the APIC ID, otherwise it is often synthetic and
 727  *      just set to the value of the cpu_id in the cpu_t.
 728  *
 729  * cpi_pkgcoreid
 730  *
 731  *      This is similar to the cpi_coreid in that logical CPUs that are part of
 732  *      the same core should have the same ID. The main difference is that these
 733  *      values are only required to be unique to a given socket.
 734  *
 735  * cpi_clogid
 736  *
 737  *      This represents the logical ID of a logical CPU. This value should be
 738  *      unique within a given socket for each logical CPU. This is allowed to be
 739  *      synthetic, though it is usually based off of the CPU's apic ID. The
 740  *      broader system expects that logical CPUs that have are part of the same
 741  *      core have contiguous numbers. For example, if there were two threads per
 742  *      core, then the core IDs divided by two should be the same and the first
 743  *      modulus two should be zero and the second one. For example, IDs 4 and 5
 744  *      indicate two logical CPUs that are part of the same core. But IDs 5 and
 745  *      6 represent two logical CPUs that are part of different cores.
 746  *
 747  *      While it is common for the cpi_coreid and the cpi_clogid to be derived
 748  *      from the same source, strictly speaking, they don't have to be and the
 749  *      two values should be considered logically independent. One should not
 750  *      try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
 751  *      some kind of relationship. While this is tempting, we've seen cases on
 752  *      AMD family 0xf where the system's cpu id is not related to its APIC ID.
 753  *
 754  * cpi_ncpu_per_chip
 755  *
 756  *      This value indicates the total number of logical CPUs that exist in the
 757  *      physical package. Critically, this is not the number of logical CPUs
 758  *      that exist for just the single core.
 759  *
 760  *      This value should be the same for all logical CPUs in the same package.
 761  *
 762  * cpi_ncore_per_chip
 763  *
 764  *      This value indicates the total number of physical CPU cores that exist
 765  *      in the package. The system compares this value with cpi_ncpu_per_chip to
 766  *      determine if simultaneous multi-threading (SMT) is enabled. When
 767  *      cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
 768  *      the X86FSET_HTT feature is not set. If this value is greater than one,
 769  *      than we consider the processor to have the feature X86FSET_CMP, to
 770  *      indicate that there is support for more than one core.
 771  *
 772  *      This value should be the same for all logical CPUs in the same package.
 773  *
 774  * cpi_procnodes_per_pkg
 775  *
 776  *      This value indicates the number of 'nodes' that exist in the package.
 777  *      When processors are actually a multi-chip module, this represents the
 778  *      number of such modules that exist in the package. Currently, on Intel
 779  *      based systems this member is always set to 1.
 780  *
 781  *      This value should be the same for all logical CPUs in the same package.
 782  *
 783  * cpi_procnodeid
 784  *
 785  *      This value indicates the ID of the node that the logical CPU is a part
 786  *      of. All logical CPUs that are in the same node must have the same value
 787  *      here. This value must be unique across all of the packages in the
 788  *      system.  On Intel based systems, this is currently set to the value in
 789  *      cpi_chipid because there is only one node.
 790  *
 791  * cpi_cores_per_compunit
 792  *
 793  *      This value indicates the number of cores that are part of a compute
 794  *      unit. See the AMD topology section for this. This member only has real
 795  *      meaning currently for AMD Bulldozer family processors. For all other
 796  *      processors, this should currently be set to 1.
 797  *
 798  * cpi_compunitid
 799  *
 800  *      This indicates the compute unit that the logical CPU belongs to. For
 801  *      processors without AMD Bulldozer-style compute units this should be set
 802  *      to the value of cpi_coreid.
 803  *
 804  * cpi_ncpu_shr_last_cache
 805  *
 806  *      This indicates the number of logical CPUs that are sharing the same last
 807  *      level cache. This value should be the same for all CPUs that are sharing
 808  *      that cache. The last cache refers to the cache that is closest to memory
 809  *      and furthest away from the CPU.
 810  *
 811  * cpi_last_lvl_cacheid
 812  *
 813  *      This indicates the ID of the last cache that the logical CPU uses. This
 814  *      cache is often shared between multiple logical CPUs and is the cache
 815  *      that is closest to memory and furthest away from the CPU. This value
 816  *      should be the same for a group of logical CPUs only if they actually
 817  *      share the same last level cache. IDs should not overlap between
 818  *      packages.
 819  *
 820  * cpi_ncore_bits
 821  *
 822  *      This indicates the number of bits that are required to represent all of
 823  *      the cores in the system. As cores are derived based on their APIC IDs,
 824  *      we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
 825  *      this value to be larger than the actual number of IDs that are present
 826  *      in the system. This is used to size tables by the CMI framework. It is
 827  *      only filled in for Intel and AMD CPUs.
 828  *
 829  * cpi_nthread_bits
 830  *
 831  *      This indicates the number of bits required to represent all of the IDs
 832  *      that cover the logical CPUs that exist on a given core. It's OK for this
 833  *      value to be larger than the actual number of IDs that are present in the
 834  *      system.  This is used to size tables by the CMI framework. It is
 835  *      only filled in for Intel and AMD CPUs.
 836  *
 837  * -----------
 838  * Hypervisors
 839  * -----------
 840  *
 841  * If trying to manage the differences between vendors wasn't bad enough, it can
 842  * get worse thanks to our friend hardware virtualization. Hypervisors are given
 843  * the ability to interpose on all cpuid instructions and change them to suit
 844  * their purposes. In general, this is necessary as the hypervisor wants to be
 845  * able to present a more uniform set of features or not necessarily give the
 846  * guest operating system kernel knowledge of all features so it can be
 847  * more easily migrated between systems.
 848  *
 849  * When it comes to trying to determine topology information, this can be a
 850  * double edged sword. When a hypervisor doesn't actually implement a cpuid
 851  * leaf, it'll often return all zeros. Because of that, you'll often see various
 852  * checks scattered about fields being non-zero before we assume we can use
 853  * them.
 854  *
 855  * When it comes to topology information, the hypervisor is often incentivized
 856  * to lie to you about topology. This is because it doesn't always actually
 857  * guarantee that topology at all. The topology path we take in the system
 858  * depends on how the CPU advertises itself. If it advertises itself as an Intel
 859  * or AMD CPU, then we basically do our normal path. However, when they don't
 860  * use an actual vendor, then that usually turns into multiple one-core CPUs
 861  * that we enumerate that are often on different sockets. The actual behavior
 862  * depends greatly on what the hypervisor actually exposes to us.
 863  *
 864  * --------------------
 865  * Exposing Information
 866  * --------------------
 867  *
 868  * We expose CPUID information in three different forms in the system.
 869  *
 870  * The first is through the x86_featureset variable. This is used in conjunction
 871  * with the is_x86_feature() function. This is queried by x86-specific functions
 872  * to determine which features are or aren't present in the system and to make
 873  * decisions based upon them. For example, users of this include everything from
 874  * parts of the system dedicated to reliability, availability, and
 875  * serviceability (RAS), to making decisions about how to handle security
 876  * mitigations, to various x86-specific drivers. General purpose or
 877  * architecture independent drivers should never be calling this function.
 878  *
 879  * The second means is through the auxiliary vector. The auxiliary vector is a
 880  * series of tagged data that the kernel passes down to a user program when it
 881  * begins executing. This information is used to indicate to programs what
 882  * instruction set extensions are present. For example, information about the
 883  * CPU supporting the machine check architecture (MCA) wouldn't be passed down
 884  * since user programs cannot make use of it. However, things like the AVX
 885  * instruction sets are. Programs use this information to make run-time
 886  * decisions about what features they should use. As an example, the run-time
 887  * link-editor (rtld) can relocate different functions depending on the hardware
 888  * support available.
 889  *
 890  * The final form is through a series of accessor functions that all have the
 891  * form cpuid_get*. This is used by a number of different subsystems in the
 892  * kernel to determine more detailed information about what we're running on,
 893  * topology information, etc. Some of these subsystems include processor groups
 894  * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
 895  * microcode, and performance monitoring. These functions all ASSERT that the
 896  * CPU they're being called on has reached a certain cpuid pass. If the passes
 897  * are rearranged, then this needs to be adjusted.
 898  *
 899  * -----------------------------------------------
 900  * Speculative Execution CPU Side Channel Security
 901  * -----------------------------------------------
 902  *
 903  * With the advent of the Spectre and Meltdown attacks which exploit speculative
 904  * execution in the CPU to create side channels there have been a number of
 905  * different attacks and corresponding issues that the operating system needs to
 906  * mitigate against. The following list is some of the common, but not
 907  * exhaustive, set of issues that we know about and have done some or need to do
 908  * more work in the system to mitigate against:
 909  *
 910  *   - Spectre v1
 911  *   - swapgs (Spectre v1 variant)
 912  *   - Spectre v2
 913  *   - Meltdown (Spectre v3)
 914  *   - Rogue Register Read (Spectre v3a)
 915  *   - Speculative Store Bypass (Spectre v4)
 916  *   - ret2spec, SpectreRSB
 917  *   - L1 Terminal Fault (L1TF)
 918  *   - Microarchitectural Data Sampling (MDS)
 919  *
 920  * Each of these requires different sets of mitigations and has different attack
 921  * surfaces. For the most part, this discussion is about protecting the kernel
 922  * from non-kernel executing environments such as user processes and hardware
 923  * virtual machines. Unfortunately, there are a number of user vs. user
 924  * scenarios that exist with these. The rest of this section will describe the
 925  * overall approach that the system has taken to address these as well as their
 926  * shortcomings. Unfortunately, not all of the above have been handled today.
 927  *
 928  * SPECTRE v2, ret2spec, SpectreRSB
 929  *
 930  * The second variant of the spectre attack focuses on performing branch target
 931  * injection. This generally impacts indirect call instructions in the system.
 932  * There are three different ways to mitigate this issue that are commonly
 933  * described today:
 934  *
 935  *  1. Using Indirect Branch Restricted Speculation (IBRS).
 936  *  2. Using Retpolines and RSB Stuffing
 937  *  3. Using Enhanced Indirect Branch Restricted Speculation (EIBRS)
 938  *
 939  * IBRS uses a feature added to microcode to restrict speculation, among other
 940  * things. This form of mitigation has not been used as it has been generally
 941  * seen as too expensive and requires reactivation upon various transitions in
 942  * the system.
 943  *
 944  * As a less impactful alternative to IBRS, retpolines were developed by
 945  * Google. These basically require one to replace indirect calls with a specific
 946  * trampoline that will cause speculation to fail and break the attack.
 947  * Retpolines require compiler support. We always build with retpolines in the
 948  * external thunk mode. This means that a traditional indirect call is replaced
 949  * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
 950  * of this is that all indirect function calls are performed through a register.
 951  *
 952  * We have to use a common external location of the thunk and not inline it into
 953  * the callsite so that way we can have a single place to patch these functions.
 954  * As it turns out, we actually have three different forms of retpolines that
 955  * exist in the system:
 956  *
 957  *  1. A full retpoline
 958  *  2. An AMD-specific optimized retpoline
 959  *  3. A no-op version
 960  *
 961  * The first one is used in the general case. The second one is used if we can
 962  * determine that we're on an AMD system and we can successfully toggle the
 963  * lfence serializing MSR that exists on the platform. Basically with this
 964  * present, an lfence is sufficient and we don't need to do anywhere near as
 965  * complicated a dance to successfully use retpolines.
 966  *
 967  * The third form described above is the most curious. It turns out that the way
 968  * that retpolines are implemented is that they rely on how speculation is
 969  * performed on a 'ret' instruction. Intel has continued to optimize this
 970  * process (which is partly why we need to have return stack buffer stuffing,
 971  * but more on that in a bit) and in processors starting with Cascade Lake
 972  * on the server side, it's dangerous to rely on retpolines. Instead, a new
 973  * mechanism has been introduced called Enhanced IBRS (EIBRS).
 974  *
 975  * Unlike IBRS, EIBRS is designed to be enabled once at boot and left on each
 976  * physical core. However, if this is the case, we don't want to use retpolines
 977  * any more. Therefore if EIBRS is present, we end up turning each retpoline
 978  * function (called a thunk) into a jmp instruction. This means that we're still
 979  * paying the cost of an extra jump to the external thunk, but it gives us
 980  * flexibility and the ability to have a single kernel image that works across a
 981  * wide variety of systems and hardware features.
 982  *
 983  * Unfortunately, this alone is insufficient. First, Skylake systems have
 984  * additional speculation for the Return Stack Buffer (RSB) which is used to
 985  * return from call instructions which retpolines take advantage of. However,
 986  * this problem is not just limited to Skylake and is actually more pernicious.
 987  * The SpectreRSB paper introduces several more problems that can arise with
 988  * dealing with this. The RSB can be poisoned just like the indirect branch
 989  * predictor. This means that one needs to clear the RSB when transitioning
 990  * between two different privilege domains. Some examples include:
 991  *
 992  *  - Switching between two different user processes
 993  *  - Going between user land and the kernel
 994  *  - Returning to the kernel from a hardware virtual machine
 995  *
 996  * Mitigating this involves combining a couple of different things. The first is
 997  * SMEP (supervisor mode execution protection) which was introduced in Ivy
 998  * Bridge. When an RSB entry refers to a user address and we're executing in the
 999  * kernel, speculation through it will be stopped when SMEP is enabled. This
1000  * protects against a number of the different cases that we would normally be
1001  * worried about such as when we enter the kernel from user land.
1002  *
1003  * To prevent against additional manipulation of the RSB from other contexts
1004  * such as a non-root VMX context attacking the kernel we first look to enhanced
1005  * IBRS. When EIBRS is present and enabled, then there is nothing else that we
1006  * need to do to protect the kernel at this time.
1007  *
1008  * On CPUs without EIBRS we need to manually overwrite the contents of the
1009  * return stack buffer. We do this through the x86_rsb_stuff() function.
1010  * Currently this is employed on context switch. The x86_rsb_stuff() function is
1011  * disabled when enhanced IBRS is present because Intel claims on such systems
1012  * it will be ineffective. Stuffing the RSB in context switch helps prevent user
1013  * to user attacks via the RSB.
1014  *
1015  * If SMEP is not present, then we would have to stuff the RSB every time we
1016  * transitioned from user mode to the kernel, which isn't very practical right
1017  * now.
1018  *
1019  * To fully protect user to user and vmx to vmx attacks from these classes of
1020  * issues, we would also need to allow them to opt into performing an Indirect
1021  * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1022  *
1023  * By default, the system will enable RSB stuffing and the required variant of
1024  * retpolines and store that information in the x86_spectrev2_mitigation value.
1025  * This will be evaluated after a microcode update as well, though it is
1026  * expected that microcode updates will not take away features. This may mean
1027  * that a late loaded microcode may not end up in the optimal configuration
1028  * (though this should be rare).
1029  *
1030  * Currently we do not build kmdb with retpolines or perform any additional side
1031  * channel security mitigations for it. One complication with kmdb is that it
1032  * requires its own retpoline thunks and it would need to adjust itself based on
1033  * what the kernel does. The threat model of kmdb is more limited and therefore
1034  * it may make more sense to investigate using prediction barriers as the whole
1035  * system is only executing a single instruction at a time while in kmdb.
1036  *
1037  * SPECTRE v1, v4
1038  *
1039  * The v1 and v4 variants of spectre are not currently mitigated in the
1040  * system and require other classes of changes to occur in the code.
1041  *
1042  * SPECTRE v1 (SWAPGS VARIANT)
1043  *
1044  * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but
1045  * can generally affect any branch-dependent code. The swapgs issue is one
1046  * variant of this. If we are coming in from userspace, we can have code like
1047  * this:
1048  *
1049  *      cmpw    $KCS_SEL, REGOFF_CS(%rsp)
1050  *      je      1f
1051  *      movq    $0, REGOFF_SAVFP(%rsp)
1052  *      swapgs
1053  *      1:
1054  *      movq    %gs:CPU_THREAD, %rax
1055  *
1056  * If an attacker can cause a mis-speculation of the branch here, we could skip
1057  * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based
1058  * load. If subsequent code can act as the usual Spectre cache gadget, this
1059  * would potentially allow KPTI bypass. To fix this, we need an lfence prior to
1060  * any use of the %gs override.
1061  *
1062  * The other case is also an issue: if we're coming into a trap from kernel
1063  * space, we could mis-speculate and swapgs the user %gsbase back in prior to
1064  * using it. AMD systems are not vulnerable to this version, as a swapgs is
1065  * serializing with respect to subsequent uses. But as AMD /does/ need the other
1066  * case, and the fix is the same in both cases (an lfence at the branch target
1067  * 1: in this example), we'll just do it unconditionally.
1068  *
1069  * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it
1070  * harder for user-space to actually set a useful %gsbase value: although it's
1071  * not clear, it might still be feasible via lwp_setprivate(), though, so we
1072  * mitigate anyway.
1073  *
1074  * MELTDOWN
1075  *
1076  * Meltdown, or spectre v3, allowed a user process to read any data in their
1077  * address space regardless of whether or not the page tables in question
1078  * allowed the user to have the ability to read them. The solution to meltdown
1079  * is kernel page table isolation. In this world, there are two page tables that
1080  * are used for a process, one in user land and one in the kernel. To implement
1081  * this we use per-CPU page tables and switch between the user and kernel
1082  * variants when entering and exiting the kernel.  For more information about
1083  * this process and how the trampolines work, please see the big theory
1084  * statements and additional comments in:
1085  *
1086  *  - uts/i86pc/ml/kpti_trampolines.s
1087  *  - uts/i86pc/vm/hat_i86.c
1088  *
1089  * While Meltdown only impacted Intel systems and there are also Intel systems
1090  * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1091  * kernel page table isolation enabled. While this may at first seem weird, an
1092  * important thing to remember is that you can't speculatively read an address
1093  * if it's never in your page table at all. Having user processes without kernel
1094  * pages present provides us with an important layer of defense in the kernel
1095  * against any other side channel attacks that exist and have yet to be
1096  * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1097  * default, no matter the x86 system.
1098  *
1099  * L1 TERMINAL FAULT
1100  *
1101  * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1102  * execution uses page table entries. Effectively, it is two different problems.
1103  * The first is that it ignores the not present bit in the page table entries
1104  * when performing speculative execution. This means that something can
1105  * speculatively read the listed physical address if it's present in the L1
1106  * cache under certain conditions (see Intel's documentation for the full set of
1107  * conditions). Secondly, this can be used to bypass hardware virtualization
1108  * extended page tables (EPT) that are part of Intel's hardware virtual machine
1109  * instructions.
1110  *
1111  * For the non-hardware virtualized case, this is relatively easy to deal with.
1112  * We must make sure that all unmapped pages have an address of zero. This means
1113  * that they could read the first 4k of physical memory; however, we never use
1114  * that first page in the operating system and always skip putting it in our
1115  * memory map, even if firmware tells us we can use it in our memory map. While
1116  * other systems try to put extra metadata in the address and reserved bits,
1117  * which led to this being problematic in those cases, we do not.
1118  *
1119  * For hardware virtual machines things are more complicated. Because they can
1120  * construct their own page tables, it isn't hard for them to perform this
1121  * attack against any physical address. The one wrinkle is that this physical
1122  * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1123  * to flush the L1 data cache. We wrap this up in the function
1124  * spec_uarch_flush(). This function is also used in the mitigation of
1125  * microarchitectural data sampling (MDS) discussed later on. Kernel based
1126  * hypervisors such as KVM or bhyve are responsible for performing this before
1127  * entering the guest.
1128  *
1129  * Because this attack takes place in the L1 cache, there's another wrinkle
1130  * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1131  * designs. This means that when a thread enters a hardware virtualized context
1132  * and flushes the L1 data cache, the other thread on the processor may then go
1133  * ahead and put new data in it that can be potentially attacked. While one
1134  * solution is to disable SMT on the system, another option that is available is
1135  * to use a feature for hardware virtualization called 'SMT exclusion'. This
1136  * goes through and makes sure that if a HVM is being scheduled on one thread,
1137  * then the thing on the other thread is from the same hardware virtual machine.
1138  * If an interrupt comes in or the guest exits to the broader system, then the
1139  * other SMT thread will be kicked out.
1140  *
1141  * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1142  * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1143  * perform L1TF related mitigations.
1144  *
1145  * MICROARCHITECTURAL DATA SAMPLING
1146  *
1147  * Microarchitectural data sampling (MDS) is a combination of four discrete
1148  * vulnerabilities that are similar issues affecting various parts of the CPU's
1149  * microarchitectural implementation around load, store, and fill buffers.
1150  * Specifically it is made up of the following subcomponents:
1151  *
1152  *  1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1153  *  2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1154  *  3. Microarchitectural Load Port Data Sampling (MLPDS)
1155  *  4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1156  *
1157  * To begin addressing these, Intel has introduced another feature in microcode
1158  * called MD_CLEAR. This changes the verw instruction to operate in a different
1159  * way. This allows us to execute the verw instruction in a particular way to
1160  * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1161  * updated when this microcode is present to flush this state.
1162  *
1163  * Primarily we need to flush this state whenever we transition from the kernel
1164  * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1165  * little bit different. Here the structures are statically sized when a logical
1166  * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1167  * flush the microarchitectural state before the CPU goes idles by calling hlt,
1168  * mwait, or another ACPI method. To perform these flushes, we call
1169  * x86_md_clear() at all of these transition points.
1170  *
1171  * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1172  * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1173  * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1174  * a no-op.
1175  *
1176  * Unfortunately, with this issue hyperthreading rears its ugly head. In
1177  * particular, everything we've discussed above is only valid for a single
1178  * thread executing on a core. In the case where you have hyper-threading
1179  * present, this attack can be performed between threads. The theoretical fix
1180  * for this is to ensure that both threads are always in the same security
1181  * domain. This means that they are executing in the same ring and mutually
1182  * trust each other. Practically speaking, this would mean that a system call
1183  * would have to issue an inter-processor interrupt (IPI) to the other thread.
1184  * Rather than implement this, we recommend that one disables hyper-threading
1185  * through the use of psradm -aS.
1186  *
1187  * TSX ASYNCHRONOUS ABORT
1188  *
1189  * TSX Asynchronous Abort (TAA) is another side-channel vulnerability that
1190  * behaves like MDS, but leverages Intel's transactional instructions as another
1191  * vector. Effectively, when a transaction hits one of these cases (unmapped
1192  * page, various cache snoop activity, etc.) then the same data can be exposed
1193  * as in the case of MDS. This means that you can attack your twin.
1194  *
1195  * Intel has described that there are two different ways that we can mitigate
1196  * this problem on affected processors:
1197  *
1198  *   1) We can use the same techniques used to deal with MDS. Flushing the
1199  *      microarchitectural buffers and disabling hyperthreading will mitigate
1200  *      this in the same way.
1201  *
1202  *   2) Using microcode to disable TSX.
1203  *
1204  * Now, most processors that are subject to MDS (as in they don't have MDS_NO in
1205  * the IA32_ARCH_CAPABILITIES MSR) will not receive microcode to disable TSX.
1206  * That's OK as we're already doing all such mitigations. On the other hand,
1207  * processors with MDS_NO are all supposed to receive microcode updates that
1208  * enumerate support for disabling TSX. In general, we'd rather use this method
1209  * when available as it doesn't require disabling hyperthreading to be
1210  * effective. Currently we basically are relying on microcode for processors
1211  * that enumerate MDS_NO.
1212  *
1213  * The microcode features are enumerated as part of the IA32_ARCH_CAPABILITIES.
1214  * When bit 7 (IA32_ARCH_CAP_TSX_CTRL) is present, then we are given two
1215  * different powers. The first allows us to cause all transactions to
1216  * immediately abort. The second gives us a means of disabling TSX completely,
1217  * which includes removing it from cpuid. If we have support for this in
1218  * microcode during the first cpuid pass, then we'll disable TSX completely such
1219  * that user land never has a chance to observe the bit. However, if we are late
1220  * loading the microcode, then we must use the functionality to cause
1221  * transactions to automatically abort. This is necessary for user land's sake.
1222  * Once a program sees a cpuid bit, it must not be taken away.
1223  *
1224  * We track whether or not we should do this based on what cpuid pass we're in.
1225  * Whenever we hit cpuid_scan_security() on the boot CPU and we're still on pass
1226  * 1 of the cpuid logic, then we can completely turn off TSX. Notably this
1227  * should happen twice. Once in the normal cpuid_pass1() code and then a second
1228  * time after we do the initial microcode update.  As a result we need to be
1229  * careful in cpuid_apply_tsx() to only use the MSR if we've loaded a suitable
1230  * microcode on the current CPU (which happens prior to cpuid_pass_ucode()).
1231  *
1232  * If TAA has been fixed, then it will be enumerated in IA32_ARCH_CAPABILITIES
1233  * as TAA_NO. In such a case, we will still disable TSX: it's proven to be an
1234  * unfortunate feature in a number of ways, and taking the opportunity to
1235  * finally be able to turn it off is likely to be of benefit in the future.
1236  *
1237  * SUMMARY
1238  *
1239  * The following table attempts to summarize the mitigations for various issues
1240  * and what's done in various places:
1241  *
1242  *  - Spectre v1: Not currently mitigated
1243  *  - swapgs: lfences after swapgs paths
1244  *  - Spectre v2: Retpolines/RSB Stuffing or EIBRS if HW support
1245  *  - Meltdown: Kernel Page Table Isolation
1246  *  - Spectre v3a: Updated CPU microcode
1247  *  - Spectre v4: Not currently mitigated
1248  *  - SpectreRSB: SMEP and RSB Stuffing
1249  *  - L1TF: spec_uarch_flush, SMT exclusion, requires microcode
1250  *  - MDS: x86_md_clear, requires microcode, disabling SMT
1251  *  - TAA: x86_md_clear and disabling SMT OR microcode and disabling TSX
1252  *
1253  * The following table indicates the x86 feature set bits that indicate that a
1254  * given problem has been solved or a notable feature is present:
1255  *
1256  *  - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1257  *  - MDS_NO: All forms of MDS
1258  *  - TAA_NO: TAA
1259  */
1260 
1261 #include <sys/types.h>
1262 #include <sys/archsystm.h>
1263 #include <sys/x86_archext.h>
1264 #include <sys/kmem.h>
1265 #include <sys/systm.h>
1266 #include <sys/cmn_err.h>
1267 #include <sys/sunddi.h>
1268 #include <sys/sunndi.h>
1269 #include <sys/cpuvar.h>
1270 #include <sys/processor.h>
1271 #include <sys/sysmacros.h>
1272 #include <sys/pg.h>
1273 #include <sys/fp.h>
1274 #include <sys/controlregs.h>
1275 #include <sys/bitmap.h>
1276 #include <sys/auxv_386.h>
1277 #include <sys/memnode.h>
1278 #include <sys/pci_cfgspace.h>
1279 #include <sys/comm_page.h>
1280 #include <sys/mach_mmu.h>
1281 #include <sys/ucode.h>
1282 #include <sys/tsc.h>
1283 #include <sys/kobj.h>
1284 #include <sys/asm_misc.h>
1285 
1286 #ifdef __xpv
1287 #include <sys/hypervisor.h>
1288 #else
1289 #include <sys/ontrap.h>
1290 #endif
1291 
1292 uint_t x86_vendor = X86_VENDOR_IntelClone;
1293 uint_t x86_type = X86_TYPE_OTHER;
1294 uint_t x86_clflush_size = 0;
1295 
1296 #if defined(__xpv)
1297 int x86_use_pcid = 0;
1298 int x86_use_invpcid = 0;
1299 #else
1300 int x86_use_pcid = -1;
1301 int x86_use_invpcid = -1;
1302 #endif
1303 
1304 typedef enum {
1305         X86_SPECTREV2_RETPOLINE,
1306         X86_SPECTREV2_RETPOLINE_AMD,
1307         X86_SPECTREV2_ENHANCED_IBRS,
1308         X86_SPECTREV2_DISABLED
1309 } x86_spectrev2_mitigation_t;
1310 
1311 uint_t x86_disable_spectrev2 = 0;
1312 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1313     X86_SPECTREV2_RETPOLINE;
1314 
1315 /*
1316  * The mitigation status for TAA:
1317  * X86_TAA_NOTHING -- no mitigation available for TAA side-channels
1318  * X86_TAA_DISABLED -- mitigation disabled via x86_disable_taa
1319  * X86_TAA_MD_CLEAR -- MDS mitigation also suffices for TAA
1320  * X86_TAA_TSX_FORCE_ABORT -- transactions are forced to abort
1321  * X86_TAA_TSX_DISABLE -- force abort transactions and hide from CPUID
1322  * X86_TAA_HW_MITIGATED -- TSX potentially active but H/W not TAA-vulnerable
1323  */
1324 typedef enum {
1325         X86_TAA_NOTHING,
1326         X86_TAA_DISABLED,
1327         X86_TAA_MD_CLEAR,
1328         X86_TAA_TSX_FORCE_ABORT,
1329         X86_TAA_TSX_DISABLE,
1330         X86_TAA_HW_MITIGATED
1331 } x86_taa_mitigation_t;
1332 
1333 uint_t x86_disable_taa = 0;
1334 static x86_taa_mitigation_t x86_taa_mitigation = X86_TAA_NOTHING;
1335 
1336 uint_t pentiumpro_bug4046376;
1337 
1338 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1339 
1340 static char *x86_feature_names[NUM_X86_FEATURES] = {
1341         "lgpg",
1342         "tsc",
1343         "msr",
1344         "mtrr",
1345         "pge",
1346         "de",
1347         "cmov",
1348         "mmx",
1349         "mca",
1350         "pae",
1351         "cv8",
1352         "pat",
1353         "sep",
1354         "sse",
1355         "sse2",
1356         "htt",
1357         "asysc",
1358         "nx",
1359         "sse3",
1360         "cx16",
1361         "cmp",
1362         "tscp",
1363         "mwait",
1364         "sse4a",
1365         "cpuid",
1366         "ssse3",
1367         "sse4_1",
1368         "sse4_2",
1369         "1gpg",
1370         "clfsh",
1371         "64",
1372         "aes",
1373         "pclmulqdq",
1374         "xsave",
1375         "avx",
1376         "vmx",
1377         "svm",
1378         "topoext",
1379         "f16c",
1380         "rdrand",
1381         "x2apic",
1382         "avx2",
1383         "bmi1",
1384         "bmi2",
1385         "fma",
1386         "smep",
1387         "smap",
1388         "adx",
1389         "rdseed",
1390         "mpx",
1391         "avx512f",
1392         "avx512dq",
1393         "avx512pf",
1394         "avx512er",
1395         "avx512cd",
1396         "avx512bw",
1397         "avx512vl",
1398         "avx512fma",
1399         "avx512vbmi",
1400         "avx512_vpopcntdq",
1401         "avx512_4vnniw",
1402         "avx512_4fmaps",
1403         "xsaveopt",
1404         "xsavec",
1405         "xsaves",
1406         "sha",
1407         "umip",
1408         "pku",
1409         "ospke",
1410         "pcid",
1411         "invpcid",
1412         "ibrs",
1413         "ibpb",
1414         "stibp",
1415         "ssbd",
1416         "ssbd_virt",
1417         "rdcl_no",
1418         "ibrs_all",
1419         "rsba",
1420         "ssb_no",
1421         "stibp_all",
1422         "flush_cmd",
1423         "l1d_vmentry_no",
1424         "fsgsbase",
1425         "clflushopt",
1426         "clwb",
1427         "monitorx",
1428         "clzero",
1429         "xop",
1430         "fma4",
1431         "tbm",
1432         "avx512_vnni",
1433         "amd_pcec",
1434         "mb_clear",
1435         "mds_no",
1436         "core_thermal",
1437         "pkg_thermal",
1438         "tsx_ctrl",
1439         "taa_no"
1440 };
1441 
1442 boolean_t
1443 is_x86_feature(void *featureset, uint_t feature)
1444 {
1445         ASSERT(feature < NUM_X86_FEATURES);
1446         return (BT_TEST((ulong_t *)featureset, feature));
1447 }
1448 
1449 void
1450 add_x86_feature(void *featureset, uint_t feature)
1451 {
1452         ASSERT(feature < NUM_X86_FEATURES);
1453         BT_SET((ulong_t *)featureset, feature);
1454 }
1455 
1456 void
1457 remove_x86_feature(void *featureset, uint_t feature)
1458 {
1459         ASSERT(feature < NUM_X86_FEATURES);
1460         BT_CLEAR((ulong_t *)featureset, feature);
1461 }
1462 
1463 boolean_t
1464 compare_x86_featureset(void *setA, void *setB)
1465 {
1466         /*
1467          * We assume that the unused bits of the bitmap are always zero.
1468          */
1469         if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1470                 return (B_TRUE);
1471         } else {
1472                 return (B_FALSE);
1473         }
1474 }
1475 
1476 void
1477 print_x86_featureset(void *featureset)
1478 {
1479         uint_t i;
1480 
1481         for (i = 0; i < NUM_X86_FEATURES; i++) {
1482                 if (is_x86_feature(featureset, i)) {
1483                         cmn_err(CE_CONT, "?x86_feature: %s\n",
1484                             x86_feature_names[i]);
1485                 }
1486         }
1487 }
1488 
1489 /* Note: This is the maximum size for the CPU, not the size of the structure. */
1490 static size_t xsave_state_size = 0;
1491 uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1492 boolean_t xsave_force_disable = B_FALSE;
1493 extern int disable_smap;
1494 
1495 /*
1496  * This is set to platform type we are running on.
1497  */
1498 static int platform_type = -1;
1499 
1500 #if !defined(__xpv)
1501 /*
1502  * Variable to patch if hypervisor platform detection needs to be
1503  * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1504  */
1505 int enable_platform_detection = 1;
1506 #endif
1507 
1508 /*
1509  * monitor/mwait info.
1510  *
1511  * size_actual and buf_actual are the real address and size allocated to get
1512  * proper mwait_buf alignement.  buf_actual and size_actual should be passed
1513  * to kmem_free().  Currently kmem_alloc() and mwait happen to both use
1514  * processor cache-line alignment, but this is not guarantied in the furture.
1515  */
1516 struct mwait_info {
1517         size_t          mon_min;        /* min size to avoid missed wakeups */
1518         size_t          mon_max;        /* size to avoid false wakeups */
1519         size_t          size_actual;    /* size actually allocated */
1520         void            *buf_actual;    /* memory actually allocated */
1521         uint32_t        support;        /* processor support of monitor/mwait */
1522 };
1523 
1524 /*
1525  * xsave/xrestor info.
1526  *
1527  * This structure contains HW feature bits and the size of the xsave save area.
1528  * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1529  * (xsave_state) to describe the xsave layout. However, at runtime the
1530  * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1531  * xsave_state structure simply represents the legacy layout of the beginning
1532  * of the xsave area.
1533  */
1534 struct xsave_info {
1535         uint32_t        xsav_hw_features_low;   /* Supported HW features */
1536         uint32_t        xsav_hw_features_high;  /* Supported HW features */
1537         size_t          xsav_max_size;  /* max size save area for HW features */
1538         size_t          ymm_size;       /* AVX: size of ymm save area */
1539         size_t          ymm_offset;     /* AVX: offset for ymm save area */
1540         size_t          bndregs_size;   /* MPX: size of bndregs save area */
1541         size_t          bndregs_offset; /* MPX: offset for bndregs save area */
1542         size_t          bndcsr_size;    /* MPX: size of bndcsr save area */
1543         size_t          bndcsr_offset;  /* MPX: offset for bndcsr save area */
1544         size_t          opmask_size;    /* AVX512: size of opmask save */
1545         size_t          opmask_offset;  /* AVX512: offset for opmask save */
1546         size_t          zmmlo_size;     /* AVX512: size of zmm 256 save */
1547         size_t          zmmlo_offset;   /* AVX512: offset for zmm 256 save */
1548         size_t          zmmhi_size;     /* AVX512: size of zmm hi reg save */
1549         size_t          zmmhi_offset;   /* AVX512: offset for zmm hi reg save */
1550 };
1551 
1552 
1553 /*
1554  * These constants determine how many of the elements of the
1555  * cpuid we cache in the cpuid_info data structure; the
1556  * remaining elements are accessible via the cpuid instruction.
1557  */
1558 
1559 #define NMAX_CPI_STD    8               /* eax = 0 .. 7 */
1560 #define NMAX_CPI_EXTD   0x1f            /* eax = 0x80000000 .. 0x8000001e */
1561 
1562 /*
1563  * See the big theory statement for a more detailed explanation of what some of
1564  * these members mean.
1565  */
1566 struct cpuid_info {
1567         uint_t cpi_pass;                /* last pass completed */
1568         /*
1569          * standard function information
1570          */
1571         uint_t cpi_maxeax;              /* fn 0: %eax */
1572         char cpi_vendorstr[13];         /* fn 0: %ebx:%ecx:%edx */
1573         uint_t cpi_vendor;              /* enum of cpi_vendorstr */
1574 
1575         uint_t cpi_family;              /* fn 1: extended family */
1576         uint_t cpi_model;               /* fn 1: extended model */
1577         uint_t cpi_step;                /* fn 1: stepping */
1578         chipid_t cpi_chipid;            /* fn 1: %ebx:  Intel: chip # */
1579                                         /*              AMD: package/socket # */
1580         uint_t cpi_brandid;             /* fn 1: %ebx: brand ID */
1581         int cpi_clogid;                 /* fn 1: %ebx: thread # */
1582         uint_t cpi_ncpu_per_chip;       /* fn 1: %ebx: logical cpu count */
1583         uint8_t cpi_cacheinfo[16];      /* fn 2: intel-style cache desc */
1584         uint_t cpi_ncache;              /* fn 2: number of elements */
1585         uint_t cpi_ncpu_shr_last_cache; /* fn 4: %eax: ncpus sharing cache */
1586         id_t cpi_last_lvl_cacheid;      /* fn 4: %eax: derived cache id */
1587         uint_t cpi_cache_leaf_size;     /* Number of cache elements */
1588                                         /* Intel fn: 4, AMD fn: 8000001d */
1589         struct cpuid_regs **cpi_cache_leaves;   /* Acual leaves from above */
1590         struct cpuid_regs cpi_std[NMAX_CPI_STD];        /* 0 .. 7 */
1591         /*
1592          * extended function information
1593          */
1594         uint_t cpi_xmaxeax;             /* fn 0x80000000: %eax */
1595         char cpi_brandstr[49];          /* fn 0x8000000[234] */
1596         uint8_t cpi_pabits;             /* fn 0x80000006: %eax */
1597         uint8_t cpi_vabits;             /* fn 0x80000006: %eax */
1598         uint8_t cpi_fp_amd_save;        /* AMD: FP error pointer save rqd. */
1599         struct  cpuid_regs cpi_extd[NMAX_CPI_EXTD];     /* 0x800000XX */
1600 
1601         id_t cpi_coreid;                /* same coreid => strands share core */
1602         int cpi_pkgcoreid;              /* core number within single package */
1603         uint_t cpi_ncore_per_chip;      /* AMD: fn 0x80000008: %ecx[7-0] */
1604                                         /* Intel: fn 4: %eax[31-26] */
1605 
1606         /*
1607          * These values represent the number of bits that are required to store
1608          * information about the number of cores and threads.
1609          */
1610         uint_t cpi_ncore_bits;
1611         uint_t cpi_nthread_bits;
1612         /*
1613          * supported feature information
1614          */
1615         uint32_t cpi_support[6];
1616 #define STD_EDX_FEATURES        0
1617 #define AMD_EDX_FEATURES        1
1618 #define TM_EDX_FEATURES         2
1619 #define STD_ECX_FEATURES        3
1620 #define AMD_ECX_FEATURES        4
1621 #define STD_EBX_FEATURES        5
1622         /*
1623          * Synthesized information, where known.
1624          */
1625         uint32_t cpi_chiprev;           /* See X86_CHIPREV_* in x86_archext.h */
1626         const char *cpi_chiprevstr;     /* May be NULL if chiprev unknown */
1627         uint32_t cpi_socket;            /* Chip package/socket type */
1628 
1629         struct mwait_info cpi_mwait;    /* fn 5: monitor/mwait info */
1630         uint32_t cpi_apicid;
1631         uint_t cpi_procnodeid;          /* AMD: nodeID on HT, Intel: chipid */
1632         uint_t cpi_procnodes_per_pkg;   /* AMD: # of nodes in the package */
1633                                         /* Intel: 1 */
1634         uint_t cpi_compunitid;          /* AMD: ComputeUnit ID, Intel: coreid */
1635         uint_t cpi_cores_per_compunit;  /* AMD: # of cores in the ComputeUnit */
1636 
1637         struct xsave_info cpi_xsave;    /* fn D: xsave/xrestor info */
1638 };
1639 
1640 
1641 static struct cpuid_info cpuid_info0;
1642 
1643 /*
1644  * These bit fields are defined by the Intel Application Note AP-485
1645  * "Intel Processor Identification and the CPUID Instruction"
1646  */
1647 #define CPI_FAMILY_XTD(cpi)     BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1648 #define CPI_MODEL_XTD(cpi)      BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1649 #define CPI_TYPE(cpi)           BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1650 #define CPI_FAMILY(cpi)         BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1651 #define CPI_STEP(cpi)           BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1652 #define CPI_MODEL(cpi)          BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1653 
1654 #define CPI_FEATURES_EDX(cpi)           ((cpi)->cpi_std[1].cp_edx)
1655 #define CPI_FEATURES_ECX(cpi)           ((cpi)->cpi_std[1].cp_ecx)
1656 #define CPI_FEATURES_XTD_EDX(cpi)       ((cpi)->cpi_extd[1].cp_edx)
1657 #define CPI_FEATURES_XTD_ECX(cpi)       ((cpi)->cpi_extd[1].cp_ecx)
1658 #define CPI_FEATURES_7_0_EBX(cpi)       ((cpi)->cpi_std[7].cp_ebx)
1659 #define CPI_FEATURES_7_0_ECX(cpi)       ((cpi)->cpi_std[7].cp_ecx)
1660 #define CPI_FEATURES_7_0_EDX(cpi)       ((cpi)->cpi_std[7].cp_edx)
1661 
1662 #define CPI_BRANDID(cpi)        BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1663 #define CPI_CHUNKS(cpi)         BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1664 #define CPI_CPU_COUNT(cpi)      BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1665 #define CPI_APIC_ID(cpi)        BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1666 
1667 #define CPI_MAXEAX_MAX          0x100           /* sanity control */
1668 #define CPI_XMAXEAX_MAX         0x80000100
1669 #define CPI_FN4_ECX_MAX         0x20            /* sanity: max fn 4 levels */
1670 #define CPI_FNB_ECX_MAX         0x20            /* sanity: max fn B levels */
1671 
1672 /*
1673  * Function 4 (Deterministic Cache Parameters) macros
1674  * Defined by Intel Application Note AP-485
1675  */
1676 #define CPI_NUM_CORES(regs)             BITX((regs)->cp_eax, 31, 26)
1677 #define CPI_NTHR_SHR_CACHE(regs)        BITX((regs)->cp_eax, 25, 14)
1678 #define CPI_FULL_ASSOC_CACHE(regs)      BITX((regs)->cp_eax, 9, 9)
1679 #define CPI_SELF_INIT_CACHE(regs)       BITX((regs)->cp_eax, 8, 8)
1680 #define CPI_CACHE_LVL(regs)             BITX((regs)->cp_eax, 7, 5)
1681 #define CPI_CACHE_TYPE(regs)            BITX((regs)->cp_eax, 4, 0)
1682 #define CPI_CPU_LEVEL_TYPE(regs)        BITX((regs)->cp_ecx, 15, 8)
1683 
1684 #define CPI_CACHE_WAYS(regs)            BITX((regs)->cp_ebx, 31, 22)
1685 #define CPI_CACHE_PARTS(regs)           BITX((regs)->cp_ebx, 21, 12)
1686 #define CPI_CACHE_COH_LN_SZ(regs)       BITX((regs)->cp_ebx, 11, 0)
1687 
1688 #define CPI_CACHE_SETS(regs)            BITX((regs)->cp_ecx, 31, 0)
1689 
1690 #define CPI_PREFCH_STRIDE(regs)         BITX((regs)->cp_edx, 9, 0)
1691 
1692 
1693 /*
1694  * A couple of shorthand macros to identify "later" P6-family chips
1695  * like the Pentium M and Core.  First, the "older" P6-based stuff
1696  * (loosely defined as "pre-Pentium-4"):
1697  * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1698  */
1699 #define IS_LEGACY_P6(cpi) (                     \
1700         cpi->cpi_family == 6 &&                      \
1701                 (cpi->cpi_model == 1 ||              \
1702                 cpi->cpi_model == 3 ||               \
1703                 cpi->cpi_model == 5 ||               \
1704                 cpi->cpi_model == 6 ||               \
1705                 cpi->cpi_model == 7 ||               \
1706                 cpi->cpi_model == 8 ||               \
1707                 cpi->cpi_model == 0xA ||     \
1708                 cpi->cpi_model == 0xB)               \
1709 )
1710 
1711 /* A "new F6" is everything with family 6 that's not the above */
1712 #define IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
1713 
1714 /* Extended family/model support */
1715 #define IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
1716         cpi->cpi_family >= 0xf)
1717 
1718 /*
1719  * Info for monitor/mwait idle loop.
1720  *
1721  * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
1722  * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
1723  * 2006.
1724  * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
1725  * Documentation Updates" #33633, Rev 2.05, December 2006.
1726  */
1727 #define MWAIT_SUPPORT           (0x00000001)    /* mwait supported */
1728 #define MWAIT_EXTENSIONS        (0x00000002)    /* extenstion supported */
1729 #define MWAIT_ECX_INT_ENABLE    (0x00000004)    /* ecx 1 extension supported */
1730 #define MWAIT_SUPPORTED(cpi)    ((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
1731 #define MWAIT_INT_ENABLE(cpi)   ((cpi)->cpi_std[5].cp_ecx & 0x2)
1732 #define MWAIT_EXTENSION(cpi)    ((cpi)->cpi_std[5].cp_ecx & 0x1)
1733 #define MWAIT_SIZE_MIN(cpi)     BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
1734 #define MWAIT_SIZE_MAX(cpi)     BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
1735 /*
1736  * Number of sub-cstates for a given c-state.
1737  */
1738 #define MWAIT_NUM_SUBC_STATES(cpi, c_state)                     \
1739         BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
1740 
1741 /*
1742  * XSAVE leaf 0xD enumeration
1743  */
1744 #define CPUID_LEAFD_2_YMM_OFFSET        576
1745 #define CPUID_LEAFD_2_YMM_SIZE          256
1746 
1747 /*
1748  * Common extended leaf names to cut down on typos.
1749  */
1750 #define CPUID_LEAF_EXT_0                0x80000000
1751 #define CPUID_LEAF_EXT_8                0x80000008
1752 #define CPUID_LEAF_EXT_1d               0x8000001d
1753 #define CPUID_LEAF_EXT_1e               0x8000001e
1754 
1755 /*
1756  * Functions we consune from cpuid_subr.c;  don't publish these in a header
1757  * file to try and keep people using the expected cpuid_* interfaces.
1758  */
1759 extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
1760 extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
1761 extern uint32_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
1762 extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
1763 extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
1764 
1765 /*
1766  * Apply up various platform-dependent restrictions where the
1767  * underlying platform restrictions mean the CPU can be marked
1768  * as less capable than its cpuid instruction would imply.
1769  */
1770 #if defined(__xpv)
1771 static void
1772 platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
1773 {
1774         switch (eax) {
1775         case 1: {
1776                 uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
1777                     0 : CPUID_INTC_EDX_MCA;
1778                 cp->cp_edx &=
1779                     ~(mcamask |
1780                     CPUID_INTC_EDX_PSE |
1781                     CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1782                     CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
1783                     CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
1784                     CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1785                     CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
1786                 break;
1787         }
1788 
1789         case 0x80000001:
1790                 cp->cp_edx &=
1791                     ~(CPUID_AMD_EDX_PSE |
1792                     CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1793                     CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
1794                     CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
1795                     CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1796                     CPUID_AMD_EDX_TSCP);
1797                 cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
1798                 break;
1799         default:
1800                 break;
1801         }
1802 
1803         switch (vendor) {
1804         case X86_VENDOR_Intel:
1805                 switch (eax) {
1806                 case 4:
1807                         /*
1808                          * Zero out the (ncores-per-chip - 1) field
1809                          */
1810                         cp->cp_eax &= 0x03fffffff;
1811                         break;
1812                 default:
1813                         break;
1814                 }
1815                 break;
1816         case X86_VENDOR_AMD:
1817                 switch (eax) {
1818 
1819                 case 0x80000001:
1820                         cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
1821                         break;
1822 
1823                 case CPUID_LEAF_EXT_8:
1824                         /*
1825                          * Zero out the (ncores-per-chip - 1) field
1826                          */
1827                         cp->cp_ecx &= 0xffffff00;
1828                         break;
1829                 default:
1830                         break;
1831                 }
1832                 break;
1833         default:
1834                 break;
1835         }
1836 }
1837 #else
1838 #define platform_cpuid_mangle(vendor, eax, cp)  /* nothing */
1839 #endif
1840 
1841 /*
1842  *  Some undocumented ways of patching the results of the cpuid
1843  *  instruction to permit running Solaris 10 on future cpus that
1844  *  we don't currently support.  Could be set to non-zero values
1845  *  via settings in eeprom.
1846  */
1847 
1848 uint32_t cpuid_feature_ecx_include;
1849 uint32_t cpuid_feature_ecx_exclude;
1850 uint32_t cpuid_feature_edx_include;
1851 uint32_t cpuid_feature_edx_exclude;
1852 
1853 /*
1854  * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
1855  */
1856 void
1857 cpuid_alloc_space(cpu_t *cpu)
1858 {
1859         /*
1860          * By convention, cpu0 is the boot cpu, which is set up
1861          * before memory allocation is available.  All other cpus get
1862          * their cpuid_info struct allocated here.
1863          */
1864         ASSERT(cpu->cpu_id != 0);
1865         ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
1866         cpu->cpu_m.mcpu_cpi =
1867             kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
1868 }
1869 
1870 void
1871 cpuid_free_space(cpu_t *cpu)
1872 {
1873         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1874         int i;
1875 
1876         ASSERT(cpi != NULL);
1877         ASSERT(cpi != &cpuid_info0);
1878 
1879         /*
1880          * Free up any cache leaf related dynamic storage. The first entry was
1881          * cached from the standard cpuid storage, so we should not free it.
1882          */
1883         for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
1884                 kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
1885         if (cpi->cpi_cache_leaf_size > 0)
1886                 kmem_free(cpi->cpi_cache_leaves,
1887                     cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
1888 
1889         kmem_free(cpi, sizeof (*cpi));
1890         cpu->cpu_m.mcpu_cpi = NULL;
1891 }
1892 
1893 #if !defined(__xpv)
1894 /*
1895  * Determine the type of the underlying platform. This is used to customize
1896  * initialization of various subsystems (e.g. TSC). determine_platform() must
1897  * only ever be called once to prevent two processors from seeing different
1898  * values of platform_type. Must be called before cpuid_pass1(), the earliest
1899  * consumer to execute (uses _cpuid_chiprev --> synth_amd_info --> get_hwenv).
1900  */
1901 void
1902 determine_platform(void)
1903 {
1904         struct cpuid_regs cp;
1905         uint32_t base;
1906         uint32_t regs[4];
1907         char *hvstr = (char *)regs;
1908 
1909         ASSERT(platform_type == -1);
1910 
1911         platform_type = HW_NATIVE;
1912 
1913         if (!enable_platform_detection)
1914                 return;
1915 
1916         /*
1917          * If Hypervisor CPUID bit is set, try to determine hypervisor
1918          * vendor signature, and set platform type accordingly.
1919          *
1920          * References:
1921          * http://lkml.org/lkml/2008/10/1/246
1922          * http://kb.vmware.com/kb/1009458
1923          */
1924         cp.cp_eax = 0x1;
1925         (void) __cpuid_insn(&cp);
1926         if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
1927                 cp.cp_eax = 0x40000000;
1928                 (void) __cpuid_insn(&cp);
1929                 regs[0] = cp.cp_ebx;
1930                 regs[1] = cp.cp_ecx;
1931                 regs[2] = cp.cp_edx;
1932                 regs[3] = 0;
1933                 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
1934                         platform_type = HW_XEN_HVM;
1935                         return;
1936                 }
1937                 if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
1938                         platform_type = HW_VMWARE;
1939                         return;
1940                 }
1941                 if (strcmp(hvstr, HVSIG_KVM) == 0) {
1942                         platform_type = HW_KVM;
1943                         return;
1944                 }
1945                 if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
1946                         platform_type = HW_BHYVE;
1947                         return;
1948                 }
1949                 if (strcmp(hvstr, HVSIG_MICROSOFT) == 0)
1950                         platform_type = HW_MICROSOFT;
1951         } else {
1952                 /*
1953                  * Check older VMware hardware versions. VMware hypervisor is
1954                  * detected by performing an IN operation to VMware hypervisor
1955                  * port and checking that value returned in %ebx is VMware
1956                  * hypervisor magic value.
1957                  *
1958                  * References: http://kb.vmware.com/kb/1009458
1959                  */
1960                 vmware_port(VMWARE_HVCMD_GETVERSION, regs);
1961                 if (regs[1] == VMWARE_HVMAGIC) {
1962                         platform_type = HW_VMWARE;
1963                         return;
1964                 }
1965         }
1966 
1967         /*
1968          * Check Xen hypervisor. In a fully virtualized domain,
1969          * Xen's pseudo-cpuid function returns a string representing the
1970          * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
1971          * supported cpuid function. We need at least a (base + 2) leaf value
1972          * to do what we want to do. Try different base values, since the
1973          * hypervisor might use a different one depending on whether Hyper-V
1974          * emulation is switched on by default or not.
1975          */
1976         for (base = 0x40000000; base < 0x40010000; base += 0x100) {
1977                 cp.cp_eax = base;
1978                 (void) __cpuid_insn(&cp);
1979                 regs[0] = cp.cp_ebx;
1980                 regs[1] = cp.cp_ecx;
1981                 regs[2] = cp.cp_edx;
1982                 regs[3] = 0;
1983                 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
1984                     cp.cp_eax >= (base + 2)) {
1985                         platform_type &= ~HW_NATIVE;
1986                         platform_type |= HW_XEN_HVM;
1987                         return;
1988                 }
1989         }
1990 }
1991 
1992 int
1993 get_hwenv(void)
1994 {
1995         ASSERT(platform_type != -1);
1996         return (platform_type);
1997 }
1998 
1999 int
2000 is_controldom(void)
2001 {
2002         return (0);
2003 }
2004 
2005 #else
2006 
2007 int
2008 get_hwenv(void)
2009 {
2010         return (HW_XEN_PV);
2011 }
2012 
2013 int
2014 is_controldom(void)
2015 {
2016         return (DOMAIN_IS_INITDOMAIN(xen_info));
2017 }
2018 
2019 #endif  /* __xpv */
2020 
2021 /*
2022  * Make sure that we have gathered all of the CPUID leaves that we might need to
2023  * determine topology. We assume that the standard leaf 1 has already been done
2024  * and that xmaxeax has already been calculated.
2025  */
2026 static void
2027 cpuid_gather_amd_topology_leaves(cpu_t *cpu)
2028 {
2029         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2030 
2031         if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2032                 struct cpuid_regs *cp;
2033 
2034                 cp = &cpi->cpi_extd[8];
2035                 cp->cp_eax = CPUID_LEAF_EXT_8;
2036                 (void) __cpuid_insn(cp);
2037                 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
2038         }
2039 
2040         if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2041             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2042                 struct cpuid_regs *cp;
2043 
2044                 cp = &cpi->cpi_extd[0x1e];
2045                 cp->cp_eax = CPUID_LEAF_EXT_1e;
2046                 (void) __cpuid_insn(cp);
2047         }
2048 }
2049 
2050 /*
2051  * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
2052  * it to everything else. If not, and we're on an AMD system where 8000001e is
2053  * valid, then we use that. Othewrise, we fall back to the default value for the
2054  * APIC ID in leaf 1.
2055  */
2056 static uint32_t
2057 cpuid_gather_apicid(struct cpuid_info *cpi)
2058 {
2059         /*
2060          * Leaf B changes based on the arguments to it. Beacuse we don't cache
2061          * it, we need to gather it again.
2062          */
2063         if (cpi->cpi_maxeax >= 0xB) {
2064                 struct cpuid_regs regs;
2065                 struct cpuid_regs *cp;
2066 
2067                 cp = &regs;
2068                 cp->cp_eax = 0xB;
2069                 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2070                 (void) __cpuid_insn(cp);
2071 
2072                 if (cp->cp_ebx != 0) {
2073                         return (cp->cp_edx);
2074                 }
2075         }
2076 
2077         if (cpi->cpi_vendor == X86_VENDOR_AMD &&
2078             is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2079             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2080                 return (cpi->cpi_extd[0x1e].cp_eax);
2081         }
2082 
2083         return (CPI_APIC_ID(cpi));
2084 }
2085 
2086 /*
2087  * For AMD processors, attempt to calculate the number of chips and cores that
2088  * exist. The way that we do this varies based on the generation, because the
2089  * generations themselves have changed dramatically.
2090  *
2091  * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
2092  * However, with the advent of family 17h (Zen) it actually tells us the number
2093  * of threads, so we need to look at leaf 0x8000001e if available to determine
2094  * its value. Otherwise, for all prior families, the number of enabled cores is
2095  * the same as threads.
2096  *
2097  * If we do not have leaf 0x80000008, then we assume that this processor does
2098  * not have anything. AMD's older CPUID specification says there's no reason to
2099  * fall back to leaf 1.
2100  *
2101  * In some virtualization cases we will not have leaf 8000001e or it will be
2102  * zero. When that happens we assume the number of threads is one.
2103  */
2104 static void
2105 cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2106 {
2107         uint_t nthreads, nthread_per_core;
2108 
2109         nthreads = nthread_per_core = 1;
2110 
2111         if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2112                 nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2113         } else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2114                 nthreads = CPI_CPU_COUNT(cpi);
2115         }
2116 
2117         /*
2118          * For us to have threads, and know about it, we have to be at least at
2119          * family 17h and have the cpuid bit that says we have extended
2120          * topology.
2121          */
2122         if (cpi->cpi_family >= 0x17 &&
2123             is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2124             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2125                 nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2126         }
2127 
2128         *ncpus = nthreads;
2129         *ncores = nthreads / nthread_per_core;
2130 }
2131 
2132 /*
2133  * Seed the initial values for the cores and threads for an Intel based
2134  * processor. These values will be overwritten if we detect that the processor
2135  * supports CPUID leaf 0xb.
2136  */
2137 static void
2138 cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2139 {
2140         /*
2141          * Only seed the number of physical cores from the first level leaf 4
2142          * information. The number of threads there indicate how many share the
2143          * L1 cache, which may or may not have anything to do with the number of
2144          * logical CPUs per core.
2145          */
2146         if (cpi->cpi_maxeax >= 4) {
2147                 *ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2148         } else {
2149                 *ncores = 1;
2150         }
2151 
2152         if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2153                 *ncpus = CPI_CPU_COUNT(cpi);
2154         } else {
2155                 *ncpus = *ncores;
2156         }
2157 }
2158 
2159 static boolean_t
2160 cpuid_leafB_getids(cpu_t *cpu)
2161 {
2162         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2163         struct cpuid_regs regs;
2164         struct cpuid_regs *cp;
2165 
2166         if (cpi->cpi_maxeax < 0xB)
2167                 return (B_FALSE);
2168 
2169         cp = &regs;
2170         cp->cp_eax = 0xB;
2171         cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2172 
2173         (void) __cpuid_insn(cp);
2174 
2175         /*
2176          * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2177          * indicates that the extended topology enumeration leaf is
2178          * available.
2179          */
2180         if (cp->cp_ebx != 0) {
2181                 uint32_t x2apic_id = 0;
2182                 uint_t coreid_shift = 0;
2183                 uint_t ncpu_per_core = 1;
2184                 uint_t chipid_shift = 0;
2185                 uint_t ncpu_per_chip = 1;
2186                 uint_t i;
2187                 uint_t level;
2188 
2189                 for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2190                         cp->cp_eax = 0xB;
2191                         cp->cp_ecx = i;
2192 
2193                         (void) __cpuid_insn(cp);
2194                         level = CPI_CPU_LEVEL_TYPE(cp);
2195 
2196                         if (level == 1) {
2197                                 x2apic_id = cp->cp_edx;
2198                                 coreid_shift = BITX(cp->cp_eax, 4, 0);
2199                                 ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2200                         } else if (level == 2) {
2201                                 x2apic_id = cp->cp_edx;
2202                                 chipid_shift = BITX(cp->cp_eax, 4, 0);
2203                                 ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2204                         }
2205                 }
2206 
2207                 /*
2208                  * cpi_apicid is taken care of in cpuid_gather_apicid.
2209                  */
2210                 cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2211                 cpi->cpi_ncore_per_chip = ncpu_per_chip /
2212                     ncpu_per_core;
2213                 cpi->cpi_chipid = x2apic_id >> chipid_shift;
2214                 cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2215                 cpi->cpi_coreid = x2apic_id >> coreid_shift;
2216                 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2217                 cpi->cpi_procnodeid = cpi->cpi_chipid;
2218                 cpi->cpi_compunitid = cpi->cpi_coreid;
2219 
2220                 if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2221                         cpi->cpi_nthread_bits = coreid_shift;
2222                         cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2223                 }
2224 
2225                 return (B_TRUE);
2226         } else {
2227                 return (B_FALSE);
2228         }
2229 }
2230 
2231 static void
2232 cpuid_intel_getids(cpu_t *cpu, void *feature)
2233 {
2234         uint_t i;
2235         uint_t chipid_shift = 0;
2236         uint_t coreid_shift = 0;
2237         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2238 
2239         /*
2240          * There are no compute units or processor nodes currently on Intel.
2241          * Always set these to one.
2242          */
2243         cpi->cpi_procnodes_per_pkg = 1;
2244         cpi->cpi_cores_per_compunit = 1;
2245 
2246         /*
2247          * If cpuid Leaf B is present, use that to try and get this information.
2248          * It will be the most accurate for Intel CPUs.
2249          */
2250         if (cpuid_leafB_getids(cpu))
2251                 return;
2252 
2253         /*
2254          * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2255          * and ncore_per_chip. These represent the largest power of two values
2256          * that we need to cover all of the IDs in the system. Therefore, we use
2257          * those values to seed the number of bits needed to cover information
2258          * in the case when leaf B is not available. These values will probably
2259          * be larger than required, but that's OK.
2260          */
2261         cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2262         cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2263 
2264         for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2265                 chipid_shift++;
2266 
2267         cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2268         cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2269 
2270         if (is_x86_feature(feature, X86FSET_CMP)) {
2271                 /*
2272                  * Multi-core (and possibly multi-threaded)
2273                  * processors.
2274                  */
2275                 uint_t ncpu_per_core;
2276                 if (cpi->cpi_ncore_per_chip == 1)
2277                         ncpu_per_core = cpi->cpi_ncpu_per_chip;
2278                 else if (cpi->cpi_ncore_per_chip > 1)
2279                         ncpu_per_core = cpi->cpi_ncpu_per_chip /
2280                             cpi->cpi_ncore_per_chip;
2281                 /*
2282                  * 8bit APIC IDs on dual core Pentiums
2283                  * look like this:
2284                  *
2285                  * +-----------------------+------+------+
2286                  * | Physical Package ID   |  MC  |  HT  |
2287                  * +-----------------------+------+------+
2288                  * <------- chipid -------->
2289                  * <------- coreid --------------->
2290                  *                         <--- clogid -->
2291                  *                         <------>
2292                  *                         pkgcoreid
2293                  *
2294                  * Where the number of bits necessary to
2295                  * represent MC and HT fields together equals
2296                  * to the minimum number of bits necessary to
2297                  * store the value of cpi->cpi_ncpu_per_chip.
2298                  * Of those bits, the MC part uses the number
2299                  * of bits necessary to store the value of
2300                  * cpi->cpi_ncore_per_chip.
2301                  */
2302                 for (i = 1; i < ncpu_per_core; i <<= 1)
2303                         coreid_shift++;
2304                 cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2305                 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2306         } else if (is_x86_feature(feature, X86FSET_HTT)) {
2307                 /*
2308                  * Single-core multi-threaded processors.
2309                  */
2310                 cpi->cpi_coreid = cpi->cpi_chipid;
2311                 cpi->cpi_pkgcoreid = 0;
2312         } else {
2313                 /*
2314                  * Single-core single-thread processors.
2315                  */
2316                 cpi->cpi_coreid = cpu->cpu_id;
2317                 cpi->cpi_pkgcoreid = 0;
2318         }
2319         cpi->cpi_procnodeid = cpi->cpi_chipid;
2320         cpi->cpi_compunitid = cpi->cpi_coreid;
2321 }
2322 
2323 /*
2324  * Historically, AMD has had CMP chips with only a single thread per core.
2325  * However, starting in family 17h (Zen), this has changed and they now have
2326  * multiple threads. Our internal core id needs to be a unique value.
2327  *
2328  * To determine the core id of an AMD system, if we're from a family before 17h,
2329  * then we just use the cpu id, as that gives us a good value that will be
2330  * unique for each core. If instead, we're on family 17h or later, then we need
2331  * to do something more complicated. CPUID leaf 0x8000001e can tell us
2332  * how many threads are in the system. Based on that, we'll shift the APIC ID.
2333  * We can't use the normal core id in that leaf as it's only unique within the
2334  * socket, which is perfect for cpi_pkgcoreid, but not us.
2335  */
2336 static id_t
2337 cpuid_amd_get_coreid(cpu_t *cpu)
2338 {
2339         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2340 
2341         if (cpi->cpi_family >= 0x17 &&
2342             is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2343             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2344                 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2345                 if (nthreads > 1) {
2346                         VERIFY3U(nthreads, ==, 2);
2347                         return (cpi->cpi_apicid >> 1);
2348                 }
2349         }
2350 
2351         return (cpu->cpu_id);
2352 }
2353 
2354 /*
2355  * IDs on AMD is a more challenging task. This is notable because of the
2356  * following two facts:
2357  *
2358  *  1. Before family 0x17 (Zen), there was no support for SMT and there was
2359  *     also no way to get an actual unique core id from the system. As such, we
2360  *     synthesize this case by using cpu->cpu_id.  This scheme does not,
2361  *     however, guarantee that sibling cores of a chip will have sequential
2362  *     coreids starting at a multiple of the number of cores per chip - that is
2363  *     usually the case, but if the ACPI MADT table is presented in a different
2364  *     order then we need to perform a few more gymnastics for the pkgcoreid.
2365  *
2366  *  2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2367  *     called compute units. These compute units share the L1I cache, L2 cache,
2368  *     and the FPU. To deal with this, a new topology leaf was added in
2369  *     0x8000001e. However, parts of this leaf have different meanings
2370  *     once we get to family 0x17.
2371  */
2372 
2373 static void
2374 cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2375 {
2376         int i, first_half, coreidsz;
2377         uint32_t nb_caps_reg;
2378         uint_t node2_1;
2379         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2380         struct cpuid_regs *cp;
2381 
2382         /*
2383          * Calculate the core id (this comes from hardware in family 0x17 if it
2384          * hasn't been stripped by virtualization). We always set the compute
2385          * unit id to the same value. Also, initialize the default number of
2386          * cores per compute unit and nodes per package. This will be
2387          * overwritten when we know information about a particular family.
2388          */
2389         cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2390         cpi->cpi_compunitid = cpi->cpi_coreid;
2391         cpi->cpi_cores_per_compunit = 1;
2392         cpi->cpi_procnodes_per_pkg = 1;
2393 
2394         /*
2395          * To construct the logical ID, we need to determine how many APIC IDs
2396          * are dedicated to the cores and threads. This is provided for us in
2397          * 0x80000008. However, if it's not present (say due to virtualization),
2398          * then we assume it's one. This should be present on all 64-bit AMD
2399          * processors.  It was added in family 0xf (Hammer).
2400          */
2401         if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2402                 coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2403 
2404                 /*
2405                  * In AMD parlance chip is really a node while illumos
2406                  * uses chip as equivalent to socket/package.
2407                  */
2408                 if (coreidsz == 0) {
2409                         /* Use legacy method */
2410                         for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2411                                 coreidsz++;
2412                         if (coreidsz == 0)
2413                                 coreidsz = 1;
2414                 }
2415         } else {
2416                 /* Assume single-core part */
2417                 coreidsz = 1;
2418         }
2419         cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2420 
2421         /*
2422          * The package core ID varies depending on the family. While it may be
2423          * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2424          * this value is the core id in the given node. For non-virtualized
2425          * family 17h, we need to take the logical core id and shift off the
2426          * threads like we do when getting the core id.  Otherwise, we can use
2427          * the clogid as is. When family 17h is virtualized, the clogid should
2428          * be sufficient as if we don't have valid data in the leaf, then we
2429          * won't think we have SMT, in which case the cpi_clogid should be
2430          * sufficient.
2431          */
2432         if (cpi->cpi_family >= 0x17 &&
2433             is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2434             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2435             cpi->cpi_extd[0x1e].cp_ebx != 0) {
2436                 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2437                 if (nthreads > 1) {
2438                         VERIFY3U(nthreads, ==, 2);
2439                         cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2440                 } else {
2441                         cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2442                 }
2443         } else {
2444                 cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2445         }
2446 
2447         /*
2448          * Obtain the node ID and compute unit IDs. If we're on family 0x15
2449          * (bulldozer) or newer, then we can derive all of this from leaf
2450          * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2451          */
2452         if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2453             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2454                 cp = &cpi->cpi_extd[0x1e];
2455 
2456                 cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2457                 cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2458 
2459                 /*
2460                  * For Bulldozer-era CPUs, recalculate the compute unit
2461                  * information.
2462                  */
2463                 if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2464                         cpi->cpi_cores_per_compunit =
2465                             BITX(cp->cp_ebx, 15, 8) + 1;
2466                         cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2467                             (cpi->cpi_ncore_per_chip /
2468                             cpi->cpi_cores_per_compunit) *
2469                             (cpi->cpi_procnodeid /
2470                             cpi->cpi_procnodes_per_pkg);
2471                 }
2472         } else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2473                 cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2474         } else if (cpi->cpi_family == 0x10) {
2475                 /*
2476                  * See if we are a multi-node processor.
2477                  * All processors in the system have the same number of nodes
2478                  */
2479                 nb_caps_reg =  pci_getl_func(0, 24, 3, 0xe8);
2480                 if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2481                         /* Single-node */
2482                         cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2483                             coreidsz);
2484                 } else {
2485 
2486                         /*
2487                          * Multi-node revision D (2 nodes per package
2488                          * are supported)
2489                          */
2490                         cpi->cpi_procnodes_per_pkg = 2;
2491 
2492                         first_half = (cpi->cpi_pkgcoreid <=
2493                             (cpi->cpi_ncore_per_chip/2 - 1));
2494 
2495                         if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2496                                 /* We are BSP */
2497                                 cpi->cpi_procnodeid = (first_half ? 0 : 1);
2498                         } else {
2499 
2500                                 /* We are AP */
2501                                 /* NodeId[2:1] bits to use for reading F3xe8 */
2502                                 node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2503 
2504                                 nb_caps_reg =
2505                                     pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2506 
2507                                 /*
2508                                  * Check IntNodeNum bit (31:30, but bit 31 is
2509                                  * always 0 on dual-node processors)
2510                                  */
2511                                 if (BITX(nb_caps_reg, 30, 30) == 0)
2512                                         cpi->cpi_procnodeid = node2_1 +
2513                                             !first_half;
2514                                 else
2515                                         cpi->cpi_procnodeid = node2_1 +
2516                                             first_half;
2517                         }
2518                 }
2519         } else {
2520                 cpi->cpi_procnodeid = 0;
2521         }
2522 
2523         cpi->cpi_chipid =
2524             cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2525 
2526         cpi->cpi_ncore_bits = coreidsz;
2527         cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2528             cpi->cpi_ncore_per_chip);
2529 }
2530 
2531 static void
2532 spec_uarch_flush_noop(void)
2533 {
2534 }
2535 
2536 /*
2537  * When microcode is present that mitigates MDS, this wrmsr will also flush the
2538  * MDS-related micro-architectural state that would normally happen by calling
2539  * x86_md_clear().
2540  */
2541 static void
2542 spec_uarch_flush_msr(void)
2543 {
2544         wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2545 }
2546 
2547 /*
2548  * This function points to a function that will flush certain
2549  * micro-architectural state on the processor. This flush is used to mitigate
2550  * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This
2551  * function can point to one of three functions:
2552  *
2553  * - A noop which is done because we either are vulnerable, but do not have
2554  *   microcode available to help deal with a fix, or because we aren't
2555  *   vulnerable.
2556  *
2557  * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2558  *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2559  *   however, it only flushes the MDS related micro-architectural state on the
2560  *   current hyperthread, it does not do anything for the twin.
2561  *
2562  * - x86_md_clear which will flush the MDS related state. This is done when we
2563  *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2564  *   (RDCL_NO is set).
2565  */
2566 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2567 
2568 static void
2569 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2570 {
2571         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2572 
2573         /*
2574          * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2575          * has been fixed in hardware, it doesn't cover everything related to
2576          * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2577          * need to mitigate this.
2578          */
2579         if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2580             is_x86_feature(featureset, X86FSET_MDS_NO)) {
2581                 return;
2582         }
2583 
2584         if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2585                 const uint8_t nop = NOP_INSTR;
2586                 uint8_t *md = (uint8_t *)x86_md_clear;
2587 
2588                 *md = nop;
2589         }
2590 
2591         membar_producer();
2592 }
2593 
2594 static void
2595 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2596 {
2597         boolean_t need_l1d, need_mds;
2598         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2599 
2600         /*
2601          * If we're not on Intel or we've mitigated both RDCL and MDS in
2602          * hardware, then there's nothing left for us to do for enabling the
2603          * flush. We can also go ahead and say that SMT exclusion is
2604          * unnecessary.
2605          */
2606         if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2607             (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2608             is_x86_feature(featureset, X86FSET_MDS_NO))) {
2609                 extern int smt_exclusion;
2610                 smt_exclusion = 0;
2611                 spec_uarch_flush = spec_uarch_flush_noop;
2612                 membar_producer();
2613                 return;
2614         }
2615 
2616         /*
2617          * The locations where we need to perform an L1D flush are required both
2618          * for mitigating L1TF and MDS. When verw support is present in
2619          * microcode, then the L1D flush will take care of doing that as well.
2620          * However, if we have a system where RDCL_NO is present, but we don't
2621          * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2622          * L1D flush.
2623          */
2624         if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2625             is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2626             !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2627                 need_l1d = B_TRUE;
2628         } else {
2629                 need_l1d = B_FALSE;
2630         }
2631 
2632         if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2633             is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2634                 need_mds = B_TRUE;
2635         } else {
2636                 need_mds = B_FALSE;
2637         }
2638 
2639         if (need_l1d) {
2640                 spec_uarch_flush = spec_uarch_flush_msr;
2641         } else if (need_mds) {
2642                 spec_uarch_flush = x86_md_clear;
2643         } else {
2644                 /*
2645                  * We have no hardware mitigations available to us.
2646                  */
2647                 spec_uarch_flush = spec_uarch_flush_noop;
2648         }
2649         membar_producer();
2650 }
2651 
2652 /*
2653  * We default to enabling RSB mitigations.
2654  */
2655 static void
2656 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)
2657 {
2658         const uint8_t ret = RET_INSTR;
2659         uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
2660 
2661         switch (mit) {
2662         case X86_SPECTREV2_ENHANCED_IBRS:
2663         case X86_SPECTREV2_DISABLED:
2664                 *stuff = ret;
2665                 break;
2666         default:
2667                 break;
2668         }
2669 }
2670 
2671 static void
2672 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
2673 {
2674         const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
2675             "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
2676             "_r14", "_r15" };
2677         const uint_t nthunks = ARRAY_SIZE(thunks);
2678         const char *type;
2679         uint_t i;
2680 
2681         if (mit == x86_spectrev2_mitigation)
2682                 return;
2683 
2684         switch (mit) {
2685         case X86_SPECTREV2_RETPOLINE:
2686                 type = "gen";
2687                 break;
2688         case X86_SPECTREV2_RETPOLINE_AMD:
2689                 type = "amd";
2690                 break;
2691         case X86_SPECTREV2_ENHANCED_IBRS:
2692         case X86_SPECTREV2_DISABLED:
2693                 type = "jmp";
2694                 break;
2695         default:
2696                 panic("asked to updated retpoline state with unknown state!");
2697         }
2698 
2699         for (i = 0; i < nthunks; i++) {
2700                 uintptr_t source, dest;
2701                 int ssize, dsize;
2702                 char sourcebuf[64], destbuf[64];
2703                 size_t len;
2704 
2705                 (void) snprintf(destbuf, sizeof (destbuf),
2706                     "__x86_indirect_thunk%s", thunks[i]);
2707                 (void) snprintf(sourcebuf, sizeof (sourcebuf),
2708                     "__x86_indirect_thunk_%s%s", type, thunks[i]);
2709 
2710                 source = kobj_getelfsym(sourcebuf, NULL, &ssize);
2711                 dest = kobj_getelfsym(destbuf, NULL, &dsize);
2712                 VERIFY3U(source, !=, 0);
2713                 VERIFY3U(dest, !=, 0);
2714                 VERIFY3S(dsize, >=, ssize);
2715                 bcopy((void *)source, (void *)dest, ssize);
2716         }
2717 }
2718 
2719 static void
2720 cpuid_enable_enhanced_ibrs(void)
2721 {
2722         uint64_t val;
2723 
2724         val = rdmsr(MSR_IA32_SPEC_CTRL);
2725         val |= IA32_SPEC_CTRL_IBRS;
2726         wrmsr(MSR_IA32_SPEC_CTRL, val);
2727 }
2728 
2729 #ifndef __xpv
2730 /*
2731  * Determine whether or not we can use the AMD optimized retpoline
2732  * functionality. We use this when we know we're on an AMD system and we can
2733  * successfully verify that lfence is dispatch serializing.
2734  */
2735 static boolean_t
2736 cpuid_use_amd_retpoline(struct cpuid_info *cpi)
2737 {
2738         uint64_t val;
2739         on_trap_data_t otd;
2740 
2741         if (cpi->cpi_vendor != X86_VENDOR_AMD)
2742                 return (B_FALSE);
2743 
2744         /*
2745          * We need to determine whether or not lfence is serializing. It always
2746          * is on families 0xf and 0x11. On others, it's controlled by
2747          * MSR_AMD_DECODE_CONFIG (MSRC001_1029). If some hypervisor gives us a
2748          * crazy old family, don't try and do anything.
2749          */
2750         if (cpi->cpi_family < 0xf)
2751                 return (B_FALSE);
2752         if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11)
2753                 return (B_TRUE);
2754 
2755         /*
2756          * While it may be tempting to use get_hwenv(), there are no promises
2757          * that a hypervisor will actually declare themselves to be so in a
2758          * friendly way. As such, try to read and set the MSR. If we can then
2759          * read back the value we set (it wasn't just set to zero), then we go
2760          * for it.
2761          */
2762         if (!on_trap(&otd, OT_DATA_ACCESS)) {
2763                 val = rdmsr(MSR_AMD_DECODE_CONFIG);
2764                 val |= AMD_DECODE_CONFIG_LFENCE_DISPATCH;
2765                 wrmsr(MSR_AMD_DECODE_CONFIG, val);
2766                 val = rdmsr(MSR_AMD_DECODE_CONFIG);
2767         } else {
2768                 val = 0;
2769         }
2770         no_trap();
2771 
2772         if ((val & AMD_DECODE_CONFIG_LFENCE_DISPATCH) != 0)
2773                 return (B_TRUE);
2774         return (B_FALSE);
2775 }
2776 #endif  /* !__xpv */
2777 
2778 /*
2779  * Determine how we should mitigate TAA or if we need to. Regardless of TAA, if
2780  * we can disable TSX, we do so.
2781  *
2782  * This determination is done only on the boot CPU, potentially after loading
2783  * updated microcode.
2784  */
2785 static void
2786 cpuid_update_tsx(cpu_t *cpu, uchar_t *featureset)
2787 {
2788         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2789 
2790         VERIFY(cpu->cpu_id == 0);
2791 
2792         if (cpi->cpi_vendor != X86_VENDOR_Intel) {
2793                 x86_taa_mitigation = X86_TAA_HW_MITIGATED;
2794                 return;
2795         }
2796 
2797         if (x86_disable_taa) {
2798                 x86_taa_mitigation = X86_TAA_DISABLED;
2799                 return;
2800         }
2801 
2802         /*
2803          * If we do not have the ability to disable TSX, then our only
2804          * mitigation options are in hardware (TAA_NO), or by using our existing
2805          * MDS mitigation as described above.  The latter relies upon us having
2806          * configured MDS mitigations correctly! This includes disabling SMT if
2807          * we want to cross-CPU-thread protection.
2808          */
2809         if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) {
2810                 /*
2811                  * It's not clear whether any parts will enumerate TAA_NO
2812                  * *without* TSX_CTRL, but let's mark it as such if we see this.
2813                  */
2814                 if (is_x86_feature(featureset, X86FSET_TAA_NO)) {
2815                         x86_taa_mitigation = X86_TAA_HW_MITIGATED;
2816                         return;
2817                 }
2818 
2819                 if (is_x86_feature(featureset, X86FSET_MD_CLEAR) &&
2820                     !is_x86_feature(featureset, X86FSET_MDS_NO)) {
2821                         x86_taa_mitigation = X86_TAA_MD_CLEAR;
2822                 } else {
2823                         x86_taa_mitigation = X86_TAA_NOTHING;
2824                 }
2825                 return;
2826         }
2827 
2828         /*
2829          * We have TSX_CTRL, but we can only fully disable TSX if we're early
2830          * enough in boot.
2831          *
2832          * Otherwise, we'll fall back to causing transactions to abort as our
2833          * mitigation. TSX-using code will always take the fallback path.
2834          */
2835         if (cpi->cpi_pass < 4) {
2836                 x86_taa_mitigation = X86_TAA_TSX_DISABLE;
2837         } else {
2838                 x86_taa_mitigation = X86_TAA_TSX_FORCE_ABORT;
2839         }
2840 }
2841 
2842 /*
2843  * As mentioned, we should only touch the MSR when we've got a suitable
2844  * microcode loaded on this CPU.
2845  */
2846 static void
2847 cpuid_apply_tsx(x86_taa_mitigation_t taa, uchar_t *featureset)
2848 {
2849         uint64_t val;
2850 
2851         switch (taa) {
2852         case X86_TAA_TSX_DISABLE:
2853                 if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
2854                         return;
2855                 val = rdmsr(MSR_IA32_TSX_CTRL);
2856                 val |= IA32_TSX_CTRL_CPUID_CLEAR | IA32_TSX_CTRL_RTM_DISABLE;
2857                 wrmsr(MSR_IA32_TSX_CTRL, val);
2858                 break;
2859         case X86_TAA_TSX_FORCE_ABORT:
2860                 if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
2861                         return;
2862                 val = rdmsr(MSR_IA32_TSX_CTRL);
2863                 val |= IA32_TSX_CTRL_RTM_DISABLE;
2864                 wrmsr(MSR_IA32_TSX_CTRL, val);
2865                 break;
2866         case X86_TAA_HW_MITIGATED:
2867         case X86_TAA_MD_CLEAR:
2868         case X86_TAA_DISABLED:
2869         case X86_TAA_NOTHING:
2870                 break;
2871         }
2872 }
2873 
2874 static void
2875 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
2876 {
2877         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2878         x86_spectrev2_mitigation_t v2mit;
2879 
2880         if (cpi->cpi_vendor == X86_VENDOR_AMD &&
2881             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2882                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
2883                         add_x86_feature(featureset, X86FSET_IBPB);
2884                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
2885                         add_x86_feature(featureset, X86FSET_IBRS);
2886                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
2887                         add_x86_feature(featureset, X86FSET_STIBP);
2888                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
2889                         add_x86_feature(featureset, X86FSET_STIBP_ALL);
2890                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
2891                         add_x86_feature(featureset, X86FSET_SSBD);
2892                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
2893                         add_x86_feature(featureset, X86FSET_SSBD_VIRT);
2894                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
2895                         add_x86_feature(featureset, X86FSET_SSB_NO);
2896                 /*
2897                  * Don't enable enhanced IBRS unless we're told that we should
2898                  * prefer it and it has the same semantics as Intel. This is
2899                  * split into two bits rather than a single one.
2900                  */
2901                 if ((cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
2902                     (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)) {
2903                         add_x86_feature(featureset, X86FSET_IBRS_ALL);
2904                 }
2905 
2906         } else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
2907             cpi->cpi_maxeax >= 7) {
2908                 struct cpuid_regs *ecp;
2909                 ecp = &cpi->cpi_std[7];
2910 
2911                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
2912                         add_x86_feature(featureset, X86FSET_MD_CLEAR);
2913                 }
2914 
2915                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
2916                         add_x86_feature(featureset, X86FSET_IBRS);
2917                         add_x86_feature(featureset, X86FSET_IBPB);
2918                 }
2919 
2920                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
2921                         add_x86_feature(featureset, X86FSET_STIBP);
2922                 }
2923 
2924                 /*
2925                  * Don't read the arch caps MSR on xpv where we lack the
2926                  * on_trap().
2927                  */
2928 #ifndef __xpv
2929                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
2930                         on_trap_data_t otd;
2931 
2932                         /*
2933                          * Be paranoid and assume we'll get a #GP.
2934                          */
2935                         if (!on_trap(&otd, OT_DATA_ACCESS)) {
2936                                 uint64_t reg;
2937 
2938                                 reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
2939                                 if (reg & IA32_ARCH_CAP_RDCL_NO) {
2940                                         add_x86_feature(featureset,
2941                                             X86FSET_RDCL_NO);
2942                                 }
2943                                 if (reg & IA32_ARCH_CAP_IBRS_ALL) {
2944                                         add_x86_feature(featureset,
2945                                             X86FSET_IBRS_ALL);
2946                                 }
2947                                 if (reg & IA32_ARCH_CAP_RSBA) {
2948                                         add_x86_feature(featureset,
2949                                             X86FSET_RSBA);
2950                                 }
2951                                 if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
2952                                         add_x86_feature(featureset,
2953                                             X86FSET_L1D_VM_NO);
2954                                 }
2955                                 if (reg & IA32_ARCH_CAP_SSB_NO) {
2956                                         add_x86_feature(featureset,
2957                                             X86FSET_SSB_NO);
2958                                 }
2959                                 if (reg & IA32_ARCH_CAP_MDS_NO) {
2960                                         add_x86_feature(featureset,
2961                                             X86FSET_MDS_NO);
2962                                 }
2963                                 if (reg & IA32_ARCH_CAP_TSX_CTRL) {
2964                                         add_x86_feature(featureset,
2965                                             X86FSET_TSX_CTRL);
2966                                 }
2967                                 if (reg & IA32_ARCH_CAP_TAA_NO) {
2968                                         add_x86_feature(featureset,
2969                                             X86FSET_TAA_NO);
2970                                 }
2971                         }
2972                         no_trap();
2973                 }
2974 #endif  /* !__xpv */
2975 
2976                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
2977                         add_x86_feature(featureset, X86FSET_SSBD);
2978 
2979                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
2980                         add_x86_feature(featureset, X86FSET_FLUSH_CMD);
2981         }
2982 
2983         /*
2984          * Take care of certain mitigations on the non-boot CPU. The boot CPU
2985          * will have already run this function and determined what we need to
2986          * do. This gives us a hook for per-HW thread mitigations such as
2987          * enhanced IBRS, or disabling TSX.
2988          */
2989         if (cpu->cpu_id != 0) {
2990                 if (x86_spectrev2_mitigation == X86_SPECTREV2_ENHANCED_IBRS) {
2991                         cpuid_enable_enhanced_ibrs();
2992                 }
2993 
2994                 cpuid_apply_tsx(x86_taa_mitigation, featureset);
2995                 return;
2996         }
2997 
2998         /*
2999          * Go through and initialize various security mechanisms that we should
3000          * only do on a single CPU. This includes Spectre V2, L1TF, MDS, and
3001          * TAA.
3002          */
3003 
3004         /*
3005          * By default we've come in with retpolines enabled. Check whether we
3006          * should disable them or enable enhanced IBRS. RSB stuffing is enabled
3007          * by default, but disabled if we are using enhanced IBRS.
3008          */
3009         if (x86_disable_spectrev2 != 0) {
3010                 v2mit = X86_SPECTREV2_DISABLED;
3011         } else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
3012                 cpuid_enable_enhanced_ibrs();
3013                 v2mit = X86_SPECTREV2_ENHANCED_IBRS;
3014 #ifndef __xpv
3015         } else if (cpuid_use_amd_retpoline(cpi)) {
3016                 v2mit = X86_SPECTREV2_RETPOLINE_AMD;
3017 #endif  /* !__xpv */
3018         } else {
3019                 v2mit = X86_SPECTREV2_RETPOLINE;
3020         }
3021 
3022         cpuid_patch_retpolines(v2mit);
3023         cpuid_patch_rsb(v2mit);
3024         x86_spectrev2_mitigation = v2mit;
3025         membar_producer();
3026 
3027         /*
3028          * We need to determine what changes are required for mitigating L1TF
3029          * and MDS. If the CPU suffers from either of them, then SMT exclusion
3030          * is required.
3031          *
3032          * If any of these are present, then we need to flush u-arch state at
3033          * various points. For MDS, we need to do so whenever we change to a
3034          * lesser privilege level or we are halting the CPU. For L1TF we need to
3035          * flush the L1D cache at VM entry. When we have microcode that handles
3036          * MDS, the L1D flush also clears the other u-arch state that the
3037          * md_clear does.
3038          */
3039 
3040         /*
3041          * Update whether or not we need to be taking explicit action against
3042          * MDS.
3043          */
3044         cpuid_update_md_clear(cpu, featureset);
3045 
3046         /*
3047          * Determine whether SMT exclusion is required and whether or not we
3048          * need to perform an l1d flush.
3049          */
3050         cpuid_update_l1d_flush(cpu, featureset);
3051 
3052         /*
3053          * Determine what our mitigation strategy should be for TAA and then
3054          * also apply TAA mitigations.
3055          */
3056         cpuid_update_tsx(cpu, featureset);
3057         cpuid_apply_tsx(x86_taa_mitigation, featureset);
3058 }
3059 
3060 /*
3061  * Setup XFeature_Enabled_Mask register. Required by xsave feature.
3062  */
3063 void
3064 setup_xfem(void)
3065 {
3066         uint64_t flags = XFEATURE_LEGACY_FP;
3067 
3068         ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
3069 
3070         if (is_x86_feature(x86_featureset, X86FSET_SSE))
3071                 flags |= XFEATURE_SSE;
3072 
3073         if (is_x86_feature(x86_featureset, X86FSET_AVX))
3074                 flags |= XFEATURE_AVX;
3075 
3076         if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
3077                 flags |= XFEATURE_AVX512;
3078 
3079         set_xcr(XFEATURE_ENABLED_MASK, flags);
3080 
3081         xsave_bv_all = flags;
3082 }
3083 
3084 static void
3085 cpuid_pass1_topology(cpu_t *cpu, uchar_t *featureset)
3086 {
3087         struct cpuid_info *cpi;
3088 
3089         cpi = cpu->cpu_m.mcpu_cpi;
3090 
3091         if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3092                 cpuid_gather_amd_topology_leaves(cpu);
3093         }
3094 
3095         cpi->cpi_apicid = cpuid_gather_apicid(cpi);
3096 
3097         /*
3098          * Before we can calculate the IDs that we should assign to this
3099          * processor, we need to understand how many cores and threads it has.
3100          */
3101         switch (cpi->cpi_vendor) {
3102         case X86_VENDOR_Intel:
3103                 cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3104                     &cpi->cpi_ncore_per_chip);
3105                 break;
3106         case X86_VENDOR_AMD:
3107                 cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3108                     &cpi->cpi_ncore_per_chip);
3109                 break;
3110         default:
3111                 /*
3112                  * If we have some other x86 compatible chip, it's not clear how
3113                  * they would behave. The most common case is virtualization
3114                  * today, though there are also 64-bit VIA chips. Assume that
3115                  * all we can get is the basic Leaf 1 HTT information.
3116                  */
3117                 if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
3118                         cpi->cpi_ncore_per_chip = 1;
3119                         cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
3120                 }
3121                 break;
3122         }
3123 
3124         /*
3125          * Based on the calculated number of threads and cores, potentially
3126          * assign the HTT and CMT features.
3127          */
3128         if (cpi->cpi_ncore_per_chip > 1) {
3129                 add_x86_feature(featureset, X86FSET_CMP);
3130         }
3131 
3132         if (cpi->cpi_ncpu_per_chip > 1 &&
3133             cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
3134                 add_x86_feature(featureset, X86FSET_HTT);
3135         }
3136 
3137         /*
3138          * Now that has been set up, we need to go through and calculate all of
3139          * the rest of the parameters that exist. If we think the CPU doesn't
3140          * have either SMT (HTT) or CMP, then we basically go through and fake
3141          * up information in some way. The most likely case for this is
3142          * virtualization where we have a lot of partial topology information.
3143          */
3144         if (!is_x86_feature(featureset, X86FSET_HTT) &&
3145             !is_x86_feature(featureset, X86FSET_CMP)) {
3146                 /*
3147                  * This is a single core, single-threaded processor.
3148                  */
3149                 cpi->cpi_procnodes_per_pkg = 1;
3150                 cpi->cpi_cores_per_compunit = 1;
3151                 cpi->cpi_compunitid = 0;
3152                 cpi->cpi_chipid = -1;
3153                 cpi->cpi_clogid = 0;
3154                 cpi->cpi_coreid = cpu->cpu_id;
3155                 cpi->cpi_pkgcoreid = 0;
3156                 if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3157                         cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
3158                 } else {
3159                         cpi->cpi_procnodeid = cpi->cpi_chipid;
3160                 }
3161         } else {
3162                 switch (cpi->cpi_vendor) {
3163                 case X86_VENDOR_Intel:
3164                         cpuid_intel_getids(cpu, featureset);
3165                         break;
3166                 case X86_VENDOR_AMD:
3167                         cpuid_amd_getids(cpu, featureset);
3168                         break;
3169                 default:
3170                         /*
3171                          * In this case, it's hard to say what we should do.
3172                          * We're going to model them to the OS as single core
3173                          * threads. We don't have a good identifier for them, so
3174                          * we're just going to use the cpu id all on a single
3175                          * chip.
3176                          *
3177                          * This case has historically been different from the
3178                          * case above where we don't have HTT or CMP. While they
3179                          * could be combined, we've opted to keep it separate to
3180                          * minimize the risk of topology changes in weird cases.
3181                          */
3182                         cpi->cpi_procnodes_per_pkg = 1;
3183                         cpi->cpi_cores_per_compunit = 1;
3184                         cpi->cpi_chipid = 0;
3185                         cpi->cpi_coreid = cpu->cpu_id;
3186                         cpi->cpi_clogid = cpu->cpu_id;
3187                         cpi->cpi_pkgcoreid = cpu->cpu_id;
3188                         cpi->cpi_procnodeid = cpi->cpi_chipid;
3189                         cpi->cpi_compunitid = cpi->cpi_coreid;
3190                         break;
3191                 }
3192         }
3193 }
3194 
3195 /*
3196  * Gather relevant CPU features from leaf 6 which covers thermal information. We
3197  * always gather leaf 6 if it's supported; however, we only look for features on
3198  * Intel systems as AMD does not currently define any of the features we look
3199  * for below.
3200  */
3201 static void
3202 cpuid_pass1_thermal(cpu_t *cpu, uchar_t *featureset)
3203 {
3204         struct cpuid_regs *cp;
3205         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3206 
3207         if (cpi->cpi_maxeax < 6) {
3208                 return;
3209         }
3210 
3211         cp = &cpi->cpi_std[6];
3212         cp->cp_eax = 6;
3213         cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
3214         (void) __cpuid_insn(cp);
3215         platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
3216 
3217         if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3218                 return;
3219         }
3220 
3221         if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
3222                 add_x86_feature(featureset, X86FSET_CORE_THERMAL);
3223         }
3224 
3225         if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
3226                 add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3227         }
3228 }
3229 
3230 void
3231 cpuid_pass1(cpu_t *cpu, uchar_t *featureset)
3232 {
3233         uint32_t mask_ecx, mask_edx;
3234         struct cpuid_info *cpi;
3235         struct cpuid_regs *cp;
3236         int xcpuid;
3237 #if !defined(__xpv)
3238         extern int idle_cpu_prefer_mwait;
3239 #endif
3240 
3241         /*
3242          * Space statically allocated for BSP, ensure pointer is set
3243          */
3244         if (cpu->cpu_id == 0) {
3245                 if (cpu->cpu_m.mcpu_cpi == NULL)
3246                         cpu->cpu_m.mcpu_cpi = &cpuid_info0;
3247         }
3248 
3249         add_x86_feature(featureset, X86FSET_CPUID);
3250 
3251         cpi = cpu->cpu_m.mcpu_cpi;
3252         ASSERT(cpi != NULL);
3253         cp = &cpi->cpi_std[0];
3254         cp->cp_eax = 0;
3255         cpi->cpi_maxeax = __cpuid_insn(cp);
3256         {
3257                 uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
3258                 *iptr++ = cp->cp_ebx;
3259                 *iptr++ = cp->cp_edx;
3260                 *iptr++ = cp->cp_ecx;
3261                 *(char *)&cpi->cpi_vendorstr[12] = '\0';
3262         }
3263 
3264         cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
3265         x86_vendor = cpi->cpi_vendor; /* for compatibility */
3266 
3267         /*
3268          * Limit the range in case of weird hardware
3269          */
3270         if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
3271                 cpi->cpi_maxeax = CPI_MAXEAX_MAX;
3272         if (cpi->cpi_maxeax < 1)
3273                 goto pass1_done;
3274 
3275         cp = &cpi->cpi_std[1];
3276         cp->cp_eax = 1;
3277         (void) __cpuid_insn(cp);
3278 
3279         /*
3280          * Extract identifying constants for easy access.
3281          */
3282         cpi->cpi_model = CPI_MODEL(cpi);
3283         cpi->cpi_family = CPI_FAMILY(cpi);
3284 
3285         if (cpi->cpi_family == 0xf)
3286                 cpi->cpi_family += CPI_FAMILY_XTD(cpi);
3287 
3288         /*
3289          * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
3290          * Intel, and presumably everyone else, uses model == 0xf, as
3291          * one would expect (max value means possible overflow).  Sigh.
3292          */
3293 
3294         switch (cpi->cpi_vendor) {
3295         case X86_VENDOR_Intel:
3296                 if (IS_EXTENDED_MODEL_INTEL(cpi))
3297                         cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3298                 break;
3299         case X86_VENDOR_AMD:
3300                 if (CPI_FAMILY(cpi) == 0xf)
3301                         cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3302                 break;
3303         default:
3304                 if (cpi->cpi_model == 0xf)
3305                         cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3306                 break;
3307         }
3308 
3309         cpi->cpi_step = CPI_STEP(cpi);
3310         cpi->cpi_brandid = CPI_BRANDID(cpi);
3311 
3312         /*
3313          * *default* assumptions:
3314          * - believe %edx feature word
3315          * - ignore %ecx feature word
3316          * - 32-bit virtual and physical addressing
3317          */
3318         mask_edx = 0xffffffff;
3319         mask_ecx = 0;
3320 
3321         cpi->cpi_pabits = cpi->cpi_vabits = 32;
3322 
3323         switch (cpi->cpi_vendor) {
3324         case X86_VENDOR_Intel:
3325                 if (cpi->cpi_family == 5)
3326                         x86_type = X86_TYPE_P5;
3327                 else if (IS_LEGACY_P6(cpi)) {
3328                         x86_type = X86_TYPE_P6;
3329                         pentiumpro_bug4046376 = 1;
3330                         /*
3331                          * Clear the SEP bit when it was set erroneously
3332                          */
3333                         if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
3334                                 cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
3335                 } else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
3336                         x86_type = X86_TYPE_P4;
3337                         /*
3338                          * We don't currently depend on any of the %ecx
3339                          * features until Prescott, so we'll only check
3340                          * this from P4 onwards.  We might want to revisit
3341                          * that idea later.
3342                          */
3343                         mask_ecx = 0xffffffff;
3344                 } else if (cpi->cpi_family > 0xf)
3345                         mask_ecx = 0xffffffff;
3346                 /*
3347                  * We don't support MONITOR/MWAIT if leaf 5 is not available
3348                  * to obtain the monitor linesize.
3349                  */
3350                 if (cpi->cpi_maxeax < 5)
3351                         mask_ecx &= ~CPUID_INTC_ECX_MON;
3352                 break;
3353         case X86_VENDOR_IntelClone:
3354         default:
3355                 break;
3356         case X86_VENDOR_AMD:
3357 #if defined(OPTERON_ERRATUM_108)
3358                 if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
3359                         cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
3360                         cpi->cpi_model = 0xc;
3361                 } else
3362 #endif
3363                 if (cpi->cpi_family == 5) {
3364                         /*
3365                          * AMD K5 and K6
3366                          *
3367                          * These CPUs have an incomplete implementation
3368                          * of MCA/MCE which we mask away.
3369                          */
3370                         mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
3371 
3372                         /*
3373                          * Model 0 uses the wrong (APIC) bit
3374                          * to indicate PGE.  Fix it here.
3375                          */
3376                         if (cpi->cpi_model == 0) {
3377                                 if (cp->cp_edx & 0x200) {
3378                                         cp->cp_edx &= ~0x200;
3379                                         cp->cp_edx |= CPUID_INTC_EDX_PGE;
3380                                 }
3381                         }
3382 
3383                         /*
3384                          * Early models had problems w/ MMX; disable.
3385                          */
3386                         if (cpi->cpi_model < 6)
3387                                 mask_edx &= ~CPUID_INTC_EDX_MMX;
3388                 }
3389 
3390                 /*
3391                  * For newer families, SSE3 and CX16, at least, are valid;
3392                  * enable all
3393                  */
3394                 if (cpi->cpi_family >= 0xf)
3395                         mask_ecx = 0xffffffff;
3396                 /*
3397                  * We don't support MONITOR/MWAIT if leaf 5 is not available
3398                  * to obtain the monitor linesize.
3399                  */
3400                 if (cpi->cpi_maxeax < 5)
3401                         mask_ecx &= ~CPUID_INTC_ECX_MON;
3402 
3403 #if !defined(__xpv)
3404                 /*
3405                  * AMD has not historically used MWAIT in the CPU's idle loop.
3406                  * Pre-family-10h Opterons do not have the MWAIT instruction. We
3407                  * know for certain that in at least family 17h, per AMD, mwait
3408                  * is preferred. Families in-between are less certain.
3409                  */
3410                 if (cpi->cpi_family < 0x17) {
3411                         idle_cpu_prefer_mwait = 0;
3412                 }
3413 #endif
3414 
3415                 break;
3416         case X86_VENDOR_TM:
3417                 /*
3418                  * workaround the NT workaround in CMS 4.1
3419                  */
3420                 if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
3421                     (cpi->cpi_step == 2 || cpi->cpi_step == 3))
3422                         cp->cp_edx |= CPUID_INTC_EDX_CX8;
3423                 break;
3424         case X86_VENDOR_Centaur:
3425                 /*
3426                  * workaround the NT workarounds again
3427                  */
3428                 if (cpi->cpi_family == 6)
3429                         cp->cp_edx |= CPUID_INTC_EDX_CX8;
3430                 break;
3431         case X86_VENDOR_Cyrix:
3432                 /*
3433                  * We rely heavily on the probing in locore
3434                  * to actually figure out what parts, if any,
3435                  * of the Cyrix cpuid instruction to believe.
3436                  */
3437                 switch (x86_type) {
3438                 case X86_TYPE_CYRIX_486:
3439                         mask_edx = 0;
3440                         break;
3441                 case X86_TYPE_CYRIX_6x86:
3442                         mask_edx = 0;
3443                         break;
3444                 case X86_TYPE_CYRIX_6x86L:
3445                         mask_edx =
3446                             CPUID_INTC_EDX_DE |
3447                             CPUID_INTC_EDX_CX8;
3448                         break;
3449                 case X86_TYPE_CYRIX_6x86MX:
3450                         mask_edx =
3451                             CPUID_INTC_EDX_DE |
3452                             CPUID_INTC_EDX_MSR |
3453                             CPUID_INTC_EDX_CX8 |
3454                             CPUID_INTC_EDX_PGE |
3455                             CPUID_INTC_EDX_CMOV |
3456                             CPUID_INTC_EDX_MMX;
3457                         break;
3458                 case X86_TYPE_CYRIX_GXm:
3459                         mask_edx =
3460                             CPUID_INTC_EDX_MSR |
3461                             CPUID_INTC_EDX_CX8 |
3462                             CPUID_INTC_EDX_CMOV |
3463                             CPUID_INTC_EDX_MMX;
3464                         break;
3465                 case X86_TYPE_CYRIX_MediaGX:
3466                         break;
3467                 case X86_TYPE_CYRIX_MII:
3468                 case X86_TYPE_VIA_CYRIX_III:
3469                         mask_edx =
3470                             CPUID_INTC_EDX_DE |
3471                             CPUID_INTC_EDX_TSC |
3472                             CPUID_INTC_EDX_MSR |
3473                             CPUID_INTC_EDX_CX8 |
3474                             CPUID_INTC_EDX_PGE |
3475                             CPUID_INTC_EDX_CMOV |
3476                             CPUID_INTC_EDX_MMX;
3477                         break;
3478                 default:
3479                         break;
3480                 }
3481                 break;
3482         }
3483 
3484 #if defined(__xpv)
3485         /*
3486          * Do not support MONITOR/MWAIT under a hypervisor
3487          */
3488         mask_ecx &= ~CPUID_INTC_ECX_MON;
3489         /*
3490          * Do not support XSAVE under a hypervisor for now
3491          */
3492         xsave_force_disable = B_TRUE;
3493 
3494 #endif  /* __xpv */
3495 
3496         if (xsave_force_disable) {
3497                 mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
3498                 mask_ecx &= ~CPUID_INTC_ECX_AVX;
3499                 mask_ecx &= ~CPUID_INTC_ECX_F16C;
3500                 mask_ecx &= ~CPUID_INTC_ECX_FMA;
3501         }
3502 
3503         /*
3504          * Now we've figured out the masks that determine
3505          * which bits we choose to believe, apply the masks
3506          * to the feature words, then map the kernel's view
3507          * of these feature words into its feature word.
3508          */
3509         cp->cp_edx &= mask_edx;
3510         cp->cp_ecx &= mask_ecx;
3511 
3512         /*
3513          * apply any platform restrictions (we don't call this
3514          * immediately after __cpuid_insn here, because we need the
3515          * workarounds applied above first)
3516          */
3517         platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
3518 
3519         /*
3520          * In addition to ecx and edx, Intel and AMD are storing a bunch of
3521          * instruction set extensions in leaf 7's ebx, ecx, and edx.
3522          */
3523         if (cpi->cpi_maxeax >= 7) {
3524                 struct cpuid_regs *ecp;
3525                 ecp = &cpi->cpi_std[7];
3526                 ecp->cp_eax = 7;
3527                 ecp->cp_ecx = 0;
3528                 (void) __cpuid_insn(ecp);
3529 
3530                 /*
3531                  * If XSAVE has been disabled, just ignore all of the
3532                  * extended-save-area dependent flags here.
3533                  */
3534                 if (xsave_force_disable) {
3535                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
3536                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
3537                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
3538                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
3539                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
3540                         ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
3541                         ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
3542                 }
3543 
3544                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
3545                         add_x86_feature(featureset, X86FSET_SMEP);
3546 
3547                 /*
3548                  * We check disable_smap here in addition to in startup_smap()
3549                  * to ensure CPUs that aren't the boot CPU don't accidentally
3550                  * include it in the feature set and thus generate a mismatched
3551                  * x86 feature set across CPUs.
3552                  */
3553                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
3554                     disable_smap == 0)
3555                         add_x86_feature(featureset, X86FSET_SMAP);
3556 
3557                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
3558                         add_x86_feature(featureset, X86FSET_RDSEED);
3559 
3560                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
3561                         add_x86_feature(featureset, X86FSET_ADX);
3562 
3563                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
3564                         add_x86_feature(featureset, X86FSET_FSGSBASE);
3565 
3566                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
3567                         add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
3568 
3569                 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3570                         if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
3571                                 add_x86_feature(featureset, X86FSET_INVPCID);
3572 
3573                         if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
3574                                 add_x86_feature(featureset, X86FSET_MPX);
3575 
3576                         if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
3577                                 add_x86_feature(featureset, X86FSET_CLWB);
3578                 }
3579         }
3580 
3581         /*
3582          * fold in overrides from the "eeprom" mechanism
3583          */
3584         cp->cp_edx |= cpuid_feature_edx_include;
3585         cp->cp_edx &= ~cpuid_feature_edx_exclude;
3586 
3587         cp->cp_ecx |= cpuid_feature_ecx_include;
3588         cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
3589 
3590         if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
3591                 add_x86_feature(featureset, X86FSET_LARGEPAGE);
3592         }
3593         if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
3594                 add_x86_feature(featureset, X86FSET_TSC);
3595         }
3596         if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
3597                 add_x86_feature(featureset, X86FSET_MSR);
3598         }
3599         if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
3600                 add_x86_feature(featureset, X86FSET_MTRR);
3601         }
3602         if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
3603                 add_x86_feature(featureset, X86FSET_PGE);
3604         }
3605         if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
3606                 add_x86_feature(featureset, X86FSET_CMOV);
3607         }
3608         if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
3609                 add_x86_feature(featureset, X86FSET_MMX);
3610         }
3611         if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
3612             (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
3613                 add_x86_feature(featureset, X86FSET_MCA);
3614         }
3615         if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
3616                 add_x86_feature(featureset, X86FSET_PAE);
3617         }
3618         if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
3619                 add_x86_feature(featureset, X86FSET_CX8);
3620         }
3621         if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
3622                 add_x86_feature(featureset, X86FSET_CX16);
3623         }
3624         if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
3625                 add_x86_feature(featureset, X86FSET_PAT);
3626         }
3627         if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
3628                 add_x86_feature(featureset, X86FSET_SEP);
3629         }
3630         if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
3631                 /*
3632                  * In our implementation, fxsave/fxrstor
3633                  * are prerequisites before we'll even
3634                  * try and do SSE things.
3635                  */
3636                 if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
3637                         add_x86_feature(featureset, X86FSET_SSE);
3638                 }
3639                 if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
3640                         add_x86_feature(featureset, X86FSET_SSE2);
3641                 }
3642                 if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
3643                         add_x86_feature(featureset, X86FSET_SSE3);
3644                 }
3645                 if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
3646                         add_x86_feature(featureset, X86FSET_SSSE3);
3647                 }
3648                 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
3649                         add_x86_feature(featureset, X86FSET_SSE4_1);
3650                 }
3651                 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
3652                         add_x86_feature(featureset, X86FSET_SSE4_2);
3653                 }
3654                 if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
3655                         add_x86_feature(featureset, X86FSET_AES);
3656                 }
3657                 if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
3658                         add_x86_feature(featureset, X86FSET_PCLMULQDQ);
3659                 }
3660 
3661                 if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
3662                         add_x86_feature(featureset, X86FSET_SHA);
3663 
3664                 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
3665                         add_x86_feature(featureset, X86FSET_UMIP);
3666                 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_PKU)
3667                         add_x86_feature(featureset, X86FSET_PKU);
3668                 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
3669                         add_x86_feature(featureset, X86FSET_OSPKE);
3670 
3671                 if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
3672                         add_x86_feature(featureset, X86FSET_XSAVE);
3673 
3674                         /* We only test AVX & AVX512 when there is XSAVE */
3675 
3676                         if (cp->cp_ecx & CPUID_INTC_ECX_AVX) {
3677                                 add_x86_feature(featureset,
3678                                     X86FSET_AVX);
3679 
3680                                 /*
3681                                  * Intel says we can't check these without also
3682                                  * checking AVX.
3683                                  */
3684                                 if (cp->cp_ecx & CPUID_INTC_ECX_F16C)
3685                                         add_x86_feature(featureset,
3686                                             X86FSET_F16C);
3687 
3688                                 if (cp->cp_ecx & CPUID_INTC_ECX_FMA)
3689                                         add_x86_feature(featureset,
3690                                             X86FSET_FMA);
3691 
3692                                 if (cpi->cpi_std[7].cp_ebx &
3693                                     CPUID_INTC_EBX_7_0_BMI1)
3694                                         add_x86_feature(featureset,
3695                                             X86FSET_BMI1);
3696 
3697                                 if (cpi->cpi_std[7].cp_ebx &
3698                                     CPUID_INTC_EBX_7_0_BMI2)
3699                                         add_x86_feature(featureset,
3700                                             X86FSET_BMI2);
3701 
3702                                 if (cpi->cpi_std[7].cp_ebx &
3703                                     CPUID_INTC_EBX_7_0_AVX2)
3704                                         add_x86_feature(featureset,
3705                                             X86FSET_AVX2);
3706                         }
3707 
3708                         if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3709                             (cpi->cpi_std[7].cp_ebx &
3710                             CPUID_INTC_EBX_7_0_AVX512F) != 0) {
3711                                 add_x86_feature(featureset, X86FSET_AVX512F);
3712 
3713                                 if (cpi->cpi_std[7].cp_ebx &
3714                                     CPUID_INTC_EBX_7_0_AVX512DQ)
3715                                         add_x86_feature(featureset,
3716                                             X86FSET_AVX512DQ);
3717                                 if (cpi->cpi_std[7].cp_ebx &
3718                                     CPUID_INTC_EBX_7_0_AVX512IFMA)
3719                                         add_x86_feature(featureset,
3720                                             X86FSET_AVX512FMA);
3721                                 if (cpi->cpi_std[7].cp_ebx &
3722                                     CPUID_INTC_EBX_7_0_AVX512PF)
3723                                         add_x86_feature(featureset,
3724                                             X86FSET_AVX512PF);
3725                                 if (cpi->cpi_std[7].cp_ebx &
3726                                     CPUID_INTC_EBX_7_0_AVX512ER)
3727                                         add_x86_feature(featureset,
3728                                             X86FSET_AVX512ER);
3729                                 if (cpi->cpi_std[7].cp_ebx &
3730                                     CPUID_INTC_EBX_7_0_AVX512CD)
3731                                         add_x86_feature(featureset,
3732                                             X86FSET_AVX512CD);
3733                                 if (cpi->cpi_std[7].cp_ebx &
3734                                     CPUID_INTC_EBX_7_0_AVX512BW)
3735                                         add_x86_feature(featureset,
3736                                             X86FSET_AVX512BW);
3737                                 if (cpi->cpi_std[7].cp_ebx &
3738                                     CPUID_INTC_EBX_7_0_AVX512VL)
3739                                         add_x86_feature(featureset,
3740                                             X86FSET_AVX512VL);
3741 
3742                                 if (cpi->cpi_std[7].cp_ecx &
3743                                     CPUID_INTC_ECX_7_0_AVX512VBMI)
3744                                         add_x86_feature(featureset,
3745                                             X86FSET_AVX512VBMI);
3746                                 if (cpi->cpi_std[7].cp_ecx &
3747                                     CPUID_INTC_ECX_7_0_AVX512VNNI)
3748                                         add_x86_feature(featureset,
3749                                             X86FSET_AVX512VNNI);
3750                                 if (cpi->cpi_std[7].cp_ecx &
3751                                     CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3752                                         add_x86_feature(featureset,
3753                                             X86FSET_AVX512VPOPCDQ);
3754 
3755                                 if (cpi->cpi_std[7].cp_edx &
3756                                     CPUID_INTC_EDX_7_0_AVX5124NNIW)
3757                                         add_x86_feature(featureset,
3758                                             X86FSET_AVX512NNIW);
3759                                 if (cpi->cpi_std[7].cp_edx &
3760                                     CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3761                                         add_x86_feature(featureset,
3762                                             X86FSET_AVX512FMAPS);
3763                         }
3764                 }
3765         }
3766 
3767         if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3768                 if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
3769                         add_x86_feature(featureset, X86FSET_PCID);
3770                 }
3771         }
3772 
3773         if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
3774                 add_x86_feature(featureset, X86FSET_X2APIC);
3775         }
3776         if (cp->cp_edx & CPUID_INTC_EDX_DE) {
3777                 add_x86_feature(featureset, X86FSET_DE);
3778         }
3779 #if !defined(__xpv)
3780         if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
3781 
3782                 /*
3783                  * We require the CLFLUSH instruction for erratum workaround
3784                  * to use MONITOR/MWAIT.
3785                  */
3786                 if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3787                         cpi->cpi_mwait.support |= MWAIT_SUPPORT;
3788                         add_x86_feature(featureset, X86FSET_MWAIT);
3789                 } else {
3790                         extern int idle_cpu_assert_cflush_monitor;
3791 
3792                         /*
3793                          * All processors we are aware of which have
3794                          * MONITOR/MWAIT also have CLFLUSH.
3795                          */
3796                         if (idle_cpu_assert_cflush_monitor) {
3797                                 ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
3798                                     (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
3799                         }
3800                 }
3801         }
3802 #endif  /* __xpv */
3803 
3804         if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
3805                 add_x86_feature(featureset, X86FSET_VMX);
3806         }
3807 
3808         if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
3809                 add_x86_feature(featureset, X86FSET_RDRAND);
3810 
3811         /*
3812          * Only need it first time, rest of the cpus would follow suit.
3813          * we only capture this for the bootcpu.
3814          */
3815         if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3816                 add_x86_feature(featureset, X86FSET_CLFSH);
3817                 x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
3818         }
3819         if (is_x86_feature(featureset, X86FSET_PAE))
3820                 cpi->cpi_pabits = 36;
3821 
3822         if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
3823                 struct cpuid_regs r, *ecp;
3824 
3825                 ecp = &r;
3826                 ecp->cp_eax = 0xD;
3827                 ecp->cp_ecx = 1;
3828                 ecp->cp_edx = ecp->cp_ebx = 0;
3829                 (void) __cpuid_insn(ecp);
3830 
3831                 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
3832                         add_x86_feature(featureset, X86FSET_XSAVEOPT);
3833                 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
3834                         add_x86_feature(featureset, X86FSET_XSAVEC);
3835                 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
3836                         add_x86_feature(featureset, X86FSET_XSAVES);
3837         }
3838 
3839         /*
3840          * Work on the "extended" feature information, doing
3841          * some basic initialization for cpuid_pass2()
3842          */
3843         xcpuid = 0;
3844         switch (cpi->cpi_vendor) {
3845         case X86_VENDOR_Intel:
3846                 /*
3847                  * On KVM we know we will have proper support for extended
3848                  * cpuid.
3849                  */
3850                 if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
3851                     (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
3852                     (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
3853                         xcpuid++;
3854                 break;
3855         case X86_VENDOR_AMD:
3856                 if (cpi->cpi_family > 5 ||
3857                     (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
3858                         xcpuid++;
3859                 break;
3860         case X86_VENDOR_Cyrix:
3861                 /*
3862                  * Only these Cyrix CPUs are -known- to support
3863                  * extended cpuid operations.
3864                  */
3865                 if (x86_type == X86_TYPE_VIA_CYRIX_III ||
3866                     x86_type == X86_TYPE_CYRIX_GXm)
3867                         xcpuid++;
3868                 break;
3869         case X86_VENDOR_Centaur:
3870         case X86_VENDOR_TM:
3871         default:
3872                 xcpuid++;
3873                 break;
3874         }
3875 
3876         if (xcpuid) {
3877                 cp = &cpi->cpi_extd[0];
3878                 cp->cp_eax = CPUID_LEAF_EXT_0;
3879                 cpi->cpi_xmaxeax = __cpuid_insn(cp);
3880         }
3881 
3882         if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
3883 
3884                 if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
3885                         cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
3886 
3887                 switch (cpi->cpi_vendor) {
3888                 case X86_VENDOR_Intel:
3889                 case X86_VENDOR_AMD:
3890                         if (cpi->cpi_xmaxeax < 0x80000001)
3891                                 break;
3892                         cp = &cpi->cpi_extd[1];
3893                         cp->cp_eax = 0x80000001;
3894                         (void) __cpuid_insn(cp);
3895 
3896                         if (cpi->cpi_vendor == X86_VENDOR_AMD &&
3897                             cpi->cpi_family == 5 &&
3898                             cpi->cpi_model == 6 &&
3899                             cpi->cpi_step == 6) {
3900                                 /*
3901                                  * K6 model 6 uses bit 10 to indicate SYSC
3902                                  * Later models use bit 11. Fix it here.
3903                                  */
3904                                 if (cp->cp_edx & 0x400) {
3905                                         cp->cp_edx &= ~0x400;
3906                                         cp->cp_edx |= CPUID_AMD_EDX_SYSC;
3907                                 }
3908                         }
3909 
3910                         platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
3911 
3912                         /*
3913                          * Compute the additions to the kernel's feature word.
3914                          */
3915                         if (cp->cp_edx & CPUID_AMD_EDX_NX) {
3916                                 add_x86_feature(featureset, X86FSET_NX);
3917                         }
3918 
3919                         /*
3920                          * Regardless whether or not we boot 64-bit,
3921                          * we should have a way to identify whether
3922                          * the CPU is capable of running 64-bit.
3923                          */
3924                         if (cp->cp_edx & CPUID_AMD_EDX_LM) {
3925                                 add_x86_feature(featureset, X86FSET_64);
3926                         }
3927 
3928                         /* 1 GB large page - enable only for 64 bit kernel */
3929                         if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
3930                                 add_x86_feature(featureset, X86FSET_1GPG);
3931                         }
3932 
3933                         if ((cpi->cpi_vendor == X86_VENDOR_AMD) &&
3934                             (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
3935                             (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
3936                                 add_x86_feature(featureset, X86FSET_SSE4A);
3937                         }
3938 
3939                         /*
3940                          * It's really tricky to support syscall/sysret in
3941                          * the i386 kernel; we rely on sysenter/sysexit
3942                          * instead.  In the amd64 kernel, things are -way-
3943                          * better.
3944                          */
3945                         if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
3946                                 add_x86_feature(featureset, X86FSET_ASYSC);
3947                         }
3948 
3949                         /*
3950                          * While we're thinking about system calls, note
3951                          * that AMD processors don't support sysenter
3952                          * in long mode at all, so don't try to program them.
3953                          */
3954                         if (x86_vendor == X86_VENDOR_AMD) {
3955                                 remove_x86_feature(featureset, X86FSET_SEP);
3956                         }
3957 
3958                         if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
3959                                 add_x86_feature(featureset, X86FSET_TSCP);
3960                         }
3961 
3962                         if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
3963                                 add_x86_feature(featureset, X86FSET_SVM);
3964                         }
3965 
3966                         if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
3967                                 add_x86_feature(featureset, X86FSET_TOPOEXT);
3968                         }
3969 
3970                         if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
3971                                 add_x86_feature(featureset, X86FSET_AMD_PCEC);
3972                         }
3973 
3974                         if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
3975                                 add_x86_feature(featureset, X86FSET_XOP);
3976                         }
3977 
3978                         if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
3979                                 add_x86_feature(featureset, X86FSET_FMA4);
3980                         }
3981 
3982                         if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
3983                                 add_x86_feature(featureset, X86FSET_TBM);
3984                         }
3985 
3986                         if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
3987                                 add_x86_feature(featureset, X86FSET_MONITORX);
3988                         }
3989                         break;
3990                 default:
3991                         break;
3992                 }
3993 
3994                 /*
3995                  * Get CPUID data about processor cores and hyperthreads.
3996                  */
3997                 switch (cpi->cpi_vendor) {
3998                 case X86_VENDOR_Intel:
3999                         if (cpi->cpi_maxeax >= 4) {
4000                                 cp = &cpi->cpi_std[4];
4001                                 cp->cp_eax = 4;
4002                                 cp->cp_ecx = 0;
4003                                 (void) __cpuid_insn(cp);
4004                                 platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
4005                         }
4006                         /*FALLTHROUGH*/
4007                 case X86_VENDOR_AMD:
4008                         if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
4009                                 break;
4010                         cp = &cpi->cpi_extd[8];
4011                         cp->cp_eax = CPUID_LEAF_EXT_8;
4012                         (void) __cpuid_insn(cp);
4013                         platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
4014                             cp);
4015 
4016                         /*
4017                          * AMD uses ebx for some extended functions.
4018                          */
4019                         if (cpi->cpi_vendor == X86_VENDOR_AMD) {
4020                                 /*
4021                                  * While we're here, check for the AMD "Error
4022                                  * Pointer Zero/Restore" feature. This can be
4023                                  * used to setup the FP save handlers
4024                                  * appropriately.
4025                                  */
4026                                 if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4027                                         cpi->cpi_fp_amd_save = 0;
4028                                 } else {
4029                                         cpi->cpi_fp_amd_save = 1;
4030                                 }
4031 
4032                                 if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
4033                                         add_x86_feature(featureset,
4034                                             X86FSET_CLZERO);
4035                                 }
4036                         }
4037 
4038                         /*
4039                          * Virtual and physical address limits from
4040                          * cpuid override previously guessed values.
4041                          */
4042                         cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
4043                         cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
4044                         break;
4045                 default:
4046                         break;
4047                 }
4048 
4049                 /*
4050                  * Get CPUID data about TSC Invariance in Deep C-State.
4051                  */
4052                 switch (cpi->cpi_vendor) {
4053                 case X86_VENDOR_Intel:
4054                 case X86_VENDOR_AMD:
4055                         if (cpi->cpi_maxeax >= 7) {
4056                                 cp = &cpi->cpi_extd[7];
4057                                 cp->cp_eax = 0x80000007;
4058                                 cp->cp_ecx = 0;
4059                                 (void) __cpuid_insn(cp);
4060                         }
4061                         break;
4062                 default:
4063                         break;
4064                 }
4065         }
4066 
4067         cpuid_pass1_topology(cpu, featureset);
4068         cpuid_pass1_thermal(cpu, featureset);
4069 
4070         /*
4071          * Synthesize chip "revision" and socket type
4072          */
4073         cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
4074             cpi->cpi_model, cpi->cpi_step);
4075         cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
4076             cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
4077         cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
4078             cpi->cpi_model, cpi->cpi_step);
4079 
4080         if (cpi->cpi_vendor == X86_VENDOR_AMD) {
4081                 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
4082                     cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4083                         /* Special handling for AMD FP not necessary. */
4084                         cpi->cpi_fp_amd_save = 0;
4085                 } else {
4086                         cpi->cpi_fp_amd_save = 1;
4087                 }
4088         }
4089 
4090         /*
4091          * Check the processor leaves that are used for security features.
4092          */
4093         cpuid_scan_security(cpu, featureset);
4094 
4095 pass1_done:
4096         cpi->cpi_pass = 1;
4097 }
4098 
4099 /*
4100  * Make copies of the cpuid table entries we depend on, in
4101  * part for ease of parsing now, in part so that we have only
4102  * one place to correct any of it, in part for ease of
4103  * later export to userland, and in part so we can look at
4104  * this stuff in a crash dump.
4105  */
4106 
4107 /*ARGSUSED*/
4108 void
4109 cpuid_pass2(cpu_t *cpu)
4110 {
4111         uint_t n, nmax;
4112         int i;
4113         struct cpuid_regs *cp;
4114         uint8_t *dp;
4115         uint32_t *iptr;
4116         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4117 
4118         ASSERT(cpi->cpi_pass == 1);
4119 
4120         if (cpi->cpi_maxeax < 1)
4121                 goto pass2_done;
4122 
4123         if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
4124                 nmax = NMAX_CPI_STD;
4125         /*
4126          * (We already handled n == 0 and n == 1 in pass 1)
4127          */
4128         for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
4129                 /*
4130                  * leaves 6 and 7 were handled in pass 1
4131                  */
4132                 if (n == 6 || n == 7)
4133                         continue;
4134 
4135                 cp->cp_eax = n;
4136 
4137                 /*
4138                  * CPUID function 4 expects %ecx to be initialized
4139                  * with an index which indicates which cache to return
4140                  * information about. The OS is expected to call function 4
4141                  * with %ecx set to 0, 1, 2, ... until it returns with
4142                  * EAX[4:0] set to 0, which indicates there are no more
4143                  * caches.
4144                  *
4145                  * Here, populate cpi_std[4] with the information returned by
4146                  * function 4 when %ecx == 0, and do the rest in cpuid_pass3()
4147                  * when dynamic memory allocation becomes available.
4148                  *
4149                  * Note: we need to explicitly initialize %ecx here, since
4150                  * function 4 may have been previously invoked.
4151                  */
4152                 if (n == 4)
4153                         cp->cp_ecx = 0;
4154 
4155                 (void) __cpuid_insn(cp);
4156                 platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
4157                 switch (n) {
4158                 case 2:
4159                         /*
4160                          * "the lower 8 bits of the %eax register
4161                          * contain a value that identifies the number
4162                          * of times the cpuid [instruction] has to be
4163                          * executed to obtain a complete image of the
4164                          * processor's caching systems."
4165                          *
4166                          * How *do* they make this stuff up?
4167                          */
4168                         cpi->cpi_ncache = sizeof (*cp) *
4169                             BITX(cp->cp_eax, 7, 0);
4170                         if (cpi->cpi_ncache == 0)
4171                                 break;
4172                         cpi->cpi_ncache--;   /* skip count byte */
4173 
4174                         /*
4175                          * Well, for now, rather than attempt to implement
4176                          * this slightly dubious algorithm, we just look
4177                          * at the first 15 ..
4178                          */
4179                         if (cpi->cpi_ncache > (sizeof (*cp) - 1))
4180                                 cpi->cpi_ncache = sizeof (*cp) - 1;
4181 
4182                         dp = cpi->cpi_cacheinfo;
4183                         if (BITX(cp->cp_eax, 31, 31) == 0) {
4184                                 uint8_t *p = (void *)&cp->cp_eax;
4185                                 for (i = 1; i < 4; i++)
4186                                         if (p[i] != 0)
4187                                                 *dp++ = p[i];
4188                         }
4189                         if (BITX(cp->cp_ebx, 31, 31) == 0) {
4190                                 uint8_t *p = (void *)&cp->cp_ebx;
4191                                 for (i = 0; i < 4; i++)
4192                                         if (p[i] != 0)
4193                                                 *dp++ = p[i];
4194                         }
4195                         if (BITX(cp->cp_ecx, 31, 31) == 0) {
4196                                 uint8_t *p = (void *)&cp->cp_ecx;
4197                                 for (i = 0; i < 4; i++)
4198                                         if (p[i] != 0)
4199                                                 *dp++ = p[i];
4200                         }
4201                         if (BITX(cp->cp_edx, 31, 31) == 0) {
4202                                 uint8_t *p = (void *)&cp->cp_edx;
4203                                 for (i = 0; i < 4; i++)
4204                                         if (p[i] != 0)
4205                                                 *dp++ = p[i];
4206                         }
4207                         break;
4208 
4209                 case 3: /* Processor serial number, if PSN supported */
4210                         break;
4211 
4212                 case 4: /* Deterministic cache parameters */
4213                         break;
4214 
4215                 case 5: /* Monitor/Mwait parameters */
4216                 {
4217                         size_t mwait_size;
4218 
4219                         /*
4220                          * check cpi_mwait.support which was set in cpuid_pass1
4221                          */
4222                         if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
4223                                 break;
4224 
4225                         /*
4226                          * Protect ourself from insane mwait line size.
4227                          * Workaround for incomplete hardware emulator(s).
4228                          */
4229                         mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
4230                         if (mwait_size < sizeof (uint32_t) ||
4231                             !ISP2(mwait_size)) {
4232 #if DEBUG
4233                                 cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
4234                                     "size %ld", cpu->cpu_id, (long)mwait_size);
4235 #endif
4236                                 break;
4237                         }
4238 
4239                         cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
4240                         cpi->cpi_mwait.mon_max = mwait_size;
4241                         if (MWAIT_EXTENSION(cpi)) {
4242                                 cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
4243                                 if (MWAIT_INT_ENABLE(cpi))
4244                                         cpi->cpi_mwait.support |=
4245                                             MWAIT_ECX_INT_ENABLE;
4246                         }
4247                         break;
4248                 }
4249                 default:
4250                         break;
4251                 }
4252         }
4253 
4254         /*
4255          * XSAVE enumeration
4256          */
4257         if (cpi->cpi_maxeax >= 0xD) {
4258                 struct cpuid_regs regs;
4259                 boolean_t cpuid_d_valid = B_TRUE;
4260 
4261                 cp = &regs;
4262                 cp->cp_eax = 0xD;
4263                 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
4264 
4265                 (void) __cpuid_insn(cp);
4266 
4267                 /*
4268                  * Sanity checks for debug
4269                  */
4270                 if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
4271                     (cp->cp_eax & XFEATURE_SSE) == 0) {
4272                         cpuid_d_valid = B_FALSE;
4273                 }
4274 
4275                 cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
4276                 cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
4277                 cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
4278 
4279                 /*
4280                  * If the hw supports AVX, get the size and offset in the save
4281                  * area for the ymm state.
4282                  */
4283                 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
4284                         cp->cp_eax = 0xD;
4285                         cp->cp_ecx = 2;
4286                         cp->cp_edx = cp->cp_ebx = 0;
4287 
4288                         (void) __cpuid_insn(cp);
4289 
4290                         if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
4291                             cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
4292                                 cpuid_d_valid = B_FALSE;
4293                         }
4294 
4295                         cpi->cpi_xsave.ymm_size = cp->cp_eax;
4296                         cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
4297                 }
4298 
4299                 /*
4300                  * If the hw supports MPX, get the size and offset in the
4301                  * save area for BNDREGS and BNDCSR.
4302                  */
4303                 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
4304                         cp->cp_eax = 0xD;
4305                         cp->cp_ecx = 3;
4306                         cp->cp_edx = cp->cp_ebx = 0;
4307 
4308                         (void) __cpuid_insn(cp);
4309 
4310                         cpi->cpi_xsave.bndregs_size = cp->cp_eax;
4311                         cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
4312 
4313                         cp->cp_eax = 0xD;
4314                         cp->cp_ecx = 4;
4315                         cp->cp_edx = cp->cp_ebx = 0;
4316 
4317                         (void) __cpuid_insn(cp);
4318 
4319                         cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
4320                         cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
4321                 }
4322 
4323                 /*
4324                  * If the hw supports AVX512, get the size and offset in the
4325                  * save area for the opmask registers and zmm state.
4326                  */
4327                 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
4328                         cp->cp_eax = 0xD;
4329                         cp->cp_ecx = 5;
4330                         cp->cp_edx = cp->cp_ebx = 0;
4331 
4332                         (void) __cpuid_insn(cp);
4333 
4334                         cpi->cpi_xsave.opmask_size = cp->cp_eax;
4335                         cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
4336 
4337                         cp->cp_eax = 0xD;
4338                         cp->cp_ecx = 6;
4339                         cp->cp_edx = cp->cp_ebx = 0;
4340 
4341                         (void) __cpuid_insn(cp);
4342 
4343                         cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
4344                         cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
4345 
4346                         cp->cp_eax = 0xD;
4347                         cp->cp_ecx = 7;
4348                         cp->cp_edx = cp->cp_ebx = 0;
4349 
4350                         (void) __cpuid_insn(cp);
4351 
4352                         cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
4353                         cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
4354                 }
4355 
4356                 if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
4357                         xsave_state_size = 0;
4358                 } else if (cpuid_d_valid) {
4359                         xsave_state_size = cpi->cpi_xsave.xsav_max_size;
4360                 } else {
4361                         /* Broken CPUID 0xD, probably in HVM */
4362                         cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
4363                             "value: hw_low = %d, hw_high = %d, xsave_size = %d"
4364                             ", ymm_size = %d, ymm_offset = %d\n",
4365                             cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
4366                             cpi->cpi_xsave.xsav_hw_features_high,
4367                             (int)cpi->cpi_xsave.xsav_max_size,
4368                             (int)cpi->cpi_xsave.ymm_size,
4369                             (int)cpi->cpi_xsave.ymm_offset);
4370 
4371                         if (xsave_state_size != 0) {
4372                                 /*
4373                                  * This must be a non-boot CPU. We cannot
4374                                  * continue, because boot cpu has already
4375                                  * enabled XSAVE.
4376                                  */
4377                                 ASSERT(cpu->cpu_id != 0);
4378                                 cmn_err(CE_PANIC, "cpu%d: we have already "
4379                                     "enabled XSAVE on boot cpu, cannot "
4380                                     "continue.", cpu->cpu_id);
4381                         } else {
4382                                 /*
4383                                  * If we reached here on the boot CPU, it's also
4384                                  * almost certain that we'll reach here on the
4385                                  * non-boot CPUs. When we're here on a boot CPU
4386                                  * we should disable the feature, on a non-boot
4387                                  * CPU we need to confirm that we have.
4388                                  */
4389                                 if (cpu->cpu_id == 0) {
4390                                         remove_x86_feature(x86_featureset,
4391                                             X86FSET_XSAVE);
4392                                         remove_x86_feature(x86_featureset,
4393                                             X86FSET_AVX);
4394                                         remove_x86_feature(x86_featureset,
4395                                             X86FSET_F16C);
4396                                         remove_x86_feature(x86_featureset,
4397                                             X86FSET_BMI1);
4398                                         remove_x86_feature(x86_featureset,
4399                                             X86FSET_BMI2);
4400                                         remove_x86_feature(x86_featureset,
4401                                             X86FSET_FMA);
4402                                         remove_x86_feature(x86_featureset,
4403                                             X86FSET_AVX2);
4404                                         remove_x86_feature(x86_featureset,
4405                                             X86FSET_MPX);
4406                                         remove_x86_feature(x86_featureset,
4407                                             X86FSET_AVX512F);
4408                                         remove_x86_feature(x86_featureset,
4409                                             X86FSET_AVX512DQ);
4410                                         remove_x86_feature(x86_featureset,
4411                                             X86FSET_AVX512PF);
4412                                         remove_x86_feature(x86_featureset,
4413                                             X86FSET_AVX512ER);
4414                                         remove_x86_feature(x86_featureset,
4415                                             X86FSET_AVX512CD);
4416                                         remove_x86_feature(x86_featureset,
4417                                             X86FSET_AVX512BW);
4418                                         remove_x86_feature(x86_featureset,
4419                                             X86FSET_AVX512VL);
4420                                         remove_x86_feature(x86_featureset,
4421                                             X86FSET_AVX512FMA);
4422                                         remove_x86_feature(x86_featureset,
4423                                             X86FSET_AVX512VBMI);
4424                                         remove_x86_feature(x86_featureset,
4425                                             X86FSET_AVX512VNNI);
4426                                         remove_x86_feature(x86_featureset,
4427                                             X86FSET_AVX512VPOPCDQ);
4428                                         remove_x86_feature(x86_featureset,
4429                                             X86FSET_AVX512NNIW);
4430                                         remove_x86_feature(x86_featureset,
4431                                             X86FSET_AVX512FMAPS);
4432 
4433                                         CPI_FEATURES_ECX(cpi) &=
4434                                             ~CPUID_INTC_ECX_XSAVE;
4435                                         CPI_FEATURES_ECX(cpi) &=
4436                                             ~CPUID_INTC_ECX_AVX;
4437                                         CPI_FEATURES_ECX(cpi) &=
4438                                             ~CPUID_INTC_ECX_F16C;
4439                                         CPI_FEATURES_ECX(cpi) &=
4440                                             ~CPUID_INTC_ECX_FMA;
4441                                         CPI_FEATURES_7_0_EBX(cpi) &=
4442                                             ~CPUID_INTC_EBX_7_0_BMI1;
4443                                         CPI_FEATURES_7_0_EBX(cpi) &=
4444                                             ~CPUID_INTC_EBX_7_0_BMI2;
4445                                         CPI_FEATURES_7_0_EBX(cpi) &=
4446                                             ~CPUID_INTC_EBX_7_0_AVX2;
4447                                         CPI_FEATURES_7_0_EBX(cpi) &=
4448                                             ~CPUID_INTC_EBX_7_0_MPX;
4449                                         CPI_FEATURES_7_0_EBX(cpi) &=
4450                                             ~CPUID_INTC_EBX_7_0_ALL_AVX512;
4451 
4452                                         CPI_FEATURES_7_0_ECX(cpi) &=
4453                                             ~CPUID_INTC_ECX_7_0_ALL_AVX512;
4454 
4455                                         CPI_FEATURES_7_0_EDX(cpi) &=
4456                                             ~CPUID_INTC_EDX_7_0_ALL_AVX512;
4457 
4458                                         xsave_force_disable = B_TRUE;
4459                                 } else {
4460                                         VERIFY(is_x86_feature(x86_featureset,
4461                                             X86FSET_XSAVE) == B_FALSE);
4462                                 }
4463                         }
4464                 }
4465         }
4466 
4467 
4468         if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
4469                 goto pass2_done;
4470 
4471         if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
4472                 nmax = NMAX_CPI_EXTD;
4473         /*
4474          * Copy the extended properties, fixing them as we go.
4475          * (We already handled n == 0 and n == 1 in pass 1)
4476          */
4477         iptr = (void *)cpi->cpi_brandstr;
4478         for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
4479                 cp->cp_eax = CPUID_LEAF_EXT_0 + n;
4480                 (void) __cpuid_insn(cp);
4481                 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
4482                     cp);
4483                 switch (n) {
4484                 case 2:
4485                 case 3:
4486                 case 4:
4487                         /*
4488                          * Extract the brand string
4489                          */
4490                         *iptr++ = cp->cp_eax;
4491                         *iptr++ = cp->cp_ebx;
4492                         *iptr++ = cp->cp_ecx;
4493                         *iptr++ = cp->cp_edx;
4494                         break;
4495                 case 5:
4496                         switch (cpi->cpi_vendor) {
4497                         case X86_VENDOR_AMD:
4498                                 /*
4499                                  * The Athlon and Duron were the first
4500                                  * parts to report the sizes of the
4501                                  * TLB for large pages. Before then,
4502                                  * we don't trust the data.
4503                                  */
4504                                 if (cpi->cpi_family < 6 ||
4505                                     (cpi->cpi_family == 6 &&
4506                                     cpi->cpi_model < 1))
4507                                         cp->cp_eax = 0;
4508                                 break;
4509                         default:
4510                                 break;
4511                         }
4512                         break;
4513                 case 6:
4514                         switch (cpi->cpi_vendor) {
4515                         case X86_VENDOR_AMD:
4516                                 /*
4517                                  * The Athlon and Duron were the first
4518                                  * AMD parts with L2 TLB's.
4519                                  * Before then, don't trust the data.
4520                                  */
4521                                 if (cpi->cpi_family < 6 ||
4522                                     cpi->cpi_family == 6 &&
4523                                     cpi->cpi_model < 1)
4524                                         cp->cp_eax = cp->cp_ebx = 0;
4525                                 /*
4526                                  * AMD Duron rev A0 reports L2
4527                                  * cache size incorrectly as 1K
4528                                  * when it is really 64K
4529                                  */
4530                                 if (cpi->cpi_family == 6 &&
4531                                     cpi->cpi_model == 3 &&
4532                                     cpi->cpi_step == 0) {
4533                                         cp->cp_ecx &= 0xffff;
4534                                         cp->cp_ecx |= 0x400000;
4535                                 }
4536                                 break;
4537                         case X86_VENDOR_Cyrix:  /* VIA C3 */
4538                                 /*
4539                                  * VIA C3 processors are a bit messed
4540                                  * up w.r.t. encoding cache sizes in %ecx
4541                                  */
4542                                 if (cpi->cpi_family != 6)
4543                                         break;
4544                                 /*
4545                                  * model 7 and 8 were incorrectly encoded
4546                                  *
4547                                  * xxx is model 8 really broken?
4548                                  */
4549                                 if (cpi->cpi_model == 7 ||
4550                                     cpi->cpi_model == 8)
4551                                         cp->cp_ecx =
4552                                             BITX(cp->cp_ecx, 31, 24) << 16 |
4553                                             BITX(cp->cp_ecx, 23, 16) << 12 |
4554                                             BITX(cp->cp_ecx, 15, 8) << 8 |
4555                                             BITX(cp->cp_ecx, 7, 0);
4556                                 /*
4557                                  * model 9 stepping 1 has wrong associativity
4558                                  */
4559                                 if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
4560                                         cp->cp_ecx |= 8 << 12;
4561                                 break;
4562                         case X86_VENDOR_Intel:
4563                                 /*
4564                                  * Extended L2 Cache features function.
4565                                  * First appeared on Prescott.
4566                                  */
4567                         default:
4568                                 break;
4569                         }
4570                         break;
4571                 default:
4572                         break;
4573                 }
4574         }
4575 
4576 pass2_done:
4577         cpi->cpi_pass = 2;
4578 }
4579 
4580 static const char *
4581 intel_cpubrand(const struct cpuid_info *cpi)
4582 {
4583         int i;
4584 
4585         if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4586             cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
4587                 return ("i486");
4588 
4589         switch (cpi->cpi_family) {
4590         case 5:
4591                 return ("Intel Pentium(r)");
4592         case 6:
4593                 switch (cpi->cpi_model) {
4594                         uint_t celeron, xeon;
4595                         const struct cpuid_regs *cp;
4596                 case 0:
4597                 case 1:
4598                 case 2:
4599                         return ("Intel Pentium(r) Pro");
4600                 case 3:
4601                 case 4:
4602                         return ("Intel Pentium(r) II");
4603                 case 6:
4604                         return ("Intel Celeron(r)");
4605                 case 5:
4606                 case 7:
4607                         celeron = xeon = 0;
4608                         cp = &cpi->cpi_std[2];   /* cache info */
4609 
4610                         for (i = 1; i < 4; i++) {
4611                                 uint_t tmp;
4612 
4613                                 tmp = (cp->cp_eax >> (8 * i)) & 0xff;
4614                                 if (tmp == 0x40)
4615                                         celeron++;
4616                                 if (tmp >= 0x44 && tmp <= 0x45)
4617                                         xeon++;
4618                         }
4619 
4620                         for (i = 0; i < 2; i++) {
4621                                 uint_t tmp;
4622 
4623                                 tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
4624                                 if (tmp == 0x40)
4625                                         celeron++;
4626                                 else if (tmp >= 0x44 && tmp <= 0x45)
4627                                         xeon++;
4628                         }
4629 
4630                         for (i = 0; i < 4; i++) {
4631                                 uint_t tmp;
4632 
4633                                 tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
4634                                 if (tmp == 0x40)
4635                                         celeron++;
4636                                 else if (tmp >= 0x44 && tmp <= 0x45)
4637                                         xeon++;
4638                         }
4639 
4640                         for (i = 0; i < 4; i++) {
4641                                 uint_t tmp;
4642 
4643                                 tmp = (cp->cp_edx >> (8 * i)) & 0xff;
4644                                 if (tmp == 0x40)
4645                                         celeron++;
4646                                 else if (tmp >= 0x44 && tmp <= 0x45)
4647                                         xeon++;
4648                         }
4649 
4650                         if (celeron)
4651                                 return ("Intel Celeron(r)");
4652                         if (xeon)
4653                                 return (cpi->cpi_model == 5 ?
4654                                     "Intel Pentium(r) II Xeon(tm)" :
4655                                     "Intel Pentium(r) III Xeon(tm)");
4656                         return (cpi->cpi_model == 5 ?
4657                             "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
4658                             "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
4659                 default:
4660                         break;
4661                 }
4662         default:
4663                 break;
4664         }
4665 
4666         /* BrandID is present if the field is nonzero */
4667         if (cpi->cpi_brandid != 0) {
4668                 static const struct {
4669                         uint_t bt_bid;
4670                         const char *bt_str;
4671                 } brand_tbl[] = {
4672                         { 0x1,  "Intel(r) Celeron(r)" },
4673                         { 0x2,  "Intel(r) Pentium(r) III" },
4674                         { 0x3,  "Intel(r) Pentium(r) III Xeon(tm)" },
4675                         { 0x4,  "Intel(r) Pentium(r) III" },
4676                         { 0x6,  "Mobile Intel(r) Pentium(r) III" },
4677                         { 0x7,  "Mobile Intel(r) Celeron(r)" },
4678                         { 0x8,  "Intel(r) Pentium(r) 4" },
4679                         { 0x9,  "Intel(r) Pentium(r) 4" },
4680                         { 0xa,  "Intel(r) Celeron(r)" },
4681                         { 0xb,  "Intel(r) Xeon(tm)" },
4682                         { 0xc,  "Intel(r) Xeon(tm) MP" },
4683                         { 0xe,  "Mobile Intel(r) Pentium(r) 4" },
4684                         { 0xf,  "Mobile Intel(r) Celeron(r)" },
4685                         { 0x11, "Mobile Genuine Intel(r)" },
4686                         { 0x12, "Intel(r) Celeron(r) M" },
4687                         { 0x13, "Mobile Intel(r) Celeron(r)" },
4688                         { 0x14, "Intel(r) Celeron(r)" },
4689                         { 0x15, "Mobile Genuine Intel(r)" },
4690                         { 0x16, "Intel(r) Pentium(r) M" },
4691                         { 0x17, "Mobile Intel(r) Celeron(r)" }
4692                 };
4693                 uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
4694                 uint_t sgn;
4695 
4696                 sgn = (cpi->cpi_family << 8) |
4697                     (cpi->cpi_model << 4) | cpi->cpi_step;
4698 
4699                 for (i = 0; i < btblmax; i++)
4700                         if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
4701                                 break;
4702                 if (i < btblmax) {
4703                         if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
4704                                 return ("Intel(r) Celeron(r)");
4705                         if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
4706                                 return ("Intel(r) Xeon(tm) MP");
4707                         if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
4708                                 return ("Intel(r) Xeon(tm)");
4709                         return (brand_tbl[i].bt_str);
4710                 }
4711         }
4712 
4713         return (NULL);
4714 }
4715 
4716 static const char *
4717 amd_cpubrand(const struct cpuid_info *cpi)
4718 {
4719         if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4720             cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
4721                 return ("i486 compatible");
4722 
4723         switch (cpi->cpi_family) {
4724         case 5:
4725                 switch (cpi->cpi_model) {
4726                 case 0:
4727                 case 1:
4728                 case 2:
4729                 case 3:
4730                 case 4:
4731                 case 5:
4732                         return ("AMD-K5(r)");
4733                 case 6:
4734                 case 7:
4735                         return ("AMD-K6(r)");
4736                 case 8:
4737                         return ("AMD-K6(r)-2");
4738                 case 9:
4739                         return ("AMD-K6(r)-III");
4740                 default:
4741                         return ("AMD (family 5)");
4742                 }
4743         case 6:
4744                 switch (cpi->cpi_model) {
4745                 case 1:
4746                         return ("AMD-K7(tm)");
4747                 case 0:
4748                 case 2:
4749                 case 4:
4750                         return ("AMD Athlon(tm)");
4751                 case 3:
4752                 case 7:
4753                         return ("AMD Duron(tm)");
4754                 case 6:
4755                 case 8:
4756                 case 10:
4757                         /*
4758                          * Use the L2 cache size to distinguish
4759                          */
4760                         return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
4761                             "AMD Athlon(tm)" : "AMD Duron(tm)");
4762                 default:
4763                         return ("AMD (family 6)");
4764                 }
4765         default:
4766                 break;
4767         }
4768 
4769         if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
4770             cpi->cpi_brandid != 0) {
4771                 switch (BITX(cpi->cpi_brandid, 7, 5)) {
4772                 case 3:
4773                         return ("AMD Opteron(tm) UP 1xx");
4774                 case 4:
4775                         return ("AMD Opteron(tm) DP 2xx");
4776                 case 5:
4777                         return ("AMD Opteron(tm) MP 8xx");
4778                 default:
4779                         return ("AMD Opteron(tm)");
4780                 }
4781         }
4782 
4783         return (NULL);
4784 }
4785 
4786 static const char *
4787 cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
4788 {
4789         if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4790             cpi->cpi_maxeax < 1 || cpi->cpi_family < 5 ||
4791             type == X86_TYPE_CYRIX_486)
4792                 return ("i486 compatible");
4793 
4794         switch (type) {
4795         case X86_TYPE_CYRIX_6x86:
4796                 return ("Cyrix 6x86");
4797         case X86_TYPE_CYRIX_6x86L:
4798                 return ("Cyrix 6x86L");
4799         case X86_TYPE_CYRIX_6x86MX:
4800                 return ("Cyrix 6x86MX");
4801         case X86_TYPE_CYRIX_GXm:
4802                 return ("Cyrix GXm");
4803         case X86_TYPE_CYRIX_MediaGX:
4804                 return ("Cyrix MediaGX");
4805         case X86_TYPE_CYRIX_MII:
4806                 return ("Cyrix M2");
4807         case X86_TYPE_VIA_CYRIX_III:
4808                 return ("VIA Cyrix M3");
4809         default:
4810                 /*
4811                  * Have another wild guess ..
4812                  */
4813                 if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
4814                         return ("Cyrix 5x86");
4815                 else if (cpi->cpi_family == 5) {
4816                         switch (cpi->cpi_model) {
4817                         case 2:
4818                                 return ("Cyrix 6x86");  /* Cyrix M1 */
4819                         case 4:
4820                                 return ("Cyrix MediaGX");
4821                         default:
4822                                 break;
4823                         }
4824                 } else if (cpi->cpi_family == 6) {
4825                         switch (cpi->cpi_model) {
4826                         case 0:
4827                                 return ("Cyrix 6x86MX"); /* Cyrix M2? */
4828                         case 5:
4829                         case 6:
4830                         case 7:
4831                         case 8:
4832                         case 9:
4833                                 return ("VIA C3");
4834                         default:
4835                                 break;
4836                         }
4837                 }
4838                 break;
4839         }
4840         return (NULL);
4841 }
4842 
4843 /*
4844  * This only gets called in the case that the CPU extended
4845  * feature brand string (0x80000002, 0x80000003, 0x80000004)
4846  * aren't available, or contain null bytes for some reason.
4847  */
4848 static void
4849 fabricate_brandstr(struct cpuid_info *cpi)
4850 {
4851         const char *brand = NULL;
4852 
4853         switch (cpi->cpi_vendor) {
4854         case X86_VENDOR_Intel:
4855                 brand = intel_cpubrand(cpi);
4856                 break;
4857         case X86_VENDOR_AMD:
4858                 brand = amd_cpubrand(cpi);
4859                 break;
4860         case X86_VENDOR_Cyrix:
4861                 brand = cyrix_cpubrand(cpi, x86_type);
4862                 break;
4863         case X86_VENDOR_NexGen:
4864                 if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4865                         brand = "NexGen Nx586";
4866                 break;
4867         case X86_VENDOR_Centaur:
4868                 if (cpi->cpi_family == 5)
4869                         switch (cpi->cpi_model) {
4870                         case 4:
4871                                 brand = "Centaur C6";
4872                                 break;
4873                         case 8:
4874                                 brand = "Centaur C2";
4875                                 break;
4876                         case 9:
4877                                 brand = "Centaur C3";
4878                                 break;
4879                         default:
4880                                 break;
4881                         }
4882                 break;
4883         case X86_VENDOR_Rise:
4884                 if (cpi->cpi_family == 5 &&
4885                     (cpi->cpi_model == 0 || cpi->cpi_model == 2))
4886                         brand = "Rise mP6";
4887                 break;
4888         case X86_VENDOR_SiS:
4889                 if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4890                         brand = "SiS 55x";
4891                 break;
4892         case X86_VENDOR_TM:
4893                 if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
4894                         brand = "Transmeta Crusoe TM3x00 or TM5x00";
4895                 break;
4896         case X86_VENDOR_NSC:
4897         case X86_VENDOR_UMC:
4898         default:
4899                 break;
4900         }
4901         if (brand) {
4902                 (void) strcpy((char *)cpi->cpi_brandstr, brand);
4903                 return;
4904         }
4905 
4906         /*
4907          * If all else fails ...
4908          */
4909         (void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
4910             "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
4911             cpi->cpi_model, cpi->cpi_step);
4912 }
4913 
4914 /*
4915  * This routine is called just after kernel memory allocation
4916  * becomes available on cpu0, and as part of mp_startup() on
4917  * the other cpus.
4918  *
4919  * Fixup the brand string, and collect any information from cpuid
4920  * that requires dynamically allocated storage to represent.
4921  */
4922 /*ARGSUSED*/
4923 void
4924 cpuid_pass3(cpu_t *cpu)
4925 {
4926         int     i, max, shft, level, size;
4927         struct cpuid_regs regs;
4928         struct cpuid_regs *cp;
4929         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4930 
4931         ASSERT(cpi->cpi_pass == 2);
4932 
4933         /*
4934          * Deterministic cache parameters
4935          *
4936          * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
4937          * values that are present are currently defined to be the same. This
4938          * means we can use the same logic to parse it as long as we use the
4939          * appropriate leaf to get the data. If you're updating this, make sure
4940          * you're careful about which vendor supports which aspect.
4941          *
4942          * Take this opportunity to detect the number of threads sharing the
4943          * last level cache, and construct a corresponding cache id. The
4944          * respective cpuid_info members are initialized to the default case of
4945          * "no last level cache sharing".
4946          */
4947         cpi->cpi_ncpu_shr_last_cache = 1;
4948         cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
4949 
4950         if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
4951             (cpi->cpi_vendor == X86_VENDOR_AMD &&
4952             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
4953             is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
4954                 uint32_t leaf;
4955 
4956                 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4957                         leaf = 4;
4958                 } else {
4959                         leaf = CPUID_LEAF_EXT_1d;
4960                 }
4961 
4962                 /*
4963                  * Find the # of elements (size) returned by the leaf and along
4964                  * the way detect last level cache sharing details.
4965                  */
4966                 bzero(&regs, sizeof (regs));
4967                 cp = &regs;
4968                 for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
4969                         cp->cp_eax = leaf;
4970                         cp->cp_ecx = i;
4971 
4972                         (void) __cpuid_insn(cp);
4973 
4974                         if (CPI_CACHE_TYPE(cp) == 0)
4975                                 break;
4976                         level = CPI_CACHE_LVL(cp);
4977                         if (level > max) {
4978                                 max = level;
4979                                 cpi->cpi_ncpu_shr_last_cache =
4980                                     CPI_NTHR_SHR_CACHE(cp) + 1;
4981                         }
4982                 }
4983                 cpi->cpi_cache_leaf_size = size = i;
4984 
4985                 /*
4986                  * Allocate the cpi_cache_leaves array. The first element
4987                  * references the regs for the corresponding leaf with %ecx set
4988                  * to 0. This was gathered in cpuid_pass2().
4989                  */
4990                 if (size > 0) {
4991                         cpi->cpi_cache_leaves =
4992                             kmem_alloc(size * sizeof (cp), KM_SLEEP);
4993                         if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4994                                 cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
4995                         } else {
4996                                 cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
4997                         }
4998 
4999                         /*
5000                          * Allocate storage to hold the additional regs
5001                          * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
5002                          *
5003                          * The regs for the leaf, %ecx == 0 has already
5004                          * been allocated as indicated above.
5005                          */
5006                         for (i = 1; i < size; i++) {
5007                                 cp = cpi->cpi_cache_leaves[i] =
5008                                     kmem_zalloc(sizeof (regs), KM_SLEEP);
5009                                 cp->cp_eax = leaf;
5010                                 cp->cp_ecx = i;
5011 
5012                                 (void) __cpuid_insn(cp);
5013                         }
5014                 }
5015                 /*
5016                  * Determine the number of bits needed to represent
5017                  * the number of CPUs sharing the last level cache.
5018                  *
5019                  * Shift off that number of bits from the APIC id to
5020                  * derive the cache id.
5021                  */
5022                 shft = 0;
5023                 for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
5024                         shft++;
5025                 cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
5026         }
5027 
5028         /*
5029          * Now fixup the brand string
5030          */
5031         if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
5032                 fabricate_brandstr(cpi);
5033         } else {
5034 
5035                 /*
5036                  * If we successfully extracted a brand string from the cpuid
5037                  * instruction, clean it up by removing leading spaces and
5038                  * similar junk.
5039                  */
5040                 if (cpi->cpi_brandstr[0]) {
5041                         size_t maxlen = sizeof (cpi->cpi_brandstr);
5042                         char *src, *dst;
5043 
5044                         dst = src = (char *)cpi->cpi_brandstr;
5045                         src[maxlen - 1] = '\0';
5046                         /*
5047                          * strip leading spaces
5048                          */
5049                         while (*src == ' ')
5050                                 src++;
5051                         /*
5052                          * Remove any 'Genuine' or "Authentic" prefixes
5053                          */
5054                         if (strncmp(src, "Genuine ", 8) == 0)
5055                                 src += 8;
5056                         if (strncmp(src, "Authentic ", 10) == 0)
5057                                 src += 10;
5058 
5059                         /*
5060                          * Now do an in-place copy.
5061                          * Map (R) to (r) and (TM) to (tm).
5062                          * The era of teletypes is long gone, and there's
5063                          * -really- no need to shout.
5064                          */
5065                         while (*src != '\0') {
5066                                 if (src[0] == '(') {
5067                                         if (strncmp(src + 1, "R)", 2) == 0) {
5068                                                 (void) strncpy(dst, "(r)", 3);
5069                                                 src += 3;
5070                                                 dst += 3;
5071                                                 continue;
5072                                         }
5073                                         if (strncmp(src + 1, "TM)", 3) == 0) {
5074                                                 (void) strncpy(dst, "(tm)", 4);
5075                                                 src += 4;
5076                                                 dst += 4;
5077                                                 continue;
5078                                         }
5079                                 }
5080                                 *dst++ = *src++;
5081                         }
5082                         *dst = '\0';
5083 
5084                         /*
5085                          * Finally, remove any trailing spaces
5086                          */
5087                         while (--dst > cpi->cpi_brandstr)
5088                                 if (*dst == ' ')
5089                                         *dst = '\0';
5090                                 else
5091                                         break;
5092                 } else
5093                         fabricate_brandstr(cpi);
5094         }
5095         cpi->cpi_pass = 3;
5096 }
5097 
5098 /*
5099  * This routine is called out of bind_hwcap() much later in the life
5100  * of the kernel (post_startup()).  The job of this routine is to resolve
5101  * the hardware feature support and kernel support for those features into
5102  * what we're actually going to tell applications via the aux vector.
5103  */
5104 void
5105 cpuid_pass4(cpu_t *cpu, uint_t *hwcap_out)
5106 {
5107         struct cpuid_info *cpi;
5108         uint_t hwcap_flags = 0, hwcap_flags_2 = 0;
5109 
5110         if (cpu == NULL)
5111                 cpu = CPU;
5112         cpi = cpu->cpu_m.mcpu_cpi;
5113 
5114         ASSERT(cpi->cpi_pass == 3);
5115 
5116         if (cpi->cpi_maxeax >= 1) {
5117                 uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
5118                 uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
5119                 uint32_t *ebx = &cpi->cpi_support[STD_EBX_FEATURES];
5120 
5121                 *edx = CPI_FEATURES_EDX(cpi);
5122                 *ecx = CPI_FEATURES_ECX(cpi);
5123                 *ebx = CPI_FEATURES_7_0_EBX(cpi);
5124 
5125                 /*
5126                  * [these require explicit kernel support]
5127                  */
5128                 if (!is_x86_feature(x86_featureset, X86FSET_SEP))
5129                         *edx &= ~CPUID_INTC_EDX_SEP;
5130 
5131                 if (!is_x86_feature(x86_featureset, X86FSET_SSE))
5132                         *edx &= ~(CPUID_INTC_EDX_FXSR|CPUID_INTC_EDX_SSE);
5133                 if (!is_x86_feature(x86_featureset, X86FSET_SSE2))
5134                         *edx &= ~CPUID_INTC_EDX_SSE2;
5135 
5136                 if (!is_x86_feature(x86_featureset, X86FSET_HTT))
5137                         *edx &= ~CPUID_INTC_EDX_HTT;
5138 
5139                 if (!is_x86_feature(x86_featureset, X86FSET_SSE3))
5140                         *ecx &= ~CPUID_INTC_ECX_SSE3;
5141 
5142                 if (!is_x86_feature(x86_featureset, X86FSET_SSSE3))
5143                         *ecx &= ~CPUID_INTC_ECX_SSSE3;
5144                 if (!is_x86_feature(x86_featureset, X86FSET_SSE4_1))
5145                         *ecx &= ~CPUID_INTC_ECX_SSE4_1;
5146                 if (!is_x86_feature(x86_featureset, X86FSET_SSE4_2))
5147                         *ecx &= ~CPUID_INTC_ECX_SSE4_2;
5148                 if (!is_x86_feature(x86_featureset, X86FSET_AES))
5149                         *ecx &= ~CPUID_INTC_ECX_AES;
5150                 if (!is_x86_feature(x86_featureset, X86FSET_PCLMULQDQ))
5151                         *ecx &= ~CPUID_INTC_ECX_PCLMULQDQ;
5152                 if (!is_x86_feature(x86_featureset, X86FSET_XSAVE))
5153                         *ecx &= ~(CPUID_INTC_ECX_XSAVE |
5154                             CPUID_INTC_ECX_OSXSAVE);
5155                 if (!is_x86_feature(x86_featureset, X86FSET_AVX))
5156                         *ecx &= ~CPUID_INTC_ECX_AVX;
5157                 if (!is_x86_feature(x86_featureset, X86FSET_F16C))
5158                         *ecx &= ~CPUID_INTC_ECX_F16C;
5159                 if (!is_x86_feature(x86_featureset, X86FSET_FMA))
5160                         *ecx &= ~CPUID_INTC_ECX_FMA;
5161                 if (!is_x86_feature(x86_featureset, X86FSET_BMI1))
5162                         *ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
5163                 if (!is_x86_feature(x86_featureset, X86FSET_BMI2))
5164                         *ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
5165                 if (!is_x86_feature(x86_featureset, X86FSET_AVX2))
5166                         *ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
5167                 if (!is_x86_feature(x86_featureset, X86FSET_RDSEED))
5168                         *ebx &= ~CPUID_INTC_EBX_7_0_RDSEED;
5169                 if (!is_x86_feature(x86_featureset, X86FSET_ADX))
5170                         *ebx &= ~CPUID_INTC_EBX_7_0_ADX;
5171 
5172                 /*
5173                  * [no explicit support required beyond x87 fp context]
5174                  */
5175                 if (!fpu_exists)
5176                         *edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
5177 
5178                 /*
5179                  * Now map the supported feature vector to things that we
5180                  * think userland will care about.
5181                  */
5182                 if (*edx & CPUID_INTC_EDX_SEP)
5183                         hwcap_flags |= AV_386_SEP;
5184                 if (*edx & CPUID_INTC_EDX_SSE)
5185                         hwcap_flags |= AV_386_FXSR | AV_386_SSE;
5186                 if (*edx & CPUID_INTC_EDX_SSE2)
5187                         hwcap_flags |= AV_386_SSE2;
5188                 if (*ecx & CPUID_INTC_ECX_SSE3)
5189                         hwcap_flags |= AV_386_SSE3;
5190                 if (*ecx & CPUID_INTC_ECX_SSSE3)
5191                         hwcap_flags |= AV_386_SSSE3;
5192                 if (*ecx & CPUID_INTC_ECX_SSE4_1)
5193                         hwcap_flags |= AV_386_SSE4_1;
5194                 if (*ecx & CPUID_INTC_ECX_SSE4_2)
5195                         hwcap_flags |= AV_386_SSE4_2;
5196                 if (*ecx & CPUID_INTC_ECX_MOVBE)
5197                         hwcap_flags |= AV_386_MOVBE;
5198                 if (*ecx & CPUID_INTC_ECX_AES)
5199                         hwcap_flags |= AV_386_AES;
5200                 if (*ecx & CPUID_INTC_ECX_PCLMULQDQ)
5201                         hwcap_flags |= AV_386_PCLMULQDQ;
5202                 if ((*ecx & CPUID_INTC_ECX_XSAVE) &&
5203                     (*ecx & CPUID_INTC_ECX_OSXSAVE)) {
5204                         hwcap_flags |= AV_386_XSAVE;
5205 
5206                         if (*ecx & CPUID_INTC_ECX_AVX) {
5207                                 uint32_t *ecx_7 = &CPI_FEATURES_7_0_ECX(cpi);
5208                                 uint32_t *edx_7 = &CPI_FEATURES_7_0_EDX(cpi);
5209 
5210                                 hwcap_flags |= AV_386_AVX;
5211                                 if (*ecx & CPUID_INTC_ECX_F16C)
5212                                         hwcap_flags_2 |= AV_386_2_F16C;
5213                                 if (*ecx & CPUID_INTC_ECX_FMA)
5214                                         hwcap_flags_2 |= AV_386_2_FMA;
5215 
5216                                 if (*ebx & CPUID_INTC_EBX_7_0_BMI1)
5217                                         hwcap_flags_2 |= AV_386_2_BMI1;
5218                                 if (*ebx & CPUID_INTC_EBX_7_0_BMI2)
5219                                         hwcap_flags_2 |= AV_386_2_BMI2;
5220                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX2)
5221                                         hwcap_flags_2 |= AV_386_2_AVX2;
5222                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512F)
5223                                         hwcap_flags_2 |= AV_386_2_AVX512F;
5224                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
5225                                         hwcap_flags_2 |= AV_386_2_AVX512DQ;
5226                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
5227                                         hwcap_flags_2 |= AV_386_2_AVX512IFMA;
5228                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512PF)
5229                                         hwcap_flags_2 |= AV_386_2_AVX512PF;
5230                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512ER)
5231                                         hwcap_flags_2 |= AV_386_2_AVX512ER;
5232                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512CD)
5233                                         hwcap_flags_2 |= AV_386_2_AVX512CD;
5234                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512BW)
5235                                         hwcap_flags_2 |= AV_386_2_AVX512BW;
5236                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512VL)
5237                                         hwcap_flags_2 |= AV_386_2_AVX512VL;
5238 
5239                                 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VBMI)
5240                                         hwcap_flags_2 |= AV_386_2_AVX512VBMI;
5241                                 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VNNI)
5242                                         hwcap_flags_2 |= AV_386_2_AVX512_VNNI;
5243                                 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
5244                                         hwcap_flags_2 |= AV_386_2_AVX512VPOPCDQ;
5245 
5246                                 if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124NNIW)
5247                                         hwcap_flags_2 |= AV_386_2_AVX512_4NNIW;
5248                                 if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
5249                                         hwcap_flags_2 |= AV_386_2_AVX512_4FMAPS;
5250                         }
5251                 }
5252                 if (*ecx & CPUID_INTC_ECX_VMX)
5253                         hwcap_flags |= AV_386_VMX;
5254                 if (*ecx & CPUID_INTC_ECX_POPCNT)
5255                         hwcap_flags |= AV_386_POPCNT;
5256                 if (*edx & CPUID_INTC_EDX_FPU)
5257                         hwcap_flags |= AV_386_FPU;
5258                 if (*edx & CPUID_INTC_EDX_MMX)
5259                         hwcap_flags |= AV_386_MMX;
5260 
5261                 if (*edx & CPUID_INTC_EDX_TSC)
5262                         hwcap_flags |= AV_386_TSC;
5263                 if (*edx & CPUID_INTC_EDX_CX8)
5264                         hwcap_flags |= AV_386_CX8;
5265                 if (*edx & CPUID_INTC_EDX_CMOV)
5266                         hwcap_flags |= AV_386_CMOV;
5267                 if (*ecx & CPUID_INTC_ECX_CX16)
5268                         hwcap_flags |= AV_386_CX16;
5269 
5270                 if (*ecx & CPUID_INTC_ECX_RDRAND)
5271                         hwcap_flags_2 |= AV_386_2_RDRAND;
5272                 if (*ebx & CPUID_INTC_EBX_7_0_ADX)
5273                         hwcap_flags_2 |= AV_386_2_ADX;
5274                 if (*ebx & CPUID_INTC_EBX_7_0_RDSEED)
5275                         hwcap_flags_2 |= AV_386_2_RDSEED;
5276                 if (*ebx & CPUID_INTC_EBX_7_0_SHA)
5277                         hwcap_flags_2 |= AV_386_2_SHA;
5278                 if (*ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
5279                         hwcap_flags_2 |= AV_386_2_FSGSBASE;
5280                 if (*ebx & CPUID_INTC_EBX_7_0_CLWB)
5281                         hwcap_flags_2 |= AV_386_2_CLWB;
5282                 if (*ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
5283                         hwcap_flags_2 |= AV_386_2_CLFLUSHOPT;
5284 
5285         }
5286         /*
5287          * Check a few miscilaneous features.
5288          */
5289         if (is_x86_feature(x86_featureset, X86FSET_CLZERO))
5290                 hwcap_flags_2 |= AV_386_2_CLZERO;
5291 
5292         if (cpi->cpi_xmaxeax < 0x80000001)
5293                 goto pass4_done;
5294 
5295         switch (cpi->cpi_vendor) {
5296                 struct cpuid_regs cp;
5297                 uint32_t *edx, *ecx;
5298 
5299         case X86_VENDOR_Intel:
5300                 /*
5301                  * Seems like Intel duplicated what we necessary
5302                  * here to make the initial crop of 64-bit OS's work.
5303                  * Hopefully, those are the only "extended" bits
5304                  * they'll add.
5305                  */
5306                 /*FALLTHROUGH*/
5307 
5308         case X86_VENDOR_AMD:
5309                 edx = &cpi->cpi_support[AMD_EDX_FEATURES];
5310                 ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
5311 
5312                 *edx = CPI_FEATURES_XTD_EDX(cpi);
5313                 *ecx = CPI_FEATURES_XTD_ECX(cpi);
5314 
5315                 /*
5316                  * [these features require explicit kernel support]
5317                  */
5318                 switch (cpi->cpi_vendor) {
5319                 case X86_VENDOR_Intel:
5320                         if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
5321                                 *edx &= ~CPUID_AMD_EDX_TSCP;
5322                         break;
5323 
5324                 case X86_VENDOR_AMD:
5325                         if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
5326                                 *edx &= ~CPUID_AMD_EDX_TSCP;
5327                         if (!is_x86_feature(x86_featureset, X86FSET_SSE4A))
5328                                 *ecx &= ~CPUID_AMD_ECX_SSE4A;
5329                         break;
5330 
5331                 default:
5332                         break;
5333                 }
5334 
5335                 /*
5336                  * [no explicit support required beyond
5337                  * x87 fp context and exception handlers]
5338                  */
5339                 if (!fpu_exists)
5340                         *edx &= ~(CPUID_AMD_EDX_MMXamd |
5341                             CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
5342 
5343                 if (!is_x86_feature(x86_featureset, X86FSET_NX))
5344                         *edx &= ~CPUID_AMD_EDX_NX;
5345 #if !defined(__amd64)
5346                 *edx &= ~CPUID_AMD_EDX_LM;
5347 #endif
5348                 /*
5349                  * Now map the supported feature vector to
5350                  * things that we think userland will care about.
5351                  */
5352 #if defined(__amd64)
5353                 if (*edx & CPUID_AMD_EDX_SYSC)
5354                         hwcap_flags |= AV_386_AMD_SYSC;
5355 #endif
5356                 if (*edx & CPUID_AMD_EDX_MMXamd)
5357                         hwcap_flags |= AV_386_AMD_MMX;
5358                 if (*edx & CPUID_AMD_EDX_3DNow)
5359                         hwcap_flags |= AV_386_AMD_3DNow;
5360                 if (*edx & CPUID_AMD_EDX_3DNowx)
5361                         hwcap_flags |= AV_386_AMD_3DNowx;
5362                 if (*ecx & CPUID_AMD_ECX_SVM)
5363                         hwcap_flags |= AV_386_AMD_SVM;
5364 
5365                 switch (cpi->cpi_vendor) {
5366                 case X86_VENDOR_AMD:
5367                         if (*edx & CPUID_AMD_EDX_TSCP)
5368                                 hwcap_flags |= AV_386_TSCP;
5369                         if (*ecx & CPUID_AMD_ECX_AHF64)
5370                                 hwcap_flags |= AV_386_AHF;
5371                         if (*ecx & CPUID_AMD_ECX_SSE4A)
5372                                 hwcap_flags |= AV_386_AMD_SSE4A;
5373                         if (*ecx & CPUID_AMD_ECX_LZCNT)
5374                                 hwcap_flags |= AV_386_AMD_LZCNT;
5375                         if (*ecx & CPUID_AMD_ECX_MONITORX)
5376                                 hwcap_flags_2 |= AV_386_2_MONITORX;
5377                         break;
5378 
5379                 case X86_VENDOR_Intel:
5380                         if (*edx & CPUID_AMD_EDX_TSCP)
5381                                 hwcap_flags |= AV_386_TSCP;
5382                         if (*ecx & CPUID_AMD_ECX_LZCNT)
5383                                 hwcap_flags |= AV_386_AMD_LZCNT;
5384                         /*
5385                          * Aarrgh.
5386                          * Intel uses a different bit in the same word.
5387                          */
5388                         if (*ecx & CPUID_INTC_ECX_AHF64)
5389                                 hwcap_flags |= AV_386_AHF;
5390                         break;
5391 
5392                 default:
5393                         break;
5394                 }
5395                 break;
5396 
5397         case X86_VENDOR_TM:
5398                 cp.cp_eax = 0x80860001;
5399                 (void) __cpuid_insn(&cp);
5400                 cpi->cpi_support[TM_EDX_FEATURES] = cp.cp_edx;
5401                 break;
5402 
5403         default:
5404                 break;
5405         }
5406 
5407 pass4_done:
5408         cpi->cpi_pass = 4;
5409         if (hwcap_out != NULL) {
5410                 hwcap_out[0] = hwcap_flags;
5411                 hwcap_out[1] = hwcap_flags_2;
5412         }
5413 }
5414 
5415 
5416 /*
5417  * Simulate the cpuid instruction using the data we previously
5418  * captured about this CPU.  We try our best to return the truth
5419  * about the hardware, independently of kernel support.
5420  */
5421 uint32_t
5422 cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
5423 {
5424         struct cpuid_info *cpi;
5425         struct cpuid_regs *xcp;
5426 
5427         if (cpu == NULL)
5428                 cpu = CPU;
5429         cpi = cpu->cpu_m.mcpu_cpi;
5430 
5431         ASSERT(cpuid_checkpass(cpu, 3));
5432 
5433         /*
5434          * CPUID data is cached in two separate places: cpi_std for standard
5435          * CPUID leaves , and cpi_extd for extended CPUID leaves.
5436          */
5437         if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
5438                 xcp = &cpi->cpi_std[cp->cp_eax];
5439         } else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
5440             cp->cp_eax <= cpi->cpi_xmaxeax &&
5441             cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
5442                 xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
5443         } else {
5444                 /*
5445                  * The caller is asking for data from an input parameter which
5446                  * the kernel has not cached.  In this case we go fetch from
5447                  * the hardware and return the data directly to the user.
5448                  */
5449                 return (__cpuid_insn(cp));
5450         }
5451 
5452         cp->cp_eax = xcp->cp_eax;
5453         cp->cp_ebx = xcp->cp_ebx;
5454         cp->cp_ecx = xcp->cp_ecx;
5455         cp->cp_edx = xcp->cp_edx;
5456         return (cp->cp_eax);
5457 }
5458 
5459 int
5460 cpuid_checkpass(cpu_t *cpu, int pass)
5461 {
5462         return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
5463             cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
5464 }
5465 
5466 int
5467 cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
5468 {
5469         ASSERT(cpuid_checkpass(cpu, 3));
5470 
5471         return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
5472 }
5473 
5474 int
5475 cpuid_is_cmt(cpu_t *cpu)
5476 {
5477         if (cpu == NULL)
5478                 cpu = CPU;
5479 
5480         ASSERT(cpuid_checkpass(cpu, 1));
5481 
5482         return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
5483 }
5484 
5485 /*
5486  * AMD and Intel both implement the 64-bit variant of the syscall
5487  * instruction (syscallq), so if there's -any- support for syscall,
5488  * cpuid currently says "yes, we support this".
5489  *
5490  * However, Intel decided to -not- implement the 32-bit variant of the
5491  * syscall instruction, so we provide a predicate to allow our caller
5492  * to test that subtlety here.
5493  *
5494  * XXPV Currently, 32-bit syscall instructions don't work via the hypervisor,
5495  *      even in the case where the hardware would in fact support it.
5496  */
5497 /*ARGSUSED*/
5498 int
5499 cpuid_syscall32_insn(cpu_t *cpu)
5500 {
5501         ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), 1));
5502 
5503 #if !defined(__xpv)
5504         if (cpu == NULL)
5505                 cpu = CPU;
5506 
5507         /*CSTYLED*/
5508         {
5509                 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5510 
5511                 if (cpi->cpi_vendor == X86_VENDOR_AMD &&
5512                     cpi->cpi_xmaxeax >= 0x80000001 &&
5513                     (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
5514                         return (1);
5515         }
5516 #endif
5517         return (0);
5518 }
5519 
5520 int
5521 cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
5522 {
5523         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5524 
5525         static const char fmt[] =
5526             "x86 (%s %X family %d model %d step %d clock %d MHz)";
5527         static const char fmt_ht[] =
5528             "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
5529 
5530         ASSERT(cpuid_checkpass(cpu, 1));
5531 
5532         if (cpuid_is_cmt(cpu))
5533                 return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
5534                     cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5535                     cpi->cpi_family, cpi->cpi_model,
5536                     cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5537         return (snprintf(s, n, fmt,
5538             cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5539             cpi->cpi_family, cpi->cpi_model,
5540             cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5541 }
5542 
5543 const char *
5544 cpuid_getvendorstr(cpu_t *cpu)
5545 {
5546         ASSERT(cpuid_checkpass(cpu, 1));
5547         return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
5548 }
5549 
5550 uint_t
5551 cpuid_getvendor(cpu_t *cpu)
5552 {
5553         ASSERT(cpuid_checkpass(cpu, 1));
5554         return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
5555 }
5556 
5557 uint_t
5558 cpuid_getfamily(cpu_t *cpu)
5559 {
5560         ASSERT(cpuid_checkpass(cpu, 1));
5561         return (cpu->cpu_m.mcpu_cpi->cpi_family);
5562 }
5563 
5564 uint_t
5565 cpuid_getmodel(cpu_t *cpu)
5566 {
5567         ASSERT(cpuid_checkpass(cpu, 1));
5568         return (cpu->cpu_m.mcpu_cpi->cpi_model);
5569 }
5570 
5571 uint_t
5572 cpuid_get_ncpu_per_chip(cpu_t *cpu)
5573 {
5574         ASSERT(cpuid_checkpass(cpu, 1));
5575         return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
5576 }
5577 
5578 uint_t
5579 cpuid_get_ncore_per_chip(cpu_t *cpu)
5580 {
5581         ASSERT(cpuid_checkpass(cpu, 1));
5582         return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
5583 }
5584 
5585 uint_t
5586 cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
5587 {
5588         ASSERT(cpuid_checkpass(cpu, 2));
5589         return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
5590 }
5591 
5592 id_t
5593 cpuid_get_last_lvl_cacheid(cpu_t *cpu)
5594 {
5595         ASSERT(cpuid_checkpass(cpu, 2));
5596         return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5597 }
5598 
5599 uint_t
5600 cpuid_getstep(cpu_t *cpu)
5601 {
5602         ASSERT(cpuid_checkpass(cpu, 1));
5603         return (cpu->cpu_m.mcpu_cpi->cpi_step);
5604 }
5605 
5606 uint_t
5607 cpuid_getsig(struct cpu *cpu)
5608 {
5609         ASSERT(cpuid_checkpass(cpu, 1));
5610         return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
5611 }
5612 
5613 uint32_t
5614 cpuid_getchiprev(struct cpu *cpu)
5615 {
5616         ASSERT(cpuid_checkpass(cpu, 1));
5617         return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
5618 }
5619 
5620 const char *
5621 cpuid_getchiprevstr(struct cpu *cpu)
5622 {
5623         ASSERT(cpuid_checkpass(cpu, 1));
5624         return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
5625 }
5626 
5627 uint32_t
5628 cpuid_getsockettype(struct cpu *cpu)
5629 {
5630         ASSERT(cpuid_checkpass(cpu, 1));
5631         return (cpu->cpu_m.mcpu_cpi->cpi_socket);
5632 }
5633 
5634 const char *
5635 cpuid_getsocketstr(cpu_t *cpu)
5636 {
5637         static const char *socketstr = NULL;
5638         struct cpuid_info *cpi;
5639 
5640         ASSERT(cpuid_checkpass(cpu, 1));
5641         cpi = cpu->cpu_m.mcpu_cpi;
5642 
5643         /* Assume that socket types are the same across the system */
5644         if (socketstr == NULL)
5645                 socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
5646                     cpi->cpi_model, cpi->cpi_step);
5647 
5648 
5649         return (socketstr);
5650 }
5651 
5652 int
5653 cpuid_get_chipid(cpu_t *cpu)
5654 {
5655         ASSERT(cpuid_checkpass(cpu, 1));
5656 
5657         if (cpuid_is_cmt(cpu))
5658                 return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
5659         return (cpu->cpu_id);
5660 }
5661 
5662 id_t
5663 cpuid_get_coreid(cpu_t *cpu)
5664 {
5665         ASSERT(cpuid_checkpass(cpu, 1));
5666         return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
5667 }
5668 
5669 int
5670 cpuid_get_pkgcoreid(cpu_t *cpu)
5671 {
5672         ASSERT(cpuid_checkpass(cpu, 1));
5673         return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
5674 }
5675 
5676 int
5677 cpuid_get_clogid(cpu_t *cpu)
5678 {
5679         ASSERT(cpuid_checkpass(cpu, 1));
5680         return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
5681 }
5682 
5683 int
5684 cpuid_get_cacheid(cpu_t *cpu)
5685 {
5686         ASSERT(cpuid_checkpass(cpu, 1));
5687         return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5688 }
5689 
5690 uint_t
5691 cpuid_get_procnodeid(cpu_t *cpu)
5692 {
5693         ASSERT(cpuid_checkpass(cpu, 1));
5694         return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
5695 }
5696 
5697 uint_t
5698 cpuid_get_procnodes_per_pkg(cpu_t *cpu)
5699 {
5700         ASSERT(cpuid_checkpass(cpu, 1));
5701         return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
5702 }
5703 
5704 uint_t
5705 cpuid_get_compunitid(cpu_t *cpu)
5706 {
5707         ASSERT(cpuid_checkpass(cpu, 1));
5708         return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
5709 }
5710 
5711 uint_t
5712 cpuid_get_cores_per_compunit(cpu_t *cpu)
5713 {
5714         ASSERT(cpuid_checkpass(cpu, 1));
5715         return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
5716 }
5717 
5718 /*ARGSUSED*/
5719 int
5720 cpuid_have_cr8access(cpu_t *cpu)
5721 {
5722 #if defined(__amd64)
5723         return (1);
5724 #else
5725         struct cpuid_info *cpi;
5726 
5727         ASSERT(cpu != NULL);
5728         cpi = cpu->cpu_m.mcpu_cpi;
5729         if (cpi->cpi_vendor == X86_VENDOR_AMD && cpi->cpi_maxeax >= 1 &&
5730             (CPI_FEATURES_XTD_ECX(cpi) & CPUID_AMD_ECX_CR8D) != 0)
5731                 return (1);
5732         return (0);
5733 #endif
5734 }
5735 
5736 uint32_t
5737 cpuid_get_apicid(cpu_t *cpu)
5738 {
5739         ASSERT(cpuid_checkpass(cpu, 1));
5740         if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
5741                 return (UINT32_MAX);
5742         } else {
5743                 return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
5744         }
5745 }
5746 
5747 void
5748 cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
5749 {
5750         struct cpuid_info *cpi;
5751 
5752         if (cpu == NULL)
5753                 cpu = CPU;
5754         cpi = cpu->cpu_m.mcpu_cpi;
5755 
5756         ASSERT(cpuid_checkpass(cpu, 1));
5757 
5758         if (pabits)
5759                 *pabits = cpi->cpi_pabits;
5760         if (vabits)
5761                 *vabits = cpi->cpi_vabits;
5762 }
5763 
5764 size_t
5765 cpuid_get_xsave_size()
5766 {
5767         return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
5768             sizeof (struct xsave_state)));
5769 }
5770 
5771 /*
5772  * Return true if the CPUs on this system require 'pointer clearing' for the
5773  * floating point error pointer exception handling. In the past, this has been
5774  * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
5775  * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
5776  * feature bit and is reflected in the cpi_fp_amd_save member.
5777  */
5778 boolean_t
5779 cpuid_need_fp_excp_handling()
5780 {
5781         return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
5782             cpuid_info0.cpi_fp_amd_save != 0);
5783 }
5784 
5785 /*
5786  * Returns the number of data TLB entries for a corresponding
5787  * pagesize.  If it can't be computed, or isn't known, the
5788  * routine returns zero.  If you ask about an architecturally
5789  * impossible pagesize, the routine will panic (so that the
5790  * hat implementor knows that things are inconsistent.)
5791  */
5792 uint_t
5793 cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
5794 {
5795         struct cpuid_info *cpi;
5796         uint_t dtlb_nent = 0;
5797 
5798         if (cpu == NULL)
5799                 cpu = CPU;
5800         cpi = cpu->cpu_m.mcpu_cpi;
5801 
5802         ASSERT(cpuid_checkpass(cpu, 1));
5803 
5804         /*
5805          * Check the L2 TLB info
5806          */
5807         if (cpi->cpi_xmaxeax >= 0x80000006) {
5808                 struct cpuid_regs *cp = &cpi->cpi_extd[6];
5809 
5810                 switch (pagesize) {
5811 
5812                 case 4 * 1024:
5813                         /*
5814                          * All zero in the top 16 bits of the register
5815                          * indicates a unified TLB. Size is in low 16 bits.
5816                          */
5817                         if ((cp->cp_ebx & 0xffff0000) == 0)
5818                                 dtlb_nent = cp->cp_ebx & 0x0000ffff;
5819                         else
5820                                 dtlb_nent = BITX(cp->cp_ebx, 27, 16);
5821                         break;
5822 
5823                 case 2 * 1024 * 1024:
5824                         if ((cp->cp_eax & 0xffff0000) == 0)
5825                                 dtlb_nent = cp->cp_eax & 0x0000ffff;
5826                         else
5827                                 dtlb_nent = BITX(cp->cp_eax, 27, 16);
5828                         break;
5829 
5830                 default:
5831                         panic("unknown L2 pagesize");
5832                         /*NOTREACHED*/
5833                 }
5834         }
5835 
5836         if (dtlb_nent != 0)
5837                 return (dtlb_nent);
5838 
5839         /*
5840          * No L2 TLB support for this size, try L1.
5841          */
5842         if (cpi->cpi_xmaxeax >= 0x80000005) {
5843                 struct cpuid_regs *cp = &cpi->cpi_extd[5];
5844 
5845                 switch (pagesize) {
5846                 case 4 * 1024:
5847                         dtlb_nent = BITX(cp->cp_ebx, 23, 16);
5848                         break;
5849                 case 2 * 1024 * 1024:
5850                         dtlb_nent = BITX(cp->cp_eax, 23, 16);
5851                         break;
5852                 default:
5853                         panic("unknown L1 d-TLB pagesize");
5854                         /*NOTREACHED*/
5855                 }
5856         }
5857 
5858         return (dtlb_nent);
5859 }
5860 
5861 /*
5862  * Return 0 if the erratum is not present or not applicable, positive
5863  * if it is, and negative if the status of the erratum is unknown.
5864  *
5865  * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
5866  * Processors" #25759, Rev 3.57, August 2005
5867  */
5868 int
5869 cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
5870 {
5871         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5872         uint_t eax;
5873 
5874         /*
5875          * Bail out if this CPU isn't an AMD CPU, or if it's
5876          * a legacy (32-bit) AMD CPU.
5877          */
5878         if (cpi->cpi_vendor != X86_VENDOR_AMD ||
5879             cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
5880             cpi->cpi_family == 6) {
5881                 return (0);
5882         }
5883 
5884         eax = cpi->cpi_std[1].cp_eax;
5885 
5886 #define SH_B0(eax)      (eax == 0xf40 || eax == 0xf50)
5887 #define SH_B3(eax)      (eax == 0xf51)
5888 #define B(eax)          (SH_B0(eax) || SH_B3(eax))
5889 
5890 #define SH_C0(eax)      (eax == 0xf48 || eax == 0xf58)
5891 
5892 #define SH_CG(eax)      (eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
5893 #define DH_CG(eax)      (eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
5894 #define CH_CG(eax)      (eax == 0xf82 || eax == 0xfb2)
5895 #define CG(eax)         (SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
5896 
5897 #define SH_D0(eax)      (eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
5898 #define DH_D0(eax)      (eax == 0x10fc0 || eax == 0x10ff0)
5899 #define CH_D0(eax)      (eax == 0x10f80 || eax == 0x10fb0)
5900 #define D0(eax)         (SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
5901 
5902 #define SH_E0(eax)      (eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
5903 #define JH_E1(eax)      (eax == 0x20f10)        /* JH8_E0 had 0x20f30 */
5904 #define DH_E3(eax)      (eax == 0x20fc0 || eax == 0x20ff0)
5905 #define SH_E4(eax)      (eax == 0x20f51 || eax == 0x20f71)
5906 #define BH_E4(eax)      (eax == 0x20fb1)
5907 #define SH_E5(eax)      (eax == 0x20f42)
5908 #define DH_E6(eax)      (eax == 0x20ff2 || eax == 0x20fc2)
5909 #define JH_E6(eax)      (eax == 0x20f12 || eax == 0x20f32)
5910 #define EX(eax)         (SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
5911                             SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
5912                             DH_E6(eax) || JH_E6(eax))
5913 
5914 #define DR_AX(eax)      (eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
5915 #define DR_B0(eax)      (eax == 0x100f20)
5916 #define DR_B1(eax)      (eax == 0x100f21)
5917 #define DR_BA(eax)      (eax == 0x100f2a)
5918 #define DR_B2(eax)      (eax == 0x100f22)
5919 #define DR_B3(eax)      (eax == 0x100f23)
5920 #define RB_C0(eax)      (eax == 0x100f40)
5921 
5922         switch (erratum) {
5923         case 1:
5924                 return (cpi->cpi_family < 0x10);
5925         case 51:        /* what does the asterisk mean? */
5926                 return (B(eax) || SH_C0(eax) || CG(eax));
5927         case 52:
5928                 return (B(eax));
5929         case 57:
5930                 return (cpi->cpi_family <= 0x11);
5931         case 58:
5932                 return (B(eax));
5933         case 60:
5934                 return (cpi->cpi_family <= 0x11);
5935         case 61:
5936         case 62:
5937         case 63:
5938         case 64:
5939         case 65:
5940         case 66:
5941         case 68:
5942         case 69:
5943         case 70:
5944         case 71:
5945                 return (B(eax));
5946         case 72:
5947                 return (SH_B0(eax));
5948         case 74:
5949                 return (B(eax));
5950         case 75:
5951                 return (cpi->cpi_family < 0x10);
5952         case 76:
5953                 return (B(eax));
5954         case 77:
5955                 return (cpi->cpi_family <= 0x11);
5956         case 78:
5957                 return (B(eax) || SH_C0(eax));
5958         case 79:
5959                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5960         case 80:
5961         case 81:
5962         case 82:
5963                 return (B(eax));
5964         case 83:
5965                 return (B(eax) || SH_C0(eax) || CG(eax));
5966         case 85:
5967                 return (cpi->cpi_family < 0x10);
5968         case 86:
5969                 return (SH_C0(eax) || CG(eax));
5970         case 88:
5971 #if !defined(__amd64)
5972                 return (0);
5973 #else
5974                 return (B(eax) || SH_C0(eax));
5975 #endif
5976         case 89:
5977                 return (cpi->cpi_family < 0x10);
5978         case 90:
5979                 return (B(eax) || SH_C0(eax) || CG(eax));
5980         case 91:
5981         case 92:
5982                 return (B(eax) || SH_C0(eax));
5983         case 93:
5984                 return (SH_C0(eax));
5985         case 94:
5986                 return (B(eax) || SH_C0(eax) || CG(eax));
5987         case 95:
5988 #if !defined(__amd64)
5989                 return (0);
5990 #else
5991                 return (B(eax) || SH_C0(eax));
5992 #endif
5993         case 96:
5994                 return (B(eax) || SH_C0(eax) || CG(eax));
5995         case 97:
5996         case 98:
5997                 return (SH_C0(eax) || CG(eax));
5998         case 99:
5999                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6000         case 100:
6001                 return (B(eax) || SH_C0(eax));
6002         case 101:
6003         case 103:
6004                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6005         case 104:
6006                 return (SH_C0(eax) || CG(eax) || D0(eax));
6007         case 105:
6008         case 106:
6009         case 107:
6010                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6011         case 108:
6012                 return (DH_CG(eax));
6013         case 109:
6014                 return (SH_C0(eax) || CG(eax) || D0(eax));
6015         case 110:
6016                 return (D0(eax) || EX(eax));
6017         case 111:
6018                 return (CG(eax));
6019         case 112:
6020                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6021         case 113:
6022                 return (eax == 0x20fc0);
6023         case 114:
6024                 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6025         case 115:
6026                 return (SH_E0(eax) || JH_E1(eax));
6027         case 116:
6028                 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6029         case 117:
6030                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6031         case 118:
6032                 return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
6033                     JH_E6(eax));
6034         case 121:
6035                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6036         case 122:
6037                 return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
6038         case 123:
6039                 return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
6040         case 131:
6041                 return (cpi->cpi_family < 0x10);
6042         case 6336786:
6043 
6044                 /*
6045                  * Test for AdvPowerMgmtInfo.TscPStateInvariant
6046                  * if this is a K8 family or newer processor. We're testing for
6047                  * this 'erratum' to determine whether or not we have a constant
6048                  * TSC.
6049                  *
6050                  * Our current fix for this is to disable the C1-Clock ramping.
6051                  * However, this doesn't work on newer processor families nor
6052                  * does it work when virtualized as those devices don't exist.
6053                  */
6054                 if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
6055                         return (0);
6056                 }
6057 
6058                 if (CPI_FAMILY(cpi) == 0xf) {
6059                         struct cpuid_regs regs;
6060                         regs.cp_eax = 0x80000007;
6061                         (void) __cpuid_insn(&regs);
6062                         return (!(regs.cp_edx & 0x100));
6063                 }
6064                 return (0);
6065         case 6323525:
6066                 /*
6067                  * This erratum (K8 #147) is not present on family 10 and newer.
6068                  */
6069                 if (cpi->cpi_family >= 0x10) {
6070                         return (0);
6071                 }
6072                 return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
6073                     (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
6074 
6075         case 6671130:
6076                 /*
6077                  * check for processors (pre-Shanghai) that do not provide
6078                  * optimal management of 1gb ptes in its tlb.
6079                  */
6080                 return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
6081 
6082         case 298:
6083                 return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
6084                     DR_B2(eax) || RB_C0(eax));
6085 
6086         case 721:
6087 #if defined(__amd64)
6088                 return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
6089 #else
6090                 return (0);
6091 #endif
6092 
6093         default:
6094                 return (-1);
6095 
6096         }
6097 }
6098 
6099 /*
6100  * Determine if specified erratum is present via OSVW (OS Visible Workaround).
6101  * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
6102  */
6103 int
6104 osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
6105 {
6106         struct cpuid_info       *cpi;
6107         uint_t                  osvwid;
6108         static int              osvwfeature = -1;
6109         uint64_t                osvwlength;
6110 
6111 
6112         cpi = cpu->cpu_m.mcpu_cpi;
6113 
6114         /* confirm OSVW supported */
6115         if (osvwfeature == -1) {
6116                 osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
6117         } else {
6118                 /* assert that osvw feature setting is consistent on all cpus */
6119                 ASSERT(osvwfeature ==
6120                     (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
6121         }
6122         if (!osvwfeature)
6123                 return (-1);
6124 
6125         osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
6126 
6127         switch (erratum) {
6128         case 298:       /* osvwid is 0 */
6129                 osvwid = 0;
6130                 if (osvwlength <= (uint64_t)osvwid) {
6131                         /* osvwid 0 is unknown */
6132                         return (-1);
6133                 }
6134 
6135                 /*
6136                  * Check the OSVW STATUS MSR to determine the state
6137                  * of the erratum where:
6138                  *   0 - fixed by HW
6139                  *   1 - BIOS has applied the workaround when BIOS
6140                  *   workaround is available. (Or for other errata,
6141                  *   OS workaround is required.)
6142                  * For a value of 1, caller will confirm that the
6143                  * erratum 298 workaround has indeed been applied by BIOS.
6144                  *
6145                  * A 1 may be set in cpus that have a HW fix
6146                  * in a mixed cpu system. Regarding erratum 298:
6147                  *   In a multiprocessor platform, the workaround above
6148                  *   should be applied to all processors regardless of
6149                  *   silicon revision when an affected processor is
6150                  *   present.
6151                  */
6152 
6153                 return (rdmsr(MSR_AMD_OSVW_STATUS +
6154                     (osvwid / OSVW_ID_CNT_PER_MSR)) &
6155                     (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
6156 
6157         default:
6158                 return (-1);
6159         }
6160 }
6161 
6162 static const char assoc_str[] = "associativity";
6163 static const char line_str[] = "line-size";
6164 static const char size_str[] = "size";
6165 
6166 static void
6167 add_cache_prop(dev_info_t *devi, const char *label, const char *type,
6168     uint32_t val)
6169 {
6170         char buf[128];
6171 
6172         /*
6173          * ndi_prop_update_int() is used because it is desirable for
6174          * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
6175          */
6176         if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
6177                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
6178 }
6179 
6180 /*
6181  * Intel-style cache/tlb description
6182  *
6183  * Standard cpuid level 2 gives a randomly ordered
6184  * selection of tags that index into a table that describes
6185  * cache and tlb properties.
6186  */
6187 
6188 static const char l1_icache_str[] = "l1-icache";
6189 static const char l1_dcache_str[] = "l1-dcache";
6190 static const char l2_cache_str[] = "l2-cache";
6191 static const char l3_cache_str[] = "l3-cache";
6192 static const char itlb4k_str[] = "itlb-4K";
6193 static const char dtlb4k_str[] = "dtlb-4K";
6194 static const char itlb2M_str[] = "itlb-2M";
6195 static const char itlb4M_str[] = "itlb-4M";
6196 static const char dtlb4M_str[] = "dtlb-4M";
6197 static const char dtlb24_str[] = "dtlb0-2M-4M";
6198 static const char itlb424_str[] = "itlb-4K-2M-4M";
6199 static const char itlb24_str[] = "itlb-2M-4M";
6200 static const char dtlb44_str[] = "dtlb-4K-4M";
6201 static const char sl1_dcache_str[] = "sectored-l1-dcache";
6202 static const char sl2_cache_str[] = "sectored-l2-cache";
6203 static const char itrace_str[] = "itrace-cache";
6204 static const char sl3_cache_str[] = "sectored-l3-cache";
6205 static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
6206 
6207 static const struct cachetab {
6208         uint8_t         ct_code;
6209         uint8_t         ct_assoc;
6210         uint16_t        ct_line_size;
6211         size_t          ct_size;
6212         const char      *ct_label;
6213 } intel_ctab[] = {
6214         /*
6215          * maintain descending order!
6216          *
6217          * Codes ignored - Reason
6218          * ----------------------
6219          * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
6220          * f0H/f1H - Currently we do not interpret prefetch size by design
6221          */
6222         { 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
6223         { 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
6224         { 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
6225         { 0xde, 12, 64, 6*1024*1024, l3_cache_str},
6226         { 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
6227         { 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
6228         { 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
6229         { 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
6230         { 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
6231         { 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
6232         { 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
6233         { 0xd0, 4, 64, 512*1024, l3_cache_str},
6234         { 0xca, 4, 0, 512, sh_l2_tlb4k_str},
6235         { 0xc0, 4, 0, 8, dtlb44_str },
6236         { 0xba, 4, 0, 64, dtlb4k_str },
6237         { 0xb4, 4, 0, 256, dtlb4k_str },
6238         { 0xb3, 4, 0, 128, dtlb4k_str },
6239         { 0xb2, 4, 0, 64, itlb4k_str },
6240         { 0xb0, 4, 0, 128, itlb4k_str },
6241         { 0x87, 8, 64, 1024*1024, l2_cache_str},
6242         { 0x86, 4, 64, 512*1024, l2_cache_str},
6243         { 0x85, 8, 32, 2*1024*1024, l2_cache_str},
6244         { 0x84, 8, 32, 1024*1024, l2_cache_str},
6245         { 0x83, 8, 32, 512*1024, l2_cache_str},
6246         { 0x82, 8, 32, 256*1024, l2_cache_str},
6247         { 0x80, 8, 64, 512*1024, l2_cache_str},
6248         { 0x7f, 2, 64, 512*1024, l2_cache_str},
6249         { 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
6250         { 0x7c, 8, 64, 1024*1024, sl2_cache_str},
6251         { 0x7b, 8, 64, 512*1024, sl2_cache_str},
6252         { 0x7a, 8, 64, 256*1024, sl2_cache_str},
6253         { 0x79, 8, 64, 128*1024, sl2_cache_str},
6254         { 0x78, 8, 64, 1024*1024, l2_cache_str},
6255         { 0x73, 8, 0, 64*1024, itrace_str},
6256         { 0x72, 8, 0, 32*1024, itrace_str},
6257         { 0x71, 8, 0, 16*1024, itrace_str},
6258         { 0x70, 8, 0, 12*1024, itrace_str},
6259         { 0x68, 4, 64, 32*1024, sl1_dcache_str},
6260         { 0x67, 4, 64, 16*1024, sl1_dcache_str},
6261         { 0x66, 4, 64, 8*1024, sl1_dcache_str},
6262         { 0x60, 8, 64, 16*1024, sl1_dcache_str},
6263         { 0x5d, 0, 0, 256, dtlb44_str},
6264         { 0x5c, 0, 0, 128, dtlb44_str},
6265         { 0x5b, 0, 0, 64, dtlb44_str},
6266         { 0x5a, 4, 0, 32, dtlb24_str},
6267         { 0x59, 0, 0, 16, dtlb4k_str},
6268         { 0x57, 4, 0, 16, dtlb4k_str},
6269         { 0x56, 4, 0, 16, dtlb4M_str},
6270         { 0x55, 0, 0, 7, itlb24_str},
6271         { 0x52, 0, 0, 256, itlb424_str},
6272         { 0x51, 0, 0, 128, itlb424_str},
6273         { 0x50, 0, 0, 64, itlb424_str},
6274         { 0x4f, 0, 0, 32, itlb4k_str},
6275         { 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
6276         { 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
6277         { 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
6278         { 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
6279         { 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
6280         { 0x49, 16, 64, 4*1024*1024, l3_cache_str},
6281         { 0x48, 12, 64, 3*1024*1024, l2_cache_str},
6282         { 0x47, 8, 64, 8*1024*1024, l3_cache_str},
6283         { 0x46, 4, 64, 4*1024*1024, l3_cache_str},
6284         { 0x45, 4, 32, 2*1024*1024, l2_cache_str},
6285         { 0x44, 4, 32, 1024*1024, l2_cache_str},
6286         { 0x43, 4, 32, 512*1024, l2_cache_str},
6287         { 0x42, 4, 32, 256*1024, l2_cache_str},
6288         { 0x41, 4, 32, 128*1024, l2_cache_str},
6289         { 0x3e, 4, 64, 512*1024, sl2_cache_str},
6290         { 0x3d, 6, 64, 384*1024, sl2_cache_str},
6291         { 0x3c, 4, 64, 256*1024, sl2_cache_str},
6292         { 0x3b, 2, 64, 128*1024, sl2_cache_str},
6293         { 0x3a, 6, 64, 192*1024, sl2_cache_str},
6294         { 0x39, 4, 64, 128*1024, sl2_cache_str},
6295         { 0x30, 8, 64, 32*1024, l1_icache_str},
6296         { 0x2c, 8, 64, 32*1024, l1_dcache_str},
6297         { 0x29, 8, 64, 4096*1024, sl3_cache_str},
6298         { 0x25, 8, 64, 2048*1024, sl3_cache_str},
6299         { 0x23, 8, 64, 1024*1024, sl3_cache_str},
6300         { 0x22, 4, 64, 512*1024, sl3_cache_str},
6301         { 0x0e, 6, 64, 24*1024, l1_dcache_str},
6302         { 0x0d, 4, 32, 16*1024, l1_dcache_str},
6303         { 0x0c, 4, 32, 16*1024, l1_dcache_str},
6304         { 0x0b, 4, 0, 4, itlb4M_str},
6305         { 0x0a, 2, 32, 8*1024, l1_dcache_str},
6306         { 0x08, 4, 32, 16*1024, l1_icache_str},
6307         { 0x06, 4, 32, 8*1024, l1_icache_str},
6308         { 0x05, 4, 0, 32, dtlb4M_str},
6309         { 0x04, 4, 0, 8, dtlb4M_str},
6310         { 0x03, 4, 0, 64, dtlb4k_str},
6311         { 0x02, 4, 0, 2, itlb4M_str},
6312         { 0x01, 4, 0, 32, itlb4k_str},
6313         { 0 }
6314 };
6315 
6316 static const struct cachetab cyrix_ctab[] = {
6317         { 0x70, 4, 0, 32, "tlb-4K" },
6318         { 0x80, 4, 16, 16*1024, "l1-cache" },
6319         { 0 }
6320 };
6321 
6322 /*
6323  * Search a cache table for a matching entry
6324  */
6325 static const struct cachetab *
6326 find_cacheent(const struct cachetab *ct, uint_t code)
6327 {
6328         if (code != 0) {
6329                 for (; ct->ct_code != 0; ct++)
6330                         if (ct->ct_code <= code)
6331                                 break;
6332                 if (ct->ct_code == code)
6333                         return (ct);
6334         }
6335         return (NULL);
6336 }
6337 
6338 /*
6339  * Populate cachetab entry with L2 or L3 cache-information using
6340  * cpuid function 4. This function is called from intel_walk_cacheinfo()
6341  * when descriptor 0x49 is encountered. It returns 0 if no such cache
6342  * information is found.
6343  */
6344 static int
6345 intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
6346 {
6347         uint32_t level, i;
6348         int ret = 0;
6349 
6350         for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
6351                 level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
6352 
6353                 if (level == 2 || level == 3) {
6354                         ct->ct_assoc =
6355                             CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
6356                         ct->ct_line_size =
6357                             CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
6358                         ct->ct_size = ct->ct_assoc *
6359                             (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
6360                             ct->ct_line_size *
6361                             (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
6362 
6363                         if (level == 2) {
6364                                 ct->ct_label = l2_cache_str;
6365                         } else if (level == 3) {
6366                                 ct->ct_label = l3_cache_str;
6367                         }
6368                         ret = 1;
6369                 }
6370         }
6371 
6372         return (ret);
6373 }
6374 
6375 /*
6376  * Walk the cacheinfo descriptor, applying 'func' to every valid element
6377  * The walk is terminated if the walker returns non-zero.
6378  */
6379 static void
6380 intel_walk_cacheinfo(struct cpuid_info *cpi,
6381     void *arg, int (*func)(void *, const struct cachetab *))
6382 {
6383         const struct cachetab *ct;
6384         struct cachetab des_49_ct, des_b1_ct;
6385         uint8_t *dp;
6386         int i;
6387 
6388         if ((dp = cpi->cpi_cacheinfo) == NULL)
6389                 return;
6390         for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6391                 /*
6392                  * For overloaded descriptor 0x49 we use cpuid function 4
6393                  * if supported by the current processor, to create
6394                  * cache information.
6395                  * For overloaded descriptor 0xb1 we use X86_PAE flag
6396                  * to disambiguate the cache information.
6397                  */
6398                 if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
6399                     intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
6400                                 ct = &des_49_ct;
6401                 } else if (*dp == 0xb1) {
6402                         des_b1_ct.ct_code = 0xb1;
6403                         des_b1_ct.ct_assoc = 4;
6404                         des_b1_ct.ct_line_size = 0;
6405                         if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
6406                                 des_b1_ct.ct_size = 8;
6407                                 des_b1_ct.ct_label = itlb2M_str;
6408                         } else {
6409                                 des_b1_ct.ct_size = 4;
6410                                 des_b1_ct.ct_label = itlb4M_str;
6411                         }
6412                         ct = &des_b1_ct;
6413                 } else {
6414                         if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
6415                                 continue;
6416                         }
6417                 }
6418 
6419                 if (func(arg, ct) != 0) {
6420                         break;
6421                 }
6422         }
6423 }
6424 
6425 /*
6426  * (Like the Intel one, except for Cyrix CPUs)
6427  */
6428 static void
6429 cyrix_walk_cacheinfo(struct cpuid_info *cpi,
6430     void *arg, int (*func)(void *, const struct cachetab *))
6431 {
6432         const struct cachetab *ct;
6433         uint8_t *dp;
6434         int i;
6435 
6436         if ((dp = cpi->cpi_cacheinfo) == NULL)
6437                 return;
6438         for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6439                 /*
6440                  * Search Cyrix-specific descriptor table first ..
6441                  */
6442                 if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
6443                         if (func(arg, ct) != 0)
6444                                 break;
6445                         continue;
6446                 }
6447                 /*
6448                  * .. else fall back to the Intel one
6449                  */
6450                 if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
6451                         if (func(arg, ct) != 0)
6452                                 break;
6453                         continue;
6454                 }
6455         }
6456 }
6457 
6458 /*
6459  * A cacheinfo walker that adds associativity, line-size, and size properties
6460  * to the devinfo node it is passed as an argument.
6461  */
6462 static int
6463 add_cacheent_props(void *arg, const struct cachetab *ct)
6464 {
6465         dev_info_t *devi = arg;
6466 
6467         add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
6468         if (ct->ct_line_size != 0)
6469                 add_cache_prop(devi, ct->ct_label, line_str,
6470                     ct->ct_line_size);
6471         add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
6472         return (0);
6473 }
6474 
6475 
6476 static const char fully_assoc[] = "fully-associative?";
6477 
6478 /*
6479  * AMD style cache/tlb description
6480  *
6481  * Extended functions 5 and 6 directly describe properties of
6482  * tlbs and various cache levels.
6483  */
6484 static void
6485 add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6486 {
6487         switch (assoc) {
6488         case 0: /* reserved; ignore */
6489                 break;
6490         default:
6491                 add_cache_prop(devi, label, assoc_str, assoc);
6492                 break;
6493         case 0xff:
6494                 add_cache_prop(devi, label, fully_assoc, 1);
6495                 break;
6496         }
6497 }
6498 
6499 static void
6500 add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6501 {
6502         if (size == 0)
6503                 return;
6504         add_cache_prop(devi, label, size_str, size);
6505         add_amd_assoc(devi, label, assoc);
6506 }
6507 
6508 static void
6509 add_amd_cache(dev_info_t *devi, const char *label,
6510     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6511 {
6512         if (size == 0 || line_size == 0)
6513                 return;
6514         add_amd_assoc(devi, label, assoc);
6515         /*
6516          * Most AMD parts have a sectored cache. Multiple cache lines are
6517          * associated with each tag. A sector consists of all cache lines
6518          * associated with a tag. For example, the AMD K6-III has a sector
6519          * size of 2 cache lines per tag.
6520          */
6521         if (lines_per_tag != 0)
6522                 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6523         add_cache_prop(devi, label, line_str, line_size);
6524         add_cache_prop(devi, label, size_str, size * 1024);
6525 }
6526 
6527 static void
6528 add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6529 {
6530         switch (assoc) {
6531         case 0: /* off */
6532                 break;
6533         case 1:
6534         case 2:
6535         case 4:
6536                 add_cache_prop(devi, label, assoc_str, assoc);
6537                 break;
6538         case 6:
6539                 add_cache_prop(devi, label, assoc_str, 8);
6540                 break;
6541         case 8:
6542                 add_cache_prop(devi, label, assoc_str, 16);
6543                 break;
6544         case 0xf:
6545                 add_cache_prop(devi, label, fully_assoc, 1);
6546                 break;
6547         default: /* reserved; ignore */
6548                 break;
6549         }
6550 }
6551 
6552 static void
6553 add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6554 {
6555         if (size == 0 || assoc == 0)
6556                 return;
6557         add_amd_l2_assoc(devi, label, assoc);
6558         add_cache_prop(devi, label, size_str, size);
6559 }
6560 
6561 static void
6562 add_amd_l2_cache(dev_info_t *devi, const char *label,
6563     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6564 {
6565         if (size == 0 || assoc == 0 || line_size == 0)
6566                 return;
6567         add_amd_l2_assoc(devi, label, assoc);
6568         if (lines_per_tag != 0)
6569                 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6570         add_cache_prop(devi, label, line_str, line_size);
6571         add_cache_prop(devi, label, size_str, size * 1024);
6572 }
6573 
6574 static void
6575 amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
6576 {
6577         struct cpuid_regs *cp;
6578 
6579         if (cpi->cpi_xmaxeax < 0x80000005)
6580                 return;
6581         cp = &cpi->cpi_extd[5];
6582 
6583         /*
6584          * 4M/2M L1 TLB configuration
6585          *
6586          * We report the size for 2M pages because AMD uses two
6587          * TLB entries for one 4M page.
6588          */
6589         add_amd_tlb(devi, "dtlb-2M",
6590             BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
6591         add_amd_tlb(devi, "itlb-2M",
6592             BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
6593 
6594         /*
6595          * 4K L1 TLB configuration
6596          */
6597 
6598         switch (cpi->cpi_vendor) {
6599                 uint_t nentries;
6600         case X86_VENDOR_TM:
6601                 if (cpi->cpi_family >= 5) {
6602                         /*
6603                          * Crusoe processors have 256 TLB entries, but
6604                          * cpuid data format constrains them to only
6605                          * reporting 255 of them.
6606                          */
6607                         if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
6608                                 nentries = 256;
6609                         /*
6610                          * Crusoe processors also have a unified TLB
6611                          */
6612                         add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
6613                             nentries);
6614                         break;
6615                 }
6616                 /*FALLTHROUGH*/
6617         default:
6618                 add_amd_tlb(devi, itlb4k_str,
6619                     BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
6620                 add_amd_tlb(devi, dtlb4k_str,
6621                     BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
6622                 break;
6623         }
6624 
6625         /*
6626          * data L1 cache configuration
6627          */
6628 
6629         add_amd_cache(devi, l1_dcache_str,
6630             BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
6631             BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
6632 
6633         /*
6634          * code L1 cache configuration
6635          */
6636 
6637         add_amd_cache(devi, l1_icache_str,
6638             BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
6639             BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
6640 
6641         if (cpi->cpi_xmaxeax < 0x80000006)
6642                 return;
6643         cp = &cpi->cpi_extd[6];
6644 
6645         /* Check for a unified L2 TLB for large pages */
6646 
6647         if (BITX(cp->cp_eax, 31, 16) == 0)
6648                 add_amd_l2_tlb(devi, "l2-tlb-2M",
6649                     BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6650         else {
6651                 add_amd_l2_tlb(devi, "l2-dtlb-2M",
6652                     BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6653                 add_amd_l2_tlb(devi, "l2-itlb-2M",
6654                     BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6655         }
6656 
6657         /* Check for a unified L2 TLB for 4K pages */
6658 
6659         if (BITX(cp->cp_ebx, 31, 16) == 0) {
6660                 add_amd_l2_tlb(devi, "l2-tlb-4K",
6661                     BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6662         } else {
6663                 add_amd_l2_tlb(devi, "l2-dtlb-4K",
6664                     BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6665                 add_amd_l2_tlb(devi, "l2-itlb-4K",
6666                     BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6667         }
6668 
6669         add_amd_l2_cache(devi, l2_cache_str,
6670             BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
6671             BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
6672 }
6673 
6674 /*
6675  * There are two basic ways that the x86 world describes it cache
6676  * and tlb architecture - Intel's way and AMD's way.
6677  *
6678  * Return which flavor of cache architecture we should use
6679  */
6680 static int
6681 x86_which_cacheinfo(struct cpuid_info *cpi)
6682 {
6683         switch (cpi->cpi_vendor) {
6684         case X86_VENDOR_Intel:
6685                 if (cpi->cpi_maxeax >= 2)
6686                         return (X86_VENDOR_Intel);
6687                 break;
6688         case X86_VENDOR_AMD:
6689                 /*
6690                  * The K5 model 1 was the first part from AMD that reported
6691                  * cache sizes via extended cpuid functions.
6692                  */
6693                 if (cpi->cpi_family > 5 ||
6694                     (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
6695                         return (X86_VENDOR_AMD);
6696                 break;
6697         case X86_VENDOR_TM:
6698                 if (cpi->cpi_family >= 5)
6699                         return (X86_VENDOR_AMD);
6700                 /*FALLTHROUGH*/
6701         default:
6702                 /*
6703                  * If they have extended CPU data for 0x80000005
6704                  * then we assume they have AMD-format cache
6705                  * information.
6706                  *
6707                  * If not, and the vendor happens to be Cyrix,
6708                  * then try our-Cyrix specific handler.
6709                  *
6710                  * If we're not Cyrix, then assume we're using Intel's
6711                  * table-driven format instead.
6712                  */
6713                 if (cpi->cpi_xmaxeax >= 0x80000005)
6714                         return (X86_VENDOR_AMD);
6715                 else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
6716                         return (X86_VENDOR_Cyrix);
6717                 else if (cpi->cpi_maxeax >= 2)
6718                         return (X86_VENDOR_Intel);
6719                 break;
6720         }
6721         return (-1);
6722 }
6723 
6724 void
6725 cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
6726     struct cpuid_info *cpi)
6727 {
6728         dev_info_t *cpu_devi;
6729         int create;
6730 
6731         cpu_devi = (dev_info_t *)dip;
6732 
6733         /* device_type */
6734         (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6735             "device_type", "cpu");
6736 
6737         /* reg */
6738         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6739             "reg", cpu_id);
6740 
6741         /* cpu-mhz, and clock-frequency */
6742         if (cpu_freq > 0) {
6743                 long long mul;
6744 
6745                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6746                     "cpu-mhz", cpu_freq);
6747                 if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
6748                         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6749                             "clock-frequency", (int)mul);
6750         }
6751 
6752         if (!is_x86_feature(x86_featureset, X86FSET_CPUID)) {
6753                 return;
6754         }
6755 
6756         /* vendor-id */
6757         (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6758             "vendor-id", cpi->cpi_vendorstr);
6759 
6760         if (cpi->cpi_maxeax == 0) {
6761                 return;
6762         }
6763 
6764         /*
6765          * family, model, and step
6766          */
6767         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6768             "family", CPI_FAMILY(cpi));
6769         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6770             "cpu-model", CPI_MODEL(cpi));
6771         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6772             "stepping-id", CPI_STEP(cpi));
6773 
6774         /* type */
6775         switch (cpi->cpi_vendor) {
6776         case X86_VENDOR_Intel:
6777                 create = 1;
6778                 break;
6779         default:
6780                 create = 0;
6781                 break;
6782         }
6783         if (create)
6784                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6785                     "type", CPI_TYPE(cpi));
6786 
6787         /* ext-family */
6788         switch (cpi->cpi_vendor) {
6789         case X86_VENDOR_Intel:
6790         case X86_VENDOR_AMD:
6791                 create = cpi->cpi_family >= 0xf;
6792                 break;
6793         default:
6794                 create = 0;
6795                 break;
6796         }
6797         if (create)
6798                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6799                     "ext-family", CPI_FAMILY_XTD(cpi));
6800 
6801         /* ext-model */
6802         switch (cpi->cpi_vendor) {
6803         case X86_VENDOR_Intel:
6804                 create = IS_EXTENDED_MODEL_INTEL(cpi);
6805                 break;
6806         case X86_VENDOR_AMD:
6807                 create = CPI_FAMILY(cpi) == 0xf;
6808                 break;
6809         default:
6810                 create = 0;
6811                 break;
6812         }
6813         if (create)
6814                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6815                     "ext-model", CPI_MODEL_XTD(cpi));
6816 
6817         /* generation */
6818         switch (cpi->cpi_vendor) {
6819         case X86_VENDOR_AMD:
6820                 /*
6821                  * AMD K5 model 1 was the first part to support this
6822                  */
6823                 create = cpi->cpi_xmaxeax >= 0x80000001;
6824                 break;
6825         default:
6826                 create = 0;
6827                 break;
6828         }
6829         if (create)
6830                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6831                     "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
6832 
6833         /* brand-id */
6834         switch (cpi->cpi_vendor) {
6835         case X86_VENDOR_Intel:
6836                 /*
6837                  * brand id first appeared on Pentium III Xeon model 8,
6838                  * and Celeron model 8 processors and Opteron
6839                  */
6840                 create = cpi->cpi_family > 6 ||
6841                     (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
6842                 break;
6843         case X86_VENDOR_AMD:
6844                 create = cpi->cpi_family >= 0xf;
6845                 break;
6846         default:
6847                 create = 0;
6848                 break;
6849         }
6850         if (create && cpi->cpi_brandid != 0) {
6851                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6852                     "brand-id", cpi->cpi_brandid);
6853         }
6854 
6855         /* chunks, and apic-id */
6856         switch (cpi->cpi_vendor) {
6857                 /*
6858                  * first available on Pentium IV and Opteron (K8)
6859                  */
6860         case X86_VENDOR_Intel:
6861                 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6862                 break;
6863         case X86_VENDOR_AMD:
6864                 create = cpi->cpi_family >= 0xf;
6865                 break;
6866         default:
6867                 create = 0;
6868                 break;
6869         }
6870         if (create) {
6871                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6872                     "chunks", CPI_CHUNKS(cpi));
6873                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6874                     "apic-id", cpi->cpi_apicid);
6875                 if (cpi->cpi_chipid >= 0) {
6876                         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6877                             "chip#", cpi->cpi_chipid);
6878                         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6879                             "clog#", cpi->cpi_clogid);
6880                 }
6881         }
6882 
6883         /* cpuid-features */
6884         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6885             "cpuid-features", CPI_FEATURES_EDX(cpi));
6886 
6887 
6888         /* cpuid-features-ecx */
6889         switch (cpi->cpi_vendor) {
6890         case X86_VENDOR_Intel:
6891                 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6892                 break;
6893         case X86_VENDOR_AMD:
6894                 create = cpi->cpi_family >= 0xf;
6895                 break;
6896         default:
6897                 create = 0;
6898                 break;
6899         }
6900         if (create)
6901                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6902                     "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
6903 
6904         /* ext-cpuid-features */
6905         switch (cpi->cpi_vendor) {
6906         case X86_VENDOR_Intel:
6907         case X86_VENDOR_AMD:
6908         case X86_VENDOR_Cyrix:
6909         case X86_VENDOR_TM:
6910         case X86_VENDOR_Centaur:
6911                 create = cpi->cpi_xmaxeax >= 0x80000001;
6912                 break;
6913         default:
6914                 create = 0;
6915                 break;
6916         }
6917         if (create) {
6918                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6919                     "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
6920                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6921                     "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
6922         }
6923 
6924         /*
6925          * Brand String first appeared in Intel Pentium IV, AMD K5
6926          * model 1, and Cyrix GXm.  On earlier models we try and
6927          * simulate something similar .. so this string should always
6928          * same -something- about the processor, however lame.
6929          */
6930         (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6931             "brand-string", cpi->cpi_brandstr);
6932 
6933         /*
6934          * Finally, cache and tlb information
6935          */
6936         switch (x86_which_cacheinfo(cpi)) {
6937         case X86_VENDOR_Intel:
6938                 intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
6939                 break;
6940         case X86_VENDOR_Cyrix:
6941                 cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
6942                 break;
6943         case X86_VENDOR_AMD:
6944                 amd_cache_info(cpi, cpu_devi);
6945                 break;
6946         default:
6947                 break;
6948         }
6949 }
6950 
6951 struct l2info {
6952         int *l2i_csz;
6953         int *l2i_lsz;
6954         int *l2i_assoc;
6955         int l2i_ret;
6956 };
6957 
6958 /*
6959  * A cacheinfo walker that fetches the size, line-size and associativity
6960  * of the L2 cache
6961  */
6962 static int
6963 intel_l2cinfo(void *arg, const struct cachetab *ct)
6964 {
6965         struct l2info *l2i = arg;
6966         int *ip;
6967 
6968         if (ct->ct_label != l2_cache_str &&
6969             ct->ct_label != sl2_cache_str)
6970                 return (0);     /* not an L2 -- keep walking */
6971 
6972         if ((ip = l2i->l2i_csz) != NULL)
6973                 *ip = ct->ct_size;
6974         if ((ip = l2i->l2i_lsz) != NULL)
6975                 *ip = ct->ct_line_size;
6976         if ((ip = l2i->l2i_assoc) != NULL)
6977                 *ip = ct->ct_assoc;
6978         l2i->l2i_ret = ct->ct_size;
6979         return (1);             /* was an L2 -- terminate walk */
6980 }
6981 
6982 /*
6983  * AMD L2/L3 Cache and TLB Associativity Field Definition:
6984  *
6985  *      Unlike the associativity for the L1 cache and tlb where the 8 bit
6986  *      value is the associativity, the associativity for the L2 cache and
6987  *      tlb is encoded in the following table. The 4 bit L2 value serves as
6988  *      an index into the amd_afd[] array to determine the associativity.
6989  *      -1 is undefined. 0 is fully associative.
6990  */
6991 
6992 static int amd_afd[] =
6993         {-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
6994 
6995 static void
6996 amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
6997 {
6998         struct cpuid_regs *cp;
6999         uint_t size, assoc;
7000         int i;
7001         int *ip;
7002 
7003         if (cpi->cpi_xmaxeax < 0x80000006)
7004                 return;
7005         cp = &cpi->cpi_extd[6];
7006 
7007         if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
7008             (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
7009                 uint_t cachesz = size * 1024;
7010                 assoc = amd_afd[i];
7011 
7012                 ASSERT(assoc != -1);
7013 
7014                 if ((ip = l2i->l2i_csz) != NULL)
7015                         *ip = cachesz;
7016                 if ((ip = l2i->l2i_lsz) != NULL)
7017                         *ip = BITX(cp->cp_ecx, 7, 0);
7018                 if ((ip = l2i->l2i_assoc) != NULL)
7019                         *ip = assoc;
7020                 l2i->l2i_ret = cachesz;
7021         }
7022 }
7023 
7024 int
7025 getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
7026 {
7027         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7028         struct l2info __l2info, *l2i = &__l2info;
7029 
7030         l2i->l2i_csz = csz;
7031         l2i->l2i_lsz = lsz;
7032         l2i->l2i_assoc = assoc;
7033         l2i->l2i_ret = -1;
7034 
7035         switch (x86_which_cacheinfo(cpi)) {
7036         case X86_VENDOR_Intel:
7037                 intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7038                 break;
7039         case X86_VENDOR_Cyrix:
7040                 cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7041                 break;
7042         case X86_VENDOR_AMD:
7043                 amd_l2cacheinfo(cpi, l2i);
7044                 break;
7045         default:
7046                 break;
7047         }
7048         return (l2i->l2i_ret);
7049 }
7050 
7051 #if !defined(__xpv)
7052 
7053 uint32_t *
7054 cpuid_mwait_alloc(cpu_t *cpu)
7055 {
7056         uint32_t        *ret;
7057         size_t          mwait_size;
7058 
7059         ASSERT(cpuid_checkpass(CPU, 2));
7060 
7061         mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
7062         if (mwait_size == 0)
7063                 return (NULL);
7064 
7065         /*
7066          * kmem_alloc() returns cache line size aligned data for mwait_size
7067          * allocations.  mwait_size is currently cache line sized.  Neither
7068          * of these implementation details are guarantied to be true in the
7069          * future.
7070          *
7071          * First try allocating mwait_size as kmem_alloc() currently returns
7072          * correctly aligned memory.  If kmem_alloc() does not return
7073          * mwait_size aligned memory, then use mwait_size ROUNDUP.
7074          *
7075          * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
7076          * decide to free this memory.
7077          */
7078         ret = kmem_zalloc(mwait_size, KM_SLEEP);
7079         if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
7080                 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7081                 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
7082                 *ret = MWAIT_RUNNING;
7083                 return (ret);
7084         } else {
7085                 kmem_free(ret, mwait_size);
7086                 ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
7087                 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7088                 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
7089                 ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
7090                 *ret = MWAIT_RUNNING;
7091                 return (ret);
7092         }
7093 }
7094 
7095 void
7096 cpuid_mwait_free(cpu_t *cpu)
7097 {
7098         if (cpu->cpu_m.mcpu_cpi == NULL) {
7099                 return;
7100         }
7101 
7102         if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
7103             cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
7104                 kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
7105                     cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
7106         }
7107 
7108         cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
7109         cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
7110 }
7111 
7112 void
7113 patch_tsc_read(int flag)
7114 {
7115         size_t cnt;
7116 
7117         switch (flag) {
7118         case TSC_NONE:
7119                 cnt = &_no_rdtsc_end - &_no_rdtsc_start;
7120                 (void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
7121                 break;
7122         case TSC_RDTSC_MFENCE:
7123                 cnt = &_tsc_mfence_end - &_tsc_mfence_start;
7124                 (void) memcpy((void *)tsc_read,
7125                     (void *)&_tsc_mfence_start, cnt);
7126                 break;
7127         case TSC_RDTSC_LFENCE:
7128                 cnt = &_tsc_lfence_end - &_tsc_lfence_start;
7129                 (void) memcpy((void *)tsc_read,
7130                     (void *)&_tsc_lfence_start, cnt);
7131                 break;
7132         case TSC_TSCP:
7133                 cnt = &_tscp_end - &_tscp_start;
7134                 (void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
7135                 break;
7136         default:
7137                 /* Bail for unexpected TSC types. (TSC_NONE covers 0) */
7138                 cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
7139                 break;
7140         }
7141         tsc_type = flag;
7142 }
7143 
7144 int
7145 cpuid_deep_cstates_supported(void)
7146 {
7147         struct cpuid_info *cpi;
7148         struct cpuid_regs regs;
7149 
7150         ASSERT(cpuid_checkpass(CPU, 1));
7151 
7152         cpi = CPU->cpu_m.mcpu_cpi;
7153 
7154         if (!is_x86_feature(x86_featureset, X86FSET_CPUID))
7155                 return (0);
7156 
7157         switch (cpi->cpi_vendor) {
7158         case X86_VENDOR_Intel:
7159                 if (cpi->cpi_xmaxeax < 0x80000007)
7160                         return (0);
7161 
7162                 /*
7163                  * TSC run at a constant rate in all ACPI C-states?
7164                  */
7165                 regs.cp_eax = 0x80000007;
7166                 (void) __cpuid_insn(&regs);
7167                 return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
7168 
7169         default:
7170                 return (0);
7171         }
7172 }
7173 
7174 #endif  /* !__xpv */
7175 
7176 void
7177 post_startup_cpu_fixups(void)
7178 {
7179 #ifndef __xpv
7180         /*
7181          * Some AMD processors support C1E state. Entering this state will
7182          * cause the local APIC timer to stop, which we can't deal with at
7183          * this time.
7184          */
7185         if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
7186                 on_trap_data_t otd;
7187                 uint64_t reg;
7188 
7189                 if (!on_trap(&otd, OT_DATA_ACCESS)) {
7190                         reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
7191                         /* Disable C1E state if it is enabled by BIOS */
7192                         if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
7193                             AMD_ACTONCMPHALT_MASK) {
7194                                 reg &= ~(AMD_ACTONCMPHALT_MASK <<
7195                                     AMD_ACTONCMPHALT_SHIFT);
7196                                 wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
7197                         }
7198                 }
7199                 no_trap();
7200         }
7201 #endif  /* !__xpv */
7202 }
7203 
7204 void
7205 enable_pcid(void)
7206 {
7207         if (x86_use_pcid == -1)
7208                 x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
7209 
7210         if (x86_use_invpcid == -1) {
7211                 x86_use_invpcid = is_x86_feature(x86_featureset,
7212                     X86FSET_INVPCID);
7213         }
7214 
7215         if (!x86_use_pcid)
7216                 return;
7217 
7218         /*
7219          * Intel say that on setting PCIDE, it immediately starts using the PCID
7220          * bits; better make sure there's nothing there.
7221          */
7222         ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
7223 
7224         setcr4(getcr4() | CR4_PCIDE);
7225 }
7226 
7227 /*
7228  * Setup necessary registers to enable XSAVE feature on this processor.
7229  * This function needs to be called early enough, so that no xsave/xrstor
7230  * ops will execute on the processor before the MSRs are properly set up.
7231  *
7232  * Current implementation has the following assumption:
7233  * - cpuid_pass1() is done, so that X86 features are known.
7234  * - fpu_probe() is done, so that fp_save_mech is chosen.
7235  */
7236 void
7237 xsave_setup_msr(cpu_t *cpu)
7238 {
7239         ASSERT(fp_save_mech == FP_XSAVE);
7240         ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
7241 
7242         /* Enable OSXSAVE in CR4. */
7243         setcr4(getcr4() | CR4_OSXSAVE);
7244         /*
7245          * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report
7246          * correct value.
7247          */
7248         cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE;
7249         setup_xfem();
7250 }
7251 
7252 /*
7253  * Starting with the Westmere processor the local
7254  * APIC timer will continue running in all C-states,
7255  * including the deepest C-states.
7256  */
7257 int
7258 cpuid_arat_supported(void)
7259 {
7260         struct cpuid_info *cpi;
7261         struct cpuid_regs regs;
7262 
7263         ASSERT(cpuid_checkpass(CPU, 1));
7264         ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7265 
7266         cpi = CPU->cpu_m.mcpu_cpi;
7267 
7268         switch (cpi->cpi_vendor) {
7269         case X86_VENDOR_Intel:
7270                 /*
7271                  * Always-running Local APIC Timer is
7272                  * indicated by CPUID.6.EAX[2].
7273                  */
7274                 if (cpi->cpi_maxeax >= 6) {
7275                         regs.cp_eax = 6;
7276                         (void) cpuid_insn(NULL, &regs);
7277                         return (regs.cp_eax & CPUID_INTC_EAX_ARAT);
7278                 } else {
7279                         return (0);
7280                 }
7281         default:
7282                 return (0);
7283         }
7284 }
7285 
7286 /*
7287  * Check support for Intel ENERGY_PERF_BIAS feature
7288  */
7289 int
7290 cpuid_iepb_supported(struct cpu *cp)
7291 {
7292         struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi;
7293         struct cpuid_regs regs;
7294 
7295         ASSERT(cpuid_checkpass(cp, 1));
7296 
7297         if (!(is_x86_feature(x86_featureset, X86FSET_CPUID)) ||
7298             !(is_x86_feature(x86_featureset, X86FSET_MSR))) {
7299                 return (0);
7300         }
7301 
7302         /*
7303          * Intel ENERGY_PERF_BIAS MSR is indicated by
7304          * capability bit CPUID.6.ECX.3
7305          */
7306         if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6))
7307                 return (0);
7308 
7309         regs.cp_eax = 0x6;
7310         (void) cpuid_insn(NULL, &regs);
7311         return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS);
7312 }
7313 
7314 /*
7315  * Check support for TSC deadline timer
7316  *
7317  * TSC deadline timer provides a superior software programming
7318  * model over local APIC timer that eliminates "time drifts".
7319  * Instead of specifying a relative time, software specifies an
7320  * absolute time as the target at which the processor should
7321  * generate a timer event.
7322  */
7323 int
7324 cpuid_deadline_tsc_supported(void)
7325 {
7326         struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi;
7327         struct cpuid_regs regs;
7328 
7329         ASSERT(cpuid_checkpass(CPU, 1));
7330         ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7331 
7332         switch (cpi->cpi_vendor) {
7333         case X86_VENDOR_Intel:
7334                 if (cpi->cpi_maxeax >= 1) {
7335                         regs.cp_eax = 1;
7336                         (void) cpuid_insn(NULL, &regs);
7337                         return (regs.cp_ecx & CPUID_DEADLINE_TSC);
7338                 } else {
7339                         return (0);
7340                 }
7341         default:
7342                 return (0);
7343         }
7344 }
7345 
7346 #if defined(__amd64) && !defined(__xpv)
7347 /*
7348  * Patch in versions of bcopy for high performance Intel Nhm processors
7349  * and later...
7350  */
7351 void
7352 patch_memops(uint_t vendor)
7353 {
7354         size_t cnt, i;
7355         caddr_t to, from;
7356 
7357         if ((vendor == X86_VENDOR_Intel) &&
7358             is_x86_feature(x86_featureset, X86FSET_SSE4_2)) {
7359                 cnt = &bcopy_patch_end - &bcopy_patch_start;
7360                 to = &bcopy_ck_size;
7361                 from = &bcopy_patch_start;
7362                 for (i = 0; i < cnt; i++) {
7363                         *to++ = *from++;
7364                 }
7365         }
7366 }
7367 #endif  /* __amd64 && !__xpv */
7368 
7369 /*
7370  * We're being asked to tell the system how many bits are required to represent
7371  * the various thread and strand IDs. While it's tempting to derive this based
7372  * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite
7373  * correct. Instead, this needs to be based on the number of bits that the APIC
7374  * allows for these different configurations. We only update these to a larger
7375  * value if we find one.
7376  */
7377 void
7378 cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
7379 {
7380         struct cpuid_info *cpi;
7381 
7382         VERIFY(cpuid_checkpass(CPU, 1));
7383         cpi = cpu->cpu_m.mcpu_cpi;
7384 
7385         if (cpi->cpi_ncore_bits > *core_nbits) {
7386                 *core_nbits = cpi->cpi_ncore_bits;
7387         }
7388 
7389         if (cpi->cpi_nthread_bits > *strand_nbits) {
7390                 *strand_nbits = cpi->cpi_nthread_bits;
7391         }
7392 }
7393 
7394 void
7395 cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
7396 {
7397         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7398         struct cpuid_regs cp;
7399 
7400         /*
7401          * Reread the CPUID portions that we need for various security
7402          * information.
7403          */
7404         if (cpi->cpi_vendor == X86_VENDOR_Intel) {
7405                 /*
7406                  * Check if we now have leaf 7 available to us.
7407                  */
7408                 if (cpi->cpi_maxeax < 7) {
7409                         bzero(&cp, sizeof (cp));
7410                         cp.cp_eax = 0;
7411                         cpi->cpi_maxeax = __cpuid_insn(&cp);
7412                         if (cpi->cpi_maxeax < 7)
7413                                 return;
7414                 }
7415 
7416                 bzero(&cp, sizeof (cp));
7417                 cp.cp_eax = 7;
7418                 cp.cp_ecx = 0;
7419                 (void) __cpuid_insn(&cp);
7420                 cpi->cpi_std[7] = cp;
7421         } else if (cpi->cpi_vendor == X86_VENDOR_AMD) {
7422                 /* No xcpuid support */
7423                 if (cpi->cpi_family < 5 ||
7424                     (cpi->cpi_family == 5 && cpi->cpi_model < 1))
7425                         return;
7426 
7427                 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7428                         bzero(&cp, sizeof (cp));
7429                         cp.cp_eax = CPUID_LEAF_EXT_0;
7430                         cpi->cpi_xmaxeax = __cpuid_insn(&cp);
7431                         if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7432                                 return;
7433                         }
7434                 }
7435 
7436                 bzero(&cp, sizeof (cp));
7437                 cp.cp_eax = CPUID_LEAF_EXT_8;
7438                 (void) __cpuid_insn(&cp);
7439                 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
7440                 cpi->cpi_extd[8] = cp;
7441         } else {
7442                 /*
7443                  * Nothing to do here. Return an empty set which has already
7444                  * been zeroed for us.
7445                  */
7446                 return;
7447         }
7448         cpuid_scan_security(cpu, fset);
7449 }
7450 
7451 /* ARGSUSED */
7452 static int
7453 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
7454 {
7455         uchar_t *fset;
7456         boolean_t first_pass = (boolean_t)arg1;
7457 
7458         fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
7459         if (first_pass && CPU->cpu_id != 0)
7460                 return (0);
7461         if (!first_pass && CPU->cpu_id == 0)
7462                 return (0);
7463         cpuid_pass_ucode(CPU, fset);
7464 
7465         return (0);
7466 }
7467 
7468 /*
7469  * After a microcode update where the version has changed, then we need to
7470  * rescan CPUID. To do this we check every CPU to make sure that they have the
7471  * same microcode. Then we perform a cross call to all such CPUs. It's the
7472  * caller's job to make sure that no one else can end up doing an update while
7473  * this is going on.
7474  *
7475  * We assume that the system is microcode capable if we're called.
7476  */
7477 void
7478 cpuid_post_ucodeadm(void)
7479 {
7480         uint32_t rev;
7481         int i;
7482         struct cpu *cpu;
7483         cpuset_t cpuset;
7484         void *argdata;
7485         uchar_t *f0;
7486 
7487         argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
7488 
7489         mutex_enter(&cpu_lock);
7490         cpu = cpu_get(0);
7491         rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
7492         CPUSET_ONLY(cpuset, 0);
7493         for (i = 1; i < max_ncpus; i++) {
7494                 if ((cpu = cpu_get(i)) == NULL)
7495                         continue;
7496 
7497                 if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
7498                         panic("post microcode update CPU %d has differing "
7499                             "microcode revision (%u) from CPU 0 (%u)",
7500                             i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
7501                 }
7502                 CPUSET_ADD(cpuset, i);
7503         }
7504 
7505         /*
7506          * We do the cross calls in two passes. The first pass is only for the
7507          * boot CPU. The second pass is for all of the other CPUs. This allows
7508          * the boot CPU to go through and change behavior related to patching or
7509          * whether or not Enhanced IBRS needs to be enabled and then allow all
7510          * other CPUs to follow suit.
7511          */
7512         kpreempt_disable();
7513         xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset),
7514             cpuid_post_ucodeadm_xc);
7515         xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset),
7516             cpuid_post_ucodeadm_xc);
7517         kpreempt_enable();
7518 
7519         /*
7520          * OK, now look at each CPU and see if their feature sets are equal.
7521          */
7522         f0 = argdata;
7523         for (i = 1; i < max_ncpus; i++) {
7524                 uchar_t *fset;
7525                 if (!CPU_IN_SET(cpuset, i))
7526                         continue;
7527 
7528                 fset = (uchar_t *)((uintptr_t)argdata +
7529                     sizeof (x86_featureset) * i);
7530 
7531                 if (!compare_x86_featureset(f0, fset)) {
7532                         panic("Post microcode update CPU %d has "
7533                             "differing security feature (%p) set from CPU 0 "
7534                             "(%p), not appending to feature set", i,
7535                             (void *)fset, (void *)f0);
7536                 }
7537         }
7538 
7539         mutex_exit(&cpu_lock);
7540 
7541         for (i = 0; i < NUM_X86_FEATURES; i++) {
7542                 cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n",
7543                     x86_feature_names[i]);
7544                 if (is_x86_feature(f0, i)) {
7545                         add_x86_feature(x86_featureset, i);
7546                 }
7547         }
7548         kmem_free(argdata, sizeof (x86_featureset) * NCPU);
7549 }