1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  24  * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
  25  * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
  26  */
  27 /*
  28  * Copyright (c) 2010, Intel Corporation.
  29  * All rights reserved.
  30  */
  31 /*
  32  * Portions Copyright 2009 Advanced Micro Devices, Inc.
  33  */
  34 /*
  35  * Copyright 2019 Joyent, Inc.
  36  */
  37 
  38 /*
  39  * CPU Identification logic
  40  *
  41  * The purpose of this file and its companion, cpuid_subr.c, is to help deal
  42  * with the identification of CPUs, their features, and their topologies. More
  43  * specifically, this file helps drive the following:
  44  *
  45  * 1. Enumeration of features of the processor which are used by the kernel to
  46  *    determine what features to enable or disable. These may be instruction set
  47  *    enhancements or features that we use.
  48  *
  49  * 2. Enumeration of instruction set architecture (ISA) additions that userland
  50  *    will be told about through the auxiliary vector.
  51  *
  52  * 3. Understanding the physical topology of the CPU such as the number of
  53  *    caches, how many cores it has, whether or not it supports symmetric
  54  *    multi-processing (SMT), etc.
  55  *
  56  * ------------------------
  57  * CPUID History and Basics
  58  * ------------------------
  59  *
  60  * The cpuid instruction was added by Intel roughly around the time that the
  61  * original Pentium was introduced. The purpose of cpuid was to tell in a
  62  * programmatic fashion information about the CPU that previously was guessed
  63  * at. For example, an important part of cpuid is that we can know what
  64  * extensions to the ISA exist. If you use an invalid opcode you would get a
  65  * #UD, so this method allows a program (whether a user program or the kernel)
  66  * to determine what exists without crashing or getting a SIGILL. Of course,
  67  * this was also during the era of the clones and the AMD Am5x86. The vendor
  68  * name shows up first in cpuid for a reason.
  69  *
  70  * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
  71  * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
  72  * its own meaning. The different leaves are broken down into different regions:
  73  *
  74  *      [ 0, 7fffffff ]                 This region is called the 'basic'
  75  *                                      region. This region is generally defined
  76  *                                      by Intel, though some of the original
  77  *                                      portions have different meanings based
  78  *                                      on the manufacturer. These days, Intel
  79  *                                      adds most new features to this region.
  80  *                                      AMD adds non-Intel compatible
  81  *                                      information in the third, extended
  82  *                                      region. Intel uses this for everything
  83  *                                      including ISA extensions, CPU
  84  *                                      features, cache information, topology,
  85  *                                      and more.
  86  *
  87  *                                      There is a hole carved out of this
  88  *                                      region which is reserved for
  89  *                                      hypervisors.
  90  *
  91  *      [ 40000000, 4fffffff ]          This region, which is found in the
  92  *                                      middle of the previous region, is
  93  *                                      explicitly promised to never be used by
  94  *                                      CPUs. Instead, it is used by hypervisors
  95  *                                      to communicate information about
  96  *                                      themselves to the operating system. The
  97  *                                      values and details are unique for each
  98  *                                      hypervisor.
  99  *
 100  *      [ 80000000, ffffffff ]          This region is called the 'extended'
 101  *                                      region. Some of the low leaves mirror
 102  *                                      parts of the basic leaves. This region
 103  *                                      has generally been used by AMD for
 104  *                                      various extensions. For example, AMD-
 105  *                                      specific information about caches,
 106  *                                      features, and topology are found in this
 107  *                                      region.
 108  *
 109  * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
 110  * and %edx, and then issue the cpuid instruction. At the first leaf in each of
 111  * the ranges, one of the primary things returned is the maximum valid leaf in
 112  * that range. This allows for discovery of what range of CPUID is valid.
 113  *
 114  * The CPUs have potentially surprising behavior when using an invalid leaf or
 115  * unimplemented leaf. If the requested leaf is within the valid basic or
 116  * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
 117  * set to zero. However, if you specify a leaf that is outside of a valid range,
 118  * then instead it will be filled with the last valid _basic_ leaf. For example,
 119  * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
 120  * an invalid extended leaf will return the information for leaf 3.
 121  *
 122  * Some leaves are broken down into sub-leaves. This means that the value
 123  * depends on both the leaf asked for in %eax and a secondary register. For
 124  * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
 125  * additional information. Or when getting topology information in leaf 0xb, the
 126  * initial value in %ecx changes which level of the topology that you are
 127  * getting information about.
 128  *
 129  * cpuid values are always kept to 32 bits regardless of whether or not the
 130  * program is in 64-bit mode. When executing in 64-bit mode, the upper
 131  * 32 bits of the register are always set to zero so that way the values are the
 132  * same regardless of execution mode.
 133  *
 134  * ----------------------
 135  * Identifying Processors
 136  * ----------------------
 137  *
 138  * We can identify a processor in two steps. The first step looks at cpuid leaf
 139  * 0. Leaf 0 contains the processor's vendor information. This is done by
 140  * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
 141  * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
 142  *
 143  * From there, a processor is identified by a combination of three different
 144  * values:
 145  *
 146  *  1. Family
 147  *  2. Model
 148  *  3. Stepping
 149  *
 150  * Each vendor uses the family and model to uniquely identify a processor. The
 151  * way that family and model are changed depends on the vendor. For example,
 152  * Intel has been using family 0x6 for almost all of their processor since the
 153  * Pentium Pro/Pentium II era, often called the P6. The model is used to
 154  * identify the exact processor. Different models are often used for the client
 155  * (consumer) and server parts. Even though each processor often has major
 156  * architectural differences, they still are considered the same family by
 157  * Intel.
 158  *
 159  * On the other hand, each major AMD architecture generally has its own family.
 160  * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
 161  * the model number is used to help identify specific processors.
 162  *
 163  * The stepping is used to refer to a revision of a specific microprocessor. The
 164  * term comes from equipment used to produce masks that are used to create
 165  * integrated circuits.
 166  *
 167  * The information is present in leaf 1, %eax. In technical documentation you
 168  * will see the terms extended model and extended family. The original family,
 169  * model, and stepping fields were each 4 bits wide. If the values in either
 170  * are 0xf, then one is to consult the extended model and extended family, which
 171  * take previously reserved bits and allow for a larger number of models and add
 172  * 0xf to them.
 173  *
 174  * When we process this information, we store the full family, model, and
 175  * stepping in the struct cpuid_info members cpi_family, cpi_model, and
 176  * cpi_step, respectively. Whenever you are performing comparisons with the
 177  * family, model, and stepping, you should use these members and not the raw
 178  * values from cpuid. If you must use the raw values from cpuid directly, you
 179  * must make sure that you add the extended model and family to the base model
 180  * and family.
 181  *
 182  * In general, we do not use information about the family, model, and stepping
 183  * to determine whether or not a feature is present; that is generally driven by
 184  * specific leaves. However, when something we care about on the processor is
 185  * not considered 'architectural' meaning that it is specific to a set of
 186  * processors and not promised in the architecture model to be consistent from
 187  * generation to generation, then we will fall back on this information. The
 188  * most common cases where this comes up is when we have to workaround errata in
 189  * the processor, are dealing with processor-specific features such as CPU
 190  * performance counters, or we want to provide additional information for things
 191  * such as fault management.
 192  *
 193  * While processors also do have a brand string, which is the name that people
 194  * are familiar with when buying the processor, they are not meant for
 195  * programmatic consumption. That is what the family, model, and stepping are
 196  * for.
 197  *
 198  * ------------
 199  * CPUID Passes
 200  * ------------
 201  *
 202  * As part of performing feature detection, we break this into several different
 203  * passes. The passes are as follows:
 204  *
 205  *      Pass 0          This is a primordial pass done in locore.s to deal with
 206  *                      Cyrix CPUs that don't support cpuid. The reality is that
 207  *                      we likely don't run on them any more, but there is still
 208  *                      logic for handling them.
 209  *
 210  *      Pass 1          This is the primary pass and is responsible for doing a
 211  *                      large number of different things:
 212  *
 213  *                      1. Determine which vendor manufactured the CPU and
 214  *                      determining the family, model, and stepping information.
 215  *
 216  *                      2. Gathering a large number of feature flags to
 217  *                      determine which features the CPU support and which
 218  *                      indicate things that we need to do other work in the OS
 219  *                      to enable. Features detected this way are added to the
 220  *                      x86_featureset which can be queried to
 221  *                      determine what we should do. This includes processing
 222  *                      all of the basic and extended CPU features that we care
 223  *                      about.
 224  *
 225  *                      3. Determining the CPU's topology. This includes
 226  *                      information about how many cores and threads are present
 227  *                      in the package. It also is responsible for figuring out
 228  *                      which logical CPUs are potentially part of the same core
 229  *                      and what other resources they might share. For more
 230  *                      information see the 'Topology' section.
 231  *
 232  *                      4. Determining the set of CPU security-specific features
 233  *                      that we need to worry about and determine the
 234  *                      appropriate set of workarounds.
 235  *
 236  *                      Pass 1 on the boot CPU occurs before KMDB is started.
 237  *
 238  *      Pass 2          The second pass is done after startup(). Here, we check
 239  *                      other miscellaneous features. Most of this is gathering
 240  *                      additional basic and extended features that we'll use in
 241  *                      later passes or for debugging support.
 242  *
 243  *      Pass 3          The third pass occurs after the kernel memory allocator
 244  *                      has been fully initialized. This gathers information
 245  *                      where we might need dynamic memory available for our
 246  *                      uses. This includes several varying width leaves that
 247  *                      have cache information and the processor's brand string.
 248  *
 249  *      Pass 4          The fourth and final normal pass is performed after the
 250  *                      kernel has brought most everything online. This is
 251  *                      invoked from post_startup(). In this pass, we go through
 252  *                      the set of features that we have enabled and turn that
 253  *                      into the hardware auxiliary vector features that
 254  *                      userland receives. This is used by userland, primarily
 255  *                      by the run-time link-editor (RTLD), though userland
 256  *                      software could also refer to it directly.
 257  *
 258  *      Microcode       After a microcode update, we do a selective rescan of
 259  *                      the cpuid leaves to determine what features have
 260  *                      changed. Microcode updates can provide more details
 261  *                      about security related features to deal with issues like
 262  *                      Spectre and L1TF. On occasion, vendors have violated
 263  *                      their contract and removed bits. However, we don't try
 264  *                      to detect that because that puts us in a situation that
 265  *                      we really can't deal with. As such, the only thing we
 266  *                      rescan are security related features today. See
 267  *                      cpuid_pass_ucode().
 268  *
 269  * All of the passes (except pass 0) are run on all CPUs. However, for the most
 270  * part we only care about what the boot CPU says about this information and use
 271  * the other CPUs as a rough guide to sanity check that we have the same feature
 272  * set.
 273  *
 274  * We do not support running multiple logical CPUs with disjoint, let alone
 275  * different, feature sets.
 276  *
 277  * ------------------
 278  * Processor Topology
 279  * ------------------
 280  *
 281  * One of the important things that we need to do is to understand the topology
 282  * of the underlying processor. When we say topology in this case, we're trying
 283  * to understand the relationship between the logical CPUs that the operating
 284  * system sees and the underlying physical layout. Different logical CPUs may
 285  * share different resources which can have important consequences for the
 286  * performance of the system. For example, they may share caches, execution
 287  * units, and more.
 288  *
 289  * The topology of the processor changes from generation to generation and
 290  * vendor to vendor.  Along with that, different vendors use different
 291  * terminology, and the operating system itself uses occasionally overlapping
 292  * terminology. It's important to understand what this topology looks like so
 293  * one can understand the different things that we try to calculate and
 294  * determine.
 295  *
 296  * To get started, let's talk about a little bit of terminology that we've used
 297  * so far, is used throughout this file, and is fairly generic across multiple
 298  * vendors:
 299  *
 300  * CPU
 301  *      A central processing unit (CPU) refers to a logical and/or virtual
 302  *      entity that the operating system can execute instructions on. The
 303  *      underlying resources for this CPU may be shared between multiple
 304  *      entities; however, to the operating system it is a discrete unit.
 305  *
 306  * PROCESSOR and PACKAGE
 307  *
 308  *      Generally, when we use the term 'processor' on its own, we are referring
 309  *      to the physical entity that one buys and plugs into a board. However,
 310  *      because processor has been overloaded and one might see it used to mean
 311  *      multiple different levels, we will instead use the term 'package' for
 312  *      the rest of this file. The term package comes from the electrical
 313  *      engineering side and refers to the physical entity that encloses the
 314  *      electronics inside. Strictly speaking the package can contain more than
 315  *      just the CPU, for example, on many processors it may also have what's
 316  *      called an 'integrated graphical processing unit (GPU)'. Because the
 317  *      package can encapsulate multiple units, it is the largest physical unit
 318  *      that we refer to.
 319  *
 320  * SOCKET
 321  *
 322  *      A socket refers to unit on a system board (generally the motherboard)
 323  *      that can receive a package. A single package, or processor, is plugged
 324  *      into a single socket. A system may have multiple sockets. Often times,
 325  *      the term socket is used interchangeably with package and refers to the
 326  *      electrical component that has plugged in, and not the receptacle itself.
 327  *
 328  * CORE
 329  *
 330  *      A core refers to the physical instantiation of a CPU, generally, with a
 331  *      full set of hardware resources available to it. A package may contain
 332  *      multiple cores inside of it or it may just have a single one. A
 333  *      processor with more than one core is often referred to as 'multi-core'.
 334  *      In illumos, we will use the feature X86FSET_CMP to refer to a system
 335  *      that has 'multi-core' processors.
 336  *
 337  *      A core may expose a single logical CPU to the operating system, or it
 338  *      may expose multiple CPUs, which we call threads, defined below.
 339  *
 340  *      Some resources may still be shared by cores in the same package. For
 341  *      example, many processors will share the level 3 cache between cores.
 342  *      Some AMD generations share hardware resources between cores. For more
 343  *      information on that see the section 'AMD Topology'.
 344  *
 345  * THREAD and STRAND
 346  *
 347  *      In this file, generally a thread refers to a hardware resources and not
 348  *      the operating system's logical abstraction. A thread is always exposed
 349  *      as an independent logical CPU to the operating system. A thread belongs
 350  *      to a specific core. A core may have more than one thread. When that is
 351  *      the case, the threads that are part of the same core are often referred
 352  *      to as 'siblings'.
 353  *
 354  *      When multiple threads exist, this is generally referred to as
 355  *      simultaneous multi-threading (SMT). When Intel introduced this in their
 356  *      processors they called it hyper-threading (HT). When multiple threads
 357  *      are active in a core, they split the resources of the core. For example,
 358  *      two threads may share the same set of hardware execution units.
 359  *
 360  *      The operating system often uses the term 'strand' to refer to a thread.
 361  *      This helps disambiguate it from the software concept.
 362  *
 363  * CHIP
 364  *
 365  *      Unfortunately, the term 'chip' is dramatically overloaded. At its most
 366  *      base meaning, it is used to refer to a single integrated circuit, which
 367  *      may or may not be the only thing in the package. In illumos, when you
 368  *      see the term 'chip' it is almost always referring to the same thing as
 369  *      the 'package'. However, many vendors may use chip to refer to one of
 370  *      many integrated circuits that have been placed in the package. As an
 371  *      example, see the subsequent definition.
 372  *
 373  *      To try and keep things consistent, we will only use chip when referring
 374  *      to the entire integrated circuit package, with the exception of the
 375  *      definition of multi-chip module (because it is in the name) and use the
 376  *      term 'die' when we want the more general, potential sub-component
 377  *      definition.
 378  *
 379  * DIE
 380  *
 381  *      A die refers to an integrated circuit. Inside of the package there may
 382  *      be a single die or multiple dies. This is sometimes called a 'chip' in
 383  *      vendor's parlance, but in this file, we use the term die to refer to a
 384  *      subcomponent.
 385  *
 386  * MULTI-CHIP MODULE
 387  *
 388  *      A multi-chip module (MCM) refers to putting multiple distinct chips that
 389  *      are connected together in the same package. When a multi-chip design is
 390  *      used, generally each chip is manufactured independently and then joined
 391  *      together in the package. For example, on AMD's Zen microarchitecture
 392  *      (family 0x17), the package contains several dies (the second meaning of
 393  *      chip from above) that are connected together.
 394  *
 395  * CACHE
 396  *
 397  *      A cache is a part of the processor that maintains copies of recently
 398  *      accessed memory. Caches are split into levels and then into types.
 399  *      Commonly there are one to three levels, called level one, two, and
 400  *      three. The lower the level, the smaller it is, the closer it is to the
 401  *      execution units of the CPU, and the faster it is to access. The layout
 402  *      and design of the cache come in many different flavors, consult other
 403  *      resources for a discussion of those.
 404  *
 405  *      Caches are generally split into two types, the instruction and data
 406  *      cache. The caches contain what their names suggest, the instruction
 407  *      cache has executable program text, while the data cache has all other
 408  *      memory that the processor accesses. As of this writing, data is kept
 409  *      coherent between all of the caches on x86, so if one modifies program
 410  *      text before it is executed, that will be in the data cache, and the
 411  *      instruction cache will be synchronized with that change when the
 412  *      processor actually executes those instructions. This coherency also
 413  *      covers the fact that data could show up in multiple caches.
 414  *
 415  *      Generally, the lowest level caches are specific to a core. However, the
 416  *      last layer cache is shared between some number of cores. The number of
 417  *      CPUs sharing this last level cache is important. This has implications
 418  *      for the choices that the scheduler makes, as accessing memory that might
 419  *      be in a remote cache after thread migration can be quite expensive.
 420  *
 421  *      Sometimes, the word cache is abbreviated with a '$', because in US
 422  *      English the word cache is pronounced the same as cash. So L1D$ refers to
 423  *      the L1 data cache, and L2$ would be the L2 cache. This will not be used
 424  *      in the rest of this theory statement for clarity.
 425  *
 426  * MEMORY CONTROLLER
 427  *
 428  *      The memory controller is a component that provides access to DRAM. Each
 429  *      memory controller can access a set number of DRAM channels. Each channel
 430  *      can have a number of DIMMs (sticks of memory) associated with it. A
 431  *      given package may have more than one memory controller. The association
 432  *      of the memory controller to a group of cores is important as it is
 433  *      cheaper to access memory on the controller that you are associated with.
 434  *
 435  * NUMA
 436  *
 437  *      NUMA or non-uniform memory access, describes a way that systems are
 438  *      built. On x86, any processor core can address all of the memory in the
 439  *      system. However, When using multiple sockets or possibly within a
 440  *      multi-chip module, some of that memory is physically closer and some of
 441  *      it is further. Memory that is further away is more expensive to access.
 442  *      Consider the following image of multiple sockets with memory:
 443  *
 444  *      +--------+                                                +--------+
 445  *      | DIMM A |         +----------+      +----------+         | DIMM D |
 446  *      +--------+-+       |          |      |          |       +-+------+-+
 447  *        | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
 448  *        +--------+-+     |          |      |          |     +-+------+-+
 449  *          | DIMM C |     +----------+      +----------+     | DIMM F |
 450  *          +--------+                                        +--------+
 451  *
 452  *      In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
 453  *      closer to DIMMs D-F. This means that it is cheaper for socket 0 to
 454  *      access DIMMs A-C and more expensive to access D-F as it has to go
 455  *      through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
 456  *      D-F are cheaper than A-C. While the socket form is the most common, when
 457  *      using multi-chip modules, this can also sometimes occur. For another
 458  *      example of this that's more involved, see the AMD topology section.
 459  *
 460  *
 461  * Intel Topology
 462  * --------------
 463  *
 464  * Most Intel processors since Nehalem, (as of this writing the current gen
 465  * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
 466  * the package is a single monolithic die. MCMs currently aren't used. Most
 467  * parts have three levels of caches, with the L3 cache being shared between
 468  * all of the cores on the package. The L1/L2 cache is generally specific to
 469  * an individual core. The following image shows at a simplified level what
 470  * this looks like. The memory controller is commonly part of something called
 471  * the 'Uncore', that used to be separate physical chips that were not a part of
 472  * the package, but are now part of the same chip.
 473  *
 474  *  +-----------------------------------------------------------------------+
 475  *  | Package                                                               |
 476  *  |  +-------------------+  +-------------------+  +-------------------+  |
 477  *  |  | Core              |  | Core              |  | Core              |  |
 478  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
 479  *  |  |  | Thread | | L | |  |  | Thread | | L | |  |  | Thread | | L | |  |
 480  *  |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |
 481  *  |  |  +--------+ |   | |  |  +--------+ |   | |  |  +--------+ |   | |  |
 482  *  |  |  | Thread | |   | |  |  | Thread | |   | |  |  | Thread | |   | |  |
 483  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
 484  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
 485  *  |  |  | L2 Cache     | |  |  | L2 Cache     | |  |  | L2 Cache     | |  |
 486  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
 487  *  |  +-------------------+  +-------------------+  +-------------------+  |
 488  *  | +-------------------------------------------------------------------+ |
 489  *  | |                         Shared L3 Cache                           | |
 490  *  | +-------------------------------------------------------------------+ |
 491  *  | +-------------------------------------------------------------------+ |
 492  *  | |                        Memory Controller                          | |
 493  *  | +-------------------------------------------------------------------+ |
 494  *  +-----------------------------------------------------------------------+
 495  *
 496  * A side effect of this current architecture is that what we care about from a
 497  * scheduling and topology perspective, is simplified. In general we care about
 498  * understanding which logical CPUs are part of the same core and socket.
 499  *
 500  * To determine the relationship between threads and cores, Intel initially used
 501  * the identifier in the advanced programmable interrupt controller (APIC). They
 502  * also added cpuid leaf 4 to give additional information about the number of
 503  * threads and CPUs in the processor. With the addition of x2apic (which
 504  * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
 505  * additional cpuid topology leaf 0xB was added.
 506  *
 507  * AMD Topology
 508  * ------------
 509  *
 510  * When discussing AMD topology, we want to break this into three distinct
 511  * generations of topology. There's the basic topology that has been used in
 512  * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
 513  * with family 0x15 (Bulldozer), and there's the topology that was introduced
 514  * with family 0x17 (Zen). AMD also has some additional terminology that's worth
 515  * talking about.
 516  *
 517  * Until the introduction of family 0x17 (Zen), AMD did not implement something
 518  * that they considered SMT. Whether or not the AMD processors have SMT
 519  * influences many things including scheduling and reliability, availability,
 520  * and serviceability (RAS) features.
 521  *
 522  * NODE
 523  *
 524  *      AMD uses the term node to refer to a die that contains a number of cores
 525  *      and I/O resources. Depending on the processor family and model, more
 526  *      than one node can be present in the package. When there is more than one
 527  *      node this indicates a multi-chip module. Usually each node has its own
 528  *      access to memory and I/O devices. This is important and generally
 529  *      different from the corresponding Intel Nehalem-Skylake+ processors. As a
 530  *      result, we track this relationship in the operating system.
 531  *
 532  *      In processors with an L3 cache, the L3 cache is generally shared across
 533  *      the entire node, though the way this is carved up varies from generation
 534  *      to generation.
 535  *
 536  * BULLDOZER
 537  *
 538  *      Starting with the Bulldozer family (0x15) and continuing until the
 539  *      introduction of the Zen microarchitecture, AMD introduced the idea of a
 540  *      compute unit. In a compute unit, two traditional cores share a number of
 541  *      hardware resources. Critically, they share the FPU, L1 instruction
 542  *      cache, and the L2 cache. Several compute units were then combined inside
 543  *      of a single node.  Because the integer execution units, L1 data cache,
 544  *      and some other resources were not shared between the cores, AMD never
 545  *      considered this to be SMT.
 546  *
 547  * ZEN
 548  *
 549  *      The Zen family (0x17) uses a multi-chip module (MCM) design, the module
 550  *      is called Zeppelin. These modules are similar to the idea of nodes used
 551  *      previously. Each of these nodes has two DRAM channels which all of the
 552  *      cores in the node can access uniformly. These nodes are linked together
 553  *      in the package, creating a NUMA environment.
 554  *
 555  *      The Zeppelin die itself contains two different 'core complexes'. Each
 556  *      core complex consists of four cores which each have two threads, for a
 557  *      total of 8 logical CPUs per complex. Unlike other generations,
 558  *      where all the logical CPUs in a given node share the L3 cache, here each
 559  *      core complex has its own shared L3 cache.
 560  *
 561  *      A further thing that we need to consider is that in some configurations,
 562  *      particularly with the Threadripper line of processors, not every die
 563  *      actually has its memory controllers wired up to actual memory channels.
 564  *      This means that some cores have memory attached to them and others
 565  *      don't.
 566  *
 567  *      To put Zen in perspective, consider the following images:
 568  *
 569  *      +--------------------------------------------------------+
 570  *      | Core Complex                                           |
 571  *      | +-------------------+    +-------------------+  +---+  |
 572  *      | | Core       +----+ |    | Core       +----+ |  |   |  |
 573  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  |   |  |
 574  *      | | | Thread | +----+ |    | | Thread | +----+ |  |   |  |
 575  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  | L |  |
 576  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  | 3 |  |
 577  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
 578  *      | +-------------------+    +-------------------+  | C |  |
 579  *      | +-------------------+    +-------------------+  | a |  |
 580  *      | | Core       +----+ |    | Core       +----+ |  | c |  |
 581  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  | h |  |
 582  *      | | | Thread | +----+ |    | | Thread | +----+ |  | e |  |
 583  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |   |  |
 584  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  |   |  |
 585  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
 586  *      | +-------------------+    +-------------------+  +---+  |
 587  *      |                                                        |
 588  *      +--------------------------------------------------------+
 589  *
 590  *  This first image represents a single Zen core complex that consists of four
 591  *  cores.
 592  *
 593  *
 594  *      +--------------------------------------------------------+
 595  *      | Zeppelin Die                                           |
 596  *      |  +--------------------------------------------------+  |
 597  *      |  |         I/O Units (PCIe, SATA, USB, etc.)        |  |
 598  *      |  +--------------------------------------------------+  |
 599  *      |                           HH                           |
 600  *      |          +-----------+    HH    +-----------+          |
 601  *      |          |           |    HH    |           |          |
 602  *      |          |    Core   |==========|    Core   |          |
 603  *      |          |  Complex  |==========|  Complex  |          |
 604  *      |          |           |    HH    |           |          |
 605  *      |          +-----------+    HH    +-----------+          |
 606  *      |                           HH                           |
 607  *      |  +--------------------------------------------------+  |
 608  *      |  |                Memory Controller                 |  |
 609  *      |  +--------------------------------------------------+  |
 610  *      |                                                        |
 611  *      +--------------------------------------------------------+
 612  *
 613  *  This image represents a single Zeppelin Die. Note how both cores are
 614  *  connected to the same memory controller and I/O units. While each core
 615  *  complex has its own L3 cache as seen in the first image, they both have
 616  *  uniform access to memory.
 617  *
 618  *
 619  *                      PP                     PP
 620  *                      PP                     PP
 621  *           +----------PP---------------------PP---------+
 622  *           |          PP                     PP         |
 623  *           |    +-----------+          +-----------+    |
 624  *           |    |           |          |           |    |
 625  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
 626  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
 627  *           |    |           |          |           |    |
 628  *           |    +-----------+ooo    ...+-----------+    |
 629  *           |          HH      ooo  ...       HH         |
 630  *           |          HH        oo..         HH         |
 631  *           |          HH        ..oo         HH         |
 632  *           |          HH      ...  ooo       HH         |
 633  *           |    +-----------+...    ooo+-----------+    |
 634  *           |    |           |          |           |    |
 635  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
 636  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
 637  *           |    |           |          |           |    |
 638  *           |    +-----------+          +-----------+    |
 639  *           |          PP                     PP         |
 640  *           +----------PP---------------------PP---------+
 641  *                      PP                     PP
 642  *                      PP                     PP
 643  *
 644  *  This image represents a single Zen package. In this example, it has four
 645  *  Zeppelin dies, though some configurations only have a single one. In this
 646  *  example, each die is directly connected to the next. Also, each die is
 647  *  represented as being connected to memory by the 'M' character and connected
 648  *  to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
 649  *  die is made up of two core complexes, we have multiple different NUMA
 650  *  domains that we care about for these systems.
 651  *
 652  * CPUID LEAVES
 653  *
 654  * There are a few different CPUID leaves that we can use to try and understand
 655  * the actual state of the world. As part of the introduction of family 0xf, AMD
 656  * added CPUID leaf 0x80000008. This leaf tells us the number of logical
 657  * processors that are in the system. Because families before Zen didn't have
 658  * SMT, this was always the number of cores that were in the system. However, it
 659  * should always be thought of as the number of logical threads to be consistent
 660  * between generations. In addition we also get the size of the APIC ID that is
 661  * used to represent the number of logical processors. This is important for
 662  * deriving topology information.
 663  *
 664  * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
 665  * bit between Bulldozer and later families, but it is quite useful in
 666  * determining the topology information. Because this information has changed
 667  * across family generations, it's worth calling out what these mean
 668  * explicitly. The registers have the following meanings:
 669  *
 670  *      %eax    The APIC ID. The entire register is defined to have a 32-bit
 671  *              APIC ID, even though on systems without x2apic support, it will
 672  *              be limited to 8 bits.
 673  *
 674  *      %ebx    On Bulldozer-era systems this contains information about the
 675  *              number of cores that are in a compute unit (cores that share
 676  *              resources). It also contains a per-package compute unit ID that
 677  *              identifies which compute unit the logical CPU is a part of.
 678  *
 679  *              On Zen-era systems this instead contains the number of threads
 680  *              per core and the ID of the core that the logical CPU is a part
 681  *              of. Note, this ID is unique only to the package, it is not
 682  *              globally unique across the entire system.
 683  *
 684  *      %ecx    This contains the number of nodes that exist in the package. It
 685  *              also contains an ID that identifies which node the logical CPU
 686  *              is a part of.
 687  *
 688  * Finally, we also use cpuid leaf 0x8000001D to determine information about the
 689  * cache layout to determine which logical CPUs are sharing which caches.
 690  *
 691  * illumos Topology
 692  * ----------------
 693  *
 694  * Based on the above we synthesize the information into several different
 695  * variables that we store in the 'struct cpuid_info'. We'll go into the details
 696  * of what each member is supposed to represent and their uniqueness. In
 697  * general, there are two levels of uniqueness that we care about. We care about
 698  * an ID that is globally unique. That means that it will be unique across all
 699  * entities in the system. For example, the default logical CPU ID is globally
 700  * unique. On the other hand, there is some information that we only care about
 701  * being unique within the context of a single package / socket. Here are the
 702  * variables that we keep track of and their meaning.
 703  *
 704  * Several of the values that are asking for an identifier, with the exception
 705  * of cpi_apicid, are allowed to be synthetic.
 706  *
 707  *
 708  * cpi_apicid
 709  *
 710  *      This is the value of the CPU's APIC id. This should be the full 32-bit
 711  *      ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
 712  *      APIC ID. This value is globally unique between all logical CPUs across
 713  *      all packages. This is usually required by the APIC.
 714  *
 715  * cpi_chipid
 716  *
 717  *      This value indicates the ID of the package that the logical CPU is a
 718  *      part of. This value is allowed to be synthetic. It is usually derived by
 719  *      taking the CPU's APIC ID and determining how many bits are used to
 720  *      represent CPU cores in the package. All logical CPUs that are part of
 721  *      the same package must have the same value.
 722  *
 723  * cpi_coreid
 724  *
 725  *      This represents the ID of a CPU core. Two logical CPUs should only have
 726  *      the same cpi_coreid value if they are part of the same core. These
 727  *      values may be synthetic. On systems that support SMT, this value is
 728  *      usually derived from the APIC ID, otherwise it is often synthetic and
 729  *      just set to the value of the cpu_id in the cpu_t.
 730  *
 731  * cpi_pkgcoreid
 732  *
 733  *      This is similar to the cpi_coreid in that logical CPUs that are part of
 734  *      the same core should have the same ID. The main difference is that these
 735  *      values are only required to be unique to a given socket.
 736  *
 737  * cpi_clogid
 738  *
 739  *      This represents the logical ID of a logical CPU. This value should be
 740  *      unique within a given socket for each logical CPU. This is allowed to be
 741  *      synthetic, though it is usually based off of the CPU's apic ID. The
 742  *      broader system expects that logical CPUs that have are part of the same
 743  *      core have contiguous numbers. For example, if there were two threads per
 744  *      core, then the core IDs divided by two should be the same and the first
 745  *      modulus two should be zero and the second one. For example, IDs 4 and 5
 746  *      indicate two logical CPUs that are part of the same core. But IDs 5 and
 747  *      6 represent two logical CPUs that are part of different cores.
 748  *
 749  *      While it is common for the cpi_coreid and the cpi_clogid to be derived
 750  *      from the same source, strictly speaking, they don't have to be and the
 751  *      two values should be considered logically independent. One should not
 752  *      try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
 753  *      some kind of relationship. While this is tempting, we've seen cases on
 754  *      AMD family 0xf where the system's cpu id is not related to its APIC ID.
 755  *
 756  * cpi_ncpu_per_chip
 757  *
 758  *      This value indicates the total number of logical CPUs that exist in the
 759  *      physical package. Critically, this is not the number of logical CPUs
 760  *      that exist for just the single core.
 761  *
 762  *      This value should be the same for all logical CPUs in the same package.
 763  *
 764  * cpi_ncore_per_chip
 765  *
 766  *      This value indicates the total number of physical CPU cores that exist
 767  *      in the package. The system compares this value with cpi_ncpu_per_chip to
 768  *      determine if simultaneous multi-threading (SMT) is enabled. When
 769  *      cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
 770  *      the X86FSET_HTT feature is not set. If this value is greater than one,
 771  *      than we consider the processor to have the feature X86FSET_CMP, to
 772  *      indicate that there is support for more than one core.
 773  *
 774  *      This value should be the same for all logical CPUs in the same package.
 775  *
 776  * cpi_procnodes_per_pkg
 777  *
 778  *      This value indicates the number of 'nodes' that exist in the package.
 779  *      When processors are actually a multi-chip module, this represents the
 780  *      number of such modules that exist in the package. Currently, on Intel
 781  *      based systems this member is always set to 1.
 782  *
 783  *      This value should be the same for all logical CPUs in the same package.
 784  *
 785  * cpi_procnodeid
 786  *
 787  *      This value indicates the ID of the node that the logical CPU is a part
 788  *      of. All logical CPUs that are in the same node must have the same value
 789  *      here. This value must be unique across all of the packages in the
 790  *      system.  On Intel based systems, this is currently set to the value in
 791  *      cpi_chipid because there is only one node.
 792  *
 793  * cpi_cores_per_compunit
 794  *
 795  *      This value indicates the number of cores that are part of a compute
 796  *      unit. See the AMD topology section for this. This member only has real
 797  *      meaning currently for AMD Bulldozer family processors. For all other
 798  *      processors, this should currently be set to 1.
 799  *
 800  * cpi_compunitid
 801  *
 802  *      This indicates the compute unit that the logical CPU belongs to. For
 803  *      processors without AMD Bulldozer-style compute units this should be set
 804  *      to the value of cpi_coreid.
 805  *
 806  * cpi_ncpu_shr_last_cache
 807  *
 808  *      This indicates the number of logical CPUs that are sharing the same last
 809  *      level cache. This value should be the same for all CPUs that are sharing
 810  *      that cache. The last cache refers to the cache that is closest to memory
 811  *      and furthest away from the CPU.
 812  *
 813  * cpi_last_lvl_cacheid
 814  *
 815  *      This indicates the ID of the last cache that the logical CPU uses. This
 816  *      cache is often shared between multiple logical CPUs and is the cache
 817  *      that is closest to memory and furthest away from the CPU. This value
 818  *      should be the same for a group of logical CPUs only if they actually
 819  *      share the same last level cache. IDs should not overlap between
 820  *      packages.
 821  *
 822  * cpi_ncore_bits
 823  *
 824  *      This indicates the number of bits that are required to represent all of
 825  *      the cores in the system. As cores are derived based on their APIC IDs,
 826  *      we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
 827  *      this value to be larger than the actual number of IDs that are present
 828  *      in the system. This is used to size tables by the CMI framework. It is
 829  *      only filled in for Intel and AMD CPUs.
 830  *
 831  * cpi_nthread_bits
 832  *
 833  *      This indicates the number of bits required to represent all of the IDs
 834  *      that cover the logical CPUs that exist on a given core. It's OK for this
 835  *      value to be larger than the actual number of IDs that are present in the
 836  *      system.  This is used to size tables by the CMI framework. It is
 837  *      only filled in for Intel and AMD CPUs.
 838  *
 839  * -----------
 840  * Hypervisors
 841  * -----------
 842  *
 843  * If trying to manage the differences between vendors wasn't bad enough, it can
 844  * get worse thanks to our friend hardware virtualization. Hypervisors are given
 845  * the ability to interpose on all cpuid instructions and change them to suit
 846  * their purposes. In general, this is necessary as the hypervisor wants to be
 847  * able to present a more uniform set of features or not necessarily give the
 848  * guest operating system kernel knowledge of all features so it can be
 849  * more easily migrated between systems.
 850  *
 851  * When it comes to trying to determine topology information, this can be a
 852  * double edged sword. When a hypervisor doesn't actually implement a cpuid
 853  * leaf, it'll often return all zeros. Because of that, you'll often see various
 854  * checks scattered about fields being non-zero before we assume we can use
 855  * them.
 856  *
 857  * When it comes to topology information, the hypervisor is often incentivized
 858  * to lie to you about topology. This is because it doesn't always actually
 859  * guarantee that topology at all. The topology path we take in the system
 860  * depends on how the CPU advertises itself. If it advertises itself as an Intel
 861  * or AMD CPU, then we basically do our normal path. However, when they don't
 862  * use an actual vendor, then that usually turns into multiple one-core CPUs
 863  * that we enumerate that are often on different sockets. The actual behavior
 864  * depends greatly on what the hypervisor actually exposes to us.
 865  *
 866  * --------------------
 867  * Exposing Information
 868  * --------------------
 869  *
 870  * We expose CPUID information in three different forms in the system.
 871  *
 872  * The first is through the x86_featureset variable. This is used in conjunction
 873  * with the is_x86_feature() function. This is queried by x86-specific functions
 874  * to determine which features are or aren't present in the system and to make
 875  * decisions based upon them. For example, users of this include everything from
 876  * parts of the system dedicated to reliability, availability, and
 877  * serviceability (RAS), to making decisions about how to handle security
 878  * mitigations, to various x86-specific drivers. General purpose or
 879  * architecture independent drivers should never be calling this function.
 880  *
 881  * The second means is through the auxiliary vector. The auxiliary vector is a
 882  * series of tagged data that the kernel passes down to a user program when it
 883  * begins executing. This information is used to indicate to programs what
 884  * instruction set extensions are present. For example, information about the
 885  * CPU supporting the machine check architecture (MCA) wouldn't be passed down
 886  * since user programs cannot make use of it. However, things like the AVX
 887  * instruction sets are. Programs use this information to make run-time
 888  * decisions about what features they should use. As an example, the run-time
 889  * link-editor (rtld) can relocate different functions depending on the hardware
 890  * support available.
 891  *
 892  * The final form is through a series of accessor functions that all have the
 893  * form cpuid_get*. This is used by a number of different subsystems in the
 894  * kernel to determine more detailed information about what we're running on,
 895  * topology information, etc. Some of these subsystems include processor groups
 896  * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
 897  * microcode, and performance monitoring. These functions all ASSERT that the
 898  * CPU they're being called on has reached a certain cpuid pass. If the passes
 899  * are rearranged, then this needs to be adjusted.
 900  *
 901  * -----------------------------------------------
 902  * Speculative Execution CPU Side Channel Security
 903  * -----------------------------------------------
 904  *
 905  * With the advent of the Spectre and Meltdown attacks which exploit speculative
 906  * execution in the CPU to create side channels there have been a number of
 907  * different attacks and corresponding issues that the operating system needs to
 908  * mitigate against. The following list is some of the common, but not
 909  * exhaustive, set of issues that we know about and have done some or need to do
 910  * more work in the system to mitigate against:
 911  *
 912  *   - Spectre v1
 913  *   - Spectre v2
 914  *   - Meltdown (Spectre v3)
 915  *   - Rogue Register Read (Spectre v3a)
 916  *   - Speculative Store Bypass (Spectre v4)
 917  *   - ret2spec, SpectreRSB
 918  *   - L1 Terminal Fault (L1TF)
 919  *   - Microarchitectural Data Sampling (MDS)
 920  *
 921  * Each of these requires different sets of mitigations and has different attack
 922  * surfaces. For the most part, this discussion is about protecting the kernel
 923  * from non-kernel executing environments such as user processes and hardware
 924  * virtual machines. Unfortunately, there are a number of user vs. user
 925  * scenarios that exist with these. The rest of this section will describe the
 926  * overall approach that the system has taken to address these as well as their
 927  * shortcomings. Unfortunately, not all of the above have been handled today.
 928  *
 929  * SPECTRE FAMILY (Spectre v2, ret2spec, SpectreRSB)
 930  *
 931  * The second variant of the spectre attack focuses on performing branch target
 932  * injection. This generally impacts indirect call instructions in the system.
 933  * There are three different ways to mitigate this issue that are commonly
 934  * described today:
 935  *
 936  *  1. Using Indirect Branch Restricted Speculation (IBRS).
 937  *  2. Using Retpolines and RSB Stuffing
 938  *  3. Using Enhanced Indirect Branch Restricted Speculation (EIBRS)
 939  *
 940  * IBRS uses a feature added to microcode to restrict speculation, among other
 941  * things. This form of mitigation has not been used as it has been generally
 942  * seen as too expensive and requires reactivation upon various transitions in
 943  * the system.
 944  *
 945  * As a less impactful alternative to IBRS, retpolines were developed by
 946  * Google. These basically require one to replace indirect calls with a specific
 947  * trampoline that will cause speculation to fail and break the attack.
 948  * Retpolines require compiler support. We always build with retpolines in the
 949  * external thunk mode. This means that a traditional indirect call is replaced
 950  * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
 951  * of this is that all indirect function calls are performed through a register.
 952  *
 953  * We have to use a common external location of the thunk and not inline it into
 954  * the callsite so that way we can have a single place to patch these functions.
 955  * As it turns out, we actually have three different forms of retpolines that
 956  * exist in the system:
 957  *
 958  *  1. A full retpoline
 959  *  2. An AMD-specific optimized retpoline
 960  *  3. A no-op version
 961  *
 962  * The first one is used in the general case. The second one is used if we can
 963  * determine that we're on an AMD system and we can successfully toggle the
 964  * lfence serializing MSR that exists on the platform. Basically with this
 965  * present, an lfence is sufficient and we don't need to do anywhere near as
 966  * complicated a dance to successfully use retpolines.
 967  *
 968  * The third form described above is the most curious. It turns out that the way
 969  * that retpolines are implemented is that they rely on how speculation is
 970  * performed on a 'ret' instruction. Intel has continued to optimize this
 971  * process (which is partly why we need to have return stack buffer stuffing,
 972  * but more on that in a bit) and in processors starting with Cascade Lake
 973  * on the server side, it's dangerous to rely on retpolines. Instead, a new
 974  * mechanism has been introduced called Enhanced IBRS (EIBRS).
 975  *
 976  * Unlike IBRS, EIBRS is designed to be enabled once at boot and left on each
 977  * physical core. However, if this is the case, we don't want to use retpolines
 978  * any more. Therefore if EIBRS is present, we end up turning each retpoline
 979  * function (called a thunk) into a jmp instruction. This means that we're still
 980  * paying the cost of an extra jump to the external thunk, but it gives us
 981  * flexibility and the ability to have a single kernel image that works across a
 982  * wide variety of systems and hardware features.
 983  *
 984  * Unfortunately, this alone is insufficient. First, Skylake systems have
 985  * additional speculation for the Return Stack Buffer (RSB) which is used to
 986  * return from call instructions which retpolines take advantage of. However,
 987  * this problem is not just limited to Skylake and is actually more pernicious.
 988  * The SpectreRSB paper introduces several more problems that can arise with
 989  * dealing with this. The RSB can be poisoned just like the indirect branch
 990  * predictor. This means that one needs to clear the RSB when transitioning
 991  * between two different privilege domains. Some examples include:
 992  *
 993  *  - Switching between two different user processes
 994  *  - Going between user land and the kernel
 995  *  - Returning to the kernel from a hardware virtual machine
 996  *
 997  * Mitigating this involves combining a couple of different things. The first is
 998  * SMEP (supervisor mode execution protection) which was introduced in Ivy
 999  * Bridge. When an RSB entry refers to a user address and we're executing in the
1000  * kernel, speculation through it will be stopped when SMEP is enabled. This
1001  * protects against a number of the different cases that we would normally be
1002  * worried about such as when we enter the kernel from user land.
1003  *
1004  * To prevent against additional manipulation of the RSB from other contexts
1005  * such as a non-root VMX context attacking the kernel we first look to enhanced
1006  * IBRS. When EIBRS is present and enabled, then there is nothing else that we
1007  * need to do to protect the kernel at this time.
1008  *
1009  * On CPUs without EIBRS we need to manually overwrite the contents of the
1010  * return stack buffer. We do this through the x86_rsb_stuff() function.
1011  * Currently this is employed on context switch. The x86_rsb_stuff() function is
1012  * disabled when enhanced IBRS is present because Intel claims on such systems
1013  * it will be ineffective. Stuffing the RSB in context switch helps prevent user
1014  * to user attacks via the RSB.
1015  *
1016  * If SMEP is not present, then we would have to stuff the RSB every time we
1017  * transitioned from user mode to the kernel, which isn't very practical right
1018  * now.
1019  *
1020  * To fully protect user to user and vmx to vmx attacks from these classes of
1021  * issues, we would also need to allow them to opt into performing an Indirect
1022  * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1023  *
1024  * By default, the system will enable RSB stuffing and the required variant of
1025  * retpolines and store that information in the x86_spectrev2_mitigation value.
1026  * This will be evaluated after a microcode update as well, though it is
1027  * expected that microcode updates will not take away features. This may mean
1028  * that a late loaded microcode may not end up in the optimal configuration
1029  * (though this should be rare).
1030  *
1031  * Currently we do not build kmdb with retpolines or perform any additional side
1032  * channel security mitigations for it. One complication with kmdb is that it
1033  * requires its own retpoline thunks and it would need to adjust itself based on
1034  * what the kernel does. The threat model of kmdb is more limited and therefore
1035  * it may make more sense to investigate using prediction barriers as the whole
1036  * system is only executing a single instruction at a time while in kmdb.
1037  *
1038  * SPECTRE FAMILY (v1, v4)
1039  *
1040  * The v1 and v4 variants of spectre are not currently mitigated in the
1041  * system and require other classes of changes to occur in the code.
1042  *
1043  * MELTDOWN
1044  *
1045  * Meltdown, or spectre v3, allowed a user process to read any data in their
1046  * address space regardless of whether or not the page tables in question
1047  * allowed the user to have the ability to read them. The solution to meltdown
1048  * is kernel page table isolation. In this world, there are two page tables that
1049  * are used for a process, one in user land and one in the kernel. To implement
1050  * this we use per-CPU page tables and switch between the user and kernel
1051  * variants when entering and exiting the kernel.  For more information about
1052  * this process and how the trampolines work, please see the big theory
1053  * statements and additional comments in:
1054  *
1055  *  - uts/i86pc/ml/kpti_trampolines.s
1056  *  - uts/i86pc/vm/hat_i86.c
1057  *
1058  * While Meltdown only impacted Intel systems and there are also Intel systems
1059  * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1060  * kernel page table isolation enabled. While this may at first seem weird, an
1061  * important thing to remember is that you can't speculatively read an address
1062  * if it's never in your page table at all. Having user processes without kernel
1063  * pages present provides us with an important layer of defense in the kernel
1064  * against any other side channel attacks that exist and have yet to be
1065  * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1066  * default, no matter the x86 system.
1067  *
1068  * L1 TERMINAL FAULT
1069  *
1070  * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1071  * execution uses page table entries. Effectively, it is two different problems.
1072  * The first is that it ignores the not present bit in the page table entries
1073  * when performing speculative execution. This means that something can
1074  * speculatively read the listed physical address if it's present in the L1
1075  * cache under certain conditions (see Intel's documentation for the full set of
1076  * conditions). Secondly, this can be used to bypass hardware virtualization
1077  * extended page tables (EPT) that are part of Intel's hardware virtual machine
1078  * instructions.
1079  *
1080  * For the non-hardware virtualized case, this is relatively easy to deal with.
1081  * We must make sure that all unmapped pages have an address of zero. This means
1082  * that they could read the first 4k of physical memory; however, we never use
1083  * that first page in the operating system and always skip putting it in our
1084  * memory map, even if firmware tells us we can use it in our memory map. While
1085  * other systems try to put extra metadata in the address and reserved bits,
1086  * which led to this being problematic in those cases, we do not.
1087  *
1088  * For hardware virtual machines things are more complicated. Because they can
1089  * construct their own page tables, it isn't hard for them to perform this
1090  * attack against any physical address. The one wrinkle is that this physical
1091  * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1092  * to flush the L1 data cache. We wrap this up in the function
1093  * spec_uarch_flush(). This function is also used in the mitigation of
1094  * microarchitectural data sampling (MDS) discussed later on. Kernel based
1095  * hypervisors such as KVM or bhyve are responsible for performing this before
1096  * entering the guest.
1097  *
1098  * Because this attack takes place in the L1 cache, there's another wrinkle
1099  * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1100  * designs. This means that when a thread enters a hardware virtualized context
1101  * and flushes the L1 data cache, the other thread on the processor may then go
1102  * ahead and put new data in it that can be potentially attacked. While one
1103  * solution is to disable SMT on the system, another option that is available is
1104  * to use a feature for hardware virtualization called 'SMT exclusion'. This
1105  * goes through and makes sure that if a HVM is being scheduled on one thread,
1106  * then the thing on the other thread is from the same hardware virtual machine.
1107  * If an interrupt comes in or the guest exits to the broader system, then the
1108  * other SMT thread will be kicked out.
1109  *
1110  * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1111  * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1112  * perform L1TF related mitigations.
1113  *
1114  * MICROARCHITECTURAL DATA SAMPLING
1115  *
1116  * Microarchitectural data sampling (MDS) is a combination of four discrete
1117  * vulnerabilities that are similar issues affecting various parts of the CPU's
1118  * microarchitectural implementation around load, store, and fill buffers.
1119  * Specifically it is made up of the following subcomponents:
1120  *
1121  *  1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1122  *  2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1123  *  3. Microarchitectural Load Port Data Sampling (MLPDS)
1124  *  4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1125  *
1126  * To begin addressing these, Intel has introduced another feature in microcode
1127  * called MD_CLEAR. This changes the verw instruction to operate in a different
1128  * way. This allows us to execute the verw instruction in a particular way to
1129  * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1130  * updated when this microcode is present to flush this state.
1131  *
1132  * Primarily we need to flush this state whenever we transition from the kernel
1133  * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1134  * little bit different. Here the structures are statically sized when a logical
1135  * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1136  * flush the microarchitectural state before the CPU goes idles by calling hlt,
1137  * mwait, or another ACPI method. To perform these flushes, we call
1138  * x86_md_clear() at all of these transition points.
1139  *
1140  * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1141  * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1142  * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1143  * a no-op.
1144  *
1145  * Unfortunately, with this issue hyperthreading rears its ugly head. In
1146  * particular, everything we've discussed above is only valid for a single
1147  * thread executing on a core. In the case where you have hyper-threading
1148  * present, this attack can be performed between threads. The theoretical fix
1149  * for this is to ensure that both threads are always in the same security
1150  * domain. This means that they are executing in the same ring and mutually
1151  * trust each other. Practically speaking, this would mean that a system call
1152  * would have to issue an inter-processor interrupt (IPI) to the other thread.
1153  * Rather than implement this, we recommend that one disables hyper-threading
1154  * through the use of psradm -aS.
1155  *
1156  * SUMMARY
1157  *
1158  * The following table attempts to summarize the mitigations for various issues
1159  * and what's done in various places:
1160  *
1161  *  - Spectre v1: Not currently mitigated
1162  *  - Spectre v2: Retpolines/RSB Stuffing or EIBRS if HW support
1163  *  - Meltdown: Kernel Page Table Isolation
1164  *  - Spectre v3a: Updated CPU microcode
1165  *  - Spectre v4: Not currently mitigated
1166  *  - SpectreRSB: SMEP and RSB Stuffing
1167  *  - L1TF: spec_uarch_flush, smt exclusion, requires microcode
1168  *  - MDS: x86_md_clear, requires microcode, disabling hyper threading
1169  *
1170  * The following table indicates the x86 feature set bits that indicate that a
1171  * given problem has been solved or a notable feature is present:
1172  *
1173  *  - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1174  *  - MDS_NO: All forms of MDS
1175  */
1176 
1177 #include <sys/types.h>
1178 #include <sys/archsystm.h>
1179 #include <sys/x86_archext.h>
1180 #include <sys/kmem.h>
1181 #include <sys/systm.h>
1182 #include <sys/cmn_err.h>
1183 #include <sys/sunddi.h>
1184 #include <sys/sunndi.h>
1185 #include <sys/cpuvar.h>
1186 #include <sys/processor.h>
1187 #include <sys/sysmacros.h>
1188 #include <sys/pg.h>
1189 #include <sys/fp.h>
1190 #include <sys/controlregs.h>
1191 #include <sys/bitmap.h>
1192 #include <sys/auxv_386.h>
1193 #include <sys/memnode.h>
1194 #include <sys/pci_cfgspace.h>
1195 #include <sys/comm_page.h>
1196 #include <sys/mach_mmu.h>
1197 #include <sys/ucode.h>
1198 #include <sys/tsc.h>
1199 #include <sys/kobj.h>
1200 #include <sys/asm_misc.h>
1201 
1202 #ifdef __xpv
1203 #include <sys/hypervisor.h>
1204 #else
1205 #include <sys/ontrap.h>
1206 #endif
1207 
1208 uint_t x86_vendor = X86_VENDOR_IntelClone;
1209 uint_t x86_type = X86_TYPE_OTHER;
1210 uint_t x86_clflush_size = 0;
1211 
1212 #if defined(__xpv)
1213 int x86_use_pcid = 0;
1214 int x86_use_invpcid = 0;
1215 #else
1216 int x86_use_pcid = -1;
1217 int x86_use_invpcid = -1;
1218 #endif
1219 
1220 typedef enum {
1221         X86_SPECTREV2_RETPOLINE,
1222         X86_SPECTREV2_RETPOLINE_AMD,
1223         X86_SPECTREV2_ENHANCED_IBRS,
1224         X86_SPECTREV2_DISABLED
1225 } x86_spectrev2_mitigation_t;
1226 
1227 uint_t x86_disable_spectrev2 = 0;
1228 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1229     X86_SPECTREV2_RETPOLINE;
1230 
1231 uint_t pentiumpro_bug4046376;
1232 
1233 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1234 
1235 static char *x86_feature_names[NUM_X86_FEATURES] = {
1236         "lgpg",
1237         "tsc",
1238         "msr",
1239         "mtrr",
1240         "pge",
1241         "de",
1242         "cmov",
1243         "mmx",
1244         "mca",
1245         "pae",
1246         "cv8",
1247         "pat",
1248         "sep",
1249         "sse",
1250         "sse2",
1251         "htt",
1252         "asysc",
1253         "nx",
1254         "sse3",
1255         "cx16",
1256         "cmp",
1257         "tscp",
1258         "mwait",
1259         "sse4a",
1260         "cpuid",
1261         "ssse3",
1262         "sse4_1",
1263         "sse4_2",
1264         "1gpg",
1265         "clfsh",
1266         "64",
1267         "aes",
1268         "pclmulqdq",
1269         "xsave",
1270         "avx",
1271         "vmx",
1272         "svm",
1273         "topoext",
1274         "f16c",
1275         "rdrand",
1276         "x2apic",
1277         "avx2",
1278         "bmi1",
1279         "bmi2",
1280         "fma",
1281         "smep",
1282         "smap",
1283         "adx",
1284         "rdseed",
1285         "mpx",
1286         "avx512f",
1287         "avx512dq",
1288         "avx512pf",
1289         "avx512er",
1290         "avx512cd",
1291         "avx512bw",
1292         "avx512vl",
1293         "avx512fma",
1294         "avx512vbmi",
1295         "avx512_vpopcntdq",
1296         "avx512_4vnniw",
1297         "avx512_4fmaps",
1298         "xsaveopt",
1299         "xsavec",
1300         "xsaves",
1301         "sha",
1302         "umip",
1303         "pku",
1304         "ospke",
1305         "pcid",
1306         "invpcid",
1307         "ibrs",
1308         "ibpb",
1309         "stibp",
1310         "ssbd",
1311         "ssbd_virt",
1312         "rdcl_no",
1313         "ibrs_all",
1314         "rsba",
1315         "ssb_no",
1316         "stibp_all",
1317         "flush_cmd",
1318         "l1d_vmentry_no",
1319         "fsgsbase",
1320         "clflushopt",
1321         "clwb",
1322         "monitorx",
1323         "clzero",
1324         "xop",
1325         "fma4",
1326         "tbm",
1327         "avx512_vnni",
1328         "amd_pcec",
1329         "mb_clear",
1330         "mds_no",
1331         "core_thermal",
1332         "pkg_thermal"
1333 };
1334 
1335 boolean_t
1336 is_x86_feature(void *featureset, uint_t feature)
1337 {
1338         ASSERT(feature < NUM_X86_FEATURES);
1339         return (BT_TEST((ulong_t *)featureset, feature));
1340 }
1341 
1342 void
1343 add_x86_feature(void *featureset, uint_t feature)
1344 {
1345         ASSERT(feature < NUM_X86_FEATURES);
1346         BT_SET((ulong_t *)featureset, feature);
1347 }
1348 
1349 void
1350 remove_x86_feature(void *featureset, uint_t feature)
1351 {
1352         ASSERT(feature < NUM_X86_FEATURES);
1353         BT_CLEAR((ulong_t *)featureset, feature);
1354 }
1355 
1356 boolean_t
1357 compare_x86_featureset(void *setA, void *setB)
1358 {
1359         /*
1360          * We assume that the unused bits of the bitmap are always zero.
1361          */
1362         if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1363                 return (B_TRUE);
1364         } else {
1365                 return (B_FALSE);
1366         }
1367 }
1368 
1369 void
1370 print_x86_featureset(void *featureset)
1371 {
1372         uint_t i;
1373 
1374         for (i = 0; i < NUM_X86_FEATURES; i++) {
1375                 if (is_x86_feature(featureset, i)) {
1376                         cmn_err(CE_CONT, "?x86_feature: %s\n",
1377                             x86_feature_names[i]);
1378                 }
1379         }
1380 }
1381 
1382 /* Note: This is the maximum size for the CPU, not the size of the structure. */
1383 static size_t xsave_state_size = 0;
1384 uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1385 boolean_t xsave_force_disable = B_FALSE;
1386 extern int disable_smap;
1387 
1388 /*
1389  * This is set to platform type we are running on.
1390  */
1391 static int platform_type = -1;
1392 
1393 #if !defined(__xpv)
1394 /*
1395  * Variable to patch if hypervisor platform detection needs to be
1396  * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1397  */
1398 int enable_platform_detection = 1;
1399 #endif
1400 
1401 /*
1402  * monitor/mwait info.
1403  *
1404  * size_actual and buf_actual are the real address and size allocated to get
1405  * proper mwait_buf alignement.  buf_actual and size_actual should be passed
1406  * to kmem_free().  Currently kmem_alloc() and mwait happen to both use
1407  * processor cache-line alignment, but this is not guarantied in the furture.
1408  */
1409 struct mwait_info {
1410         size_t          mon_min;        /* min size to avoid missed wakeups */
1411         size_t          mon_max;        /* size to avoid false wakeups */
1412         size_t          size_actual;    /* size actually allocated */
1413         void            *buf_actual;    /* memory actually allocated */
1414         uint32_t        support;        /* processor support of monitor/mwait */
1415 };
1416 
1417 /*
1418  * xsave/xrestor info.
1419  *
1420  * This structure contains HW feature bits and the size of the xsave save area.
1421  * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1422  * (xsave_state) to describe the xsave layout. However, at runtime the
1423  * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1424  * xsave_state structure simply represents the legacy layout of the beginning
1425  * of the xsave area.
1426  */
1427 struct xsave_info {
1428         uint32_t        xsav_hw_features_low;   /* Supported HW features */
1429         uint32_t        xsav_hw_features_high;  /* Supported HW features */
1430         size_t          xsav_max_size;  /* max size save area for HW features */
1431         size_t          ymm_size;       /* AVX: size of ymm save area */
1432         size_t          ymm_offset;     /* AVX: offset for ymm save area */
1433         size_t          bndregs_size;   /* MPX: size of bndregs save area */
1434         size_t          bndregs_offset; /* MPX: offset for bndregs save area */
1435         size_t          bndcsr_size;    /* MPX: size of bndcsr save area */
1436         size_t          bndcsr_offset;  /* MPX: offset for bndcsr save area */
1437         size_t          opmask_size;    /* AVX512: size of opmask save */
1438         size_t          opmask_offset;  /* AVX512: offset for opmask save */
1439         size_t          zmmlo_size;     /* AVX512: size of zmm 256 save */
1440         size_t          zmmlo_offset;   /* AVX512: offset for zmm 256 save */
1441         size_t          zmmhi_size;     /* AVX512: size of zmm hi reg save */
1442         size_t          zmmhi_offset;   /* AVX512: offset for zmm hi reg save */
1443 };
1444 
1445 
1446 /*
1447  * These constants determine how many of the elements of the
1448  * cpuid we cache in the cpuid_info data structure; the
1449  * remaining elements are accessible via the cpuid instruction.
1450  */
1451 
1452 #define NMAX_CPI_STD    8               /* eax = 0 .. 7 */
1453 #define NMAX_CPI_EXTD   0x1f            /* eax = 0x80000000 .. 0x8000001e */
1454 
1455 /*
1456  * See the big theory statement for a more detailed explanation of what some of
1457  * these members mean.
1458  */
1459 struct cpuid_info {
1460         uint_t cpi_pass;                /* last pass completed */
1461         /*
1462          * standard function information
1463          */
1464         uint_t cpi_maxeax;              /* fn 0: %eax */
1465         char cpi_vendorstr[13];         /* fn 0: %ebx:%ecx:%edx */
1466         uint_t cpi_vendor;              /* enum of cpi_vendorstr */
1467 
1468         uint_t cpi_family;              /* fn 1: extended family */
1469         uint_t cpi_model;               /* fn 1: extended model */
1470         uint_t cpi_step;                /* fn 1: stepping */
1471         chipid_t cpi_chipid;            /* fn 1: %ebx:  Intel: chip # */
1472                                         /*              AMD: package/socket # */
1473         uint_t cpi_brandid;             /* fn 1: %ebx: brand ID */
1474         int cpi_clogid;                 /* fn 1: %ebx: thread # */
1475         uint_t cpi_ncpu_per_chip;       /* fn 1: %ebx: logical cpu count */
1476         uint8_t cpi_cacheinfo[16];      /* fn 2: intel-style cache desc */
1477         uint_t cpi_ncache;              /* fn 2: number of elements */
1478         uint_t cpi_ncpu_shr_last_cache; /* fn 4: %eax: ncpus sharing cache */
1479         id_t cpi_last_lvl_cacheid;      /* fn 4: %eax: derived cache id */
1480         uint_t cpi_cache_leaf_size;     /* Number of cache elements */
1481                                         /* Intel fn: 4, AMD fn: 8000001d */
1482         struct cpuid_regs **cpi_cache_leaves;   /* Acual leaves from above */
1483         struct cpuid_regs cpi_std[NMAX_CPI_STD];        /* 0 .. 7 */
1484         /*
1485          * extended function information
1486          */
1487         uint_t cpi_xmaxeax;             /* fn 0x80000000: %eax */
1488         char cpi_brandstr[49];          /* fn 0x8000000[234] */
1489         uint8_t cpi_pabits;             /* fn 0x80000006: %eax */
1490         uint8_t cpi_vabits;             /* fn 0x80000006: %eax */
1491         uint8_t cpi_fp_amd_save;        /* AMD: FP error pointer save rqd. */
1492         struct  cpuid_regs cpi_extd[NMAX_CPI_EXTD];     /* 0x800000XX */
1493 
1494         id_t cpi_coreid;                /* same coreid => strands share core */
1495         int cpi_pkgcoreid;              /* core number within single package */
1496         uint_t cpi_ncore_per_chip;      /* AMD: fn 0x80000008: %ecx[7-0] */
1497                                         /* Intel: fn 4: %eax[31-26] */
1498 
1499         /*
1500          * These values represent the number of bits that are required to store
1501          * information about the number of cores and threads.
1502          */
1503         uint_t cpi_ncore_bits;
1504         uint_t cpi_nthread_bits;
1505         /*
1506          * supported feature information
1507          */
1508         uint32_t cpi_support[6];
1509 #define STD_EDX_FEATURES        0
1510 #define AMD_EDX_FEATURES        1
1511 #define TM_EDX_FEATURES         2
1512 #define STD_ECX_FEATURES        3
1513 #define AMD_ECX_FEATURES        4
1514 #define STD_EBX_FEATURES        5
1515         /*
1516          * Synthesized information, where known.
1517          */
1518         uint32_t cpi_chiprev;           /* See X86_CHIPREV_* in x86_archext.h */
1519         const char *cpi_chiprevstr;     /* May be NULL if chiprev unknown */
1520         uint32_t cpi_socket;            /* Chip package/socket type */
1521 
1522         struct mwait_info cpi_mwait;    /* fn 5: monitor/mwait info */
1523         uint32_t cpi_apicid;
1524         uint_t cpi_procnodeid;          /* AMD: nodeID on HT, Intel: chipid */
1525         uint_t cpi_procnodes_per_pkg;   /* AMD: # of nodes in the package */
1526                                         /* Intel: 1 */
1527         uint_t cpi_compunitid;          /* AMD: ComputeUnit ID, Intel: coreid */
1528         uint_t cpi_cores_per_compunit;  /* AMD: # of cores in the ComputeUnit */
1529 
1530         struct xsave_info cpi_xsave;    /* fn D: xsave/xrestor info */
1531 };
1532 
1533 
1534 static struct cpuid_info cpuid_info0;
1535 
1536 /*
1537  * These bit fields are defined by the Intel Application Note AP-485
1538  * "Intel Processor Identification and the CPUID Instruction"
1539  */
1540 #define CPI_FAMILY_XTD(cpi)     BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1541 #define CPI_MODEL_XTD(cpi)      BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1542 #define CPI_TYPE(cpi)           BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1543 #define CPI_FAMILY(cpi)         BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1544 #define CPI_STEP(cpi)           BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1545 #define CPI_MODEL(cpi)          BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1546 
1547 #define CPI_FEATURES_EDX(cpi)           ((cpi)->cpi_std[1].cp_edx)
1548 #define CPI_FEATURES_ECX(cpi)           ((cpi)->cpi_std[1].cp_ecx)
1549 #define CPI_FEATURES_XTD_EDX(cpi)       ((cpi)->cpi_extd[1].cp_edx)
1550 #define CPI_FEATURES_XTD_ECX(cpi)       ((cpi)->cpi_extd[1].cp_ecx)
1551 #define CPI_FEATURES_7_0_EBX(cpi)       ((cpi)->cpi_std[7].cp_ebx)
1552 #define CPI_FEATURES_7_0_ECX(cpi)       ((cpi)->cpi_std[7].cp_ecx)
1553 #define CPI_FEATURES_7_0_EDX(cpi)       ((cpi)->cpi_std[7].cp_edx)
1554 
1555 #define CPI_BRANDID(cpi)        BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1556 #define CPI_CHUNKS(cpi)         BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1557 #define CPI_CPU_COUNT(cpi)      BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1558 #define CPI_APIC_ID(cpi)        BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1559 
1560 #define CPI_MAXEAX_MAX          0x100           /* sanity control */
1561 #define CPI_XMAXEAX_MAX         0x80000100
1562 #define CPI_FN4_ECX_MAX         0x20            /* sanity: max fn 4 levels */
1563 #define CPI_FNB_ECX_MAX         0x20            /* sanity: max fn B levels */
1564 
1565 /*
1566  * Function 4 (Deterministic Cache Parameters) macros
1567  * Defined by Intel Application Note AP-485
1568  */
1569 #define CPI_NUM_CORES(regs)             BITX((regs)->cp_eax, 31, 26)
1570 #define CPI_NTHR_SHR_CACHE(regs)        BITX((regs)->cp_eax, 25, 14)
1571 #define CPI_FULL_ASSOC_CACHE(regs)      BITX((regs)->cp_eax, 9, 9)
1572 #define CPI_SELF_INIT_CACHE(regs)       BITX((regs)->cp_eax, 8, 8)
1573 #define CPI_CACHE_LVL(regs)             BITX((regs)->cp_eax, 7, 5)
1574 #define CPI_CACHE_TYPE(regs)            BITX((regs)->cp_eax, 4, 0)
1575 #define CPI_CPU_LEVEL_TYPE(regs)        BITX((regs)->cp_ecx, 15, 8)
1576 
1577 #define CPI_CACHE_WAYS(regs)            BITX((regs)->cp_ebx, 31, 22)
1578 #define CPI_CACHE_PARTS(regs)           BITX((regs)->cp_ebx, 21, 12)
1579 #define CPI_CACHE_COH_LN_SZ(regs)       BITX((regs)->cp_ebx, 11, 0)
1580 
1581 #define CPI_CACHE_SETS(regs)            BITX((regs)->cp_ecx, 31, 0)
1582 
1583 #define CPI_PREFCH_STRIDE(regs)         BITX((regs)->cp_edx, 9, 0)
1584 
1585 
1586 /*
1587  * A couple of shorthand macros to identify "later" P6-family chips
1588  * like the Pentium M and Core.  First, the "older" P6-based stuff
1589  * (loosely defined as "pre-Pentium-4"):
1590  * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1591  */
1592 #define IS_LEGACY_P6(cpi) (                     \
1593         cpi->cpi_family == 6 &&                      \
1594                 (cpi->cpi_model == 1 ||              \
1595                 cpi->cpi_model == 3 ||               \
1596                 cpi->cpi_model == 5 ||               \
1597                 cpi->cpi_model == 6 ||               \
1598                 cpi->cpi_model == 7 ||               \
1599                 cpi->cpi_model == 8 ||               \
1600                 cpi->cpi_model == 0xA ||     \
1601                 cpi->cpi_model == 0xB)               \
1602 )
1603 
1604 /* A "new F6" is everything with family 6 that's not the above */
1605 #define IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
1606 
1607 /* Extended family/model support */
1608 #define IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
1609         cpi->cpi_family >= 0xf)
1610 
1611 /*
1612  * Info for monitor/mwait idle loop.
1613  *
1614  * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
1615  * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
1616  * 2006.
1617  * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
1618  * Documentation Updates" #33633, Rev 2.05, December 2006.
1619  */
1620 #define MWAIT_SUPPORT           (0x00000001)    /* mwait supported */
1621 #define MWAIT_EXTENSIONS        (0x00000002)    /* extenstion supported */
1622 #define MWAIT_ECX_INT_ENABLE    (0x00000004)    /* ecx 1 extension supported */
1623 #define MWAIT_SUPPORTED(cpi)    ((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
1624 #define MWAIT_INT_ENABLE(cpi)   ((cpi)->cpi_std[5].cp_ecx & 0x2)
1625 #define MWAIT_EXTENSION(cpi)    ((cpi)->cpi_std[5].cp_ecx & 0x1)
1626 #define MWAIT_SIZE_MIN(cpi)     BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
1627 #define MWAIT_SIZE_MAX(cpi)     BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
1628 /*
1629  * Number of sub-cstates for a given c-state.
1630  */
1631 #define MWAIT_NUM_SUBC_STATES(cpi, c_state)                     \
1632         BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
1633 
1634 /*
1635  * XSAVE leaf 0xD enumeration
1636  */
1637 #define CPUID_LEAFD_2_YMM_OFFSET        576
1638 #define CPUID_LEAFD_2_YMM_SIZE          256
1639 
1640 /*
1641  * Common extended leaf names to cut down on typos.
1642  */
1643 #define CPUID_LEAF_EXT_0                0x80000000
1644 #define CPUID_LEAF_EXT_8                0x80000008
1645 #define CPUID_LEAF_EXT_1d               0x8000001d
1646 #define CPUID_LEAF_EXT_1e               0x8000001e
1647 
1648 /*
1649  * Functions we consune from cpuid_subr.c;  don't publish these in a header
1650  * file to try and keep people using the expected cpuid_* interfaces.
1651  */
1652 extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
1653 extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
1654 extern uint32_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
1655 extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
1656 extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
1657 
1658 /*
1659  * Apply up various platform-dependent restrictions where the
1660  * underlying platform restrictions mean the CPU can be marked
1661  * as less capable than its cpuid instruction would imply.
1662  */
1663 #if defined(__xpv)
1664 static void
1665 platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
1666 {
1667         switch (eax) {
1668         case 1: {
1669                 uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
1670                     0 : CPUID_INTC_EDX_MCA;
1671                 cp->cp_edx &=
1672                     ~(mcamask |
1673                     CPUID_INTC_EDX_PSE |
1674                     CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1675                     CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
1676                     CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
1677                     CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1678                     CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
1679                 break;
1680         }
1681 
1682         case 0x80000001:
1683                 cp->cp_edx &=
1684                     ~(CPUID_AMD_EDX_PSE |
1685                     CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1686                     CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
1687                     CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
1688                     CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1689                     CPUID_AMD_EDX_TSCP);
1690                 cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
1691                 break;
1692         default:
1693                 break;
1694         }
1695 
1696         switch (vendor) {
1697         case X86_VENDOR_Intel:
1698                 switch (eax) {
1699                 case 4:
1700                         /*
1701                          * Zero out the (ncores-per-chip - 1) field
1702                          */
1703                         cp->cp_eax &= 0x03fffffff;
1704                         break;
1705                 default:
1706                         break;
1707                 }
1708                 break;
1709         case X86_VENDOR_AMD:
1710                 switch (eax) {
1711 
1712                 case 0x80000001:
1713                         cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
1714                         break;
1715 
1716                 case CPUID_LEAF_EXT_8:
1717                         /*
1718                          * Zero out the (ncores-per-chip - 1) field
1719                          */
1720                         cp->cp_ecx &= 0xffffff00;
1721                         break;
1722                 default:
1723                         break;
1724                 }
1725                 break;
1726         default:
1727                 break;
1728         }
1729 }
1730 #else
1731 #define platform_cpuid_mangle(vendor, eax, cp)  /* nothing */
1732 #endif
1733 
1734 /*
1735  *  Some undocumented ways of patching the results of the cpuid
1736  *  instruction to permit running Solaris 10 on future cpus that
1737  *  we don't currently support.  Could be set to non-zero values
1738  *  via settings in eeprom.
1739  */
1740 
1741 uint32_t cpuid_feature_ecx_include;
1742 uint32_t cpuid_feature_ecx_exclude;
1743 uint32_t cpuid_feature_edx_include;
1744 uint32_t cpuid_feature_edx_exclude;
1745 
1746 /*
1747  * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
1748  */
1749 void
1750 cpuid_alloc_space(cpu_t *cpu)
1751 {
1752         /*
1753          * By convention, cpu0 is the boot cpu, which is set up
1754          * before memory allocation is available.  All other cpus get
1755          * their cpuid_info struct allocated here.
1756          */
1757         ASSERT(cpu->cpu_id != 0);
1758         ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
1759         cpu->cpu_m.mcpu_cpi =
1760             kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
1761 }
1762 
1763 void
1764 cpuid_free_space(cpu_t *cpu)
1765 {
1766         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1767         int i;
1768 
1769         ASSERT(cpi != NULL);
1770         ASSERT(cpi != &cpuid_info0);
1771 
1772         /*
1773          * Free up any cache leaf related dynamic storage. The first entry was
1774          * cached from the standard cpuid storage, so we should not free it.
1775          */
1776         for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
1777                 kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
1778         if (cpi->cpi_cache_leaf_size > 0)
1779                 kmem_free(cpi->cpi_cache_leaves,
1780                     cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
1781 
1782         kmem_free(cpi, sizeof (*cpi));
1783         cpu->cpu_m.mcpu_cpi = NULL;
1784 }
1785 
1786 #if !defined(__xpv)
1787 /*
1788  * Determine the type of the underlying platform. This is used to customize
1789  * initialization of various subsystems (e.g. TSC). determine_platform() must
1790  * only ever be called once to prevent two processors from seeing different
1791  * values of platform_type. Must be called before cpuid_pass1(), the earliest
1792  * consumer to execute (uses _cpuid_chiprev --> synth_amd_info --> get_hwenv).
1793  */
1794 void
1795 determine_platform(void)
1796 {
1797         struct cpuid_regs cp;
1798         uint32_t base;
1799         uint32_t regs[4];
1800         char *hvstr = (char *)regs;
1801 
1802         ASSERT(platform_type == -1);
1803 
1804         platform_type = HW_NATIVE;
1805 
1806         if (!enable_platform_detection)
1807                 return;
1808 
1809         /*
1810          * If Hypervisor CPUID bit is set, try to determine hypervisor
1811          * vendor signature, and set platform type accordingly.
1812          *
1813          * References:
1814          * http://lkml.org/lkml/2008/10/1/246
1815          * http://kb.vmware.com/kb/1009458
1816          */
1817         cp.cp_eax = 0x1;
1818         (void) __cpuid_insn(&cp);
1819         if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
1820                 cp.cp_eax = 0x40000000;
1821                 (void) __cpuid_insn(&cp);
1822                 regs[0] = cp.cp_ebx;
1823                 regs[1] = cp.cp_ecx;
1824                 regs[2] = cp.cp_edx;
1825                 regs[3] = 0;
1826                 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
1827                         platform_type = HW_XEN_HVM;
1828                         return;
1829                 }
1830                 if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
1831                         platform_type = HW_VMWARE;
1832                         return;
1833                 }
1834                 if (strcmp(hvstr, HVSIG_KVM) == 0) {
1835                         platform_type = HW_KVM;
1836                         return;
1837                 }
1838                 if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
1839                         platform_type = HW_BHYVE;
1840                         return;
1841                 }
1842                 if (strcmp(hvstr, HVSIG_MICROSOFT) == 0)
1843                         platform_type = HW_MICROSOFT;
1844         } else {
1845                 /*
1846                  * Check older VMware hardware versions. VMware hypervisor is
1847                  * detected by performing an IN operation to VMware hypervisor
1848                  * port and checking that value returned in %ebx is VMware
1849                  * hypervisor magic value.
1850                  *
1851                  * References: http://kb.vmware.com/kb/1009458
1852                  */
1853                 vmware_port(VMWARE_HVCMD_GETVERSION, regs);
1854                 if (regs[1] == VMWARE_HVMAGIC) {
1855                         platform_type = HW_VMWARE;
1856                         return;
1857                 }
1858         }
1859 
1860         /*
1861          * Check Xen hypervisor. In a fully virtualized domain,
1862          * Xen's pseudo-cpuid function returns a string representing the
1863          * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
1864          * supported cpuid function. We need at least a (base + 2) leaf value
1865          * to do what we want to do. Try different base values, since the
1866          * hypervisor might use a different one depending on whether Hyper-V
1867          * emulation is switched on by default or not.
1868          */
1869         for (base = 0x40000000; base < 0x40010000; base += 0x100) {
1870                 cp.cp_eax = base;
1871                 (void) __cpuid_insn(&cp);
1872                 regs[0] = cp.cp_ebx;
1873                 regs[1] = cp.cp_ecx;
1874                 regs[2] = cp.cp_edx;
1875                 regs[3] = 0;
1876                 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
1877                     cp.cp_eax >= (base + 2)) {
1878                         platform_type &= ~HW_NATIVE;
1879                         platform_type |= HW_XEN_HVM;
1880                         return;
1881                 }
1882         }
1883 }
1884 
1885 int
1886 get_hwenv(void)
1887 {
1888         ASSERT(platform_type != -1);
1889         return (platform_type);
1890 }
1891 
1892 int
1893 is_controldom(void)
1894 {
1895         return (0);
1896 }
1897 
1898 #else
1899 
1900 int
1901 get_hwenv(void)
1902 {
1903         return (HW_XEN_PV);
1904 }
1905 
1906 int
1907 is_controldom(void)
1908 {
1909         return (DOMAIN_IS_INITDOMAIN(xen_info));
1910 }
1911 
1912 #endif  /* __xpv */
1913 
1914 /*
1915  * Make sure that we have gathered all of the CPUID leaves that we might need to
1916  * determine topology. We assume that the standard leaf 1 has already been done
1917  * and that xmaxeax has already been calculated.
1918  */
1919 static void
1920 cpuid_gather_amd_topology_leaves(cpu_t *cpu)
1921 {
1922         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1923 
1924         if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
1925                 struct cpuid_regs *cp;
1926 
1927                 cp = &cpi->cpi_extd[8];
1928                 cp->cp_eax = CPUID_LEAF_EXT_8;
1929                 (void) __cpuid_insn(cp);
1930                 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
1931         }
1932 
1933         if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
1934             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
1935                 struct cpuid_regs *cp;
1936 
1937                 cp = &cpi->cpi_extd[0x1e];
1938                 cp->cp_eax = CPUID_LEAF_EXT_1e;
1939                 (void) __cpuid_insn(cp);
1940         }
1941 }
1942 
1943 /*
1944  * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
1945  * it to everything else. If not, and we're on an AMD system where 8000001e is
1946  * valid, then we use that. Othewrise, we fall back to the default value for the
1947  * APIC ID in leaf 1.
1948  */
1949 static uint32_t
1950 cpuid_gather_apicid(struct cpuid_info *cpi)
1951 {
1952         /*
1953          * Leaf B changes based on the arguments to it. Beacuse we don't cache
1954          * it, we need to gather it again.
1955          */
1956         if (cpi->cpi_maxeax >= 0xB) {
1957                 struct cpuid_regs regs;
1958                 struct cpuid_regs *cp;
1959 
1960                 cp = &regs;
1961                 cp->cp_eax = 0xB;
1962                 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
1963                 (void) __cpuid_insn(cp);
1964 
1965                 if (cp->cp_ebx != 0) {
1966                         return (cp->cp_edx);
1967                 }
1968         }
1969 
1970         if (cpi->cpi_vendor == X86_VENDOR_AMD &&
1971             is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
1972             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
1973                 return (cpi->cpi_extd[0x1e].cp_eax);
1974         }
1975 
1976         return (CPI_APIC_ID(cpi));
1977 }
1978 
1979 /*
1980  * For AMD processors, attempt to calculate the number of chips and cores that
1981  * exist. The way that we do this varies based on the generation, because the
1982  * generations themselves have changed dramatically.
1983  *
1984  * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
1985  * However, with the advent of family 17h (Zen) it actually tells us the number
1986  * of threads, so we need to look at leaf 0x8000001e if available to determine
1987  * its value. Otherwise, for all prior families, the number of enabled cores is
1988  * the same as threads.
1989  *
1990  * If we do not have leaf 0x80000008, then we assume that this processor does
1991  * not have anything. AMD's older CPUID specification says there's no reason to
1992  * fall back to leaf 1.
1993  *
1994  * In some virtualization cases we will not have leaf 8000001e or it will be
1995  * zero. When that happens we assume the number of threads is one.
1996  */
1997 static void
1998 cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
1999 {
2000         uint_t nthreads, nthread_per_core;
2001 
2002         nthreads = nthread_per_core = 1;
2003 
2004         if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2005                 nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2006         } else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2007                 nthreads = CPI_CPU_COUNT(cpi);
2008         }
2009 
2010         /*
2011          * For us to have threads, and know about it, we have to be at least at
2012          * family 17h and have the cpuid bit that says we have extended
2013          * topology.
2014          */
2015         if (cpi->cpi_family >= 0x17 &&
2016             is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2017             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2018                 nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2019         }
2020 
2021         *ncpus = nthreads;
2022         *ncores = nthreads / nthread_per_core;
2023 }
2024 
2025 /*
2026  * Seed the initial values for the cores and threads for an Intel based
2027  * processor. These values will be overwritten if we detect that the processor
2028  * supports CPUID leaf 0xb.
2029  */
2030 static void
2031 cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2032 {
2033         /*
2034          * Only seed the number of physical cores from the first level leaf 4
2035          * information. The number of threads there indicate how many share the
2036          * L1 cache, which may or may not have anything to do with the number of
2037          * logical CPUs per core.
2038          */
2039         if (cpi->cpi_maxeax >= 4) {
2040                 *ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2041         } else {
2042                 *ncores = 1;
2043         }
2044 
2045         if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2046                 *ncpus = CPI_CPU_COUNT(cpi);
2047         } else {
2048                 *ncpus = *ncores;
2049         }
2050 }
2051 
2052 static boolean_t
2053 cpuid_leafB_getids(cpu_t *cpu)
2054 {
2055         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2056         struct cpuid_regs regs;
2057         struct cpuid_regs *cp;
2058 
2059         if (cpi->cpi_maxeax < 0xB)
2060                 return (B_FALSE);
2061 
2062         cp = &regs;
2063         cp->cp_eax = 0xB;
2064         cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2065 
2066         (void) __cpuid_insn(cp);
2067 
2068         /*
2069          * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2070          * indicates that the extended topology enumeration leaf is
2071          * available.
2072          */
2073         if (cp->cp_ebx != 0) {
2074                 uint32_t x2apic_id = 0;
2075                 uint_t coreid_shift = 0;
2076                 uint_t ncpu_per_core = 1;
2077                 uint_t chipid_shift = 0;
2078                 uint_t ncpu_per_chip = 1;
2079                 uint_t i;
2080                 uint_t level;
2081 
2082                 for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2083                         cp->cp_eax = 0xB;
2084                         cp->cp_ecx = i;
2085 
2086                         (void) __cpuid_insn(cp);
2087                         level = CPI_CPU_LEVEL_TYPE(cp);
2088 
2089                         if (level == 1) {
2090                                 x2apic_id = cp->cp_edx;
2091                                 coreid_shift = BITX(cp->cp_eax, 4, 0);
2092                                 ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2093                         } else if (level == 2) {
2094                                 x2apic_id = cp->cp_edx;
2095                                 chipid_shift = BITX(cp->cp_eax, 4, 0);
2096                                 ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2097                         }
2098                 }
2099 
2100                 /*
2101                  * cpi_apicid is taken care of in cpuid_gather_apicid.
2102                  */
2103                 cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2104                 cpi->cpi_ncore_per_chip = ncpu_per_chip /
2105                     ncpu_per_core;
2106                 cpi->cpi_chipid = x2apic_id >> chipid_shift;
2107                 cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2108                 cpi->cpi_coreid = x2apic_id >> coreid_shift;
2109                 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2110                 cpi->cpi_procnodeid = cpi->cpi_chipid;
2111                 cpi->cpi_compunitid = cpi->cpi_coreid;
2112 
2113                 if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2114                         cpi->cpi_nthread_bits = coreid_shift;
2115                         cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2116                 }
2117 
2118                 return (B_TRUE);
2119         } else {
2120                 return (B_FALSE);
2121         }
2122 }
2123 
2124 static void
2125 cpuid_intel_getids(cpu_t *cpu, void *feature)
2126 {
2127         uint_t i;
2128         uint_t chipid_shift = 0;
2129         uint_t coreid_shift = 0;
2130         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2131 
2132         /*
2133          * There are no compute units or processor nodes currently on Intel.
2134          * Always set these to one.
2135          */
2136         cpi->cpi_procnodes_per_pkg = 1;
2137         cpi->cpi_cores_per_compunit = 1;
2138 
2139         /*
2140          * If cpuid Leaf B is present, use that to try and get this information.
2141          * It will be the most accurate for Intel CPUs.
2142          */
2143         if (cpuid_leafB_getids(cpu))
2144                 return;
2145 
2146         /*
2147          * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2148          * and ncore_per_chip. These represent the largest power of two values
2149          * that we need to cover all of the IDs in the system. Therefore, we use
2150          * those values to seed the number of bits needed to cover information
2151          * in the case when leaf B is not available. These values will probably
2152          * be larger than required, but that's OK.
2153          */
2154         cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2155         cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2156 
2157         for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2158                 chipid_shift++;
2159 
2160         cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2161         cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2162 
2163         if (is_x86_feature(feature, X86FSET_CMP)) {
2164                 /*
2165                  * Multi-core (and possibly multi-threaded)
2166                  * processors.
2167                  */
2168                 uint_t ncpu_per_core;
2169                 if (cpi->cpi_ncore_per_chip == 1)
2170                         ncpu_per_core = cpi->cpi_ncpu_per_chip;
2171                 else if (cpi->cpi_ncore_per_chip > 1)
2172                         ncpu_per_core = cpi->cpi_ncpu_per_chip /
2173                             cpi->cpi_ncore_per_chip;
2174                 /*
2175                  * 8bit APIC IDs on dual core Pentiums
2176                  * look like this:
2177                  *
2178                  * +-----------------------+------+------+
2179                  * | Physical Package ID   |  MC  |  HT  |
2180                  * +-----------------------+------+------+
2181                  * <------- chipid -------->
2182                  * <------- coreid --------------->
2183                  *                         <--- clogid -->
2184                  *                         <------>
2185                  *                         pkgcoreid
2186                  *
2187                  * Where the number of bits necessary to
2188                  * represent MC and HT fields together equals
2189                  * to the minimum number of bits necessary to
2190                  * store the value of cpi->cpi_ncpu_per_chip.
2191                  * Of those bits, the MC part uses the number
2192                  * of bits necessary to store the value of
2193                  * cpi->cpi_ncore_per_chip.
2194                  */
2195                 for (i = 1; i < ncpu_per_core; i <<= 1)
2196                         coreid_shift++;
2197                 cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2198                 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2199         } else if (is_x86_feature(feature, X86FSET_HTT)) {
2200                 /*
2201                  * Single-core multi-threaded processors.
2202                  */
2203                 cpi->cpi_coreid = cpi->cpi_chipid;
2204                 cpi->cpi_pkgcoreid = 0;
2205         } else {
2206                 /*
2207                  * Single-core single-thread processors.
2208                  */
2209                 cpi->cpi_coreid = cpu->cpu_id;
2210                 cpi->cpi_pkgcoreid = 0;
2211         }
2212         cpi->cpi_procnodeid = cpi->cpi_chipid;
2213         cpi->cpi_compunitid = cpi->cpi_coreid;
2214 }
2215 
2216 /*
2217  * Historically, AMD has had CMP chips with only a single thread per core.
2218  * However, starting in family 17h (Zen), this has changed and they now have
2219  * multiple threads. Our internal core id needs to be a unique value.
2220  *
2221  * To determine the core id of an AMD system, if we're from a family before 17h,
2222  * then we just use the cpu id, as that gives us a good value that will be
2223  * unique for each core. If instead, we're on family 17h or later, then we need
2224  * to do something more complicated. CPUID leaf 0x8000001e can tell us
2225  * how many threads are in the system. Based on that, we'll shift the APIC ID.
2226  * We can't use the normal core id in that leaf as it's only unique within the
2227  * socket, which is perfect for cpi_pkgcoreid, but not us.
2228  */
2229 static id_t
2230 cpuid_amd_get_coreid(cpu_t *cpu)
2231 {
2232         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2233 
2234         if (cpi->cpi_family >= 0x17 &&
2235             is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2236             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2237                 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2238                 if (nthreads > 1) {
2239                         VERIFY3U(nthreads, ==, 2);
2240                         return (cpi->cpi_apicid >> 1);
2241                 }
2242         }
2243 
2244         return (cpu->cpu_id);
2245 }
2246 
2247 /*
2248  * IDs on AMD is a more challenging task. This is notable because of the
2249  * following two facts:
2250  *
2251  *  1. Before family 0x17 (Zen), there was no support for SMT and there was
2252  *     also no way to get an actual unique core id from the system. As such, we
2253  *     synthesize this case by using cpu->cpu_id.  This scheme does not,
2254  *     however, guarantee that sibling cores of a chip will have sequential
2255  *     coreids starting at a multiple of the number of cores per chip - that is
2256  *     usually the case, but if the ACPI MADT table is presented in a different
2257  *     order then we need to perform a few more gymnastics for the pkgcoreid.
2258  *
2259  *  2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2260  *     called compute units. These compute units share the L1I cache, L2 cache,
2261  *     and the FPU. To deal with this, a new topology leaf was added in
2262  *     0x8000001e. However, parts of this leaf have different meanings
2263  *     once we get to family 0x17.
2264  */
2265 
2266 static void
2267 cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2268 {
2269         int i, first_half, coreidsz;
2270         uint32_t nb_caps_reg;
2271         uint_t node2_1;
2272         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2273         struct cpuid_regs *cp;
2274 
2275         /*
2276          * Calculate the core id (this comes from hardware in family 0x17 if it
2277          * hasn't been stripped by virtualization). We always set the compute
2278          * unit id to the same value. Also, initialize the default number of
2279          * cores per compute unit and nodes per package. This will be
2280          * overwritten when we know information about a particular family.
2281          */
2282         cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2283         cpi->cpi_compunitid = cpi->cpi_coreid;
2284         cpi->cpi_cores_per_compunit = 1;
2285         cpi->cpi_procnodes_per_pkg = 1;
2286 
2287         /*
2288          * To construct the logical ID, we need to determine how many APIC IDs
2289          * are dedicated to the cores and threads. This is provided for us in
2290          * 0x80000008. However, if it's not present (say due to virtualization),
2291          * then we assume it's one. This should be present on all 64-bit AMD
2292          * processors.  It was added in family 0xf (Hammer).
2293          */
2294         if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2295                 coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2296 
2297                 /*
2298                  * In AMD parlance chip is really a node while illumos
2299                  * uses chip as equivalent to socket/package.
2300                  */
2301                 if (coreidsz == 0) {
2302                         /* Use legacy method */
2303                         for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2304                                 coreidsz++;
2305                         if (coreidsz == 0)
2306                                 coreidsz = 1;
2307                 }
2308         } else {
2309                 /* Assume single-core part */
2310                 coreidsz = 1;
2311         }
2312         cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2313 
2314         /*
2315          * The package core ID varies depending on the family. While it may be
2316          * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2317          * this value is the core id in the given node. For non-virtualized
2318          * family 17h, we need to take the logical core id and shift off the
2319          * threads like we do when getting the core id.  Otherwise, we can use
2320          * the clogid as is. When family 17h is virtualized, the clogid should
2321          * be sufficient as if we don't have valid data in the leaf, then we
2322          * won't think we have SMT, in which case the cpi_clogid should be
2323          * sufficient.
2324          */
2325         if (cpi->cpi_family >= 0x17 &&
2326             is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2327             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2328             cpi->cpi_extd[0x1e].cp_ebx != 0) {
2329                 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2330                 if (nthreads > 1) {
2331                         VERIFY3U(nthreads, ==, 2);
2332                         cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2333                 } else {
2334                         cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2335                 }
2336         } else {
2337                 cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2338         }
2339 
2340         /*
2341          * Obtain the node ID and compute unit IDs. If we're on family 0x15
2342          * (bulldozer) or newer, then we can derive all of this from leaf
2343          * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2344          */
2345         if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2346             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2347                 cp = &cpi->cpi_extd[0x1e];
2348 
2349                 cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2350                 cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2351 
2352                 /*
2353                  * For Bulldozer-era CPUs, recalculate the compute unit
2354                  * information.
2355                  */
2356                 if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2357                         cpi->cpi_cores_per_compunit =
2358                             BITX(cp->cp_ebx, 15, 8) + 1;
2359                         cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2360                             (cpi->cpi_ncore_per_chip /
2361                             cpi->cpi_cores_per_compunit) *
2362                             (cpi->cpi_procnodeid /
2363                             cpi->cpi_procnodes_per_pkg);
2364                 }
2365         } else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2366                 cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2367         } else if (cpi->cpi_family == 0x10) {
2368                 /*
2369                  * See if we are a multi-node processor.
2370                  * All processors in the system have the same number of nodes
2371                  */
2372                 nb_caps_reg =  pci_getl_func(0, 24, 3, 0xe8);
2373                 if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2374                         /* Single-node */
2375                         cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2376                             coreidsz);
2377                 } else {
2378 
2379                         /*
2380                          * Multi-node revision D (2 nodes per package
2381                          * are supported)
2382                          */
2383                         cpi->cpi_procnodes_per_pkg = 2;
2384 
2385                         first_half = (cpi->cpi_pkgcoreid <=
2386                             (cpi->cpi_ncore_per_chip/2 - 1));
2387 
2388                         if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2389                                 /* We are BSP */
2390                                 cpi->cpi_procnodeid = (first_half ? 0 : 1);
2391                         } else {
2392 
2393                                 /* We are AP */
2394                                 /* NodeId[2:1] bits to use for reading F3xe8 */
2395                                 node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2396 
2397                                 nb_caps_reg =
2398                                     pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2399 
2400                                 /*
2401                                  * Check IntNodeNum bit (31:30, but bit 31 is
2402                                  * always 0 on dual-node processors)
2403                                  */
2404                                 if (BITX(nb_caps_reg, 30, 30) == 0)
2405                                         cpi->cpi_procnodeid = node2_1 +
2406                                             !first_half;
2407                                 else
2408                                         cpi->cpi_procnodeid = node2_1 +
2409                                             first_half;
2410                         }
2411                 }
2412         } else {
2413                 cpi->cpi_procnodeid = 0;
2414         }
2415 
2416         cpi->cpi_chipid =
2417             cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2418 
2419         cpi->cpi_ncore_bits = coreidsz;
2420         cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2421             cpi->cpi_ncore_per_chip);
2422 }
2423 
2424 static void
2425 spec_uarch_flush_noop(void)
2426 {
2427 }
2428 
2429 /*
2430  * When microcode is present that mitigates MDS, this wrmsr will also flush the
2431  * MDS-related micro-architectural state that would normally happen by calling
2432  * x86_md_clear().
2433  */
2434 static void
2435 spec_uarch_flush_msr(void)
2436 {
2437         wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2438 }
2439 
2440 /*
2441  * This function points to a function that will flush certain
2442  * micro-architectural state on the processor. This flush is used to mitigate
2443  * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This
2444  * function can point to one of three functions:
2445  *
2446  * - A noop which is done because we either are vulnerable, but do not have
2447  *   microcode available to help deal with a fix, or because we aren't
2448  *   vulnerable.
2449  *
2450  * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2451  *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2452  *   however, it only flushes the MDS related micro-architectural state on the
2453  *   current hyperthread, it does not do anything for the twin.
2454  *
2455  * - x86_md_clear which will flush the MDS related state. This is done when we
2456  *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2457  *   (RDCL_NO is set).
2458  */
2459 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2460 
2461 static void
2462 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2463 {
2464         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2465 
2466         /*
2467          * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2468          * has been fixed in hardware, it doesn't cover everything related to
2469          * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2470          * need to mitigate this.
2471          */
2472         if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2473             is_x86_feature(featureset, X86FSET_MDS_NO)) {
2474                 return;
2475         }
2476 
2477         if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2478                 const uint8_t nop = NOP_INSTR;
2479                 uint8_t *md = (uint8_t *)x86_md_clear;
2480 
2481                 *md = nop;
2482         }
2483 
2484         membar_producer();
2485 }
2486 
2487 static void
2488 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2489 {
2490         boolean_t need_l1d, need_mds;
2491         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2492 
2493         /*
2494          * If we're not on Intel or we've mitigated both RDCL and MDS in
2495          * hardware, then there's nothing left for us to do for enabling the
2496          * flush. We can also go ahead and say that SMT exclusion is
2497          * unnecessary.
2498          */
2499         if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2500             (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2501             is_x86_feature(featureset, X86FSET_MDS_NO))) {
2502                 extern int smt_exclusion;
2503                 smt_exclusion = 0;
2504                 spec_uarch_flush = spec_uarch_flush_noop;
2505                 membar_producer();
2506                 return;
2507         }
2508 
2509         /*
2510          * The locations where we need to perform an L1D flush are required both
2511          * for mitigating L1TF and MDS. When verw support is present in
2512          * microcode, then the L1D flush will take care of doing that as well.
2513          * However, if we have a system where RDCL_NO is present, but we don't
2514          * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2515          * L1D flush.
2516          */
2517         if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2518             is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2519             !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2520                 need_l1d = B_TRUE;
2521         } else {
2522                 need_l1d = B_FALSE;
2523         }
2524 
2525         if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2526             is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2527                 need_mds = B_TRUE;
2528         } else {
2529                 need_mds = B_FALSE;
2530         }
2531 
2532         if (need_l1d) {
2533                 spec_uarch_flush = spec_uarch_flush_msr;
2534         } else if (need_mds) {
2535                 spec_uarch_flush = x86_md_clear;
2536         } else {
2537                 /*
2538                  * We have no hardware mitigations available to us.
2539                  */
2540                 spec_uarch_flush = spec_uarch_flush_noop;
2541         }
2542         membar_producer();
2543 }
2544 
2545 /*
2546  * We default to enabling RSB mitigations.
2547  */
2548 static void
2549 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)
2550 {
2551         const uint8_t ret = RET_INSTR;
2552         uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
2553 
2554         switch (mit) {
2555         case X86_SPECTREV2_ENHANCED_IBRS:
2556         case X86_SPECTREV2_DISABLED:
2557                 *stuff = ret;
2558                 break;
2559         default:
2560                 break;
2561         }
2562 }
2563 
2564 static void
2565 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
2566 {
2567         const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
2568             "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
2569             "_r14", "_r15" };
2570         const uint_t nthunks = ARRAY_SIZE(thunks);
2571         const char *type;
2572         uint_t i;
2573 
2574         if (mit == x86_spectrev2_mitigation)
2575                 return;
2576 
2577         switch (mit) {
2578         case X86_SPECTREV2_RETPOLINE:
2579                 type = "gen";
2580                 break;
2581         case X86_SPECTREV2_RETPOLINE_AMD:
2582                 type = "amd";
2583                 break;
2584         case X86_SPECTREV2_ENHANCED_IBRS:
2585         case X86_SPECTREV2_DISABLED:
2586                 type = "jmp";
2587                 break;
2588         default:
2589                 panic("asked to updated retpoline state with unknown state!");
2590         }
2591 
2592         for (i = 0; i < nthunks; i++) {
2593                 uintptr_t source, dest;
2594                 int ssize, dsize;
2595                 char sourcebuf[64], destbuf[64];
2596                 size_t len;
2597 
2598                 (void) snprintf(destbuf, sizeof (destbuf),
2599                     "__x86_indirect_thunk%s", thunks[i]);
2600                 (void) snprintf(sourcebuf, sizeof (sourcebuf),
2601                     "__x86_indirect_thunk_%s%s", type, thunks[i]);
2602 
2603                 source = kobj_getelfsym(sourcebuf, NULL, &ssize);
2604                 dest = kobj_getelfsym(destbuf, NULL, &dsize);
2605                 VERIFY3U(source, !=, 0);
2606                 VERIFY3U(dest, !=, 0);
2607                 VERIFY3S(dsize, >=, ssize);
2608                 bcopy((void *)source, (void *)dest, ssize);
2609         }
2610 }
2611 
2612 static void
2613 cpuid_enable_enhanced_ibrs(void)
2614 {
2615         uint64_t val;
2616 
2617         val = rdmsr(MSR_IA32_SPEC_CTRL);
2618         val |= IA32_SPEC_CTRL_IBRS;
2619         wrmsr(MSR_IA32_SPEC_CTRL, val);
2620 }
2621 
2622 #ifndef __xpv
2623 /*
2624  * Determine whether or not we can use the AMD optimized retpoline
2625  * functionality. We use this when we know we're on an AMD system and we can
2626  * successfully verify that lfence is dispatch serializing.
2627  */
2628 static boolean_t
2629 cpuid_use_amd_retpoline(struct cpuid_info *cpi)
2630 {
2631         uint64_t val;
2632         on_trap_data_t otd;
2633 
2634         if (cpi->cpi_vendor != X86_VENDOR_AMD)
2635                 return (B_FALSE);
2636 
2637         /*
2638          * We need to determine whether or not lfence is serializing. It always
2639          * is on families 0xf and 0x11. On others, it's controlled by
2640          * MSR_AMD_DECODE_CONFIG (MSRC001_1029). If some hypervisor gives us a
2641          * crazy old family, don't try and do anything.
2642          */
2643         if (cpi->cpi_family < 0xf)
2644                 return (B_FALSE);
2645         if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11)
2646                 return (B_TRUE);
2647 
2648         /*
2649          * While it may be tempting to use get_hwenv(), there are no promises
2650          * that a hypervisor will actually declare themselves to be so in a
2651          * friendly way. As such, try to read and set the MSR. If we can then
2652          * read back the value we set (it wasn't just set to zero), then we go
2653          * for it.
2654          */
2655         if (!on_trap(&otd, OT_DATA_ACCESS)) {
2656                 val = rdmsr(MSR_AMD_DECODE_CONFIG);
2657                 val |= AMD_DECODE_CONFIG_LFENCE_DISPATCH;
2658                 wrmsr(MSR_AMD_DECODE_CONFIG, val);
2659                 val = rdmsr(MSR_AMD_DECODE_CONFIG);
2660         } else {
2661                 val = 0;
2662         }
2663         no_trap();
2664 
2665         if ((val & AMD_DECODE_CONFIG_LFENCE_DISPATCH) != 0)
2666                 return (B_TRUE);
2667         return (B_FALSE);
2668 }
2669 #endif  /* !__xpv */
2670 
2671 static void
2672 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
2673 {
2674         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2675         x86_spectrev2_mitigation_t v2mit;
2676 
2677         if (cpi->cpi_vendor == X86_VENDOR_AMD &&
2678             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2679                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
2680                         add_x86_feature(featureset, X86FSET_IBPB);
2681                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
2682                         add_x86_feature(featureset, X86FSET_IBRS);
2683                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
2684                         add_x86_feature(featureset, X86FSET_STIBP);
2685                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
2686                         add_x86_feature(featureset, X86FSET_STIBP_ALL);
2687                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
2688                         add_x86_feature(featureset, X86FSET_SSBD);
2689                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
2690                         add_x86_feature(featureset, X86FSET_SSBD_VIRT);
2691                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
2692                         add_x86_feature(featureset, X86FSET_SSB_NO);
2693                 /*
2694                  * Don't enable enhanced IBRS unless we're told that we should
2695                  * prefer it and it has the same semantics as Intel. This is
2696                  * split into two bits rather than a single one.
2697                  */
2698                 if ((cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
2699                     (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)) {
2700                         add_x86_feature(featureset, X86FSET_IBRS_ALL);
2701                 }
2702 
2703         } else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
2704             cpi->cpi_maxeax >= 7) {
2705                 struct cpuid_regs *ecp;
2706                 ecp = &cpi->cpi_std[7];
2707 
2708                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
2709                         add_x86_feature(featureset, X86FSET_MD_CLEAR);
2710                 }
2711 
2712                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
2713                         add_x86_feature(featureset, X86FSET_IBRS);
2714                         add_x86_feature(featureset, X86FSET_IBPB);
2715                 }
2716 
2717                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
2718                         add_x86_feature(featureset, X86FSET_STIBP);
2719                 }
2720 
2721                 /*
2722                  * Don't read the arch caps MSR on xpv where we lack the
2723                  * on_trap().
2724                  */
2725 #ifndef __xpv
2726                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
2727                         on_trap_data_t otd;
2728 
2729                         /*
2730                          * Be paranoid and assume we'll get a #GP.
2731                          */
2732                         if (!on_trap(&otd, OT_DATA_ACCESS)) {
2733                                 uint64_t reg;
2734 
2735                                 reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
2736                                 if (reg & IA32_ARCH_CAP_RDCL_NO) {
2737                                         add_x86_feature(featureset,
2738                                             X86FSET_RDCL_NO);
2739                                 }
2740                                 if (reg & IA32_ARCH_CAP_IBRS_ALL) {
2741                                         add_x86_feature(featureset,
2742                                             X86FSET_IBRS_ALL);
2743                                 }
2744                                 if (reg & IA32_ARCH_CAP_RSBA) {
2745                                         add_x86_feature(featureset,
2746                                             X86FSET_RSBA);
2747                                 }
2748                                 if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
2749                                         add_x86_feature(featureset,
2750                                             X86FSET_L1D_VM_NO);
2751                                 }
2752                                 if (reg & IA32_ARCH_CAP_SSB_NO) {
2753                                         add_x86_feature(featureset,
2754                                             X86FSET_SSB_NO);
2755                                 }
2756                                 if (reg & IA32_ARCH_CAP_MDS_NO) {
2757                                         add_x86_feature(featureset,
2758                                             X86FSET_MDS_NO);
2759                                 }
2760                         }
2761                         no_trap();
2762                 }
2763 #endif  /* !__xpv */
2764 
2765                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
2766                         add_x86_feature(featureset, X86FSET_SSBD);
2767 
2768                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
2769                         add_x86_feature(featureset, X86FSET_FLUSH_CMD);
2770         }
2771 
2772         if (cpu->cpu_id != 0) {
2773                 if (x86_spectrev2_mitigation == X86_SPECTREV2_ENHANCED_IBRS) {
2774                         cpuid_enable_enhanced_ibrs();
2775                 }
2776                 return;
2777         }
2778 
2779         /*
2780          * Go through and initialize various security mechanisms that we should
2781          * only do on a single CPU. This includes Spectre V2, L1TF, and MDS.
2782          */
2783 
2784         /*
2785          * By default we've come in with retpolines enabled. Check whether we
2786          * should disable them or enable enhanced IBRS. RSB stuffing is enabled
2787          * by default, but disabled if we are using enhanced IBRS.
2788          */
2789         if (x86_disable_spectrev2 != 0) {
2790                 v2mit = X86_SPECTREV2_DISABLED;
2791         } else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
2792                 cpuid_enable_enhanced_ibrs();
2793                 v2mit = X86_SPECTREV2_ENHANCED_IBRS;
2794 #ifndef __xpv
2795         } else if (cpuid_use_amd_retpoline(cpi)) {
2796                 v2mit = X86_SPECTREV2_RETPOLINE_AMD;
2797 #endif  /* !__xpv */
2798         } else {
2799                 v2mit = X86_SPECTREV2_RETPOLINE;
2800         }
2801 
2802         cpuid_patch_retpolines(v2mit);
2803         cpuid_patch_rsb(v2mit);
2804         x86_spectrev2_mitigation = v2mit;
2805         membar_producer();
2806 
2807         /*
2808          * We need to determine what changes are required for mitigating L1TF
2809          * and MDS. If the CPU suffers from either of them, then SMT exclusion
2810          * is required.
2811          *
2812          * If any of these are present, then we need to flush u-arch state at
2813          * various points. For MDS, we need to do so whenever we change to a
2814          * lesser privilege level or we are halting the CPU. For L1TF we need to
2815          * flush the L1D cache at VM entry. When we have microcode that handles
2816          * MDS, the L1D flush also clears the other u-arch state that the
2817          * md_clear does.
2818          */
2819 
2820         /*
2821          * Update whether or not we need to be taking explicit action against
2822          * MDS.
2823          */
2824         cpuid_update_md_clear(cpu, featureset);
2825 
2826         /*
2827          * Determine whether SMT exclusion is required and whether or not we
2828          * need to perform an l1d flush.
2829          */
2830         cpuid_update_l1d_flush(cpu, featureset);
2831 }
2832 
2833 /*
2834  * Setup XFeature_Enabled_Mask register. Required by xsave feature.
2835  */
2836 void
2837 setup_xfem(void)
2838 {
2839         uint64_t flags = XFEATURE_LEGACY_FP;
2840 
2841         ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
2842 
2843         if (is_x86_feature(x86_featureset, X86FSET_SSE))
2844                 flags |= XFEATURE_SSE;
2845 
2846         if (is_x86_feature(x86_featureset, X86FSET_AVX))
2847                 flags |= XFEATURE_AVX;
2848 
2849         if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
2850                 flags |= XFEATURE_AVX512;
2851 
2852         set_xcr(XFEATURE_ENABLED_MASK, flags);
2853 
2854         xsave_bv_all = flags;
2855 }
2856 
2857 static void
2858 cpuid_pass1_topology(cpu_t *cpu, uchar_t *featureset)
2859 {
2860         struct cpuid_info *cpi;
2861 
2862         cpi = cpu->cpu_m.mcpu_cpi;
2863 
2864         if (cpi->cpi_vendor == X86_VENDOR_AMD) {
2865                 cpuid_gather_amd_topology_leaves(cpu);
2866         }
2867 
2868         cpi->cpi_apicid = cpuid_gather_apicid(cpi);
2869 
2870         /*
2871          * Before we can calculate the IDs that we should assign to this
2872          * processor, we need to understand how many cores and threads it has.
2873          */
2874         switch (cpi->cpi_vendor) {
2875         case X86_VENDOR_Intel:
2876                 cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
2877                     &cpi->cpi_ncore_per_chip);
2878                 break;
2879         case X86_VENDOR_AMD:
2880                 cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
2881                     &cpi->cpi_ncore_per_chip);
2882                 break;
2883         default:
2884                 /*
2885                  * If we have some other x86 compatible chip, it's not clear how
2886                  * they would behave. The most common case is virtualization
2887                  * today, though there are also 64-bit VIA chips. Assume that
2888                  * all we can get is the basic Leaf 1 HTT information.
2889                  */
2890                 if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2891                         cpi->cpi_ncore_per_chip = 1;
2892                         cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
2893                 }
2894                 break;
2895         }
2896 
2897         /*
2898          * Based on the calculated number of threads and cores, potentially
2899          * assign the HTT and CMT features.
2900          */
2901         if (cpi->cpi_ncore_per_chip > 1) {
2902                 add_x86_feature(featureset, X86FSET_CMP);
2903         }
2904 
2905         if (cpi->cpi_ncpu_per_chip > 1 &&
2906             cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
2907                 add_x86_feature(featureset, X86FSET_HTT);
2908         }
2909 
2910         /*
2911          * Now that has been set up, we need to go through and calculate all of
2912          * the rest of the parameters that exist. If we think the CPU doesn't
2913          * have either SMT (HTT) or CMP, then we basically go through and fake
2914          * up information in some way. The most likely case for this is
2915          * virtualization where we have a lot of partial topology information.
2916          */
2917         if (!is_x86_feature(featureset, X86FSET_HTT) &&
2918             !is_x86_feature(featureset, X86FSET_CMP)) {
2919                 /*
2920                  * This is a single core, single-threaded processor.
2921                  */
2922                 cpi->cpi_procnodes_per_pkg = 1;
2923                 cpi->cpi_cores_per_compunit = 1;
2924                 cpi->cpi_compunitid = 0;
2925                 cpi->cpi_chipid = -1;
2926                 cpi->cpi_clogid = 0;
2927                 cpi->cpi_coreid = cpu->cpu_id;
2928                 cpi->cpi_pkgcoreid = 0;
2929                 if (cpi->cpi_vendor == X86_VENDOR_AMD) {
2930                         cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
2931                 } else {
2932                         cpi->cpi_procnodeid = cpi->cpi_chipid;
2933                 }
2934         } else {
2935                 switch (cpi->cpi_vendor) {
2936                 case X86_VENDOR_Intel:
2937                         cpuid_intel_getids(cpu, featureset);
2938                         break;
2939                 case X86_VENDOR_AMD:
2940                         cpuid_amd_getids(cpu, featureset);
2941                         break;
2942                 default:
2943                         /*
2944                          * In this case, it's hard to say what we should do.
2945                          * We're going to model them to the OS as single core
2946                          * threads. We don't have a good identifier for them, so
2947                          * we're just going to use the cpu id all on a single
2948                          * chip.
2949                          *
2950                          * This case has historically been different from the
2951                          * case above where we don't have HTT or CMP. While they
2952                          * could be combined, we've opted to keep it separate to
2953                          * minimize the risk of topology changes in weird cases.
2954                          */
2955                         cpi->cpi_procnodes_per_pkg = 1;
2956                         cpi->cpi_cores_per_compunit = 1;
2957                         cpi->cpi_chipid = 0;
2958                         cpi->cpi_coreid = cpu->cpu_id;
2959                         cpi->cpi_clogid = cpu->cpu_id;
2960                         cpi->cpi_pkgcoreid = cpu->cpu_id;
2961                         cpi->cpi_procnodeid = cpi->cpi_chipid;
2962                         cpi->cpi_compunitid = cpi->cpi_coreid;
2963                         break;
2964                 }
2965         }
2966 }
2967 
2968 /*
2969  * Gather relevant CPU features from leaf 6 which covers thermal information. We
2970  * always gather leaf 6 if it's supported; however, we only look for features on
2971  * Intel systems as AMD does not currently define any of the features we look
2972  * for below.
2973  */
2974 static void
2975 cpuid_pass1_thermal(cpu_t *cpu, uchar_t *featureset)
2976 {
2977         struct cpuid_regs *cp;
2978         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2979 
2980         if (cpi->cpi_maxeax < 6) {
2981                 return;
2982         }
2983 
2984         cp = &cpi->cpi_std[6];
2985         cp->cp_eax = 6;
2986         cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
2987         (void) __cpuid_insn(cp);
2988         platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
2989 
2990         if (cpi->cpi_vendor != X86_VENDOR_Intel) {
2991                 return;
2992         }
2993 
2994         if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
2995                 add_x86_feature(featureset, X86FSET_CORE_THERMAL);
2996         }
2997 
2998         if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
2999                 add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3000         }
3001 }
3002 
3003 void
3004 cpuid_pass1(cpu_t *cpu, uchar_t *featureset)
3005 {
3006         uint32_t mask_ecx, mask_edx;
3007         struct cpuid_info *cpi;
3008         struct cpuid_regs *cp;
3009         int xcpuid;
3010 #if !defined(__xpv)
3011         extern int idle_cpu_prefer_mwait;
3012 #endif
3013 
3014         /*
3015          * Space statically allocated for BSP, ensure pointer is set
3016          */
3017         if (cpu->cpu_id == 0) {
3018                 if (cpu->cpu_m.mcpu_cpi == NULL)
3019                         cpu->cpu_m.mcpu_cpi = &cpuid_info0;
3020         }
3021 
3022         add_x86_feature(featureset, X86FSET_CPUID);
3023 
3024         cpi = cpu->cpu_m.mcpu_cpi;
3025         ASSERT(cpi != NULL);
3026         cp = &cpi->cpi_std[0];
3027         cp->cp_eax = 0;
3028         cpi->cpi_maxeax = __cpuid_insn(cp);
3029         {
3030                 uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
3031                 *iptr++ = cp->cp_ebx;
3032                 *iptr++ = cp->cp_edx;
3033                 *iptr++ = cp->cp_ecx;
3034                 *(char *)&cpi->cpi_vendorstr[12] = '\0';
3035         }
3036 
3037         cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
3038         x86_vendor = cpi->cpi_vendor; /* for compatibility */
3039 
3040         /*
3041          * Limit the range in case of weird hardware
3042          */
3043         if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
3044                 cpi->cpi_maxeax = CPI_MAXEAX_MAX;
3045         if (cpi->cpi_maxeax < 1)
3046                 goto pass1_done;
3047 
3048         cp = &cpi->cpi_std[1];
3049         cp->cp_eax = 1;
3050         (void) __cpuid_insn(cp);
3051 
3052         /*
3053          * Extract identifying constants for easy access.
3054          */
3055         cpi->cpi_model = CPI_MODEL(cpi);
3056         cpi->cpi_family = CPI_FAMILY(cpi);
3057 
3058         if (cpi->cpi_family == 0xf)
3059                 cpi->cpi_family += CPI_FAMILY_XTD(cpi);
3060 
3061         /*
3062          * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
3063          * Intel, and presumably everyone else, uses model == 0xf, as
3064          * one would expect (max value means possible overflow).  Sigh.
3065          */
3066 
3067         switch (cpi->cpi_vendor) {
3068         case X86_VENDOR_Intel:
3069                 if (IS_EXTENDED_MODEL_INTEL(cpi))
3070                         cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3071                 break;
3072         case X86_VENDOR_AMD:
3073                 if (CPI_FAMILY(cpi) == 0xf)
3074                         cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3075                 break;
3076         default:
3077                 if (cpi->cpi_model == 0xf)
3078                         cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3079                 break;
3080         }
3081 
3082         cpi->cpi_step = CPI_STEP(cpi);
3083         cpi->cpi_brandid = CPI_BRANDID(cpi);
3084 
3085         /*
3086          * *default* assumptions:
3087          * - believe %edx feature word
3088          * - ignore %ecx feature word
3089          * - 32-bit virtual and physical addressing
3090          */
3091         mask_edx = 0xffffffff;
3092         mask_ecx = 0;
3093 
3094         cpi->cpi_pabits = cpi->cpi_vabits = 32;
3095 
3096         switch (cpi->cpi_vendor) {
3097         case X86_VENDOR_Intel:
3098                 if (cpi->cpi_family == 5)
3099                         x86_type = X86_TYPE_P5;
3100                 else if (IS_LEGACY_P6(cpi)) {
3101                         x86_type = X86_TYPE_P6;
3102                         pentiumpro_bug4046376 = 1;
3103                         /*
3104                          * Clear the SEP bit when it was set erroneously
3105                          */
3106                         if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
3107                                 cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
3108                 } else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
3109                         x86_type = X86_TYPE_P4;
3110                         /*
3111                          * We don't currently depend on any of the %ecx
3112                          * features until Prescott, so we'll only check
3113                          * this from P4 onwards.  We might want to revisit
3114                          * that idea later.
3115                          */
3116                         mask_ecx = 0xffffffff;
3117                 } else if (cpi->cpi_family > 0xf)
3118                         mask_ecx = 0xffffffff;
3119                 /*
3120                  * We don't support MONITOR/MWAIT if leaf 5 is not available
3121                  * to obtain the monitor linesize.
3122                  */
3123                 if (cpi->cpi_maxeax < 5)
3124                         mask_ecx &= ~CPUID_INTC_ECX_MON;
3125                 break;
3126         case X86_VENDOR_IntelClone:
3127         default:
3128                 break;
3129         case X86_VENDOR_AMD:
3130 #if defined(OPTERON_ERRATUM_108)
3131                 if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
3132                         cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
3133                         cpi->cpi_model = 0xc;
3134                 } else
3135 #endif
3136                 if (cpi->cpi_family == 5) {
3137                         /*
3138                          * AMD K5 and K6
3139                          *
3140                          * These CPUs have an incomplete implementation
3141                          * of MCA/MCE which we mask away.
3142                          */
3143                         mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
3144 
3145                         /*
3146                          * Model 0 uses the wrong (APIC) bit
3147                          * to indicate PGE.  Fix it here.
3148                          */
3149                         if (cpi->cpi_model == 0) {
3150                                 if (cp->cp_edx & 0x200) {
3151                                         cp->cp_edx &= ~0x200;
3152                                         cp->cp_edx |= CPUID_INTC_EDX_PGE;
3153                                 }
3154                         }
3155 
3156                         /*
3157                          * Early models had problems w/ MMX; disable.
3158                          */
3159                         if (cpi->cpi_model < 6)
3160                                 mask_edx &= ~CPUID_INTC_EDX_MMX;
3161                 }
3162 
3163                 /*
3164                  * For newer families, SSE3 and CX16, at least, are valid;
3165                  * enable all
3166                  */
3167                 if (cpi->cpi_family >= 0xf)
3168                         mask_ecx = 0xffffffff;
3169                 /*
3170                  * We don't support MONITOR/MWAIT if leaf 5 is not available
3171                  * to obtain the monitor linesize.
3172                  */
3173                 if (cpi->cpi_maxeax < 5)
3174                         mask_ecx &= ~CPUID_INTC_ECX_MON;
3175 
3176 #if !defined(__xpv)
3177                 /*
3178                  * AMD has not historically used MWAIT in the CPU's idle loop.
3179                  * Pre-family-10h Opterons do not have the MWAIT instruction. We
3180                  * know for certain that in at least family 17h, per AMD, mwait
3181                  * is preferred. Families in-between are less certain.
3182                  */
3183                 if (cpi->cpi_family < 0x17) {
3184                         idle_cpu_prefer_mwait = 0;
3185                 }
3186 #endif
3187 
3188                 break;
3189         case X86_VENDOR_TM:
3190                 /*
3191                  * workaround the NT workaround in CMS 4.1
3192                  */
3193                 if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
3194                     (cpi->cpi_step == 2 || cpi->cpi_step == 3))
3195                         cp->cp_edx |= CPUID_INTC_EDX_CX8;
3196                 break;
3197         case X86_VENDOR_Centaur:
3198                 /*
3199                  * workaround the NT workarounds again
3200                  */
3201                 if (cpi->cpi_family == 6)
3202                         cp->cp_edx |= CPUID_INTC_EDX_CX8;
3203                 break;
3204         case X86_VENDOR_Cyrix:
3205                 /*
3206                  * We rely heavily on the probing in locore
3207                  * to actually figure out what parts, if any,
3208                  * of the Cyrix cpuid instruction to believe.
3209                  */
3210                 switch (x86_type) {
3211                 case X86_TYPE_CYRIX_486:
3212                         mask_edx = 0;
3213                         break;
3214                 case X86_TYPE_CYRIX_6x86:
3215                         mask_edx = 0;
3216                         break;
3217                 case X86_TYPE_CYRIX_6x86L:
3218                         mask_edx =
3219                             CPUID_INTC_EDX_DE |
3220                             CPUID_INTC_EDX_CX8;
3221                         break;
3222                 case X86_TYPE_CYRIX_6x86MX:
3223                         mask_edx =
3224                             CPUID_INTC_EDX_DE |
3225                             CPUID_INTC_EDX_MSR |
3226                             CPUID_INTC_EDX_CX8 |
3227                             CPUID_INTC_EDX_PGE |
3228                             CPUID_INTC_EDX_CMOV |
3229                             CPUID_INTC_EDX_MMX;
3230                         break;
3231                 case X86_TYPE_CYRIX_GXm:
3232                         mask_edx =
3233                             CPUID_INTC_EDX_MSR |
3234                             CPUID_INTC_EDX_CX8 |
3235                             CPUID_INTC_EDX_CMOV |
3236                             CPUID_INTC_EDX_MMX;
3237                         break;
3238                 case X86_TYPE_CYRIX_MediaGX:
3239                         break;
3240                 case X86_TYPE_CYRIX_MII:
3241                 case X86_TYPE_VIA_CYRIX_III:
3242                         mask_edx =
3243                             CPUID_INTC_EDX_DE |
3244                             CPUID_INTC_EDX_TSC |
3245                             CPUID_INTC_EDX_MSR |
3246                             CPUID_INTC_EDX_CX8 |
3247                             CPUID_INTC_EDX_PGE |
3248                             CPUID_INTC_EDX_CMOV |
3249                             CPUID_INTC_EDX_MMX;
3250                         break;
3251                 default:
3252                         break;
3253                 }
3254                 break;
3255         }
3256 
3257 #if defined(__xpv)
3258         /*
3259          * Do not support MONITOR/MWAIT under a hypervisor
3260          */
3261         mask_ecx &= ~CPUID_INTC_ECX_MON;
3262         /*
3263          * Do not support XSAVE under a hypervisor for now
3264          */
3265         xsave_force_disable = B_TRUE;
3266 
3267 #endif  /* __xpv */
3268 
3269         if (xsave_force_disable) {
3270                 mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
3271                 mask_ecx &= ~CPUID_INTC_ECX_AVX;
3272                 mask_ecx &= ~CPUID_INTC_ECX_F16C;
3273                 mask_ecx &= ~CPUID_INTC_ECX_FMA;
3274         }
3275 
3276         /*
3277          * Now we've figured out the masks that determine
3278          * which bits we choose to believe, apply the masks
3279          * to the feature words, then map the kernel's view
3280          * of these feature words into its feature word.
3281          */
3282         cp->cp_edx &= mask_edx;
3283         cp->cp_ecx &= mask_ecx;
3284 
3285         /*
3286          * apply any platform restrictions (we don't call this
3287          * immediately after __cpuid_insn here, because we need the
3288          * workarounds applied above first)
3289          */
3290         platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
3291 
3292         /*
3293          * In addition to ecx and edx, Intel and AMD are storing a bunch of
3294          * instruction set extensions in leaf 7's ebx, ecx, and edx.
3295          */
3296         if (cpi->cpi_maxeax >= 7) {
3297                 struct cpuid_regs *ecp;
3298                 ecp = &cpi->cpi_std[7];
3299                 ecp->cp_eax = 7;
3300                 ecp->cp_ecx = 0;
3301                 (void) __cpuid_insn(ecp);
3302 
3303                 /*
3304                  * If XSAVE has been disabled, just ignore all of the
3305                  * extended-save-area dependent flags here.
3306                  */
3307                 if (xsave_force_disable) {
3308                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
3309                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
3310                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
3311                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
3312                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
3313                         ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
3314                         ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
3315                 }
3316 
3317                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
3318                         add_x86_feature(featureset, X86FSET_SMEP);
3319 
3320                 /*
3321                  * We check disable_smap here in addition to in startup_smap()
3322                  * to ensure CPUs that aren't the boot CPU don't accidentally
3323                  * include it in the feature set and thus generate a mismatched
3324                  * x86 feature set across CPUs.
3325                  */
3326                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
3327                     disable_smap == 0)
3328                         add_x86_feature(featureset, X86FSET_SMAP);
3329 
3330                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
3331                         add_x86_feature(featureset, X86FSET_RDSEED);
3332 
3333                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
3334                         add_x86_feature(featureset, X86FSET_ADX);
3335 
3336                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
3337                         add_x86_feature(featureset, X86FSET_FSGSBASE);
3338 
3339                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
3340                         add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
3341 
3342                 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3343                         if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
3344                                 add_x86_feature(featureset, X86FSET_INVPCID);
3345 
3346                         if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
3347                                 add_x86_feature(featureset, X86FSET_MPX);
3348 
3349                         if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
3350                                 add_x86_feature(featureset, X86FSET_CLWB);
3351                 }
3352         }
3353 
3354         /*
3355          * fold in overrides from the "eeprom" mechanism
3356          */
3357         cp->cp_edx |= cpuid_feature_edx_include;
3358         cp->cp_edx &= ~cpuid_feature_edx_exclude;
3359 
3360         cp->cp_ecx |= cpuid_feature_ecx_include;
3361         cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
3362 
3363         if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
3364                 add_x86_feature(featureset, X86FSET_LARGEPAGE);
3365         }
3366         if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
3367                 add_x86_feature(featureset, X86FSET_TSC);
3368         }
3369         if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
3370                 add_x86_feature(featureset, X86FSET_MSR);
3371         }
3372         if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
3373                 add_x86_feature(featureset, X86FSET_MTRR);
3374         }
3375         if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
3376                 add_x86_feature(featureset, X86FSET_PGE);
3377         }
3378         if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
3379                 add_x86_feature(featureset, X86FSET_CMOV);
3380         }
3381         if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
3382                 add_x86_feature(featureset, X86FSET_MMX);
3383         }
3384         if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
3385             (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
3386                 add_x86_feature(featureset, X86FSET_MCA);
3387         }
3388         if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
3389                 add_x86_feature(featureset, X86FSET_PAE);
3390         }
3391         if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
3392                 add_x86_feature(featureset, X86FSET_CX8);
3393         }
3394         if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
3395                 add_x86_feature(featureset, X86FSET_CX16);
3396         }
3397         if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
3398                 add_x86_feature(featureset, X86FSET_PAT);
3399         }
3400         if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
3401                 add_x86_feature(featureset, X86FSET_SEP);
3402         }
3403         if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
3404                 /*
3405                  * In our implementation, fxsave/fxrstor
3406                  * are prerequisites before we'll even
3407                  * try and do SSE things.
3408                  */
3409                 if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
3410                         add_x86_feature(featureset, X86FSET_SSE);
3411                 }
3412                 if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
3413                         add_x86_feature(featureset, X86FSET_SSE2);
3414                 }
3415                 if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
3416                         add_x86_feature(featureset, X86FSET_SSE3);
3417                 }
3418                 if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
3419                         add_x86_feature(featureset, X86FSET_SSSE3);
3420                 }
3421                 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
3422                         add_x86_feature(featureset, X86FSET_SSE4_1);
3423                 }
3424                 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
3425                         add_x86_feature(featureset, X86FSET_SSE4_2);
3426                 }
3427                 if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
3428                         add_x86_feature(featureset, X86FSET_AES);
3429                 }
3430                 if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
3431                         add_x86_feature(featureset, X86FSET_PCLMULQDQ);
3432                 }
3433 
3434                 if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
3435                         add_x86_feature(featureset, X86FSET_SHA);
3436 
3437                 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
3438                         add_x86_feature(featureset, X86FSET_UMIP);
3439                 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_PKU)
3440                         add_x86_feature(featureset, X86FSET_PKU);
3441                 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
3442                         add_x86_feature(featureset, X86FSET_OSPKE);
3443 
3444                 if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
3445                         add_x86_feature(featureset, X86FSET_XSAVE);
3446 
3447                         /* We only test AVX & AVX512 when there is XSAVE */
3448 
3449                         if (cp->cp_ecx & CPUID_INTC_ECX_AVX) {
3450                                 add_x86_feature(featureset,
3451                                     X86FSET_AVX);
3452 
3453                                 /*
3454                                  * Intel says we can't check these without also
3455                                  * checking AVX.
3456                                  */
3457                                 if (cp->cp_ecx & CPUID_INTC_ECX_F16C)
3458                                         add_x86_feature(featureset,
3459                                             X86FSET_F16C);
3460 
3461                                 if (cp->cp_ecx & CPUID_INTC_ECX_FMA)
3462                                         add_x86_feature(featureset,
3463                                             X86FSET_FMA);
3464 
3465                                 if (cpi->cpi_std[7].cp_ebx &
3466                                     CPUID_INTC_EBX_7_0_BMI1)
3467                                         add_x86_feature(featureset,
3468                                             X86FSET_BMI1);
3469 
3470                                 if (cpi->cpi_std[7].cp_ebx &
3471                                     CPUID_INTC_EBX_7_0_BMI2)
3472                                         add_x86_feature(featureset,
3473                                             X86FSET_BMI2);
3474 
3475                                 if (cpi->cpi_std[7].cp_ebx &
3476                                     CPUID_INTC_EBX_7_0_AVX2)
3477                                         add_x86_feature(featureset,
3478                                             X86FSET_AVX2);
3479                         }
3480 
3481                         if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3482                             (cpi->cpi_std[7].cp_ebx &
3483                             CPUID_INTC_EBX_7_0_AVX512F) != 0) {
3484                                 add_x86_feature(featureset, X86FSET_AVX512F);
3485 
3486                                 if (cpi->cpi_std[7].cp_ebx &
3487                                     CPUID_INTC_EBX_7_0_AVX512DQ)
3488                                         add_x86_feature(featureset,
3489                                             X86FSET_AVX512DQ);
3490                                 if (cpi->cpi_std[7].cp_ebx &
3491                                     CPUID_INTC_EBX_7_0_AVX512IFMA)
3492                                         add_x86_feature(featureset,
3493                                             X86FSET_AVX512FMA);
3494                                 if (cpi->cpi_std[7].cp_ebx &
3495                                     CPUID_INTC_EBX_7_0_AVX512PF)
3496                                         add_x86_feature(featureset,
3497                                             X86FSET_AVX512PF);
3498                                 if (cpi->cpi_std[7].cp_ebx &
3499                                     CPUID_INTC_EBX_7_0_AVX512ER)
3500                                         add_x86_feature(featureset,
3501                                             X86FSET_AVX512ER);
3502                                 if (cpi->cpi_std[7].cp_ebx &
3503                                     CPUID_INTC_EBX_7_0_AVX512CD)
3504                                         add_x86_feature(featureset,
3505                                             X86FSET_AVX512CD);
3506                                 if (cpi->cpi_std[7].cp_ebx &
3507                                     CPUID_INTC_EBX_7_0_AVX512BW)
3508                                         add_x86_feature(featureset,
3509                                             X86FSET_AVX512BW);
3510                                 if (cpi->cpi_std[7].cp_ebx &
3511                                     CPUID_INTC_EBX_7_0_AVX512VL)
3512                                         add_x86_feature(featureset,
3513                                             X86FSET_AVX512VL);
3514 
3515                                 if (cpi->cpi_std[7].cp_ecx &
3516                                     CPUID_INTC_ECX_7_0_AVX512VBMI)
3517                                         add_x86_feature(featureset,
3518                                             X86FSET_AVX512VBMI);
3519                                 if (cpi->cpi_std[7].cp_ecx &
3520                                     CPUID_INTC_ECX_7_0_AVX512VNNI)
3521                                         add_x86_feature(featureset,
3522                                             X86FSET_AVX512VNNI);
3523                                 if (cpi->cpi_std[7].cp_ecx &
3524                                     CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3525                                         add_x86_feature(featureset,
3526                                             X86FSET_AVX512VPOPCDQ);
3527 
3528                                 if (cpi->cpi_std[7].cp_edx &
3529                                     CPUID_INTC_EDX_7_0_AVX5124NNIW)
3530                                         add_x86_feature(featureset,
3531                                             X86FSET_AVX512NNIW);
3532                                 if (cpi->cpi_std[7].cp_edx &
3533                                     CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3534                                         add_x86_feature(featureset,
3535                                             X86FSET_AVX512FMAPS);
3536                         }
3537                 }
3538         }
3539 
3540         if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3541                 if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
3542                         add_x86_feature(featureset, X86FSET_PCID);
3543                 }
3544         }
3545 
3546         if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
3547                 add_x86_feature(featureset, X86FSET_X2APIC);
3548         }
3549         if (cp->cp_edx & CPUID_INTC_EDX_DE) {
3550                 add_x86_feature(featureset, X86FSET_DE);
3551         }
3552 #if !defined(__xpv)
3553         if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
3554 
3555                 /*
3556                  * We require the CLFLUSH instruction for erratum workaround
3557                  * to use MONITOR/MWAIT.
3558                  */
3559                 if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3560                         cpi->cpi_mwait.support |= MWAIT_SUPPORT;
3561                         add_x86_feature(featureset, X86FSET_MWAIT);
3562                 } else {
3563                         extern int idle_cpu_assert_cflush_monitor;
3564 
3565                         /*
3566                          * All processors we are aware of which have
3567                          * MONITOR/MWAIT also have CLFLUSH.
3568                          */
3569                         if (idle_cpu_assert_cflush_monitor) {
3570                                 ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
3571                                     (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
3572                         }
3573                 }
3574         }
3575 #endif  /* __xpv */
3576 
3577         if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
3578                 add_x86_feature(featureset, X86FSET_VMX);
3579         }
3580 
3581         if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
3582                 add_x86_feature(featureset, X86FSET_RDRAND);
3583 
3584         /*
3585          * Only need it first time, rest of the cpus would follow suit.
3586          * we only capture this for the bootcpu.
3587          */
3588         if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3589                 add_x86_feature(featureset, X86FSET_CLFSH);
3590                 x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
3591         }
3592         if (is_x86_feature(featureset, X86FSET_PAE))
3593                 cpi->cpi_pabits = 36;
3594 
3595         if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
3596                 struct cpuid_regs r, *ecp;
3597 
3598                 ecp = &r;
3599                 ecp->cp_eax = 0xD;
3600                 ecp->cp_ecx = 1;
3601                 ecp->cp_edx = ecp->cp_ebx = 0;
3602                 (void) __cpuid_insn(ecp);
3603 
3604                 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
3605                         add_x86_feature(featureset, X86FSET_XSAVEOPT);
3606                 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
3607                         add_x86_feature(featureset, X86FSET_XSAVEC);
3608                 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
3609                         add_x86_feature(featureset, X86FSET_XSAVES);
3610         }
3611 
3612         /*
3613          * Work on the "extended" feature information, doing
3614          * some basic initialization for cpuid_pass2()
3615          */
3616         xcpuid = 0;
3617         switch (cpi->cpi_vendor) {
3618         case X86_VENDOR_Intel:
3619                 /*
3620                  * On KVM we know we will have proper support for extended
3621                  * cpuid.
3622                  */
3623                 if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
3624                     (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
3625                     (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
3626                         xcpuid++;
3627                 break;
3628         case X86_VENDOR_AMD:
3629                 if (cpi->cpi_family > 5 ||
3630                     (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
3631                         xcpuid++;
3632                 break;
3633         case X86_VENDOR_Cyrix:
3634                 /*
3635                  * Only these Cyrix CPUs are -known- to support
3636                  * extended cpuid operations.
3637                  */
3638                 if (x86_type == X86_TYPE_VIA_CYRIX_III ||
3639                     x86_type == X86_TYPE_CYRIX_GXm)
3640                         xcpuid++;
3641                 break;
3642         case X86_VENDOR_Centaur:
3643         case X86_VENDOR_TM:
3644         default:
3645                 xcpuid++;
3646                 break;
3647         }
3648 
3649         if (xcpuid) {
3650                 cp = &cpi->cpi_extd[0];
3651                 cp->cp_eax = CPUID_LEAF_EXT_0;
3652                 cpi->cpi_xmaxeax = __cpuid_insn(cp);
3653         }
3654 
3655         if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
3656 
3657                 if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
3658                         cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
3659 
3660                 switch (cpi->cpi_vendor) {
3661                 case X86_VENDOR_Intel:
3662                 case X86_VENDOR_AMD:
3663                         if (cpi->cpi_xmaxeax < 0x80000001)
3664                                 break;
3665                         cp = &cpi->cpi_extd[1];
3666                         cp->cp_eax = 0x80000001;
3667                         (void) __cpuid_insn(cp);
3668 
3669                         if (cpi->cpi_vendor == X86_VENDOR_AMD &&
3670                             cpi->cpi_family == 5 &&
3671                             cpi->cpi_model == 6 &&
3672                             cpi->cpi_step == 6) {
3673                                 /*
3674                                  * K6 model 6 uses bit 10 to indicate SYSC
3675                                  * Later models use bit 11. Fix it here.
3676                                  */
3677                                 if (cp->cp_edx & 0x400) {
3678                                         cp->cp_edx &= ~0x400;
3679                                         cp->cp_edx |= CPUID_AMD_EDX_SYSC;
3680                                 }
3681                         }
3682 
3683                         platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
3684 
3685                         /*
3686                          * Compute the additions to the kernel's feature word.
3687                          */
3688                         if (cp->cp_edx & CPUID_AMD_EDX_NX) {
3689                                 add_x86_feature(featureset, X86FSET_NX);
3690                         }
3691 
3692                         /*
3693                          * Regardless whether or not we boot 64-bit,
3694                          * we should have a way to identify whether
3695                          * the CPU is capable of running 64-bit.
3696                          */
3697                         if (cp->cp_edx & CPUID_AMD_EDX_LM) {
3698                                 add_x86_feature(featureset, X86FSET_64);
3699                         }
3700 
3701                         /* 1 GB large page - enable only for 64 bit kernel */
3702                         if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
3703                                 add_x86_feature(featureset, X86FSET_1GPG);
3704                         }
3705 
3706                         if ((cpi->cpi_vendor == X86_VENDOR_AMD) &&
3707                             (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
3708                             (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
3709                                 add_x86_feature(featureset, X86FSET_SSE4A);
3710                         }
3711 
3712                         /*
3713                          * It's really tricky to support syscall/sysret in
3714                          * the i386 kernel; we rely on sysenter/sysexit
3715                          * instead.  In the amd64 kernel, things are -way-
3716                          * better.
3717                          */
3718                         if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
3719                                 add_x86_feature(featureset, X86FSET_ASYSC);
3720                         }
3721 
3722                         /*
3723                          * While we're thinking about system calls, note
3724                          * that AMD processors don't support sysenter
3725                          * in long mode at all, so don't try to program them.
3726                          */
3727                         if (x86_vendor == X86_VENDOR_AMD) {
3728                                 remove_x86_feature(featureset, X86FSET_SEP);
3729                         }
3730 
3731                         if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
3732                                 add_x86_feature(featureset, X86FSET_TSCP);
3733                         }
3734 
3735                         if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
3736                                 add_x86_feature(featureset, X86FSET_SVM);
3737                         }
3738 
3739                         if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
3740                                 add_x86_feature(featureset, X86FSET_TOPOEXT);
3741                         }
3742 
3743                         if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
3744                                 add_x86_feature(featureset, X86FSET_AMD_PCEC);
3745                         }
3746 
3747                         if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
3748                                 add_x86_feature(featureset, X86FSET_XOP);
3749                         }
3750 
3751                         if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
3752                                 add_x86_feature(featureset, X86FSET_FMA4);
3753                         }
3754 
3755                         if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
3756                                 add_x86_feature(featureset, X86FSET_TBM);
3757                         }
3758 
3759                         if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
3760                                 add_x86_feature(featureset, X86FSET_MONITORX);
3761                         }
3762                         break;
3763                 default:
3764                         break;
3765                 }
3766 
3767                 /*
3768                  * Get CPUID data about processor cores and hyperthreads.
3769                  */
3770                 switch (cpi->cpi_vendor) {
3771                 case X86_VENDOR_Intel:
3772                         if (cpi->cpi_maxeax >= 4) {
3773                                 cp = &cpi->cpi_std[4];
3774                                 cp->cp_eax = 4;
3775                                 cp->cp_ecx = 0;
3776                                 (void) __cpuid_insn(cp);
3777                                 platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
3778                         }
3779                         /*FALLTHROUGH*/
3780                 case X86_VENDOR_AMD:
3781                         if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
3782                                 break;
3783                         cp = &cpi->cpi_extd[8];
3784                         cp->cp_eax = CPUID_LEAF_EXT_8;
3785                         (void) __cpuid_insn(cp);
3786                         platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
3787                             cp);
3788 
3789                         /*
3790                          * AMD uses ebx for some extended functions.
3791                          */
3792                         if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3793                                 /*
3794                                  * While we're here, check for the AMD "Error
3795                                  * Pointer Zero/Restore" feature. This can be
3796                                  * used to setup the FP save handlers
3797                                  * appropriately.
3798                                  */
3799                                 if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
3800                                         cpi->cpi_fp_amd_save = 0;
3801                                 } else {
3802                                         cpi->cpi_fp_amd_save = 1;
3803                                 }
3804 
3805                                 if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
3806                                         add_x86_feature(featureset,
3807                                             X86FSET_CLZERO);
3808                                 }
3809                         }
3810 
3811                         /*
3812                          * Virtual and physical address limits from
3813                          * cpuid override previously guessed values.
3814                          */
3815                         cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
3816                         cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
3817                         break;
3818                 default:
3819                         break;
3820                 }
3821 
3822                 /*
3823                  * Get CPUID data about TSC Invariance in Deep C-State.
3824                  */
3825                 switch (cpi->cpi_vendor) {
3826                 case X86_VENDOR_Intel:
3827                 case X86_VENDOR_AMD:
3828                         if (cpi->cpi_maxeax >= 7) {
3829                                 cp = &cpi->cpi_extd[7];
3830                                 cp->cp_eax = 0x80000007;
3831                                 cp->cp_ecx = 0;
3832                                 (void) __cpuid_insn(cp);
3833                         }
3834                         break;
3835                 default:
3836                         break;
3837                 }
3838         }
3839 
3840         cpuid_pass1_topology(cpu, featureset);
3841         cpuid_pass1_thermal(cpu, featureset);
3842 
3843         /*
3844          * Synthesize chip "revision" and socket type
3845          */
3846         cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
3847             cpi->cpi_model, cpi->cpi_step);
3848         cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
3849             cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
3850         cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
3851             cpi->cpi_model, cpi->cpi_step);
3852 
3853         if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3854                 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
3855                     cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
3856                         /* Special handling for AMD FP not necessary. */
3857                         cpi->cpi_fp_amd_save = 0;
3858                 } else {
3859                         cpi->cpi_fp_amd_save = 1;
3860                 }
3861         }
3862 
3863         /*
3864          * Check the processor leaves that are used for security features.
3865          */
3866         cpuid_scan_security(cpu, featureset);
3867 
3868 pass1_done:
3869         cpi->cpi_pass = 1;
3870 }
3871 
3872 /*
3873  * Make copies of the cpuid table entries we depend on, in
3874  * part for ease of parsing now, in part so that we have only
3875  * one place to correct any of it, in part for ease of
3876  * later export to userland, and in part so we can look at
3877  * this stuff in a crash dump.
3878  */
3879 
3880 /*ARGSUSED*/
3881 void
3882 cpuid_pass2(cpu_t *cpu)
3883 {
3884         uint_t n, nmax;
3885         int i;
3886         struct cpuid_regs *cp;
3887         uint8_t *dp;
3888         uint32_t *iptr;
3889         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3890 
3891         ASSERT(cpi->cpi_pass == 1);
3892 
3893         if (cpi->cpi_maxeax < 1)
3894                 goto pass2_done;
3895 
3896         if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
3897                 nmax = NMAX_CPI_STD;
3898         /*
3899          * (We already handled n == 0 and n == 1 in pass 1)
3900          */
3901         for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
3902                 /*
3903                  * leaves 6 and 7 were handled in pass 1
3904                  */
3905                 if (n == 6 || n == 7)
3906                         continue;
3907 
3908                 cp->cp_eax = n;
3909 
3910                 /*
3911                  * CPUID function 4 expects %ecx to be initialized
3912                  * with an index which indicates which cache to return
3913                  * information about. The OS is expected to call function 4
3914                  * with %ecx set to 0, 1, 2, ... until it returns with
3915                  * EAX[4:0] set to 0, which indicates there are no more
3916                  * caches.
3917                  *
3918                  * Here, populate cpi_std[4] with the information returned by
3919                  * function 4 when %ecx == 0, and do the rest in cpuid_pass3()
3920                  * when dynamic memory allocation becomes available.
3921                  *
3922                  * Note: we need to explicitly initialize %ecx here, since
3923                  * function 4 may have been previously invoked.
3924                  */
3925                 if (n == 4)
3926                         cp->cp_ecx = 0;
3927 
3928                 (void) __cpuid_insn(cp);
3929                 platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
3930                 switch (n) {
3931                 case 2:
3932                         /*
3933                          * "the lower 8 bits of the %eax register
3934                          * contain a value that identifies the number
3935                          * of times the cpuid [instruction] has to be
3936                          * executed to obtain a complete image of the
3937                          * processor's caching systems."
3938                          *
3939                          * How *do* they make this stuff up?
3940                          */
3941                         cpi->cpi_ncache = sizeof (*cp) *
3942                             BITX(cp->cp_eax, 7, 0);
3943                         if (cpi->cpi_ncache == 0)
3944                                 break;
3945                         cpi->cpi_ncache--;   /* skip count byte */
3946 
3947                         /*
3948                          * Well, for now, rather than attempt to implement
3949                          * this slightly dubious algorithm, we just look
3950                          * at the first 15 ..
3951                          */
3952                         if (cpi->cpi_ncache > (sizeof (*cp) - 1))
3953                                 cpi->cpi_ncache = sizeof (*cp) - 1;
3954 
3955                         dp = cpi->cpi_cacheinfo;
3956                         if (BITX(cp->cp_eax, 31, 31) == 0) {
3957                                 uint8_t *p = (void *)&cp->cp_eax;
3958                                 for (i = 1; i < 4; i++)
3959                                         if (p[i] != 0)
3960                                                 *dp++ = p[i];
3961                         }
3962                         if (BITX(cp->cp_ebx, 31, 31) == 0) {
3963                                 uint8_t *p = (void *)&cp->cp_ebx;
3964                                 for (i = 0; i < 4; i++)
3965                                         if (p[i] != 0)
3966                                                 *dp++ = p[i];
3967                         }
3968                         if (BITX(cp->cp_ecx, 31, 31) == 0) {
3969                                 uint8_t *p = (void *)&cp->cp_ecx;
3970                                 for (i = 0; i < 4; i++)
3971                                         if (p[i] != 0)
3972                                                 *dp++ = p[i];
3973                         }
3974                         if (BITX(cp->cp_edx, 31, 31) == 0) {
3975                                 uint8_t *p = (void *)&cp->cp_edx;
3976                                 for (i = 0; i < 4; i++)
3977                                         if (p[i] != 0)
3978                                                 *dp++ = p[i];
3979                         }
3980                         break;
3981 
3982                 case 3: /* Processor serial number, if PSN supported */
3983                         break;
3984 
3985                 case 4: /* Deterministic cache parameters */
3986                         break;
3987 
3988                 case 5: /* Monitor/Mwait parameters */
3989                 {
3990                         size_t mwait_size;
3991 
3992                         /*
3993                          * check cpi_mwait.support which was set in cpuid_pass1
3994                          */
3995                         if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
3996                                 break;
3997 
3998                         /*
3999                          * Protect ourself from insane mwait line size.
4000                          * Workaround for incomplete hardware emulator(s).
4001                          */
4002                         mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
4003                         if (mwait_size < sizeof (uint32_t) ||
4004                             !ISP2(mwait_size)) {
4005 #if DEBUG
4006                                 cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
4007                                     "size %ld", cpu->cpu_id, (long)mwait_size);
4008 #endif
4009                                 break;
4010                         }
4011 
4012                         cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
4013                         cpi->cpi_mwait.mon_max = mwait_size;
4014                         if (MWAIT_EXTENSION(cpi)) {
4015                                 cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
4016                                 if (MWAIT_INT_ENABLE(cpi))
4017                                         cpi->cpi_mwait.support |=
4018                                             MWAIT_ECX_INT_ENABLE;
4019                         }
4020                         break;
4021                 }
4022                 default:
4023                         break;
4024                 }
4025         }
4026 
4027         /*
4028          * XSAVE enumeration
4029          */
4030         if (cpi->cpi_maxeax >= 0xD) {
4031                 struct cpuid_regs regs;
4032                 boolean_t cpuid_d_valid = B_TRUE;
4033 
4034                 cp = &regs;
4035                 cp->cp_eax = 0xD;
4036                 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
4037 
4038                 (void) __cpuid_insn(cp);
4039 
4040                 /*
4041                  * Sanity checks for debug
4042                  */
4043                 if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
4044                     (cp->cp_eax & XFEATURE_SSE) == 0) {
4045                         cpuid_d_valid = B_FALSE;
4046                 }
4047 
4048                 cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
4049                 cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
4050                 cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
4051 
4052                 /*
4053                  * If the hw supports AVX, get the size and offset in the save
4054                  * area for the ymm state.
4055                  */
4056                 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
4057                         cp->cp_eax = 0xD;
4058                         cp->cp_ecx = 2;
4059                         cp->cp_edx = cp->cp_ebx = 0;
4060 
4061                         (void) __cpuid_insn(cp);
4062 
4063                         if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
4064                             cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
4065                                 cpuid_d_valid = B_FALSE;
4066                         }
4067 
4068                         cpi->cpi_xsave.ymm_size = cp->cp_eax;
4069                         cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
4070                 }
4071 
4072                 /*
4073                  * If the hw supports MPX, get the size and offset in the
4074                  * save area for BNDREGS and BNDCSR.
4075                  */
4076                 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
4077                         cp->cp_eax = 0xD;
4078                         cp->cp_ecx = 3;
4079                         cp->cp_edx = cp->cp_ebx = 0;
4080 
4081                         (void) __cpuid_insn(cp);
4082 
4083                         cpi->cpi_xsave.bndregs_size = cp->cp_eax;
4084                         cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
4085 
4086                         cp->cp_eax = 0xD;
4087                         cp->cp_ecx = 4;
4088                         cp->cp_edx = cp->cp_ebx = 0;
4089 
4090                         (void) __cpuid_insn(cp);
4091 
4092                         cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
4093                         cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
4094                 }
4095 
4096                 /*
4097                  * If the hw supports AVX512, get the size and offset in the
4098                  * save area for the opmask registers and zmm state.
4099                  */
4100                 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
4101                         cp->cp_eax = 0xD;
4102                         cp->cp_ecx = 5;
4103                         cp->cp_edx = cp->cp_ebx = 0;
4104 
4105                         (void) __cpuid_insn(cp);
4106 
4107                         cpi->cpi_xsave.opmask_size = cp->cp_eax;
4108                         cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
4109 
4110                         cp->cp_eax = 0xD;
4111                         cp->cp_ecx = 6;
4112                         cp->cp_edx = cp->cp_ebx = 0;
4113 
4114                         (void) __cpuid_insn(cp);
4115 
4116                         cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
4117                         cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
4118 
4119                         cp->cp_eax = 0xD;
4120                         cp->cp_ecx = 7;
4121                         cp->cp_edx = cp->cp_ebx = 0;
4122 
4123                         (void) __cpuid_insn(cp);
4124 
4125                         cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
4126                         cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
4127                 }
4128 
4129                 if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
4130                         xsave_state_size = 0;
4131                 } else if (cpuid_d_valid) {
4132                         xsave_state_size = cpi->cpi_xsave.xsav_max_size;
4133                 } else {
4134                         /* Broken CPUID 0xD, probably in HVM */
4135                         cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
4136                             "value: hw_low = %d, hw_high = %d, xsave_size = %d"
4137                             ", ymm_size = %d, ymm_offset = %d\n",
4138                             cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
4139                             cpi->cpi_xsave.xsav_hw_features_high,
4140                             (int)cpi->cpi_xsave.xsav_max_size,
4141                             (int)cpi->cpi_xsave.ymm_size,
4142                             (int)cpi->cpi_xsave.ymm_offset);
4143 
4144                         if (xsave_state_size != 0) {
4145                                 /*
4146                                  * This must be a non-boot CPU. We cannot
4147                                  * continue, because boot cpu has already
4148                                  * enabled XSAVE.
4149                                  */
4150                                 ASSERT(cpu->cpu_id != 0);
4151                                 cmn_err(CE_PANIC, "cpu%d: we have already "
4152                                     "enabled XSAVE on boot cpu, cannot "
4153                                     "continue.", cpu->cpu_id);
4154                         } else {
4155                                 /*
4156                                  * If we reached here on the boot CPU, it's also
4157                                  * almost certain that we'll reach here on the
4158                                  * non-boot CPUs. When we're here on a boot CPU
4159                                  * we should disable the feature, on a non-boot
4160                                  * CPU we need to confirm that we have.
4161                                  */
4162                                 if (cpu->cpu_id == 0) {
4163                                         remove_x86_feature(x86_featureset,
4164                                             X86FSET_XSAVE);
4165                                         remove_x86_feature(x86_featureset,
4166                                             X86FSET_AVX);
4167                                         remove_x86_feature(x86_featureset,
4168                                             X86FSET_F16C);
4169                                         remove_x86_feature(x86_featureset,
4170                                             X86FSET_BMI1);
4171                                         remove_x86_feature(x86_featureset,
4172                                             X86FSET_BMI2);
4173                                         remove_x86_feature(x86_featureset,
4174                                             X86FSET_FMA);
4175                                         remove_x86_feature(x86_featureset,
4176                                             X86FSET_AVX2);
4177                                         remove_x86_feature(x86_featureset,
4178                                             X86FSET_MPX);
4179                                         remove_x86_feature(x86_featureset,
4180                                             X86FSET_AVX512F);
4181                                         remove_x86_feature(x86_featureset,
4182                                             X86FSET_AVX512DQ);
4183                                         remove_x86_feature(x86_featureset,
4184                                             X86FSET_AVX512PF);
4185                                         remove_x86_feature(x86_featureset,
4186                                             X86FSET_AVX512ER);
4187                                         remove_x86_feature(x86_featureset,
4188                                             X86FSET_AVX512CD);
4189                                         remove_x86_feature(x86_featureset,
4190                                             X86FSET_AVX512BW);
4191                                         remove_x86_feature(x86_featureset,
4192                                             X86FSET_AVX512VL);
4193                                         remove_x86_feature(x86_featureset,
4194                                             X86FSET_AVX512FMA);
4195                                         remove_x86_feature(x86_featureset,
4196                                             X86FSET_AVX512VBMI);
4197                                         remove_x86_feature(x86_featureset,
4198                                             X86FSET_AVX512VNNI);
4199                                         remove_x86_feature(x86_featureset,
4200                                             X86FSET_AVX512VPOPCDQ);
4201                                         remove_x86_feature(x86_featureset,
4202                                             X86FSET_AVX512NNIW);
4203                                         remove_x86_feature(x86_featureset,
4204                                             X86FSET_AVX512FMAPS);
4205 
4206                                         CPI_FEATURES_ECX(cpi) &=
4207                                             ~CPUID_INTC_ECX_XSAVE;
4208                                         CPI_FEATURES_ECX(cpi) &=
4209                                             ~CPUID_INTC_ECX_AVX;
4210                                         CPI_FEATURES_ECX(cpi) &=
4211                                             ~CPUID_INTC_ECX_F16C;
4212                                         CPI_FEATURES_ECX(cpi) &=
4213                                             ~CPUID_INTC_ECX_FMA;
4214                                         CPI_FEATURES_7_0_EBX(cpi) &=
4215                                             ~CPUID_INTC_EBX_7_0_BMI1;
4216                                         CPI_FEATURES_7_0_EBX(cpi) &=
4217                                             ~CPUID_INTC_EBX_7_0_BMI2;
4218                                         CPI_FEATURES_7_0_EBX(cpi) &=
4219                                             ~CPUID_INTC_EBX_7_0_AVX2;
4220                                         CPI_FEATURES_7_0_EBX(cpi) &=
4221                                             ~CPUID_INTC_EBX_7_0_MPX;
4222                                         CPI_FEATURES_7_0_EBX(cpi) &=
4223                                             ~CPUID_INTC_EBX_7_0_ALL_AVX512;
4224 
4225                                         CPI_FEATURES_7_0_ECX(cpi) &=
4226                                             ~CPUID_INTC_ECX_7_0_ALL_AVX512;
4227 
4228                                         CPI_FEATURES_7_0_EDX(cpi) &=
4229                                             ~CPUID_INTC_EDX_7_0_ALL_AVX512;
4230 
4231                                         xsave_force_disable = B_TRUE;
4232                                 } else {
4233                                         VERIFY(is_x86_feature(x86_featureset,
4234                                             X86FSET_XSAVE) == B_FALSE);
4235                                 }
4236                         }
4237                 }
4238         }
4239 
4240 
4241         if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
4242                 goto pass2_done;
4243 
4244         if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
4245                 nmax = NMAX_CPI_EXTD;
4246         /*
4247          * Copy the extended properties, fixing them as we go.
4248          * (We already handled n == 0 and n == 1 in pass 1)
4249          */
4250         iptr = (void *)cpi->cpi_brandstr;
4251         for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
4252                 cp->cp_eax = CPUID_LEAF_EXT_0 + n;
4253                 (void) __cpuid_insn(cp);
4254                 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
4255                     cp);
4256                 switch (n) {
4257                 case 2:
4258                 case 3:
4259                 case 4:
4260                         /*
4261                          * Extract the brand string
4262                          */
4263                         *iptr++ = cp->cp_eax;
4264                         *iptr++ = cp->cp_ebx;
4265                         *iptr++ = cp->cp_ecx;
4266                         *iptr++ = cp->cp_edx;
4267                         break;
4268                 case 5:
4269                         switch (cpi->cpi_vendor) {
4270                         case X86_VENDOR_AMD:
4271                                 /*
4272                                  * The Athlon and Duron were the first
4273                                  * parts to report the sizes of the
4274                                  * TLB for large pages. Before then,
4275                                  * we don't trust the data.
4276                                  */
4277                                 if (cpi->cpi_family < 6 ||
4278                                     (cpi->cpi_family == 6 &&
4279                                     cpi->cpi_model < 1))
4280                                         cp->cp_eax = 0;
4281                                 break;
4282                         default:
4283                                 break;
4284                         }
4285                         break;
4286                 case 6:
4287                         switch (cpi->cpi_vendor) {
4288                         case X86_VENDOR_AMD:
4289                                 /*
4290                                  * The Athlon and Duron were the first
4291                                  * AMD parts with L2 TLB's.
4292                                  * Before then, don't trust the data.
4293                                  */
4294                                 if (cpi->cpi_family < 6 ||
4295                                     cpi->cpi_family == 6 &&
4296                                     cpi->cpi_model < 1)
4297                                         cp->cp_eax = cp->cp_ebx = 0;
4298                                 /*
4299                                  * AMD Duron rev A0 reports L2
4300                                  * cache size incorrectly as 1K
4301                                  * when it is really 64K
4302                                  */
4303                                 if (cpi->cpi_family == 6 &&
4304                                     cpi->cpi_model == 3 &&
4305                                     cpi->cpi_step == 0) {
4306                                         cp->cp_ecx &= 0xffff;
4307                                         cp->cp_ecx |= 0x400000;
4308                                 }
4309                                 break;
4310                         case X86_VENDOR_Cyrix:  /* VIA C3 */
4311                                 /*
4312                                  * VIA C3 processors are a bit messed
4313                                  * up w.r.t. encoding cache sizes in %ecx
4314                                  */
4315                                 if (cpi->cpi_family != 6)
4316                                         break;
4317                                 /*
4318                                  * model 7 and 8 were incorrectly encoded
4319                                  *
4320                                  * xxx is model 8 really broken?
4321                                  */
4322                                 if (cpi->cpi_model == 7 ||
4323                                     cpi->cpi_model == 8)
4324                                         cp->cp_ecx =
4325                                             BITX(cp->cp_ecx, 31, 24) << 16 |
4326                                             BITX(cp->cp_ecx, 23, 16) << 12 |
4327                                             BITX(cp->cp_ecx, 15, 8) << 8 |
4328                                             BITX(cp->cp_ecx, 7, 0);
4329                                 /*
4330                                  * model 9 stepping 1 has wrong associativity
4331                                  */
4332                                 if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
4333                                         cp->cp_ecx |= 8 << 12;
4334                                 break;
4335                         case X86_VENDOR_Intel:
4336                                 /*
4337                                  * Extended L2 Cache features function.
4338                                  * First appeared on Prescott.
4339                                  */
4340                         default:
4341                                 break;
4342                         }
4343                         break;
4344                 default:
4345                         break;
4346                 }
4347         }
4348 
4349 pass2_done:
4350         cpi->cpi_pass = 2;
4351 }
4352 
4353 static const char *
4354 intel_cpubrand(const struct cpuid_info *cpi)
4355 {
4356         int i;
4357 
4358         if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4359             cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
4360                 return ("i486");
4361 
4362         switch (cpi->cpi_family) {
4363         case 5:
4364                 return ("Intel Pentium(r)");
4365         case 6:
4366                 switch (cpi->cpi_model) {
4367                         uint_t celeron, xeon;
4368                         const struct cpuid_regs *cp;
4369                 case 0:
4370                 case 1:
4371                 case 2:
4372                         return ("Intel Pentium(r) Pro");
4373                 case 3:
4374                 case 4:
4375                         return ("Intel Pentium(r) II");
4376                 case 6:
4377                         return ("Intel Celeron(r)");
4378                 case 5:
4379                 case 7:
4380                         celeron = xeon = 0;
4381                         cp = &cpi->cpi_std[2];   /* cache info */
4382 
4383                         for (i = 1; i < 4; i++) {
4384                                 uint_t tmp;
4385 
4386                                 tmp = (cp->cp_eax >> (8 * i)) & 0xff;
4387                                 if (tmp == 0x40)
4388                                         celeron++;
4389                                 if (tmp >= 0x44 && tmp <= 0x45)
4390                                         xeon++;
4391                         }
4392 
4393                         for (i = 0; i < 2; i++) {
4394                                 uint_t tmp;
4395 
4396                                 tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
4397                                 if (tmp == 0x40)
4398                                         celeron++;
4399                                 else if (tmp >= 0x44 && tmp <= 0x45)
4400                                         xeon++;
4401                         }
4402 
4403                         for (i = 0; i < 4; i++) {
4404                                 uint_t tmp;
4405 
4406                                 tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
4407                                 if (tmp == 0x40)
4408                                         celeron++;
4409                                 else if (tmp >= 0x44 && tmp <= 0x45)
4410                                         xeon++;
4411                         }
4412 
4413                         for (i = 0; i < 4; i++) {
4414                                 uint_t tmp;
4415 
4416                                 tmp = (cp->cp_edx >> (8 * i)) & 0xff;
4417                                 if (tmp == 0x40)
4418                                         celeron++;
4419                                 else if (tmp >= 0x44 && tmp <= 0x45)
4420                                         xeon++;
4421                         }
4422 
4423                         if (celeron)
4424                                 return ("Intel Celeron(r)");
4425                         if (xeon)
4426                                 return (cpi->cpi_model == 5 ?
4427                                     "Intel Pentium(r) II Xeon(tm)" :
4428                                     "Intel Pentium(r) III Xeon(tm)");
4429                         return (cpi->cpi_model == 5 ?
4430                             "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
4431                             "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
4432                 default:
4433                         break;
4434                 }
4435         default:
4436                 break;
4437         }
4438 
4439         /* BrandID is present if the field is nonzero */
4440         if (cpi->cpi_brandid != 0) {
4441                 static const struct {
4442                         uint_t bt_bid;
4443                         const char *bt_str;
4444                 } brand_tbl[] = {
4445                         { 0x1,  "Intel(r) Celeron(r)" },
4446                         { 0x2,  "Intel(r) Pentium(r) III" },
4447                         { 0x3,  "Intel(r) Pentium(r) III Xeon(tm)" },
4448                         { 0x4,  "Intel(r) Pentium(r) III" },
4449                         { 0x6,  "Mobile Intel(r) Pentium(r) III" },
4450                         { 0x7,  "Mobile Intel(r) Celeron(r)" },
4451                         { 0x8,  "Intel(r) Pentium(r) 4" },
4452                         { 0x9,  "Intel(r) Pentium(r) 4" },
4453                         { 0xa,  "Intel(r) Celeron(r)" },
4454                         { 0xb,  "Intel(r) Xeon(tm)" },
4455                         { 0xc,  "Intel(r) Xeon(tm) MP" },
4456                         { 0xe,  "Mobile Intel(r) Pentium(r) 4" },
4457                         { 0xf,  "Mobile Intel(r) Celeron(r)" },
4458                         { 0x11, "Mobile Genuine Intel(r)" },
4459                         { 0x12, "Intel(r) Celeron(r) M" },
4460                         { 0x13, "Mobile Intel(r) Celeron(r)" },
4461                         { 0x14, "Intel(r) Celeron(r)" },
4462                         { 0x15, "Mobile Genuine Intel(r)" },
4463                         { 0x16, "Intel(r) Pentium(r) M" },
4464                         { 0x17, "Mobile Intel(r) Celeron(r)" }
4465                 };
4466                 uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
4467                 uint_t sgn;
4468 
4469                 sgn = (cpi->cpi_family << 8) |
4470                     (cpi->cpi_model << 4) | cpi->cpi_step;
4471 
4472                 for (i = 0; i < btblmax; i++)
4473                         if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
4474                                 break;
4475                 if (i < btblmax) {
4476                         if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
4477                                 return ("Intel(r) Celeron(r)");
4478                         if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
4479                                 return ("Intel(r) Xeon(tm) MP");
4480                         if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
4481                                 return ("Intel(r) Xeon(tm)");
4482                         return (brand_tbl[i].bt_str);
4483                 }
4484         }
4485 
4486         return (NULL);
4487 }
4488 
4489 static const char *
4490 amd_cpubrand(const struct cpuid_info *cpi)
4491 {
4492         if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4493             cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
4494                 return ("i486 compatible");
4495 
4496         switch (cpi->cpi_family) {
4497         case 5:
4498                 switch (cpi->cpi_model) {
4499                 case 0:
4500                 case 1:
4501                 case 2:
4502                 case 3:
4503                 case 4:
4504                 case 5:
4505                         return ("AMD-K5(r)");
4506                 case 6:
4507                 case 7:
4508                         return ("AMD-K6(r)");
4509                 case 8:
4510                         return ("AMD-K6(r)-2");
4511                 case 9:
4512                         return ("AMD-K6(r)-III");
4513                 default:
4514                         return ("AMD (family 5)");
4515                 }
4516         case 6:
4517                 switch (cpi->cpi_model) {
4518                 case 1:
4519                         return ("AMD-K7(tm)");
4520                 case 0:
4521                 case 2:
4522                 case 4:
4523                         return ("AMD Athlon(tm)");
4524                 case 3:
4525                 case 7:
4526                         return ("AMD Duron(tm)");
4527                 case 6:
4528                 case 8:
4529                 case 10:
4530                         /*
4531                          * Use the L2 cache size to distinguish
4532                          */
4533                         return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
4534                             "AMD Athlon(tm)" : "AMD Duron(tm)");
4535                 default:
4536                         return ("AMD (family 6)");
4537                 }
4538         default:
4539                 break;
4540         }
4541 
4542         if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
4543             cpi->cpi_brandid != 0) {
4544                 switch (BITX(cpi->cpi_brandid, 7, 5)) {
4545                 case 3:
4546                         return ("AMD Opteron(tm) UP 1xx");
4547                 case 4:
4548                         return ("AMD Opteron(tm) DP 2xx");
4549                 case 5:
4550                         return ("AMD Opteron(tm) MP 8xx");
4551                 default:
4552                         return ("AMD Opteron(tm)");
4553                 }
4554         }
4555 
4556         return (NULL);
4557 }
4558 
4559 static const char *
4560 cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
4561 {
4562         if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4563             cpi->cpi_maxeax < 1 || cpi->cpi_family < 5 ||
4564             type == X86_TYPE_CYRIX_486)
4565                 return ("i486 compatible");
4566 
4567         switch (type) {
4568         case X86_TYPE_CYRIX_6x86:
4569                 return ("Cyrix 6x86");
4570         case X86_TYPE_CYRIX_6x86L:
4571                 return ("Cyrix 6x86L");
4572         case X86_TYPE_CYRIX_6x86MX:
4573                 return ("Cyrix 6x86MX");
4574         case X86_TYPE_CYRIX_GXm:
4575                 return ("Cyrix GXm");
4576         case X86_TYPE_CYRIX_MediaGX:
4577                 return ("Cyrix MediaGX");
4578         case X86_TYPE_CYRIX_MII:
4579                 return ("Cyrix M2");
4580         case X86_TYPE_VIA_CYRIX_III:
4581                 return ("VIA Cyrix M3");
4582         default:
4583                 /*
4584                  * Have another wild guess ..
4585                  */
4586                 if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
4587                         return ("Cyrix 5x86");
4588                 else if (cpi->cpi_family == 5) {
4589                         switch (cpi->cpi_model) {
4590                         case 2:
4591                                 return ("Cyrix 6x86");  /* Cyrix M1 */
4592                         case 4:
4593                                 return ("Cyrix MediaGX");
4594                         default:
4595                                 break;
4596                         }
4597                 } else if (cpi->cpi_family == 6) {
4598                         switch (cpi->cpi_model) {
4599                         case 0:
4600                                 return ("Cyrix 6x86MX"); /* Cyrix M2? */
4601                         case 5:
4602                         case 6:
4603                         case 7:
4604                         case 8:
4605                         case 9:
4606                                 return ("VIA C3");
4607                         default:
4608                                 break;
4609                         }
4610                 }
4611                 break;
4612         }
4613         return (NULL);
4614 }
4615 
4616 /*
4617  * This only gets called in the case that the CPU extended
4618  * feature brand string (0x80000002, 0x80000003, 0x80000004)
4619  * aren't available, or contain null bytes for some reason.
4620  */
4621 static void
4622 fabricate_brandstr(struct cpuid_info *cpi)
4623 {
4624         const char *brand = NULL;
4625 
4626         switch (cpi->cpi_vendor) {
4627         case X86_VENDOR_Intel:
4628                 brand = intel_cpubrand(cpi);
4629                 break;
4630         case X86_VENDOR_AMD:
4631                 brand = amd_cpubrand(cpi);
4632                 break;
4633         case X86_VENDOR_Cyrix:
4634                 brand = cyrix_cpubrand(cpi, x86_type);
4635                 break;
4636         case X86_VENDOR_NexGen:
4637                 if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4638                         brand = "NexGen Nx586";
4639                 break;
4640         case X86_VENDOR_Centaur:
4641                 if (cpi->cpi_family == 5)
4642                         switch (cpi->cpi_model) {
4643                         case 4:
4644                                 brand = "Centaur C6";
4645                                 break;
4646                         case 8:
4647                                 brand = "Centaur C2";
4648                                 break;
4649                         case 9:
4650                                 brand = "Centaur C3";
4651                                 break;
4652                         default:
4653                                 break;
4654                         }
4655                 break;
4656         case X86_VENDOR_Rise:
4657                 if (cpi->cpi_family == 5 &&
4658                     (cpi->cpi_model == 0 || cpi->cpi_model == 2))
4659                         brand = "Rise mP6";
4660                 break;
4661         case X86_VENDOR_SiS:
4662                 if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4663                         brand = "SiS 55x";
4664                 break;
4665         case X86_VENDOR_TM:
4666                 if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
4667                         brand = "Transmeta Crusoe TM3x00 or TM5x00";
4668                 break;
4669         case X86_VENDOR_NSC:
4670         case X86_VENDOR_UMC:
4671         default:
4672                 break;
4673         }
4674         if (brand) {
4675                 (void) strcpy((char *)cpi->cpi_brandstr, brand);
4676                 return;
4677         }
4678 
4679         /*
4680          * If all else fails ...
4681          */
4682         (void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
4683             "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
4684             cpi->cpi_model, cpi->cpi_step);
4685 }
4686 
4687 /*
4688  * This routine is called just after kernel memory allocation
4689  * becomes available on cpu0, and as part of mp_startup() on
4690  * the other cpus.
4691  *
4692  * Fixup the brand string, and collect any information from cpuid
4693  * that requires dynamically allocated storage to represent.
4694  */
4695 /*ARGSUSED*/
4696 void
4697 cpuid_pass3(cpu_t *cpu)
4698 {
4699         int     i, max, shft, level, size;
4700         struct cpuid_regs regs;
4701         struct cpuid_regs *cp;
4702         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4703 
4704         ASSERT(cpi->cpi_pass == 2);
4705 
4706         /*
4707          * Deterministic cache parameters
4708          *
4709          * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
4710          * values that are present are currently defined to be the same. This
4711          * means we can use the same logic to parse it as long as we use the
4712          * appropriate leaf to get the data. If you're updating this, make sure
4713          * you're careful about which vendor supports which aspect.
4714          *
4715          * Take this opportunity to detect the number of threads sharing the
4716          * last level cache, and construct a corresponding cache id. The
4717          * respective cpuid_info members are initialized to the default case of
4718          * "no last level cache sharing".
4719          */
4720         cpi->cpi_ncpu_shr_last_cache = 1;
4721         cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
4722 
4723         if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
4724             (cpi->cpi_vendor == X86_VENDOR_AMD &&
4725             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
4726             is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
4727                 uint32_t leaf;
4728 
4729                 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4730                         leaf = 4;
4731                 } else {
4732                         leaf = CPUID_LEAF_EXT_1d;
4733                 }
4734 
4735                 /*
4736                  * Find the # of elements (size) returned by the leaf and along
4737                  * the way detect last level cache sharing details.
4738                  */
4739                 bzero(&regs, sizeof (regs));
4740                 cp = &regs;
4741                 for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
4742                         cp->cp_eax = leaf;
4743                         cp->cp_ecx = i;
4744 
4745                         (void) __cpuid_insn(cp);
4746 
4747                         if (CPI_CACHE_TYPE(cp) == 0)
4748                                 break;
4749                         level = CPI_CACHE_LVL(cp);
4750                         if (level > max) {
4751                                 max = level;
4752                                 cpi->cpi_ncpu_shr_last_cache =
4753                                     CPI_NTHR_SHR_CACHE(cp) + 1;
4754                         }
4755                 }
4756                 cpi->cpi_cache_leaf_size = size = i;
4757 
4758                 /*
4759                  * Allocate the cpi_cache_leaves array. The first element
4760                  * references the regs for the corresponding leaf with %ecx set
4761                  * to 0. This was gathered in cpuid_pass2().
4762                  */
4763                 if (size > 0) {
4764                         cpi->cpi_cache_leaves =
4765                             kmem_alloc(size * sizeof (cp), KM_SLEEP);
4766                         if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4767                                 cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
4768                         } else {
4769                                 cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
4770                         }
4771 
4772                         /*
4773                          * Allocate storage to hold the additional regs
4774                          * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
4775                          *
4776                          * The regs for the leaf, %ecx == 0 has already
4777                          * been allocated as indicated above.
4778                          */
4779                         for (i = 1; i < size; i++) {
4780                                 cp = cpi->cpi_cache_leaves[i] =
4781                                     kmem_zalloc(sizeof (regs), KM_SLEEP);
4782                                 cp->cp_eax = leaf;
4783                                 cp->cp_ecx = i;
4784 
4785                                 (void) __cpuid_insn(cp);
4786                         }
4787                 }
4788                 /*
4789                  * Determine the number of bits needed to represent
4790                  * the number of CPUs sharing the last level cache.
4791                  *
4792                  * Shift off that number of bits from the APIC id to
4793                  * derive the cache id.
4794                  */
4795                 shft = 0;
4796                 for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
4797                         shft++;
4798                 cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
4799         }
4800 
4801         /*
4802          * Now fixup the brand string
4803          */
4804         if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
4805                 fabricate_brandstr(cpi);
4806         } else {
4807 
4808                 /*
4809                  * If we successfully extracted a brand string from the cpuid
4810                  * instruction, clean it up by removing leading spaces and
4811                  * similar junk.
4812                  */
4813                 if (cpi->cpi_brandstr[0]) {
4814                         size_t maxlen = sizeof (cpi->cpi_brandstr);
4815                         char *src, *dst;
4816 
4817                         dst = src = (char *)cpi->cpi_brandstr;
4818                         src[maxlen - 1] = '\0';
4819                         /*
4820                          * strip leading spaces
4821                          */
4822                         while (*src == ' ')
4823                                 src++;
4824                         /*
4825                          * Remove any 'Genuine' or "Authentic" prefixes
4826                          */
4827                         if (strncmp(src, "Genuine ", 8) == 0)
4828                                 src += 8;
4829                         if (strncmp(src, "Authentic ", 10) == 0)
4830                                 src += 10;
4831 
4832                         /*
4833                          * Now do an in-place copy.
4834                          * Map (R) to (r) and (TM) to (tm).
4835                          * The era of teletypes is long gone, and there's
4836                          * -really- no need to shout.
4837                          */
4838                         while (*src != '\0') {
4839                                 if (src[0] == '(') {
4840                                         if (strncmp(src + 1, "R)", 2) == 0) {
4841                                                 (void) strncpy(dst, "(r)", 3);
4842                                                 src += 3;
4843                                                 dst += 3;
4844                                                 continue;
4845                                         }
4846                                         if (strncmp(src + 1, "TM)", 3) == 0) {
4847                                                 (void) strncpy(dst, "(tm)", 4);
4848                                                 src += 4;
4849                                                 dst += 4;
4850                                                 continue;
4851                                         }
4852                                 }
4853                                 *dst++ = *src++;
4854                         }
4855                         *dst = '\0';
4856 
4857                         /*
4858                          * Finally, remove any trailing spaces
4859                          */
4860                         while (--dst > cpi->cpi_brandstr)
4861                                 if (*dst == ' ')
4862                                         *dst = '\0';
4863                                 else
4864                                         break;
4865                 } else
4866                         fabricate_brandstr(cpi);
4867         }
4868         cpi->cpi_pass = 3;
4869 }
4870 
4871 /*
4872  * This routine is called out of bind_hwcap() much later in the life
4873  * of the kernel (post_startup()).  The job of this routine is to resolve
4874  * the hardware feature support and kernel support for those features into
4875  * what we're actually going to tell applications via the aux vector.
4876  */
4877 void
4878 cpuid_pass4(cpu_t *cpu, uint_t *hwcap_out)
4879 {
4880         struct cpuid_info *cpi;
4881         uint_t hwcap_flags = 0, hwcap_flags_2 = 0;
4882 
4883         if (cpu == NULL)
4884                 cpu = CPU;
4885         cpi = cpu->cpu_m.mcpu_cpi;
4886 
4887         ASSERT(cpi->cpi_pass == 3);
4888 
4889         if (cpi->cpi_maxeax >= 1) {
4890                 uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
4891                 uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
4892                 uint32_t *ebx = &cpi->cpi_support[STD_EBX_FEATURES];
4893 
4894                 *edx = CPI_FEATURES_EDX(cpi);
4895                 *ecx = CPI_FEATURES_ECX(cpi);
4896                 *ebx = CPI_FEATURES_7_0_EBX(cpi);
4897 
4898                 /*
4899                  * [these require explicit kernel support]
4900                  */
4901                 if (!is_x86_feature(x86_featureset, X86FSET_SEP))
4902                         *edx &= ~CPUID_INTC_EDX_SEP;
4903 
4904                 if (!is_x86_feature(x86_featureset, X86FSET_SSE))
4905                         *edx &= ~(CPUID_INTC_EDX_FXSR|CPUID_INTC_EDX_SSE);
4906                 if (!is_x86_feature(x86_featureset, X86FSET_SSE2))
4907                         *edx &= ~CPUID_INTC_EDX_SSE2;
4908 
4909                 if (!is_x86_feature(x86_featureset, X86FSET_HTT))
4910                         *edx &= ~CPUID_INTC_EDX_HTT;
4911 
4912                 if (!is_x86_feature(x86_featureset, X86FSET_SSE3))
4913                         *ecx &= ~CPUID_INTC_ECX_SSE3;
4914 
4915                 if (!is_x86_feature(x86_featureset, X86FSET_SSSE3))
4916                         *ecx &= ~CPUID_INTC_ECX_SSSE3;
4917                 if (!is_x86_feature(x86_featureset, X86FSET_SSE4_1))
4918                         *ecx &= ~CPUID_INTC_ECX_SSE4_1;
4919                 if (!is_x86_feature(x86_featureset, X86FSET_SSE4_2))
4920                         *ecx &= ~CPUID_INTC_ECX_SSE4_2;
4921                 if (!is_x86_feature(x86_featureset, X86FSET_AES))
4922                         *ecx &= ~CPUID_INTC_ECX_AES;
4923                 if (!is_x86_feature(x86_featureset, X86FSET_PCLMULQDQ))
4924                         *ecx &= ~CPUID_INTC_ECX_PCLMULQDQ;
4925                 if (!is_x86_feature(x86_featureset, X86FSET_XSAVE))
4926                         *ecx &= ~(CPUID_INTC_ECX_XSAVE |
4927                             CPUID_INTC_ECX_OSXSAVE);
4928                 if (!is_x86_feature(x86_featureset, X86FSET_AVX))
4929                         *ecx &= ~CPUID_INTC_ECX_AVX;
4930                 if (!is_x86_feature(x86_featureset, X86FSET_F16C))
4931                         *ecx &= ~CPUID_INTC_ECX_F16C;
4932                 if (!is_x86_feature(x86_featureset, X86FSET_FMA))
4933                         *ecx &= ~CPUID_INTC_ECX_FMA;
4934                 if (!is_x86_feature(x86_featureset, X86FSET_BMI1))
4935                         *ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
4936                 if (!is_x86_feature(x86_featureset, X86FSET_BMI2))
4937                         *ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
4938                 if (!is_x86_feature(x86_featureset, X86FSET_AVX2))
4939                         *ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
4940                 if (!is_x86_feature(x86_featureset, X86FSET_RDSEED))
4941                         *ebx &= ~CPUID_INTC_EBX_7_0_RDSEED;
4942                 if (!is_x86_feature(x86_featureset, X86FSET_ADX))
4943                         *ebx &= ~CPUID_INTC_EBX_7_0_ADX;
4944 
4945                 /*
4946                  * [no explicit support required beyond x87 fp context]
4947                  */
4948                 if (!fpu_exists)
4949                         *edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
4950 
4951                 /*
4952                  * Now map the supported feature vector to things that we
4953                  * think userland will care about.
4954                  */
4955                 if (*edx & CPUID_INTC_EDX_SEP)
4956                         hwcap_flags |= AV_386_SEP;
4957                 if (*edx & CPUID_INTC_EDX_SSE)
4958                         hwcap_flags |= AV_386_FXSR | AV_386_SSE;
4959                 if (*edx & CPUID_INTC_EDX_SSE2)
4960                         hwcap_flags |= AV_386_SSE2;
4961                 if (*ecx & CPUID_INTC_ECX_SSE3)
4962                         hwcap_flags |= AV_386_SSE3;
4963                 if (*ecx & CPUID_INTC_ECX_SSSE3)
4964                         hwcap_flags |= AV_386_SSSE3;
4965                 if (*ecx & CPUID_INTC_ECX_SSE4_1)
4966                         hwcap_flags |= AV_386_SSE4_1;
4967                 if (*ecx & CPUID_INTC_ECX_SSE4_2)
4968                         hwcap_flags |= AV_386_SSE4_2;
4969                 if (*ecx & CPUID_INTC_ECX_MOVBE)
4970                         hwcap_flags |= AV_386_MOVBE;
4971                 if (*ecx & CPUID_INTC_ECX_AES)
4972                         hwcap_flags |= AV_386_AES;
4973                 if (*ecx & CPUID_INTC_ECX_PCLMULQDQ)
4974                         hwcap_flags |= AV_386_PCLMULQDQ;
4975                 if ((*ecx & CPUID_INTC_ECX_XSAVE) &&
4976                     (*ecx & CPUID_INTC_ECX_OSXSAVE)) {
4977                         hwcap_flags |= AV_386_XSAVE;
4978 
4979                         if (*ecx & CPUID_INTC_ECX_AVX) {
4980                                 uint32_t *ecx_7 = &CPI_FEATURES_7_0_ECX(cpi);
4981                                 uint32_t *edx_7 = &CPI_FEATURES_7_0_EDX(cpi);
4982 
4983                                 hwcap_flags |= AV_386_AVX;
4984                                 if (*ecx & CPUID_INTC_ECX_F16C)
4985                                         hwcap_flags_2 |= AV_386_2_F16C;
4986                                 if (*ecx & CPUID_INTC_ECX_FMA)
4987                                         hwcap_flags_2 |= AV_386_2_FMA;
4988 
4989                                 if (*ebx & CPUID_INTC_EBX_7_0_BMI1)
4990                                         hwcap_flags_2 |= AV_386_2_BMI1;
4991                                 if (*ebx & CPUID_INTC_EBX_7_0_BMI2)
4992                                         hwcap_flags_2 |= AV_386_2_BMI2;
4993                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX2)
4994                                         hwcap_flags_2 |= AV_386_2_AVX2;
4995                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512F)
4996                                         hwcap_flags_2 |= AV_386_2_AVX512F;
4997                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
4998                                         hwcap_flags_2 |= AV_386_2_AVX512DQ;
4999                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
5000                                         hwcap_flags_2 |= AV_386_2_AVX512IFMA;
5001                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512PF)
5002                                         hwcap_flags_2 |= AV_386_2_AVX512PF;
5003                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512ER)
5004                                         hwcap_flags_2 |= AV_386_2_AVX512ER;
5005                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512CD)
5006                                         hwcap_flags_2 |= AV_386_2_AVX512CD;
5007                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512BW)
5008                                         hwcap_flags_2 |= AV_386_2_AVX512BW;
5009                                 if (*ebx & CPUID_INTC_EBX_7_0_AVX512VL)
5010                                         hwcap_flags_2 |= AV_386_2_AVX512VL;
5011 
5012                                 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VBMI)
5013                                         hwcap_flags_2 |= AV_386_2_AVX512VBMI;
5014                                 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VNNI)
5015                                         hwcap_flags_2 |= AV_386_2_AVX512_VNNI;
5016                                 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
5017                                         hwcap_flags_2 |= AV_386_2_AVX512VPOPCDQ;
5018 
5019                                 if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124NNIW)
5020                                         hwcap_flags_2 |= AV_386_2_AVX512_4NNIW;
5021                                 if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
5022                                         hwcap_flags_2 |= AV_386_2_AVX512_4FMAPS;
5023                         }
5024                 }
5025                 if (*ecx & CPUID_INTC_ECX_VMX)
5026                         hwcap_flags |= AV_386_VMX;
5027                 if (*ecx & CPUID_INTC_ECX_POPCNT)
5028                         hwcap_flags |= AV_386_POPCNT;
5029                 if (*edx & CPUID_INTC_EDX_FPU)
5030                         hwcap_flags |= AV_386_FPU;
5031                 if (*edx & CPUID_INTC_EDX_MMX)
5032                         hwcap_flags |= AV_386_MMX;
5033 
5034                 if (*edx & CPUID_INTC_EDX_TSC)
5035                         hwcap_flags |= AV_386_TSC;
5036                 if (*edx & CPUID_INTC_EDX_CX8)
5037                         hwcap_flags |= AV_386_CX8;
5038                 if (*edx & CPUID_INTC_EDX_CMOV)
5039                         hwcap_flags |= AV_386_CMOV;
5040                 if (*ecx & CPUID_INTC_ECX_CX16)
5041                         hwcap_flags |= AV_386_CX16;
5042 
5043                 if (*ecx & CPUID_INTC_ECX_RDRAND)
5044                         hwcap_flags_2 |= AV_386_2_RDRAND;
5045                 if (*ebx & CPUID_INTC_EBX_7_0_ADX)
5046                         hwcap_flags_2 |= AV_386_2_ADX;
5047                 if (*ebx & CPUID_INTC_EBX_7_0_RDSEED)
5048                         hwcap_flags_2 |= AV_386_2_RDSEED;
5049                 if (*ebx & CPUID_INTC_EBX_7_0_SHA)
5050                         hwcap_flags_2 |= AV_386_2_SHA;
5051                 if (*ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
5052                         hwcap_flags_2 |= AV_386_2_FSGSBASE;
5053                 if (*ebx & CPUID_INTC_EBX_7_0_CLWB)
5054                         hwcap_flags_2 |= AV_386_2_CLWB;
5055                 if (*ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
5056                         hwcap_flags_2 |= AV_386_2_CLFLUSHOPT;
5057 
5058         }
5059         /*
5060          * Check a few miscilaneous features.
5061          */
5062         if (is_x86_feature(x86_featureset, X86FSET_CLZERO))
5063                 hwcap_flags_2 |= AV_386_2_CLZERO;
5064 
5065         if (cpi->cpi_xmaxeax < 0x80000001)
5066                 goto pass4_done;
5067 
5068         switch (cpi->cpi_vendor) {
5069                 struct cpuid_regs cp;
5070                 uint32_t *edx, *ecx;
5071 
5072         case X86_VENDOR_Intel:
5073                 /*
5074                  * Seems like Intel duplicated what we necessary
5075                  * here to make the initial crop of 64-bit OS's work.
5076                  * Hopefully, those are the only "extended" bits
5077                  * they'll add.
5078                  */
5079                 /*FALLTHROUGH*/
5080 
5081         case X86_VENDOR_AMD:
5082                 edx = &cpi->cpi_support[AMD_EDX_FEATURES];
5083                 ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
5084 
5085                 *edx = CPI_FEATURES_XTD_EDX(cpi);
5086                 *ecx = CPI_FEATURES_XTD_ECX(cpi);
5087 
5088                 /*
5089                  * [these features require explicit kernel support]
5090                  */
5091                 switch (cpi->cpi_vendor) {
5092                 case X86_VENDOR_Intel:
5093                         if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
5094                                 *edx &= ~CPUID_AMD_EDX_TSCP;
5095                         break;
5096 
5097                 case X86_VENDOR_AMD:
5098                         if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
5099                                 *edx &= ~CPUID_AMD_EDX_TSCP;
5100                         if (!is_x86_feature(x86_featureset, X86FSET_SSE4A))
5101                                 *ecx &= ~CPUID_AMD_ECX_SSE4A;
5102                         break;
5103 
5104                 default:
5105                         break;
5106                 }
5107 
5108                 /*
5109                  * [no explicit support required beyond
5110                  * x87 fp context and exception handlers]
5111                  */
5112                 if (!fpu_exists)
5113                         *edx &= ~(CPUID_AMD_EDX_MMXamd |
5114                             CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
5115 
5116                 if (!is_x86_feature(x86_featureset, X86FSET_NX))
5117                         *edx &= ~CPUID_AMD_EDX_NX;
5118 #if !defined(__amd64)
5119                 *edx &= ~CPUID_AMD_EDX_LM;
5120 #endif
5121                 /*
5122                  * Now map the supported feature vector to
5123                  * things that we think userland will care about.
5124                  */
5125 #if defined(__amd64)
5126                 if (*edx & CPUID_AMD_EDX_SYSC)
5127                         hwcap_flags |= AV_386_AMD_SYSC;
5128 #endif
5129                 if (*edx & CPUID_AMD_EDX_MMXamd)
5130                         hwcap_flags |= AV_386_AMD_MMX;
5131                 if (*edx & CPUID_AMD_EDX_3DNow)
5132                         hwcap_flags |= AV_386_AMD_3DNow;
5133                 if (*edx & CPUID_AMD_EDX_3DNowx)
5134                         hwcap_flags |= AV_386_AMD_3DNowx;
5135                 if (*ecx & CPUID_AMD_ECX_SVM)
5136                         hwcap_flags |= AV_386_AMD_SVM;
5137 
5138                 switch (cpi->cpi_vendor) {
5139                 case X86_VENDOR_AMD:
5140                         if (*edx & CPUID_AMD_EDX_TSCP)
5141                                 hwcap_flags |= AV_386_TSCP;
5142                         if (*ecx & CPUID_AMD_ECX_AHF64)
5143                                 hwcap_flags |= AV_386_AHF;
5144                         if (*ecx & CPUID_AMD_ECX_SSE4A)
5145                                 hwcap_flags |= AV_386_AMD_SSE4A;
5146                         if (*ecx & CPUID_AMD_ECX_LZCNT)
5147                                 hwcap_flags |= AV_386_AMD_LZCNT;
5148                         if (*ecx & CPUID_AMD_ECX_MONITORX)
5149                                 hwcap_flags_2 |= AV_386_2_MONITORX;
5150                         break;
5151 
5152                 case X86_VENDOR_Intel:
5153                         if (*edx & CPUID_AMD_EDX_TSCP)
5154                                 hwcap_flags |= AV_386_TSCP;
5155                         if (*ecx & CPUID_AMD_ECX_LZCNT)
5156                                 hwcap_flags |= AV_386_AMD_LZCNT;
5157                         /*
5158                          * Aarrgh.
5159                          * Intel uses a different bit in the same word.
5160                          */
5161                         if (*ecx & CPUID_INTC_ECX_AHF64)
5162                                 hwcap_flags |= AV_386_AHF;
5163                         break;
5164 
5165                 default:
5166                         break;
5167                 }
5168                 break;
5169 
5170         case X86_VENDOR_TM:
5171                 cp.cp_eax = 0x80860001;
5172                 (void) __cpuid_insn(&cp);
5173                 cpi->cpi_support[TM_EDX_FEATURES] = cp.cp_edx;
5174                 break;
5175 
5176         default:
5177                 break;
5178         }
5179 
5180 pass4_done:
5181         cpi->cpi_pass = 4;
5182         if (hwcap_out != NULL) {
5183                 hwcap_out[0] = hwcap_flags;
5184                 hwcap_out[1] = hwcap_flags_2;
5185         }
5186 }
5187 
5188 
5189 /*
5190  * Simulate the cpuid instruction using the data we previously
5191  * captured about this CPU.  We try our best to return the truth
5192  * about the hardware, independently of kernel support.
5193  */
5194 uint32_t
5195 cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
5196 {
5197         struct cpuid_info *cpi;
5198         struct cpuid_regs *xcp;
5199 
5200         if (cpu == NULL)
5201                 cpu = CPU;
5202         cpi = cpu->cpu_m.mcpu_cpi;
5203 
5204         ASSERT(cpuid_checkpass(cpu, 3));
5205 
5206         /*
5207          * CPUID data is cached in two separate places: cpi_std for standard
5208          * CPUID leaves , and cpi_extd for extended CPUID leaves.
5209          */
5210         if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
5211                 xcp = &cpi->cpi_std[cp->cp_eax];
5212         } else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
5213             cp->cp_eax <= cpi->cpi_xmaxeax &&
5214             cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
5215                 xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
5216         } else {
5217                 /*
5218                  * The caller is asking for data from an input parameter which
5219                  * the kernel has not cached.  In this case we go fetch from
5220                  * the hardware and return the data directly to the user.
5221                  */
5222                 return (__cpuid_insn(cp));
5223         }
5224 
5225         cp->cp_eax = xcp->cp_eax;
5226         cp->cp_ebx = xcp->cp_ebx;
5227         cp->cp_ecx = xcp->cp_ecx;
5228         cp->cp_edx = xcp->cp_edx;
5229         return (cp->cp_eax);
5230 }
5231 
5232 int
5233 cpuid_checkpass(cpu_t *cpu, int pass)
5234 {
5235         return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
5236             cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
5237 }
5238 
5239 int
5240 cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
5241 {
5242         ASSERT(cpuid_checkpass(cpu, 3));
5243 
5244         return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
5245 }
5246 
5247 int
5248 cpuid_is_cmt(cpu_t *cpu)
5249 {
5250         if (cpu == NULL)
5251                 cpu = CPU;
5252 
5253         ASSERT(cpuid_checkpass(cpu, 1));
5254 
5255         return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
5256 }
5257 
5258 /*
5259  * AMD and Intel both implement the 64-bit variant of the syscall
5260  * instruction (syscallq), so if there's -any- support for syscall,
5261  * cpuid currently says "yes, we support this".
5262  *
5263  * However, Intel decided to -not- implement the 32-bit variant of the
5264  * syscall instruction, so we provide a predicate to allow our caller
5265  * to test that subtlety here.
5266  *
5267  * XXPV Currently, 32-bit syscall instructions don't work via the hypervisor,
5268  *      even in the case where the hardware would in fact support it.
5269  */
5270 /*ARGSUSED*/
5271 int
5272 cpuid_syscall32_insn(cpu_t *cpu)
5273 {
5274         ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), 1));
5275 
5276 #if !defined(__xpv)
5277         if (cpu == NULL)
5278                 cpu = CPU;
5279 
5280         /*CSTYLED*/
5281         {
5282                 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5283 
5284                 if (cpi->cpi_vendor == X86_VENDOR_AMD &&
5285                     cpi->cpi_xmaxeax >= 0x80000001 &&
5286                     (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
5287                         return (1);
5288         }
5289 #endif
5290         return (0);
5291 }
5292 
5293 int
5294 cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
5295 {
5296         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5297 
5298         static const char fmt[] =
5299             "x86 (%s %X family %d model %d step %d clock %d MHz)";
5300         static const char fmt_ht[] =
5301             "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
5302 
5303         ASSERT(cpuid_checkpass(cpu, 1));
5304 
5305         if (cpuid_is_cmt(cpu))
5306                 return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
5307                     cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5308                     cpi->cpi_family, cpi->cpi_model,
5309                     cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5310         return (snprintf(s, n, fmt,
5311             cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5312             cpi->cpi_family, cpi->cpi_model,
5313             cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5314 }
5315 
5316 const char *
5317 cpuid_getvendorstr(cpu_t *cpu)
5318 {
5319         ASSERT(cpuid_checkpass(cpu, 1));
5320         return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
5321 }
5322 
5323 uint_t
5324 cpuid_getvendor(cpu_t *cpu)
5325 {
5326         ASSERT(cpuid_checkpass(cpu, 1));
5327         return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
5328 }
5329 
5330 uint_t
5331 cpuid_getfamily(cpu_t *cpu)
5332 {
5333         ASSERT(cpuid_checkpass(cpu, 1));
5334         return (cpu->cpu_m.mcpu_cpi->cpi_family);
5335 }
5336 
5337 uint_t
5338 cpuid_getmodel(cpu_t *cpu)
5339 {
5340         ASSERT(cpuid_checkpass(cpu, 1));
5341         return (cpu->cpu_m.mcpu_cpi->cpi_model);
5342 }
5343 
5344 uint_t
5345 cpuid_get_ncpu_per_chip(cpu_t *cpu)
5346 {
5347         ASSERT(cpuid_checkpass(cpu, 1));
5348         return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
5349 }
5350 
5351 uint_t
5352 cpuid_get_ncore_per_chip(cpu_t *cpu)
5353 {
5354         ASSERT(cpuid_checkpass(cpu, 1));
5355         return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
5356 }
5357 
5358 uint_t
5359 cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
5360 {
5361         ASSERT(cpuid_checkpass(cpu, 2));
5362         return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
5363 }
5364 
5365 id_t
5366 cpuid_get_last_lvl_cacheid(cpu_t *cpu)
5367 {
5368         ASSERT(cpuid_checkpass(cpu, 2));
5369         return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5370 }
5371 
5372 uint_t
5373 cpuid_getstep(cpu_t *cpu)
5374 {
5375         ASSERT(cpuid_checkpass(cpu, 1));
5376         return (cpu->cpu_m.mcpu_cpi->cpi_step);
5377 }
5378 
5379 uint_t
5380 cpuid_getsig(struct cpu *cpu)
5381 {
5382         ASSERT(cpuid_checkpass(cpu, 1));
5383         return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
5384 }
5385 
5386 uint32_t
5387 cpuid_getchiprev(struct cpu *cpu)
5388 {
5389         ASSERT(cpuid_checkpass(cpu, 1));
5390         return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
5391 }
5392 
5393 const char *
5394 cpuid_getchiprevstr(struct cpu *cpu)
5395 {
5396         ASSERT(cpuid_checkpass(cpu, 1));
5397         return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
5398 }
5399 
5400 uint32_t
5401 cpuid_getsockettype(struct cpu *cpu)
5402 {
5403         ASSERT(cpuid_checkpass(cpu, 1));
5404         return (cpu->cpu_m.mcpu_cpi->cpi_socket);
5405 }
5406 
5407 const char *
5408 cpuid_getsocketstr(cpu_t *cpu)
5409 {
5410         static const char *socketstr = NULL;
5411         struct cpuid_info *cpi;
5412 
5413         ASSERT(cpuid_checkpass(cpu, 1));
5414         cpi = cpu->cpu_m.mcpu_cpi;
5415 
5416         /* Assume that socket types are the same across the system */
5417         if (socketstr == NULL)
5418                 socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
5419                     cpi->cpi_model, cpi->cpi_step);
5420 
5421 
5422         return (socketstr);
5423 }
5424 
5425 int
5426 cpuid_get_chipid(cpu_t *cpu)
5427 {
5428         ASSERT(cpuid_checkpass(cpu, 1));
5429 
5430         if (cpuid_is_cmt(cpu))
5431                 return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
5432         return (cpu->cpu_id);
5433 }
5434 
5435 id_t
5436 cpuid_get_coreid(cpu_t *cpu)
5437 {
5438         ASSERT(cpuid_checkpass(cpu, 1));
5439         return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
5440 }
5441 
5442 int
5443 cpuid_get_pkgcoreid(cpu_t *cpu)
5444 {
5445         ASSERT(cpuid_checkpass(cpu, 1));
5446         return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
5447 }
5448 
5449 int
5450 cpuid_get_clogid(cpu_t *cpu)
5451 {
5452         ASSERT(cpuid_checkpass(cpu, 1));
5453         return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
5454 }
5455 
5456 int
5457 cpuid_get_cacheid(cpu_t *cpu)
5458 {
5459         ASSERT(cpuid_checkpass(cpu, 1));
5460         return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5461 }
5462 
5463 uint_t
5464 cpuid_get_procnodeid(cpu_t *cpu)
5465 {
5466         ASSERT(cpuid_checkpass(cpu, 1));
5467         return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
5468 }
5469 
5470 uint_t
5471 cpuid_get_procnodes_per_pkg(cpu_t *cpu)
5472 {
5473         ASSERT(cpuid_checkpass(cpu, 1));
5474         return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
5475 }
5476 
5477 uint_t
5478 cpuid_get_compunitid(cpu_t *cpu)
5479 {
5480         ASSERT(cpuid_checkpass(cpu, 1));
5481         return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
5482 }
5483 
5484 uint_t
5485 cpuid_get_cores_per_compunit(cpu_t *cpu)
5486 {
5487         ASSERT(cpuid_checkpass(cpu, 1));
5488         return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
5489 }
5490 
5491 /*ARGSUSED*/
5492 int
5493 cpuid_have_cr8access(cpu_t *cpu)
5494 {
5495 #if defined(__amd64)
5496         return (1);
5497 #else
5498         struct cpuid_info *cpi;
5499 
5500         ASSERT(cpu != NULL);
5501         cpi = cpu->cpu_m.mcpu_cpi;
5502         if (cpi->cpi_vendor == X86_VENDOR_AMD && cpi->cpi_maxeax >= 1 &&
5503             (CPI_FEATURES_XTD_ECX(cpi) & CPUID_AMD_ECX_CR8D) != 0)
5504                 return (1);
5505         return (0);
5506 #endif
5507 }
5508 
5509 uint32_t
5510 cpuid_get_apicid(cpu_t *cpu)
5511 {
5512         ASSERT(cpuid_checkpass(cpu, 1));
5513         if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
5514                 return (UINT32_MAX);
5515         } else {
5516                 return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
5517         }
5518 }
5519 
5520 void
5521 cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
5522 {
5523         struct cpuid_info *cpi;
5524 
5525         if (cpu == NULL)
5526                 cpu = CPU;
5527         cpi = cpu->cpu_m.mcpu_cpi;
5528 
5529         ASSERT(cpuid_checkpass(cpu, 1));
5530 
5531         if (pabits)
5532                 *pabits = cpi->cpi_pabits;
5533         if (vabits)
5534                 *vabits = cpi->cpi_vabits;
5535 }
5536 
5537 size_t
5538 cpuid_get_xsave_size()
5539 {
5540         return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
5541             sizeof (struct xsave_state)));
5542 }
5543 
5544 /*
5545  * Return true if the CPUs on this system require 'pointer clearing' for the
5546  * floating point error pointer exception handling. In the past, this has been
5547  * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
5548  * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
5549  * feature bit and is reflected in the cpi_fp_amd_save member.
5550  */
5551 boolean_t
5552 cpuid_need_fp_excp_handling()
5553 {
5554         return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
5555             cpuid_info0.cpi_fp_amd_save != 0);
5556 }
5557 
5558 /*
5559  * Returns the number of data TLB entries for a corresponding
5560  * pagesize.  If it can't be computed, or isn't known, the
5561  * routine returns zero.  If you ask about an architecturally
5562  * impossible pagesize, the routine will panic (so that the
5563  * hat implementor knows that things are inconsistent.)
5564  */
5565 uint_t
5566 cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
5567 {
5568         struct cpuid_info *cpi;
5569         uint_t dtlb_nent = 0;
5570 
5571         if (cpu == NULL)
5572                 cpu = CPU;
5573         cpi = cpu->cpu_m.mcpu_cpi;
5574 
5575         ASSERT(cpuid_checkpass(cpu, 1));
5576 
5577         /*
5578          * Check the L2 TLB info
5579          */
5580         if (cpi->cpi_xmaxeax >= 0x80000006) {
5581                 struct cpuid_regs *cp = &cpi->cpi_extd[6];
5582 
5583                 switch (pagesize) {
5584 
5585                 case 4 * 1024:
5586                         /*
5587                          * All zero in the top 16 bits of the register
5588                          * indicates a unified TLB. Size is in low 16 bits.
5589                          */
5590                         if ((cp->cp_ebx & 0xffff0000) == 0)
5591                                 dtlb_nent = cp->cp_ebx & 0x0000ffff;
5592                         else
5593                                 dtlb_nent = BITX(cp->cp_ebx, 27, 16);
5594                         break;
5595 
5596                 case 2 * 1024 * 1024:
5597                         if ((cp->cp_eax & 0xffff0000) == 0)
5598                                 dtlb_nent = cp->cp_eax & 0x0000ffff;
5599                         else
5600                                 dtlb_nent = BITX(cp->cp_eax, 27, 16);
5601                         break;
5602 
5603                 default:
5604                         panic("unknown L2 pagesize");
5605                         /*NOTREACHED*/
5606                 }
5607         }
5608 
5609         if (dtlb_nent != 0)
5610                 return (dtlb_nent);
5611 
5612         /*
5613          * No L2 TLB support for this size, try L1.
5614          */
5615         if (cpi->cpi_xmaxeax >= 0x80000005) {
5616                 struct cpuid_regs *cp = &cpi->cpi_extd[5];
5617 
5618                 switch (pagesize) {
5619                 case 4 * 1024:
5620                         dtlb_nent = BITX(cp->cp_ebx, 23, 16);
5621                         break;
5622                 case 2 * 1024 * 1024:
5623                         dtlb_nent = BITX(cp->cp_eax, 23, 16);
5624                         break;
5625                 default:
5626                         panic("unknown L1 d-TLB pagesize");
5627                         /*NOTREACHED*/
5628                 }
5629         }
5630 
5631         return (dtlb_nent);
5632 }
5633 
5634 /*
5635  * Return 0 if the erratum is not present or not applicable, positive
5636  * if it is, and negative if the status of the erratum is unknown.
5637  *
5638  * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
5639  * Processors" #25759, Rev 3.57, August 2005
5640  */
5641 int
5642 cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
5643 {
5644         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5645         uint_t eax;
5646 
5647         /*
5648          * Bail out if this CPU isn't an AMD CPU, or if it's
5649          * a legacy (32-bit) AMD CPU.
5650          */
5651         if (cpi->cpi_vendor != X86_VENDOR_AMD ||
5652             cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
5653             cpi->cpi_family == 6) {
5654                 return (0);
5655         }
5656 
5657         eax = cpi->cpi_std[1].cp_eax;
5658 
5659 #define SH_B0(eax)      (eax == 0xf40 || eax == 0xf50)
5660 #define SH_B3(eax)      (eax == 0xf51)
5661 #define B(eax)          (SH_B0(eax) || SH_B3(eax))
5662 
5663 #define SH_C0(eax)      (eax == 0xf48 || eax == 0xf58)
5664 
5665 #define SH_CG(eax)      (eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
5666 #define DH_CG(eax)      (eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
5667 #define CH_CG(eax)      (eax == 0xf82 || eax == 0xfb2)
5668 #define CG(eax)         (SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
5669 
5670 #define SH_D0(eax)      (eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
5671 #define DH_D0(eax)      (eax == 0x10fc0 || eax == 0x10ff0)
5672 #define CH_D0(eax)      (eax == 0x10f80 || eax == 0x10fb0)
5673 #define D0(eax)         (SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
5674 
5675 #define SH_E0(eax)      (eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
5676 #define JH_E1(eax)      (eax == 0x20f10)        /* JH8_E0 had 0x20f30 */
5677 #define DH_E3(eax)      (eax == 0x20fc0 || eax == 0x20ff0)
5678 #define SH_E4(eax)      (eax == 0x20f51 || eax == 0x20f71)
5679 #define BH_E4(eax)      (eax == 0x20fb1)
5680 #define SH_E5(eax)      (eax == 0x20f42)
5681 #define DH_E6(eax)      (eax == 0x20ff2 || eax == 0x20fc2)
5682 #define JH_E6(eax)      (eax == 0x20f12 || eax == 0x20f32)
5683 #define EX(eax)         (SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
5684                             SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
5685                             DH_E6(eax) || JH_E6(eax))
5686 
5687 #define DR_AX(eax)      (eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
5688 #define DR_B0(eax)      (eax == 0x100f20)
5689 #define DR_B1(eax)      (eax == 0x100f21)
5690 #define DR_BA(eax)      (eax == 0x100f2a)
5691 #define DR_B2(eax)      (eax == 0x100f22)
5692 #define DR_B3(eax)      (eax == 0x100f23)
5693 #define RB_C0(eax)      (eax == 0x100f40)
5694 
5695         switch (erratum) {
5696         case 1:
5697                 return (cpi->cpi_family < 0x10);
5698         case 51:        /* what does the asterisk mean? */
5699                 return (B(eax) || SH_C0(eax) || CG(eax));
5700         case 52:
5701                 return (B(eax));
5702         case 57:
5703                 return (cpi->cpi_family <= 0x11);
5704         case 58:
5705                 return (B(eax));
5706         case 60:
5707                 return (cpi->cpi_family <= 0x11);
5708         case 61:
5709         case 62:
5710         case 63:
5711         case 64:
5712         case 65:
5713         case 66:
5714         case 68:
5715         case 69:
5716         case 70:
5717         case 71:
5718                 return (B(eax));
5719         case 72:
5720                 return (SH_B0(eax));
5721         case 74:
5722                 return (B(eax));
5723         case 75:
5724                 return (cpi->cpi_family < 0x10);
5725         case 76:
5726                 return (B(eax));
5727         case 77:
5728                 return (cpi->cpi_family <= 0x11);
5729         case 78:
5730                 return (B(eax) || SH_C0(eax));
5731         case 79:
5732                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5733         case 80:
5734         case 81:
5735         case 82:
5736                 return (B(eax));
5737         case 83:
5738                 return (B(eax) || SH_C0(eax) || CG(eax));
5739         case 85:
5740                 return (cpi->cpi_family < 0x10);
5741         case 86:
5742                 return (SH_C0(eax) || CG(eax));
5743         case 88:
5744 #if !defined(__amd64)
5745                 return (0);
5746 #else
5747                 return (B(eax) || SH_C0(eax));
5748 #endif
5749         case 89:
5750                 return (cpi->cpi_family < 0x10);
5751         case 90:
5752                 return (B(eax) || SH_C0(eax) || CG(eax));
5753         case 91:
5754         case 92:
5755                 return (B(eax) || SH_C0(eax));
5756         case 93:
5757                 return (SH_C0(eax));
5758         case 94:
5759                 return (B(eax) || SH_C0(eax) || CG(eax));
5760         case 95:
5761 #if !defined(__amd64)
5762                 return (0);
5763 #else
5764                 return (B(eax) || SH_C0(eax));
5765 #endif
5766         case 96:
5767                 return (B(eax) || SH_C0(eax) || CG(eax));
5768         case 97:
5769         case 98:
5770                 return (SH_C0(eax) || CG(eax));
5771         case 99:
5772                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5773         case 100:
5774                 return (B(eax) || SH_C0(eax));
5775         case 101:
5776         case 103:
5777                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5778         case 104:
5779                 return (SH_C0(eax) || CG(eax) || D0(eax));
5780         case 105:
5781         case 106:
5782         case 107:
5783                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5784         case 108:
5785                 return (DH_CG(eax));
5786         case 109:
5787                 return (SH_C0(eax) || CG(eax) || D0(eax));
5788         case 110:
5789                 return (D0(eax) || EX(eax));
5790         case 111:
5791                 return (CG(eax));
5792         case 112:
5793                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5794         case 113:
5795                 return (eax == 0x20fc0);
5796         case 114:
5797                 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
5798         case 115:
5799                 return (SH_E0(eax) || JH_E1(eax));
5800         case 116:
5801                 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
5802         case 117:
5803                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5804         case 118:
5805                 return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
5806                     JH_E6(eax));
5807         case 121:
5808                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5809         case 122:
5810                 return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
5811         case 123:
5812                 return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
5813         case 131:
5814                 return (cpi->cpi_family < 0x10);
5815         case 6336786:
5816 
5817                 /*
5818                  * Test for AdvPowerMgmtInfo.TscPStateInvariant
5819                  * if this is a K8 family or newer processor. We're testing for
5820                  * this 'erratum' to determine whether or not we have a constant
5821                  * TSC.
5822                  *
5823                  * Our current fix for this is to disable the C1-Clock ramping.
5824                  * However, this doesn't work on newer processor families nor
5825                  * does it work when virtualized as those devices don't exist.
5826                  */
5827                 if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
5828                         return (0);
5829                 }
5830 
5831                 if (CPI_FAMILY(cpi) == 0xf) {
5832                         struct cpuid_regs regs;
5833                         regs.cp_eax = 0x80000007;
5834                         (void) __cpuid_insn(&regs);
5835                         return (!(regs.cp_edx & 0x100));
5836                 }
5837                 return (0);
5838         case 6323525:
5839                 /*
5840                  * This erratum (K8 #147) is not present on family 10 and newer.
5841                  */
5842                 if (cpi->cpi_family >= 0x10) {
5843                         return (0);
5844                 }
5845                 return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
5846                     (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
5847 
5848         case 6671130:
5849                 /*
5850                  * check for processors (pre-Shanghai) that do not provide
5851                  * optimal management of 1gb ptes in its tlb.
5852                  */
5853                 return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
5854 
5855         case 298:
5856                 return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
5857                     DR_B2(eax) || RB_C0(eax));
5858 
5859         case 721:
5860 #if defined(__amd64)
5861                 return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
5862 #else
5863                 return (0);
5864 #endif
5865 
5866         default:
5867                 return (-1);
5868 
5869         }
5870 }
5871 
5872 /*
5873  * Determine if specified erratum is present via OSVW (OS Visible Workaround).
5874  * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
5875  */
5876 int
5877 osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
5878 {
5879         struct cpuid_info       *cpi;
5880         uint_t                  osvwid;
5881         static int              osvwfeature = -1;
5882         uint64_t                osvwlength;
5883 
5884 
5885         cpi = cpu->cpu_m.mcpu_cpi;
5886 
5887         /* confirm OSVW supported */
5888         if (osvwfeature == -1) {
5889                 osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
5890         } else {
5891                 /* assert that osvw feature setting is consistent on all cpus */
5892                 ASSERT(osvwfeature ==
5893                     (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
5894         }
5895         if (!osvwfeature)
5896                 return (-1);
5897 
5898         osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
5899 
5900         switch (erratum) {
5901         case 298:       /* osvwid is 0 */
5902                 osvwid = 0;
5903                 if (osvwlength <= (uint64_t)osvwid) {
5904                         /* osvwid 0 is unknown */
5905                         return (-1);
5906                 }
5907 
5908                 /*
5909                  * Check the OSVW STATUS MSR to determine the state
5910                  * of the erratum where:
5911                  *   0 - fixed by HW
5912                  *   1 - BIOS has applied the workaround when BIOS
5913                  *   workaround is available. (Or for other errata,
5914                  *   OS workaround is required.)
5915                  * For a value of 1, caller will confirm that the
5916                  * erratum 298 workaround has indeed been applied by BIOS.
5917                  *
5918                  * A 1 may be set in cpus that have a HW fix
5919                  * in a mixed cpu system. Regarding erratum 298:
5920                  *   In a multiprocessor platform, the workaround above
5921                  *   should be applied to all processors regardless of
5922                  *   silicon revision when an affected processor is
5923                  *   present.
5924                  */
5925 
5926                 return (rdmsr(MSR_AMD_OSVW_STATUS +
5927                     (osvwid / OSVW_ID_CNT_PER_MSR)) &
5928                     (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
5929 
5930         default:
5931                 return (-1);
5932         }
5933 }
5934 
5935 static const char assoc_str[] = "associativity";
5936 static const char line_str[] = "line-size";
5937 static const char size_str[] = "size";
5938 
5939 static void
5940 add_cache_prop(dev_info_t *devi, const char *label, const char *type,
5941     uint32_t val)
5942 {
5943         char buf[128];
5944 
5945         /*
5946          * ndi_prop_update_int() is used because it is desirable for
5947          * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
5948          */
5949         if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
5950                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
5951 }
5952 
5953 /*
5954  * Intel-style cache/tlb description
5955  *
5956  * Standard cpuid level 2 gives a randomly ordered
5957  * selection of tags that index into a table that describes
5958  * cache and tlb properties.
5959  */
5960 
5961 static const char l1_icache_str[] = "l1-icache";
5962 static const char l1_dcache_str[] = "l1-dcache";
5963 static const char l2_cache_str[] = "l2-cache";
5964 static const char l3_cache_str[] = "l3-cache";
5965 static const char itlb4k_str[] = "itlb-4K";
5966 static const char dtlb4k_str[] = "dtlb-4K";
5967 static const char itlb2M_str[] = "itlb-2M";
5968 static const char itlb4M_str[] = "itlb-4M";
5969 static const char dtlb4M_str[] = "dtlb-4M";
5970 static const char dtlb24_str[] = "dtlb0-2M-4M";
5971 static const char itlb424_str[] = "itlb-4K-2M-4M";
5972 static const char itlb24_str[] = "itlb-2M-4M";
5973 static const char dtlb44_str[] = "dtlb-4K-4M";
5974 static const char sl1_dcache_str[] = "sectored-l1-dcache";
5975 static const char sl2_cache_str[] = "sectored-l2-cache";
5976 static const char itrace_str[] = "itrace-cache";
5977 static const char sl3_cache_str[] = "sectored-l3-cache";
5978 static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
5979 
5980 static const struct cachetab {
5981         uint8_t         ct_code;
5982         uint8_t         ct_assoc;
5983         uint16_t        ct_line_size;
5984         size_t          ct_size;
5985         const char      *ct_label;
5986 } intel_ctab[] = {
5987         /*
5988          * maintain descending order!
5989          *
5990          * Codes ignored - Reason
5991          * ----------------------
5992          * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
5993          * f0H/f1H - Currently we do not interpret prefetch size by design
5994          */
5995         { 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
5996         { 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
5997         { 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
5998         { 0xde, 12, 64, 6*1024*1024, l3_cache_str},
5999         { 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
6000         { 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
6001         { 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
6002         { 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
6003         { 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
6004         { 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
6005         { 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
6006         { 0xd0, 4, 64, 512*1024, l3_cache_str},
6007         { 0xca, 4, 0, 512, sh_l2_tlb4k_str},
6008         { 0xc0, 4, 0, 8, dtlb44_str },
6009         { 0xba, 4, 0, 64, dtlb4k_str },
6010         { 0xb4, 4, 0, 256, dtlb4k_str },
6011         { 0xb3, 4, 0, 128, dtlb4k_str },
6012         { 0xb2, 4, 0, 64, itlb4k_str },
6013         { 0xb0, 4, 0, 128, itlb4k_str },
6014         { 0x87, 8, 64, 1024*1024, l2_cache_str},
6015         { 0x86, 4, 64, 512*1024, l2_cache_str},
6016         { 0x85, 8, 32, 2*1024*1024, l2_cache_str},
6017         { 0x84, 8, 32, 1024*1024, l2_cache_str},
6018         { 0x83, 8, 32, 512*1024, l2_cache_str},
6019         { 0x82, 8, 32, 256*1024, l2_cache_str},
6020         { 0x80, 8, 64, 512*1024, l2_cache_str},
6021         { 0x7f, 2, 64, 512*1024, l2_cache_str},
6022         { 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
6023         { 0x7c, 8, 64, 1024*1024, sl2_cache_str},
6024         { 0x7b, 8, 64, 512*1024, sl2_cache_str},
6025         { 0x7a, 8, 64, 256*1024, sl2_cache_str},
6026         { 0x79, 8, 64, 128*1024, sl2_cache_str},
6027         { 0x78, 8, 64, 1024*1024, l2_cache_str},
6028         { 0x73, 8, 0, 64*1024, itrace_str},
6029         { 0x72, 8, 0, 32*1024, itrace_str},
6030         { 0x71, 8, 0, 16*1024, itrace_str},
6031         { 0x70, 8, 0, 12*1024, itrace_str},
6032         { 0x68, 4, 64, 32*1024, sl1_dcache_str},
6033         { 0x67, 4, 64, 16*1024, sl1_dcache_str},
6034         { 0x66, 4, 64, 8*1024, sl1_dcache_str},
6035         { 0x60, 8, 64, 16*1024, sl1_dcache_str},
6036         { 0x5d, 0, 0, 256, dtlb44_str},
6037         { 0x5c, 0, 0, 128, dtlb44_str},
6038         { 0x5b, 0, 0, 64, dtlb44_str},
6039         { 0x5a, 4, 0, 32, dtlb24_str},
6040         { 0x59, 0, 0, 16, dtlb4k_str},
6041         { 0x57, 4, 0, 16, dtlb4k_str},
6042         { 0x56, 4, 0, 16, dtlb4M_str},
6043         { 0x55, 0, 0, 7, itlb24_str},
6044         { 0x52, 0, 0, 256, itlb424_str},
6045         { 0x51, 0, 0, 128, itlb424_str},
6046         { 0x50, 0, 0, 64, itlb424_str},
6047         { 0x4f, 0, 0, 32, itlb4k_str},
6048         { 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
6049         { 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
6050         { 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
6051         { 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
6052         { 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
6053         { 0x49, 16, 64, 4*1024*1024, l3_cache_str},
6054         { 0x48, 12, 64, 3*1024*1024, l2_cache_str},
6055         { 0x47, 8, 64, 8*1024*1024, l3_cache_str},
6056         { 0x46, 4, 64, 4*1024*1024, l3_cache_str},
6057         { 0x45, 4, 32, 2*1024*1024, l2_cache_str},
6058         { 0x44, 4, 32, 1024*1024, l2_cache_str},
6059         { 0x43, 4, 32, 512*1024, l2_cache_str},
6060         { 0x42, 4, 32, 256*1024, l2_cache_str},
6061         { 0x41, 4, 32, 128*1024, l2_cache_str},
6062         { 0x3e, 4, 64, 512*1024, sl2_cache_str},
6063         { 0x3d, 6, 64, 384*1024, sl2_cache_str},
6064         { 0x3c, 4, 64, 256*1024, sl2_cache_str},
6065         { 0x3b, 2, 64, 128*1024, sl2_cache_str},
6066         { 0x3a, 6, 64, 192*1024, sl2_cache_str},
6067         { 0x39, 4, 64, 128*1024, sl2_cache_str},
6068         { 0x30, 8, 64, 32*1024, l1_icache_str},
6069         { 0x2c, 8, 64, 32*1024, l1_dcache_str},
6070         { 0x29, 8, 64, 4096*1024, sl3_cache_str},
6071         { 0x25, 8, 64, 2048*1024, sl3_cache_str},
6072         { 0x23, 8, 64, 1024*1024, sl3_cache_str},
6073         { 0x22, 4, 64, 512*1024, sl3_cache_str},
6074         { 0x0e, 6, 64, 24*1024, l1_dcache_str},
6075         { 0x0d, 4, 32, 16*1024, l1_dcache_str},
6076         { 0x0c, 4, 32, 16*1024, l1_dcache_str},
6077         { 0x0b, 4, 0, 4, itlb4M_str},
6078         { 0x0a, 2, 32, 8*1024, l1_dcache_str},
6079         { 0x08, 4, 32, 16*1024, l1_icache_str},
6080         { 0x06, 4, 32, 8*1024, l1_icache_str},
6081         { 0x05, 4, 0, 32, dtlb4M_str},
6082         { 0x04, 4, 0, 8, dtlb4M_str},
6083         { 0x03, 4, 0, 64, dtlb4k_str},
6084         { 0x02, 4, 0, 2, itlb4M_str},
6085         { 0x01, 4, 0, 32, itlb4k_str},
6086         { 0 }
6087 };
6088 
6089 static const struct cachetab cyrix_ctab[] = {
6090         { 0x70, 4, 0, 32, "tlb-4K" },
6091         { 0x80, 4, 16, 16*1024, "l1-cache" },
6092         { 0 }
6093 };
6094 
6095 /*
6096  * Search a cache table for a matching entry
6097  */
6098 static const struct cachetab *
6099 find_cacheent(const struct cachetab *ct, uint_t code)
6100 {
6101         if (code != 0) {
6102                 for (; ct->ct_code != 0; ct++)
6103                         if (ct->ct_code <= code)
6104                                 break;
6105                 if (ct->ct_code == code)
6106                         return (ct);
6107         }
6108         return (NULL);
6109 }
6110 
6111 /*
6112  * Populate cachetab entry with L2 or L3 cache-information using
6113  * cpuid function 4. This function is called from intel_walk_cacheinfo()
6114  * when descriptor 0x49 is encountered. It returns 0 if no such cache
6115  * information is found.
6116  */
6117 static int
6118 intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
6119 {
6120         uint32_t level, i;
6121         int ret = 0;
6122 
6123         for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
6124                 level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
6125 
6126                 if (level == 2 || level == 3) {
6127                         ct->ct_assoc =
6128                             CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
6129                         ct->ct_line_size =
6130                             CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
6131                         ct->ct_size = ct->ct_assoc *
6132                             (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
6133                             ct->ct_line_size *
6134                             (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
6135 
6136                         if (level == 2) {
6137                                 ct->ct_label = l2_cache_str;
6138                         } else if (level == 3) {
6139                                 ct->ct_label = l3_cache_str;
6140                         }
6141                         ret = 1;
6142                 }
6143         }
6144 
6145         return (ret);
6146 }
6147 
6148 /*
6149  * Walk the cacheinfo descriptor, applying 'func' to every valid element
6150  * The walk is terminated if the walker returns non-zero.
6151  */
6152 static void
6153 intel_walk_cacheinfo(struct cpuid_info *cpi,
6154     void *arg, int (*func)(void *, const struct cachetab *))
6155 {
6156         const struct cachetab *ct;
6157         struct cachetab des_49_ct, des_b1_ct;
6158         uint8_t *dp;
6159         int i;
6160 
6161         if ((dp = cpi->cpi_cacheinfo) == NULL)
6162                 return;
6163         for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6164                 /*
6165                  * For overloaded descriptor 0x49 we use cpuid function 4
6166                  * if supported by the current processor, to create
6167                  * cache information.
6168                  * For overloaded descriptor 0xb1 we use X86_PAE flag
6169                  * to disambiguate the cache information.
6170                  */
6171                 if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
6172                     intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
6173                                 ct = &des_49_ct;
6174                 } else if (*dp == 0xb1) {
6175                         des_b1_ct.ct_code = 0xb1;
6176                         des_b1_ct.ct_assoc = 4;
6177                         des_b1_ct.ct_line_size = 0;
6178                         if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
6179                                 des_b1_ct.ct_size = 8;
6180                                 des_b1_ct.ct_label = itlb2M_str;
6181                         } else {
6182                                 des_b1_ct.ct_size = 4;
6183                                 des_b1_ct.ct_label = itlb4M_str;
6184                         }
6185                         ct = &des_b1_ct;
6186                 } else {
6187                         if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
6188                                 continue;
6189                         }
6190                 }
6191 
6192                 if (func(arg, ct) != 0) {
6193                         break;
6194                 }
6195         }
6196 }
6197 
6198 /*
6199  * (Like the Intel one, except for Cyrix CPUs)
6200  */
6201 static void
6202 cyrix_walk_cacheinfo(struct cpuid_info *cpi,
6203     void *arg, int (*func)(void *, const struct cachetab *))
6204 {
6205         const struct cachetab *ct;
6206         uint8_t *dp;
6207         int i;
6208 
6209         if ((dp = cpi->cpi_cacheinfo) == NULL)
6210                 return;
6211         for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6212                 /*
6213                  * Search Cyrix-specific descriptor table first ..
6214                  */
6215                 if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
6216                         if (func(arg, ct) != 0)
6217                                 break;
6218                         continue;
6219                 }
6220                 /*
6221                  * .. else fall back to the Intel one
6222                  */
6223                 if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
6224                         if (func(arg, ct) != 0)
6225                                 break;
6226                         continue;
6227                 }
6228         }
6229 }
6230 
6231 /*
6232  * A cacheinfo walker that adds associativity, line-size, and size properties
6233  * to the devinfo node it is passed as an argument.
6234  */
6235 static int
6236 add_cacheent_props(void *arg, const struct cachetab *ct)
6237 {
6238         dev_info_t *devi = arg;
6239 
6240         add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
6241         if (ct->ct_line_size != 0)
6242                 add_cache_prop(devi, ct->ct_label, line_str,
6243                     ct->ct_line_size);
6244         add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
6245         return (0);
6246 }
6247 
6248 
6249 static const char fully_assoc[] = "fully-associative?";
6250 
6251 /*
6252  * AMD style cache/tlb description
6253  *
6254  * Extended functions 5 and 6 directly describe properties of
6255  * tlbs and various cache levels.
6256  */
6257 static void
6258 add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6259 {
6260         switch (assoc) {
6261         case 0: /* reserved; ignore */
6262                 break;
6263         default:
6264                 add_cache_prop(devi, label, assoc_str, assoc);
6265                 break;
6266         case 0xff:
6267                 add_cache_prop(devi, label, fully_assoc, 1);
6268                 break;
6269         }
6270 }
6271 
6272 static void
6273 add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6274 {
6275         if (size == 0)
6276                 return;
6277         add_cache_prop(devi, label, size_str, size);
6278         add_amd_assoc(devi, label, assoc);
6279 }
6280 
6281 static void
6282 add_amd_cache(dev_info_t *devi, const char *label,
6283     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6284 {
6285         if (size == 0 || line_size == 0)
6286                 return;
6287         add_amd_assoc(devi, label, assoc);
6288         /*
6289          * Most AMD parts have a sectored cache. Multiple cache lines are
6290          * associated with each tag. A sector consists of all cache lines
6291          * associated with a tag. For example, the AMD K6-III has a sector
6292          * size of 2 cache lines per tag.
6293          */
6294         if (lines_per_tag != 0)
6295                 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6296         add_cache_prop(devi, label, line_str, line_size);
6297         add_cache_prop(devi, label, size_str, size * 1024);
6298 }
6299 
6300 static void
6301 add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6302 {
6303         switch (assoc) {
6304         case 0: /* off */
6305                 break;
6306         case 1:
6307         case 2:
6308         case 4:
6309                 add_cache_prop(devi, label, assoc_str, assoc);
6310                 break;
6311         case 6:
6312                 add_cache_prop(devi, label, assoc_str, 8);
6313                 break;
6314         case 8:
6315                 add_cache_prop(devi, label, assoc_str, 16);
6316                 break;
6317         case 0xf:
6318                 add_cache_prop(devi, label, fully_assoc, 1);
6319                 break;
6320         default: /* reserved; ignore */
6321                 break;
6322         }
6323 }
6324 
6325 static void
6326 add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6327 {
6328         if (size == 0 || assoc == 0)
6329                 return;
6330         add_amd_l2_assoc(devi, label, assoc);
6331         add_cache_prop(devi, label, size_str, size);
6332 }
6333 
6334 static void
6335 add_amd_l2_cache(dev_info_t *devi, const char *label,
6336     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6337 {
6338         if (size == 0 || assoc == 0 || line_size == 0)
6339                 return;
6340         add_amd_l2_assoc(devi, label, assoc);
6341         if (lines_per_tag != 0)
6342                 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6343         add_cache_prop(devi, label, line_str, line_size);
6344         add_cache_prop(devi, label, size_str, size * 1024);
6345 }
6346 
6347 static void
6348 amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
6349 {
6350         struct cpuid_regs *cp;
6351 
6352         if (cpi->cpi_xmaxeax < 0x80000005)
6353                 return;
6354         cp = &cpi->cpi_extd[5];
6355 
6356         /*
6357          * 4M/2M L1 TLB configuration
6358          *
6359          * We report the size for 2M pages because AMD uses two
6360          * TLB entries for one 4M page.
6361          */
6362         add_amd_tlb(devi, "dtlb-2M",
6363             BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
6364         add_amd_tlb(devi, "itlb-2M",
6365             BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
6366 
6367         /*
6368          * 4K L1 TLB configuration
6369          */
6370 
6371         switch (cpi->cpi_vendor) {
6372                 uint_t nentries;
6373         case X86_VENDOR_TM:
6374                 if (cpi->cpi_family >= 5) {
6375                         /*
6376                          * Crusoe processors have 256 TLB entries, but
6377                          * cpuid data format constrains them to only
6378                          * reporting 255 of them.
6379                          */
6380                         if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
6381                                 nentries = 256;
6382                         /*
6383                          * Crusoe processors also have a unified TLB
6384                          */
6385                         add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
6386                             nentries);
6387                         break;
6388                 }
6389                 /*FALLTHROUGH*/
6390         default:
6391                 add_amd_tlb(devi, itlb4k_str,
6392                     BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
6393                 add_amd_tlb(devi, dtlb4k_str,
6394                     BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
6395                 break;
6396         }
6397 
6398         /*
6399          * data L1 cache configuration
6400          */
6401 
6402         add_amd_cache(devi, l1_dcache_str,
6403             BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
6404             BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
6405 
6406         /*
6407          * code L1 cache configuration
6408          */
6409 
6410         add_amd_cache(devi, l1_icache_str,
6411             BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
6412             BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
6413 
6414         if (cpi->cpi_xmaxeax < 0x80000006)
6415                 return;
6416         cp = &cpi->cpi_extd[6];
6417 
6418         /* Check for a unified L2 TLB for large pages */
6419 
6420         if (BITX(cp->cp_eax, 31, 16) == 0)
6421                 add_amd_l2_tlb(devi, "l2-tlb-2M",
6422                     BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6423         else {
6424                 add_amd_l2_tlb(devi, "l2-dtlb-2M",
6425                     BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6426                 add_amd_l2_tlb(devi, "l2-itlb-2M",
6427                     BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6428         }
6429 
6430         /* Check for a unified L2 TLB for 4K pages */
6431 
6432         if (BITX(cp->cp_ebx, 31, 16) == 0) {
6433                 add_amd_l2_tlb(devi, "l2-tlb-4K",
6434                     BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6435         } else {
6436                 add_amd_l2_tlb(devi, "l2-dtlb-4K",
6437                     BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6438                 add_amd_l2_tlb(devi, "l2-itlb-4K",
6439                     BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6440         }
6441 
6442         add_amd_l2_cache(devi, l2_cache_str,
6443             BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
6444             BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
6445 }
6446 
6447 /*
6448  * There are two basic ways that the x86 world describes it cache
6449  * and tlb architecture - Intel's way and AMD's way.
6450  *
6451  * Return which flavor of cache architecture we should use
6452  */
6453 static int
6454 x86_which_cacheinfo(struct cpuid_info *cpi)
6455 {
6456         switch (cpi->cpi_vendor) {
6457         case X86_VENDOR_Intel:
6458                 if (cpi->cpi_maxeax >= 2)
6459                         return (X86_VENDOR_Intel);
6460                 break;
6461         case X86_VENDOR_AMD:
6462                 /*
6463                  * The K5 model 1 was the first part from AMD that reported
6464                  * cache sizes via extended cpuid functions.
6465                  */
6466                 if (cpi->cpi_family > 5 ||
6467                     (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
6468                         return (X86_VENDOR_AMD);
6469                 break;
6470         case X86_VENDOR_TM:
6471                 if (cpi->cpi_family >= 5)
6472                         return (X86_VENDOR_AMD);
6473                 /*FALLTHROUGH*/
6474         default:
6475                 /*
6476                  * If they have extended CPU data for 0x80000005
6477                  * then we assume they have AMD-format cache
6478                  * information.
6479                  *
6480                  * If not, and the vendor happens to be Cyrix,
6481                  * then try our-Cyrix specific handler.
6482                  *
6483                  * If we're not Cyrix, then assume we're using Intel's
6484                  * table-driven format instead.
6485                  */
6486                 if (cpi->cpi_xmaxeax >= 0x80000005)
6487                         return (X86_VENDOR_AMD);
6488                 else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
6489                         return (X86_VENDOR_Cyrix);
6490                 else if (cpi->cpi_maxeax >= 2)
6491                         return (X86_VENDOR_Intel);
6492                 break;
6493         }
6494         return (-1);
6495 }
6496 
6497 void
6498 cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
6499     struct cpuid_info *cpi)
6500 {
6501         dev_info_t *cpu_devi;
6502         int create;
6503 
6504         cpu_devi = (dev_info_t *)dip;
6505 
6506         /* device_type */
6507         (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6508             "device_type", "cpu");
6509 
6510         /* reg */
6511         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6512             "reg", cpu_id);
6513 
6514         /* cpu-mhz, and clock-frequency */
6515         if (cpu_freq > 0) {
6516                 long long mul;
6517 
6518                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6519                     "cpu-mhz", cpu_freq);
6520                 if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
6521                         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6522                             "clock-frequency", (int)mul);
6523         }
6524 
6525         if (!is_x86_feature(x86_featureset, X86FSET_CPUID)) {
6526                 return;
6527         }
6528 
6529         /* vendor-id */
6530         (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6531             "vendor-id", cpi->cpi_vendorstr);
6532 
6533         if (cpi->cpi_maxeax == 0) {
6534                 return;
6535         }
6536 
6537         /*
6538          * family, model, and step
6539          */
6540         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6541             "family", CPI_FAMILY(cpi));
6542         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6543             "cpu-model", CPI_MODEL(cpi));
6544         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6545             "stepping-id", CPI_STEP(cpi));
6546 
6547         /* type */
6548         switch (cpi->cpi_vendor) {
6549         case X86_VENDOR_Intel:
6550                 create = 1;
6551                 break;
6552         default:
6553                 create = 0;
6554                 break;
6555         }
6556         if (create)
6557                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6558                     "type", CPI_TYPE(cpi));
6559 
6560         /* ext-family */
6561         switch (cpi->cpi_vendor) {
6562         case X86_VENDOR_Intel:
6563         case X86_VENDOR_AMD:
6564                 create = cpi->cpi_family >= 0xf;
6565                 break;
6566         default:
6567                 create = 0;
6568                 break;
6569         }
6570         if (create)
6571                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6572                     "ext-family", CPI_FAMILY_XTD(cpi));
6573 
6574         /* ext-model */
6575         switch (cpi->cpi_vendor) {
6576         case X86_VENDOR_Intel:
6577                 create = IS_EXTENDED_MODEL_INTEL(cpi);
6578                 break;
6579         case X86_VENDOR_AMD:
6580                 create = CPI_FAMILY(cpi) == 0xf;
6581                 break;
6582         default:
6583                 create = 0;
6584                 break;
6585         }
6586         if (create)
6587                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6588                     "ext-model", CPI_MODEL_XTD(cpi));
6589 
6590         /* generation */
6591         switch (cpi->cpi_vendor) {
6592         case X86_VENDOR_AMD:
6593                 /*
6594                  * AMD K5 model 1 was the first part to support this
6595                  */
6596                 create = cpi->cpi_xmaxeax >= 0x80000001;
6597                 break;
6598         default:
6599                 create = 0;
6600                 break;
6601         }
6602         if (create)
6603                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6604                     "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
6605 
6606         /* brand-id */
6607         switch (cpi->cpi_vendor) {
6608         case X86_VENDOR_Intel:
6609                 /*
6610                  * brand id first appeared on Pentium III Xeon model 8,
6611                  * and Celeron model 8 processors and Opteron
6612                  */
6613                 create = cpi->cpi_family > 6 ||
6614                     (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
6615                 break;
6616         case X86_VENDOR_AMD:
6617                 create = cpi->cpi_family >= 0xf;
6618                 break;
6619         default:
6620                 create = 0;
6621                 break;
6622         }
6623         if (create && cpi->cpi_brandid != 0) {
6624                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6625                     "brand-id", cpi->cpi_brandid);
6626         }
6627 
6628         /* chunks, and apic-id */
6629         switch (cpi->cpi_vendor) {
6630                 /*
6631                  * first available on Pentium IV and Opteron (K8)
6632                  */
6633         case X86_VENDOR_Intel:
6634                 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6635                 break;
6636         case X86_VENDOR_AMD:
6637                 create = cpi->cpi_family >= 0xf;
6638                 break;
6639         default:
6640                 create = 0;
6641                 break;
6642         }
6643         if (create) {
6644                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6645                     "chunks", CPI_CHUNKS(cpi));
6646                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6647                     "apic-id", cpi->cpi_apicid);
6648                 if (cpi->cpi_chipid >= 0) {
6649                         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6650                             "chip#", cpi->cpi_chipid);
6651                         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6652                             "clog#", cpi->cpi_clogid);
6653                 }
6654         }
6655 
6656         /* cpuid-features */
6657         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6658             "cpuid-features", CPI_FEATURES_EDX(cpi));
6659 
6660 
6661         /* cpuid-features-ecx */
6662         switch (cpi->cpi_vendor) {
6663         case X86_VENDOR_Intel:
6664                 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6665                 break;
6666         case X86_VENDOR_AMD:
6667                 create = cpi->cpi_family >= 0xf;
6668                 break;
6669         default:
6670                 create = 0;
6671                 break;
6672         }
6673         if (create)
6674                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6675                     "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
6676 
6677         /* ext-cpuid-features */
6678         switch (cpi->cpi_vendor) {
6679         case X86_VENDOR_Intel:
6680         case X86_VENDOR_AMD:
6681         case X86_VENDOR_Cyrix:
6682         case X86_VENDOR_TM:
6683         case X86_VENDOR_Centaur:
6684                 create = cpi->cpi_xmaxeax >= 0x80000001;
6685                 break;
6686         default:
6687                 create = 0;
6688                 break;
6689         }
6690         if (create) {
6691                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6692                     "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
6693                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6694                     "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
6695         }
6696 
6697         /*
6698          * Brand String first appeared in Intel Pentium IV, AMD K5
6699          * model 1, and Cyrix GXm.  On earlier models we try and
6700          * simulate something similar .. so this string should always
6701          * same -something- about the processor, however lame.
6702          */
6703         (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6704             "brand-string", cpi->cpi_brandstr);
6705 
6706         /*
6707          * Finally, cache and tlb information
6708          */
6709         switch (x86_which_cacheinfo(cpi)) {
6710         case X86_VENDOR_Intel:
6711                 intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
6712                 break;
6713         case X86_VENDOR_Cyrix:
6714                 cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
6715                 break;
6716         case X86_VENDOR_AMD:
6717                 amd_cache_info(cpi, cpu_devi);
6718                 break;
6719         default:
6720                 break;
6721         }
6722 }
6723 
6724 struct l2info {
6725         int *l2i_csz;
6726         int *l2i_lsz;
6727         int *l2i_assoc;
6728         int l2i_ret;
6729 };
6730 
6731 /*
6732  * A cacheinfo walker that fetches the size, line-size and associativity
6733  * of the L2 cache
6734  */
6735 static int
6736 intel_l2cinfo(void *arg, const struct cachetab *ct)
6737 {
6738         struct l2info *l2i = arg;
6739         int *ip;
6740 
6741         if (ct->ct_label != l2_cache_str &&
6742             ct->ct_label != sl2_cache_str)
6743                 return (0);     /* not an L2 -- keep walking */
6744 
6745         if ((ip = l2i->l2i_csz) != NULL)
6746                 *ip = ct->ct_size;
6747         if ((ip = l2i->l2i_lsz) != NULL)
6748                 *ip = ct->ct_line_size;
6749         if ((ip = l2i->l2i_assoc) != NULL)
6750                 *ip = ct->ct_assoc;
6751         l2i->l2i_ret = ct->ct_size;
6752         return (1);             /* was an L2 -- terminate walk */
6753 }
6754 
6755 /*
6756  * AMD L2/L3 Cache and TLB Associativity Field Definition:
6757  *
6758  *      Unlike the associativity for the L1 cache and tlb where the 8 bit
6759  *      value is the associativity, the associativity for the L2 cache and
6760  *      tlb is encoded in the following table. The 4 bit L2 value serves as
6761  *      an index into the amd_afd[] array to determine the associativity.
6762  *      -1 is undefined. 0 is fully associative.
6763  */
6764 
6765 static int amd_afd[] =
6766         {-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
6767 
6768 static void
6769 amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
6770 {
6771         struct cpuid_regs *cp;
6772         uint_t size, assoc;
6773         int i;
6774         int *ip;
6775 
6776         if (cpi->cpi_xmaxeax < 0x80000006)
6777                 return;
6778         cp = &cpi->cpi_extd[6];
6779 
6780         if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
6781             (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
6782                 uint_t cachesz = size * 1024;
6783                 assoc = amd_afd[i];
6784 
6785                 ASSERT(assoc != -1);
6786 
6787                 if ((ip = l2i->l2i_csz) != NULL)
6788                         *ip = cachesz;
6789                 if ((ip = l2i->l2i_lsz) != NULL)
6790                         *ip = BITX(cp->cp_ecx, 7, 0);
6791                 if ((ip = l2i->l2i_assoc) != NULL)
6792                         *ip = assoc;
6793                 l2i->l2i_ret = cachesz;
6794         }
6795 }
6796 
6797 int
6798 getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
6799 {
6800         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6801         struct l2info __l2info, *l2i = &__l2info;
6802 
6803         l2i->l2i_csz = csz;
6804         l2i->l2i_lsz = lsz;
6805         l2i->l2i_assoc = assoc;
6806         l2i->l2i_ret = -1;
6807 
6808         switch (x86_which_cacheinfo(cpi)) {
6809         case X86_VENDOR_Intel:
6810                 intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
6811                 break;
6812         case X86_VENDOR_Cyrix:
6813                 cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
6814                 break;
6815         case X86_VENDOR_AMD:
6816                 amd_l2cacheinfo(cpi, l2i);
6817                 break;
6818         default:
6819                 break;
6820         }
6821         return (l2i->l2i_ret);
6822 }
6823 
6824 #if !defined(__xpv)
6825 
6826 uint32_t *
6827 cpuid_mwait_alloc(cpu_t *cpu)
6828 {
6829         uint32_t        *ret;
6830         size_t          mwait_size;
6831 
6832         ASSERT(cpuid_checkpass(CPU, 2));
6833 
6834         mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
6835         if (mwait_size == 0)
6836                 return (NULL);
6837 
6838         /*
6839          * kmem_alloc() returns cache line size aligned data for mwait_size
6840          * allocations.  mwait_size is currently cache line sized.  Neither
6841          * of these implementation details are guarantied to be true in the
6842          * future.
6843          *
6844          * First try allocating mwait_size as kmem_alloc() currently returns
6845          * correctly aligned memory.  If kmem_alloc() does not return
6846          * mwait_size aligned memory, then use mwait_size ROUNDUP.
6847          *
6848          * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
6849          * decide to free this memory.
6850          */
6851         ret = kmem_zalloc(mwait_size, KM_SLEEP);
6852         if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
6853                 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
6854                 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
6855                 *ret = MWAIT_RUNNING;
6856                 return (ret);
6857         } else {
6858                 kmem_free(ret, mwait_size);
6859                 ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
6860                 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
6861                 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
6862                 ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
6863                 *ret = MWAIT_RUNNING;
6864                 return (ret);
6865         }
6866 }
6867 
6868 void
6869 cpuid_mwait_free(cpu_t *cpu)
6870 {
6871         if (cpu->cpu_m.mcpu_cpi == NULL) {
6872                 return;
6873         }
6874 
6875         if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
6876             cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
6877                 kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
6878                     cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
6879         }
6880 
6881         cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
6882         cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
6883 }
6884 
6885 void
6886 patch_tsc_read(int flag)
6887 {
6888         size_t cnt;
6889 
6890         switch (flag) {
6891         case TSC_NONE:
6892                 cnt = &_no_rdtsc_end - &_no_rdtsc_start;
6893                 (void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
6894                 break;
6895         case TSC_RDTSC_MFENCE:
6896                 cnt = &_tsc_mfence_end - &_tsc_mfence_start;
6897                 (void) memcpy((void *)tsc_read,
6898                     (void *)&_tsc_mfence_start, cnt);
6899                 break;
6900         case TSC_RDTSC_LFENCE:
6901                 cnt = &_tsc_lfence_end - &_tsc_lfence_start;
6902                 (void) memcpy((void *)tsc_read,
6903                     (void *)&_tsc_lfence_start, cnt);
6904                 break;
6905         case TSC_TSCP:
6906                 cnt = &_tscp_end - &_tscp_start;
6907                 (void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
6908                 break;
6909         default:
6910                 /* Bail for unexpected TSC types. (TSC_NONE covers 0) */
6911                 cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
6912                 break;
6913         }
6914         tsc_type = flag;
6915 }
6916 
6917 int
6918 cpuid_deep_cstates_supported(void)
6919 {
6920         struct cpuid_info *cpi;
6921         struct cpuid_regs regs;
6922 
6923         ASSERT(cpuid_checkpass(CPU, 1));
6924 
6925         cpi = CPU->cpu_m.mcpu_cpi;
6926 
6927         if (!is_x86_feature(x86_featureset, X86FSET_CPUID))
6928                 return (0);
6929 
6930         switch (cpi->cpi_vendor) {
6931         case X86_VENDOR_Intel:
6932                 if (cpi->cpi_xmaxeax < 0x80000007)
6933                         return (0);
6934 
6935                 /*
6936                  * TSC run at a constant rate in all ACPI C-states?
6937                  */
6938                 regs.cp_eax = 0x80000007;
6939                 (void) __cpuid_insn(&regs);
6940                 return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
6941 
6942         default:
6943                 return (0);
6944         }
6945 }
6946 
6947 #endif  /* !__xpv */
6948 
6949 void
6950 post_startup_cpu_fixups(void)
6951 {
6952 #ifndef __xpv
6953         /*
6954          * Some AMD processors support C1E state. Entering this state will
6955          * cause the local APIC timer to stop, which we can't deal with at
6956          * this time.
6957          */
6958         if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
6959                 on_trap_data_t otd;
6960                 uint64_t reg;
6961 
6962                 if (!on_trap(&otd, OT_DATA_ACCESS)) {
6963                         reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
6964                         /* Disable C1E state if it is enabled by BIOS */
6965                         if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
6966                             AMD_ACTONCMPHALT_MASK) {
6967                                 reg &= ~(AMD_ACTONCMPHALT_MASK <<
6968                                     AMD_ACTONCMPHALT_SHIFT);
6969                                 wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
6970                         }
6971                 }
6972                 no_trap();
6973         }
6974 #endif  /* !__xpv */
6975 }
6976 
6977 void
6978 enable_pcid(void)
6979 {
6980         if (x86_use_pcid == -1)
6981                 x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
6982 
6983         if (x86_use_invpcid == -1) {
6984                 x86_use_invpcid = is_x86_feature(x86_featureset,
6985                     X86FSET_INVPCID);
6986         }
6987 
6988         if (!x86_use_pcid)
6989                 return;
6990 
6991         /*
6992          * Intel say that on setting PCIDE, it immediately starts using the PCID
6993          * bits; better make sure there's nothing there.
6994          */
6995         ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
6996 
6997         setcr4(getcr4() | CR4_PCIDE);
6998 }
6999 
7000 /*
7001  * Setup necessary registers to enable XSAVE feature on this processor.
7002  * This function needs to be called early enough, so that no xsave/xrstor
7003  * ops will execute on the processor before the MSRs are properly set up.
7004  *
7005  * Current implementation has the following assumption:
7006  * - cpuid_pass1() is done, so that X86 features are known.
7007  * - fpu_probe() is done, so that fp_save_mech is chosen.
7008  */
7009 void
7010 xsave_setup_msr(cpu_t *cpu)
7011 {
7012         ASSERT(fp_save_mech == FP_XSAVE);
7013         ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
7014 
7015         /* Enable OSXSAVE in CR4. */
7016         setcr4(getcr4() | CR4_OSXSAVE);
7017         /*
7018          * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report
7019          * correct value.
7020          */
7021         cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE;
7022         setup_xfem();
7023 }
7024 
7025 /*
7026  * Starting with the Westmere processor the local
7027  * APIC timer will continue running in all C-states,
7028  * including the deepest C-states.
7029  */
7030 int
7031 cpuid_arat_supported(void)
7032 {
7033         struct cpuid_info *cpi;
7034         struct cpuid_regs regs;
7035 
7036         ASSERT(cpuid_checkpass(CPU, 1));
7037         ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7038 
7039         cpi = CPU->cpu_m.mcpu_cpi;
7040 
7041         switch (cpi->cpi_vendor) {
7042         case X86_VENDOR_Intel:
7043                 /*
7044                  * Always-running Local APIC Timer is
7045                  * indicated by CPUID.6.EAX[2].
7046                  */
7047                 if (cpi->cpi_maxeax >= 6) {
7048                         regs.cp_eax = 6;
7049                         (void) cpuid_insn(NULL, &regs);
7050                         return (regs.cp_eax & CPUID_INTC_EAX_ARAT);
7051                 } else {
7052                         return (0);
7053                 }
7054         default:
7055                 return (0);
7056         }
7057 }
7058 
7059 /*
7060  * Check support for Intel ENERGY_PERF_BIAS feature
7061  */
7062 int
7063 cpuid_iepb_supported(struct cpu *cp)
7064 {
7065         struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi;
7066         struct cpuid_regs regs;
7067 
7068         ASSERT(cpuid_checkpass(cp, 1));
7069 
7070         if (!(is_x86_feature(x86_featureset, X86FSET_CPUID)) ||
7071             !(is_x86_feature(x86_featureset, X86FSET_MSR))) {
7072                 return (0);
7073         }
7074 
7075         /*
7076          * Intel ENERGY_PERF_BIAS MSR is indicated by
7077          * capability bit CPUID.6.ECX.3
7078          */
7079         if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6))
7080                 return (0);
7081 
7082         regs.cp_eax = 0x6;
7083         (void) cpuid_insn(NULL, &regs);
7084         return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS);
7085 }
7086 
7087 /*
7088  * Check support for TSC deadline timer
7089  *
7090  * TSC deadline timer provides a superior software programming
7091  * model over local APIC timer that eliminates "time drifts".
7092  * Instead of specifying a relative time, software specifies an
7093  * absolute time as the target at which the processor should
7094  * generate a timer event.
7095  */
7096 int
7097 cpuid_deadline_tsc_supported(void)
7098 {
7099         struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi;
7100         struct cpuid_regs regs;
7101 
7102         ASSERT(cpuid_checkpass(CPU, 1));
7103         ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7104 
7105         switch (cpi->cpi_vendor) {
7106         case X86_VENDOR_Intel:
7107                 if (cpi->cpi_maxeax >= 1) {
7108                         regs.cp_eax = 1;
7109                         (void) cpuid_insn(NULL, &regs);
7110                         return (regs.cp_ecx & CPUID_DEADLINE_TSC);
7111                 } else {
7112                         return (0);
7113                 }
7114         default:
7115                 return (0);
7116         }
7117 }
7118 
7119 #if defined(__amd64) && !defined(__xpv)
7120 /*
7121  * Patch in versions of bcopy for high performance Intel Nhm processors
7122  * and later...
7123  */
7124 void
7125 patch_memops(uint_t vendor)
7126 {
7127         size_t cnt, i;
7128         caddr_t to, from;
7129 
7130         if ((vendor == X86_VENDOR_Intel) &&
7131             is_x86_feature(x86_featureset, X86FSET_SSE4_2)) {
7132                 cnt = &bcopy_patch_end - &bcopy_patch_start;
7133                 to = &bcopy_ck_size;
7134                 from = &bcopy_patch_start;
7135                 for (i = 0; i < cnt; i++) {
7136                         *to++ = *from++;
7137                 }
7138         }
7139 }
7140 #endif  /* __amd64 && !__xpv */
7141 
7142 /*
7143  * We're being asked to tell the system how many bits are required to represent
7144  * the various thread and strand IDs. While it's tempting to derive this based
7145  * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite
7146  * correct. Instead, this needs to be based on the number of bits that the APIC
7147  * allows for these different configurations. We only update these to a larger
7148  * value if we find one.
7149  */
7150 void
7151 cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
7152 {
7153         struct cpuid_info *cpi;
7154 
7155         VERIFY(cpuid_checkpass(CPU, 1));
7156         cpi = cpu->cpu_m.mcpu_cpi;
7157 
7158         if (cpi->cpi_ncore_bits > *core_nbits) {
7159                 *core_nbits = cpi->cpi_ncore_bits;
7160         }
7161 
7162         if (cpi->cpi_nthread_bits > *strand_nbits) {
7163                 *strand_nbits = cpi->cpi_nthread_bits;
7164         }
7165 }
7166 
7167 void
7168 cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
7169 {
7170         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7171         struct cpuid_regs cp;
7172 
7173         /*
7174          * Reread the CPUID portions that we need for various security
7175          * information.
7176          */
7177         if (cpi->cpi_vendor == X86_VENDOR_Intel) {
7178                 /*
7179                  * Check if we now have leaf 7 available to us.
7180                  */
7181                 if (cpi->cpi_maxeax < 7) {
7182                         bzero(&cp, sizeof (cp));
7183                         cp.cp_eax = 0;
7184                         cpi->cpi_maxeax = __cpuid_insn(&cp);
7185                         if (cpi->cpi_maxeax < 7)
7186                                 return;
7187                 }
7188 
7189                 bzero(&cp, sizeof (cp));
7190                 cp.cp_eax = 7;
7191                 cp.cp_ecx = 0;
7192                 (void) __cpuid_insn(&cp);
7193                 cpi->cpi_std[7] = cp;
7194         } else if (cpi->cpi_vendor == X86_VENDOR_AMD) {
7195                 /* No xcpuid support */
7196                 if (cpi->cpi_family < 5 ||
7197                     (cpi->cpi_family == 5 && cpi->cpi_model < 1))
7198                         return;
7199 
7200                 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7201                         bzero(&cp, sizeof (cp));
7202                         cp.cp_eax = CPUID_LEAF_EXT_0;
7203                         cpi->cpi_xmaxeax = __cpuid_insn(&cp);
7204                         if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7205                                 return;
7206                         }
7207                 }
7208 
7209                 bzero(&cp, sizeof (cp));
7210                 cp.cp_eax = CPUID_LEAF_EXT_8;
7211                 (void) __cpuid_insn(&cp);
7212                 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
7213                 cpi->cpi_extd[8] = cp;
7214         } else {
7215                 /*
7216                  * Nothing to do here. Return an empty set which has already
7217                  * been zeroed for us.
7218                  */
7219                 return;
7220         }
7221         cpuid_scan_security(cpu, fset);
7222 }
7223 
7224 /* ARGSUSED */
7225 static int
7226 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
7227 {
7228         uchar_t *fset;
7229         boolean_t first_pass = (boolean_t)arg1;
7230 
7231         fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
7232         if (first_pass && CPU->cpu_id != 0)
7233                 return (0);
7234         if (!first_pass && CPU->cpu_id == 0)
7235                 return (0);
7236         cpuid_pass_ucode(CPU, fset);
7237 
7238         return (0);
7239 }
7240 
7241 /*
7242  * After a microcode update where the version has changed, then we need to
7243  * rescan CPUID. To do this we check every CPU to make sure that they have the
7244  * same microcode. Then we perform a cross call to all such CPUs. It's the
7245  * caller's job to make sure that no one else can end up doing an update while
7246  * this is going on.
7247  *
7248  * We assume that the system is microcode capable if we're called.
7249  */
7250 void
7251 cpuid_post_ucodeadm(void)
7252 {
7253         uint32_t rev;
7254         int i;
7255         struct cpu *cpu;
7256         cpuset_t cpuset;
7257         void *argdata;
7258         uchar_t *f0;
7259 
7260         argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
7261 
7262         mutex_enter(&cpu_lock);
7263         cpu = cpu_get(0);
7264         rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
7265         CPUSET_ONLY(cpuset, 0);
7266         for (i = 1; i < max_ncpus; i++) {
7267                 if ((cpu = cpu_get(i)) == NULL)
7268                         continue;
7269 
7270                 if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
7271                         panic("post microcode update CPU %d has differing "
7272                             "microcode revision (%u) from CPU 0 (%u)",
7273                             i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
7274                 }
7275                 CPUSET_ADD(cpuset, i);
7276         }
7277 
7278         /*
7279          * We do the cross calls in two passes. The first pass is only for the
7280          * boot CPU. The second pass is for all of the other CPUs. This allows
7281          * the boot CPU to go through and change behavior related to patching or
7282          * whether or not Enhanced IBRS needs to be enabled and then allow all
7283          * other CPUs to follow suit.
7284          */
7285         kpreempt_disable();
7286         xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset),
7287             cpuid_post_ucodeadm_xc);
7288         xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset),
7289             cpuid_post_ucodeadm_xc);
7290         kpreempt_enable();
7291 
7292         /*
7293          * OK, now look at each CPU and see if their feature sets are equal.
7294          */
7295         f0 = argdata;
7296         for (i = 1; i < max_ncpus; i++) {
7297                 uchar_t *fset;
7298                 if (!CPU_IN_SET(cpuset, i))
7299                         continue;
7300 
7301                 fset = (uchar_t *)((uintptr_t)argdata +
7302                     sizeof (x86_featureset) * i);
7303 
7304                 if (!compare_x86_featureset(f0, fset)) {
7305                         panic("Post microcode update CPU %d has "
7306                             "differing security feature (%p) set from CPU 0 "
7307                             "(%p), not appending to feature set", i,
7308                             (void *)fset, (void *)f0);
7309                 }
7310         }
7311 
7312         mutex_exit(&cpu_lock);
7313 
7314         for (i = 0; i < NUM_X86_FEATURES; i++) {
7315                 cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n",
7316                     x86_feature_names[i]);
7317                 if (is_x86_feature(f0, i)) {
7318                         add_x86_feature(x86_featureset, i);
7319                 }
7320         }
7321         kmem_free(argdata, sizeof (x86_featureset) * NCPU);
7322 }