2 Wdiff usr/src/uts/i86pc/os/cpuid.c

Print this page

11787 Kernel needs to be built with retpolines
11788 Kernel needs to generally use RSB stuffing
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: John Levon <john.levon@joyent.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/i86pc/os/cpuid.c
          +++ new/usr/src/uts/i86pc/os/cpuid.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  24   24   * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
  25   25   * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
  26   26   */
  27   27  /*
  28   28   * Copyright (c) 2010, Intel Corporation.
  29   29   * All rights reserved.
  30   30   */
  31   31  /*
  32   32   * Portions Copyright 2009 Advanced Micro Devices, Inc.
  33   33   */
  34   34  /*
  35   35   * Copyright 2019 Joyent, Inc.
  36   36   */
  37   37  
  38   38  /*
  39   39   * CPU Identification logic
  40   40   *
  41   41   * The purpose of this file and its companion, cpuid_subr.c, is to help deal
  42   42   * with the identification of CPUs, their features, and their topologies. More
  43   43   * specifically, this file helps drive the following:
  44   44   *
  45   45   * 1. Enumeration of features of the processor which are used by the kernel to
  46   46   *    determine what features to enable or disable. These may be instruction set
  47   47   *    enhancements or features that we use.
  48   48   *
  49   49   * 2. Enumeration of instruction set architecture (ISA) additions that userland
  50   50   *    will be told about through the auxiliary vector.
  51   51   *
  52   52   * 3. Understanding the physical topology of the CPU such as the number of
  53   53   *    caches, how many cores it has, whether or not it supports symmetric
  54   54   *    multi-processing (SMT), etc.
  55   55   *
  56   56   * ------------------------
  57   57   * CPUID History and Basics
  58   58   * ------------------------
  59   59   *
  60   60   * The cpuid instruction was added by Intel roughly around the time that the
  61   61   * original Pentium was introduced. The purpose of cpuid was to tell in a
  62   62   * programmatic fashion information about the CPU that previously was guessed
  63   63   * at. For example, an important part of cpuid is that we can know what
  64   64   * extensions to the ISA exist. If you use an invalid opcode you would get a
  65   65   * #UD, so this method allows a program (whether a user program or the kernel)
  66   66   * to determine what exists without crashing or getting a SIGILL. Of course,
  67   67   * this was also during the era of the clones and the AMD Am5x86. The vendor
  68   68   * name shows up first in cpuid for a reason.
  69   69   *
  70   70   * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
  71   71   * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
  72   72   * its own meaning. The different leaves are broken down into different regions:
  73   73   *
  74   74   *      [ 0, 7fffffff ]                 This region is called the 'basic'
  75   75   *                                      region. This region is generally defined
  76   76   *                                      by Intel, though some of the original
  77   77   *                                      portions have different meanings based
  78   78   *                                      on the manufacturer. These days, Intel
  79   79   *                                      adds most new features to this region.
  80   80   *                                      AMD adds non-Intel compatible
  81   81   *                                      information in the third, extended
  82   82   *                                      region. Intel uses this for everything
  83   83   *                                      including ISA extensions, CPU
  84   84   *                                      features, cache information, topology,
  85   85   *                                      and more.
  86   86   *
  87   87   *                                      There is a hole carved out of this
  88   88   *                                      region which is reserved for
  89   89   *                                      hypervisors.
  90   90   *
  91   91   *      [ 40000000, 4fffffff ]          This region, which is found in the
  92   92   *                                      middle of the previous region, is
  93   93   *                                      explicitly promised to never be used by
  94   94   *                                      CPUs. Instead, it is used by hypervisors
  95   95   *                                      to communicate information about
  96   96   *                                      themselves to the operating system. The
  97   97   *                                      values and details are unique for each
  98   98   *                                      hypervisor.
  99   99   *
 100  100   *      [ 80000000, ffffffff ]          This region is called the 'extended'
 101  101   *                                      region. Some of the low leaves mirror
 102  102   *                                      parts of the basic leaves. This region
 103  103   *                                      has generally been used by AMD for
 104  104   *                                      various extensions. For example, AMD-
 105  105   *                                      specific information about caches,
 106  106   *                                      features, and topology are found in this
 107  107   *                                      region.
 108  108   *
 109  109   * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
 110  110   * and %edx, and then issue the cpuid instruction. At the first leaf in each of
 111  111   * the ranges, one of the primary things returned is the maximum valid leaf in
 112  112   * that range. This allows for discovery of what range of CPUID is valid.
 113  113   *
 114  114   * The CPUs have potentially surprising behavior when using an invalid leaf or
 115  115   * unimplemented leaf. If the requested leaf is within the valid basic or
 116  116   * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
 117  117   * set to zero. However, if you specify a leaf that is outside of a valid range,
 118  118   * then instead it will be filled with the last valid _basic_ leaf. For example,
 119  119   * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
 120  120   * an invalid extended leaf will return the information for leaf 3.
 121  121   *
 122  122   * Some leaves are broken down into sub-leaves. This means that the value
 123  123   * depends on both the leaf asked for in %eax and a secondary register. For
 124  124   * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
 125  125   * additional information. Or when getting topology information in leaf 0xb, the
 126  126   * initial value in %ecx changes which level of the topology that you are
 127  127   * getting information about.
 128  128   *
 129  129   * cpuid values are always kept to 32 bits regardless of whether or not the
 130  130   * program is in 64-bit mode. When executing in 64-bit mode, the upper
 131  131   * 32 bits of the register are always set to zero so that way the values are the
 132  132   * same regardless of execution mode.
 133  133   *
 134  134   * ----------------------
 135  135   * Identifying Processors
 136  136   * ----------------------
 137  137   *
 138  138   * We can identify a processor in two steps. The first step looks at cpuid leaf
 139  139   * 0. Leaf 0 contains the processor's vendor information. This is done by
 140  140   * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
 141  141   * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
 142  142   *
 143  143   * From there, a processor is identified by a combination of three different
 144  144   * values:
 145  145   *
 146  146   *  1. Family
 147  147   *  2. Model
 148  148   *  3. Stepping
 149  149   *
 150  150   * Each vendor uses the family and model to uniquely identify a processor. The
 151  151   * way that family and model are changed depends on the vendor. For example,
 152  152   * Intel has been using family 0x6 for almost all of their processor since the
 153  153   * Pentium Pro/Pentium II era, often called the P6. The model is used to
 154  154   * identify the exact processor. Different models are often used for the client
 155  155   * (consumer) and server parts. Even though each processor often has major
 156  156   * architectural differences, they still are considered the same family by
 157  157   * Intel.
 158  158   *
 159  159   * On the other hand, each major AMD architecture generally has its own family.
 160  160   * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
 161  161   * the model number is used to help identify specific processors.
 162  162   *
 163  163   * The stepping is used to refer to a revision of a specific microprocessor. The
 164  164   * term comes from equipment used to produce masks that are used to create
 165  165   * integrated circuits.
 166  166   *
 167  167   * The information is present in leaf 1, %eax. In technical documentation you
 168  168   * will see the terms extended model and extended family. The original family,
 169  169   * model, and stepping fields were each 4 bits wide. If the values in either
 170  170   * are 0xf, then one is to consult the extended model and extended family, which
 171  171   * take previously reserved bits and allow for a larger number of models and add
 172  172   * 0xf to them.
 173  173   *
 174  174   * When we process this information, we store the full family, model, and
 175  175   * stepping in the struct cpuid_info members cpi_family, cpi_model, and
 176  176   * cpi_step, respectively. Whenever you are performing comparisons with the
 177  177   * family, model, and stepping, you should use these members and not the raw
 178  178   * values from cpuid. If you must use the raw values from cpuid directly, you
 179  179   * must make sure that you add the extended model and family to the base model
 180  180   * and family.
 181  181   *
 182  182   * In general, we do not use information about the family, model, and stepping
 183  183   * to determine whether or not a feature is present; that is generally driven by
 184  184   * specific leaves. However, when something we care about on the processor is
 185  185   * not considered 'architectural' meaning that it is specific to a set of
 186  186   * processors and not promised in the architecture model to be consistent from
 187  187   * generation to generation, then we will fall back on this information. The
 188  188   * most common cases where this comes up is when we have to workaround errata in
 189  189   * the processor, are dealing with processor-specific features such as CPU
 190  190   * performance counters, or we want to provide additional information for things
 191  191   * such as fault management.
 192  192   *
 193  193   * While processors also do have a brand string, which is the name that people
 194  194   * are familiar with when buying the processor, they are not meant for
 195  195   * programmatic consumption. That is what the family, model, and stepping are
 196  196   * for.
 197  197   *
 198  198   * ------------
 199  199   * CPUID Passes
 200  200   * ------------
 201  201   *
 202  202   * As part of performing feature detection, we break this into several different
 203  203   * passes. The passes are as follows:
 204  204   *
 205  205   *      Pass 0          This is a primordial pass done in locore.s to deal with
 206  206   *                      Cyrix CPUs that don't support cpuid. The reality is that
 207  207   *                      we likely don't run on them any more, but there is still
 208  208   *                      logic for handling them.
 209  209   *
 210  210   *      Pass 1          This is the primary pass and is responsible for doing a
 211  211   *                      large number of different things:
 212  212   *
 213  213   *                      1. Determine which vendor manufactured the CPU and
 214  214   *                      determining the family, model, and stepping information.
 215  215   *
 216  216   *                      2. Gathering a large number of feature flags to
 217  217   *                      determine which features the CPU support and which
 218  218   *                      indicate things that we need to do other work in the OS
 219  219   *                      to enable. Features detected this way are added to the
 220  220   *                      x86_featureset which can be queried to
 221  221   *                      determine what we should do. This includes processing
 222  222   *                      all of the basic and extended CPU features that we care
 223  223   *                      about.
 224  224   *
 225  225   *                      3. Determining the CPU's topology. This includes
 226  226   *                      information about how many cores and threads are present
 227  227   *                      in the package. It also is responsible for figuring out
 228  228   *                      which logical CPUs are potentially part of the same core
 229  229   *                      and what other resources they might share. For more
 230  230   *                      information see the 'Topology' section.
 231  231   *
 232  232   *                      4. Determining the set of CPU security-specific features
 233  233   *                      that we need to worry about and determine the
 234  234   *                      appropriate set of workarounds.
 235  235   *
 236  236   *                      Pass 1 on the boot CPU occurs before KMDB is started.
 237  237   *
 238  238   *      Pass 2          The second pass is done after startup(). Here, we check
 239  239   *                      other miscellaneous features. Most of this is gathering
 240  240   *                      additional basic and extended features that we'll use in
 241  241   *                      later passes or for debugging support.
 242  242   *
 243  243   *      Pass 3          The third pass occurs after the kernel memory allocator
 244  244   *                      has been fully initialized. This gathers information
 245  245   *                      where we might need dynamic memory available for our
 246  246   *                      uses. This includes several varying width leaves that
 247  247   *                      have cache information and the processor's brand string.
 248  248   *
 249  249   *      Pass 4          The fourth and final normal pass is performed after the
 250  250   *                      kernel has brought most everything online. This is
 251  251   *                      invoked from post_startup(). In this pass, we go through
 252  252   *                      the set of features that we have enabled and turn that
 253  253   *                      into the hardware auxiliary vector features that
 254  254   *                      userland receives. This is used by userland, primarily
 255  255   *                      by the run-time link-editor (RTLD), though userland
 256  256   *                      software could also refer to it directly.
 257  257   *
 258  258   *      Microcode       After a microcode update, we do a selective rescan of
 259  259   *                      the cpuid leaves to determine what features have
 260  260   *                      changed. Microcode updates can provide more details
 261  261   *                      about security related features to deal with issues like
 262  262   *                      Spectre and L1TF. On occasion, vendors have violated
 263  263   *                      their contract and removed bits. However, we don't try
 264  264   *                      to detect that because that puts us in a situation that
 265  265   *                      we really can't deal with. As such, the only thing we
 266  266   *                      rescan are security related features today. See
 267  267   *                      cpuid_pass_ucode().
 268  268   *
 269  269   * All of the passes (except pass 0) are run on all CPUs. However, for the most
 270  270   * part we only care about what the boot CPU says about this information and use
 271  271   * the other CPUs as a rough guide to sanity check that we have the same feature
 272  272   * set.
 273  273   *
 274  274   * We do not support running multiple logical CPUs with disjoint, let alone
 275  275   * different, feature sets.
 276  276   *
 277  277   * ------------------
 278  278   * Processor Topology
 279  279   * ------------------
 280  280   *
 281  281   * One of the important things that we need to do is to understand the topology
 282  282   * of the underlying processor. When we say topology in this case, we're trying
 283  283   * to understand the relationship between the logical CPUs that the operating
 284  284   * system sees and the underlying physical layout. Different logical CPUs may
 285  285   * share different resources which can have important consequences for the
 286  286   * performance of the system. For example, they may share caches, execution
 287  287   * units, and more.
 288  288   *
 289  289   * The topology of the processor changes from generation to generation and
 290  290   * vendor to vendor.  Along with that, different vendors use different
 291  291   * terminology, and the operating system itself uses occasionally overlapping
 292  292   * terminology. It's important to understand what this topology looks like so
 293  293   * one can understand the different things that we try to calculate and
 294  294   * determine.
 295  295   *
 296  296   * To get started, let's talk about a little bit of terminology that we've used
 297  297   * so far, is used throughout this file, and is fairly generic across multiple
 298  298   * vendors:
 299  299   *
 300  300   * CPU
 301  301   *      A central processing unit (CPU) refers to a logical and/or virtual
 302  302   *      entity that the operating system can execute instructions on. The
 303  303   *      underlying resources for this CPU may be shared between multiple
 304  304   *      entities; however, to the operating system it is a discrete unit.
 305  305   *
 306  306   * PROCESSOR and PACKAGE
 307  307   *
 308  308   *      Generally, when we use the term 'processor' on its own, we are referring
 309  309   *      to the physical entity that one buys and plugs into a board. However,
 310  310   *      because processor has been overloaded and one might see it used to mean
 311  311   *      multiple different levels, we will instead use the term 'package' for
 312  312   *      the rest of this file. The term package comes from the electrical
 313  313   *      engineering side and refers to the physical entity that encloses the
 314  314   *      electronics inside. Strictly speaking the package can contain more than
 315  315   *      just the CPU, for example, on many processors it may also have what's
 316  316   *      called an 'integrated graphical processing unit (GPU)'. Because the
 317  317   *      package can encapsulate multiple units, it is the largest physical unit
 318  318   *      that we refer to.
 319  319   *
 320  320   * SOCKET
 321  321   *
 322  322   *      A socket refers to unit on a system board (generally the motherboard)
 323  323   *      that can receive a package. A single package, or processor, is plugged
 324  324   *      into a single socket. A system may have multiple sockets. Often times,
 325  325   *      the term socket is used interchangeably with package and refers to the
 326  326   *      electrical component that has plugged in, and not the receptacle itself.
 327  327   *
 328  328   * CORE
 329  329   *
 330  330   *      A core refers to the physical instantiation of a CPU, generally, with a
 331  331   *      full set of hardware resources available to it. A package may contain
 332  332   *      multiple cores inside of it or it may just have a single one. A
 333  333   *      processor with more than one core is often referred to as 'multi-core'.
 334  334   *      In illumos, we will use the feature X86FSET_CMP to refer to a system
 335  335   *      that has 'multi-core' processors.
 336  336   *
 337  337   *      A core may expose a single logical CPU to the operating system, or it
 338  338   *      may expose multiple CPUs, which we call threads, defined below.
 339  339   *
 340  340   *      Some resources may still be shared by cores in the same package. For
 341  341   *      example, many processors will share the level 3 cache between cores.
 342  342   *      Some AMD generations share hardware resources between cores. For more
 343  343   *      information on that see the section 'AMD Topology'.
 344  344   *
 345  345   * THREAD and STRAND
 346  346   *
 347  347   *      In this file, generally a thread refers to a hardware resources and not
 348  348   *      the operating system's logical abstraction. A thread is always exposed
 349  349   *      as an independent logical CPU to the operating system. A thread belongs
 350  350   *      to a specific core. A core may have more than one thread. When that is
 351  351   *      the case, the threads that are part of the same core are often referred
 352  352   *      to as 'siblings'.
 353  353   *
 354  354   *      When multiple threads exist, this is generally referred to as
 355  355   *      simultaneous multi-threading (SMT). When Intel introduced this in their
 356  356   *      processors they called it hyper-threading (HT). When multiple threads
 357  357   *      are active in a core, they split the resources of the core. For example,
 358  358   *      two threads may share the same set of hardware execution units.
 359  359   *
 360  360   *      The operating system often uses the term 'strand' to refer to a thread.
 361  361   *      This helps disambiguate it from the software concept.
 362  362   *
 363  363   * CHIP
 364  364   *
 365  365   *      Unfortunately, the term 'chip' is dramatically overloaded. At its most
 366  366   *      base meaning, it is used to refer to a single integrated circuit, which
 367  367   *      may or may not be the only thing in the package. In illumos, when you
 368  368   *      see the term 'chip' it is almost always referring to the same thing as
 369  369   *      the 'package'. However, many vendors may use chip to refer to one of
 370  370   *      many integrated circuits that have been placed in the package. As an
 371  371   *      example, see the subsequent definition.
 372  372   *
 373  373   *      To try and keep things consistent, we will only use chip when referring
 374  374   *      to the entire integrated circuit package, with the exception of the
 375  375   *      definition of multi-chip module (because it is in the name) and use the
 376  376   *      term 'die' when we want the more general, potential sub-component
 377  377   *      definition.
 378  378   *
 379  379   * DIE
 380  380   *
 381  381   *      A die refers to an integrated circuit. Inside of the package there may
 382  382   *      be a single die or multiple dies. This is sometimes called a 'chip' in
 383  383   *      vendor's parlance, but in this file, we use the term die to refer to a
 384  384   *      subcomponent.
 385  385   *
 386  386   * MULTI-CHIP MODULE
 387  387   *
 388  388   *      A multi-chip module (MCM) refers to putting multiple distinct chips that
 389  389   *      are connected together in the same package. When a multi-chip design is
 390  390   *      used, generally each chip is manufactured independently and then joined
 391  391   *      together in the package. For example, on AMD's Zen microarchitecture
 392  392   *      (family 0x17), the package contains several dies (the second meaning of
 393  393   *      chip from above) that are connected together.
 394  394   *
 395  395   * CACHE
 396  396   *
 397  397   *      A cache is a part of the processor that maintains copies of recently
 398  398   *      accessed memory. Caches are split into levels and then into types.
 399  399   *      Commonly there are one to three levels, called level one, two, and
 400  400   *      three. The lower the level, the smaller it is, the closer it is to the
 401  401   *      execution units of the CPU, and the faster it is to access. The layout
 402  402   *      and design of the cache come in many different flavors, consult other
 403  403   *      resources for a discussion of those.
 404  404   *
 405  405   *      Caches are generally split into two types, the instruction and data
 406  406   *      cache. The caches contain what their names suggest, the instruction
 407  407   *      cache has executable program text, while the data cache has all other
 408  408   *      memory that the processor accesses. As of this writing, data is kept
 409  409   *      coherent between all of the caches on x86, so if one modifies program
 410  410   *      text before it is executed, that will be in the data cache, and the
 411  411   *      instruction cache will be synchronized with that change when the
 412  412   *      processor actually executes those instructions. This coherency also
 413  413   *      covers the fact that data could show up in multiple caches.
 414  414   *
 415  415   *      Generally, the lowest level caches are specific to a core. However, the
 416  416   *      last layer cache is shared between some number of cores. The number of
 417  417   *      CPUs sharing this last level cache is important. This has implications
 418  418   *      for the choices that the scheduler makes, as accessing memory that might
 419  419   *      be in a remote cache after thread migration can be quite expensive.
 420  420   *
 421  421   *      Sometimes, the word cache is abbreviated with a '$', because in US
 422  422   *      English the word cache is pronounced the same as cash. So L1D$ refers to
 423  423   *      the L1 data cache, and L2$ would be the L2 cache. This will not be used
 424  424   *      in the rest of this theory statement for clarity.
 425  425   *
 426  426   * MEMORY CONTROLLER
 427  427   *
 428  428   *      The memory controller is a component that provides access to DRAM. Each
 429  429   *      memory controller can access a set number of DRAM channels. Each channel
 430  430   *      can have a number of DIMMs (sticks of memory) associated with it. A
 431  431   *      given package may have more than one memory controller. The association
 432  432   *      of the memory controller to a group of cores is important as it is
 433  433   *      cheaper to access memory on the controller that you are associated with.
 434  434   *
 435  435   * NUMA
 436  436   *
 437  437   *      NUMA or non-uniform memory access, describes a way that systems are
 438  438   *      built. On x86, any processor core can address all of the memory in the
 439  439   *      system. However, When using multiple sockets or possibly within a
 440  440   *      multi-chip module, some of that memory is physically closer and some of
 441  441   *      it is further. Memory that is further away is more expensive to access.
 442  442   *      Consider the following image of multiple sockets with memory:
 443  443   *
 444  444   *      +--------+                                                +--------+
 445  445   *      | DIMM A |         +----------+      +----------+         | DIMM D |
 446  446   *      +--------+-+       |          |      |          |       +-+------+-+
 447  447   *        | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
 448  448   *        +--------+-+     |          |      |          |     +-+------+-+
 449  449   *          | DIMM C |     +----------+      +----------+     | DIMM F |
 450  450   *          +--------+                                        +--------+
 451  451   *
 452  452   *      In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
 453  453   *      closer to DIMMs D-F. This means that it is cheaper for socket 0 to
 454  454   *      access DIMMs A-C and more expensive to access D-F as it has to go
 455  455   *      through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
 456  456   *      D-F are cheaper than A-C. While the socket form is the most common, when
 457  457   *      using multi-chip modules, this can also sometimes occur. For another
 458  458   *      example of this that's more involved, see the AMD topology section.
 459  459   *
 460  460   *
 461  461   * Intel Topology
 462  462   * --------------
 463  463   *
 464  464   * Most Intel processors since Nehalem, (as of this writing the current gen
 465  465   * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
 466  466   * the package is a single monolithic die. MCMs currently aren't used. Most
 467  467   * parts have three levels of caches, with the L3 cache being shared between
 468  468   * all of the cores on the package. The L1/L2 cache is generally specific to
 469  469   * an individual core. The following image shows at a simplified level what
 470  470   * this looks like. The memory controller is commonly part of something called
 471  471   * the 'Uncore', that used to be separate physical chips that were not a part of
 472  472   * the package, but are now part of the same chip.
 473  473   *
 474  474   *  +-----------------------------------------------------------------------+
 475  475   *  | Package                                                               |
 476  476   *  |  +-------------------+  +-------------------+  +-------------------+  |
 477  477   *  |  | Core              |  | Core              |  | Core              |  |
 478  478   *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
 479  479   *  |  |  | Thread | | L | |  |  | Thread | | L | |  |  | Thread | | L | |  |
 480  480   *  |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |
 481  481   *  |  |  +--------+ |   | |  |  +--------+ |   | |  |  +--------+ |   | |  |
 482  482   *  |  |  | Thread | |   | |  |  | Thread | |   | |  |  | Thread | |   | |  |
 483  483   *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
 484  484   *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
 485  485   *  |  |  | L2 Cache     | |  |  | L2 Cache     | |  |  | L2 Cache     | |  |
 486  486   *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
 487  487   *  |  +-------------------+  +-------------------+  +-------------------+  |
 488  488   *  | +-------------------------------------------------------------------+ |
 489  489   *  | |                         Shared L3 Cache                           | |
 490  490   *  | +-------------------------------------------------------------------+ |
 491  491   *  | +-------------------------------------------------------------------+ |
 492  492   *  | |                        Memory Controller                          | |
 493  493   *  | +-------------------------------------------------------------------+ |
 494  494   *  +-----------------------------------------------------------------------+
 495  495   *
 496  496   * A side effect of this current architecture is that what we care about from a
 497  497   * scheduling and topology perspective, is simplified. In general we care about
 498  498   * understanding which logical CPUs are part of the same core and socket.
 499  499   *
 500  500   * To determine the relationship between threads and cores, Intel initially used
 501  501   * the identifier in the advanced programmable interrupt controller (APIC). They
 502  502   * also added cpuid leaf 4 to give additional information about the number of
 503  503   * threads and CPUs in the processor. With the addition of x2apic (which
 504  504   * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
 505  505   * additional cpuid topology leaf 0xB was added.
 506  506   *
 507  507   * AMD Topology
 508  508   * ------------
 509  509   *
 510  510   * When discussing AMD topology, we want to break this into three distinct
 511  511   * generations of topology. There's the basic topology that has been used in
 512  512   * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
 513  513   * with family 0x15 (Bulldozer), and there's the topology that was introduced
 514  514   * with family 0x17 (Zen). AMD also has some additional terminology that's worth
 515  515   * talking about.
 516  516   *
 517  517   * Until the introduction of family 0x17 (Zen), AMD did not implement something
 518  518   * that they considered SMT. Whether or not the AMD processors have SMT
 519  519   * influences many things including scheduling and reliability, availability,
 520  520   * and serviceability (RAS) features.
 521  521   *
 522  522   * NODE
 523  523   *
 524  524   *      AMD uses the term node to refer to a die that contains a number of cores
 525  525   *      and I/O resources. Depending on the processor family and model, more
 526  526   *      than one node can be present in the package. When there is more than one
 527  527   *      node this indicates a multi-chip module. Usually each node has its own
 528  528   *      access to memory and I/O devices. This is important and generally
 529  529   *      different from the corresponding Intel Nehalem-Skylake+ processors. As a
 530  530   *      result, we track this relationship in the operating system.
 531  531   *
 532  532   *      In processors with an L3 cache, the L3 cache is generally shared across
 533  533   *      the entire node, though the way this is carved up varies from generation
 534  534   *      to generation.
 535  535   *
 536  536   * BULLDOZER
 537  537   *
 538  538   *      Starting with the Bulldozer family (0x15) and continuing until the
 539  539   *      introduction of the Zen microarchitecture, AMD introduced the idea of a
 540  540   *      compute unit. In a compute unit, two traditional cores share a number of
 541  541   *      hardware resources. Critically, they share the FPU, L1 instruction
 542  542   *      cache, and the L2 cache. Several compute units were then combined inside
 543  543   *      of a single node.  Because the integer execution units, L1 data cache,
 544  544   *      and some other resources were not shared between the cores, AMD never
 545  545   *      considered this to be SMT.
 546  546   *
 547  547   * ZEN
 548  548   *
 549  549   *      The Zen family (0x17) uses a multi-chip module (MCM) design, the module
 550  550   *      is called Zeppelin. These modules are similar to the idea of nodes used
 551  551   *      previously. Each of these nodes has two DRAM channels which all of the
 552  552   *      cores in the node can access uniformly. These nodes are linked together
 553  553   *      in the package, creating a NUMA environment.
 554  554   *
 555  555   *      The Zeppelin die itself contains two different 'core complexes'. Each
 556  556   *      core complex consists of four cores which each have two threads, for a
 557  557   *      total of 8 logical CPUs per complex. Unlike other generations,
 558  558   *      where all the logical CPUs in a given node share the L3 cache, here each
 559  559   *      core complex has its own shared L3 cache.
 560  560   *
 561  561   *      A further thing that we need to consider is that in some configurations,
 562  562   *      particularly with the Threadripper line of processors, not every die
 563  563   *      actually has its memory controllers wired up to actual memory channels.
 564  564   *      This means that some cores have memory attached to them and others
 565  565   *      don't.
 566  566   *
 567  567   *      To put Zen in perspective, consider the following images:
 568  568   *
 569  569   *      +--------------------------------------------------------+
 570  570   *      | Core Complex                                           |
 571  571   *      | +-------------------+    +-------------------+  +---+  |
 572  572   *      | | Core       +----+ |    | Core       +----+ |  |   |  |
 573  573   *      | | +--------+ | L2 | |    | +--------+ | L2 | |  |   |  |
 574  574   *      | | | Thread | +----+ |    | | Thread | +----+ |  |   |  |
 575  575   *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  | L |  |
 576  576   *      | |   | Thread | |L1| |    |   | Thread | |L1| |  | 3 |  |
 577  577   *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
 578  578   *      | +-------------------+    +-------------------+  | C |  |
 579  579   *      | +-------------------+    +-------------------+  | a |  |
 580  580   *      | | Core       +----+ |    | Core       +----+ |  | c |  |
 581  581   *      | | +--------+ | L2 | |    | +--------+ | L2 | |  | h |  |
 582  582   *      | | | Thread | +----+ |    | | Thread | +----+ |  | e |  |
 583  583   *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |   |  |
 584  584   *      | |   | Thread | |L1| |    |   | Thread | |L1| |  |   |  |
 585  585   *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
 586  586   *      | +-------------------+    +-------------------+  +---+  |
 587  587   *      |                                                        |
 588  588   *      +--------------------------------------------------------+
 589  589   *
 590  590   *  This first image represents a single Zen core complex that consists of four
 591  591   *  cores.
 592  592   *
 593  593   *
 594  594   *      +--------------------------------------------------------+
 595  595   *      | Zeppelin Die                                           |
 596  596   *      |  +--------------------------------------------------+  |
 597  597   *      |  |         I/O Units (PCIe, SATA, USB, etc.)        |  |
 598  598   *      |  +--------------------------------------------------+  |
 599  599   *      |                           HH                           |
 600  600   *      |          +-----------+    HH    +-----------+          |
 601  601   *      |          |           |    HH    |           |          |
 602  602   *      |          |    Core   |==========|    Core   |          |
 603  603   *      |          |  Complex  |==========|  Complex  |          |
 604  604   *      |          |           |    HH    |           |          |
 605  605   *      |          +-----------+    HH    +-----------+          |
 606  606   *      |                           HH                           |
 607  607   *      |  +--------------------------------------------------+  |
 608  608   *      |  |                Memory Controller                 |  |
 609  609   *      |  +--------------------------------------------------+  |
 610  610   *      |                                                        |
 611  611   *      +--------------------------------------------------------+
 612  612   *
 613  613   *  This image represents a single Zeppelin Die. Note how both cores are
 614  614   *  connected to the same memory controller and I/O units. While each core
 615  615   *  complex has its own L3 cache as seen in the first image, they both have
 616  616   *  uniform access to memory.
 617  617   *
 618  618   *
 619  619   *                      PP                     PP
 620  620   *                      PP                     PP
 621  621   *           +----------PP---------------------PP---------+
 622  622   *           |          PP                     PP         |
 623  623   *           |    +-----------+          +-----------+    |
 624  624   *           |    |           |          |           |    |
 625  625   *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
 626  626   *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
 627  627   *           |    |           |          |           |    |
 628  628   *           |    +-----------+ooo    ...+-----------+    |
 629  629   *           |          HH      ooo  ...       HH         |
 630  630   *           |          HH        oo..         HH         |
 631  631   *           |          HH        ..oo         HH         |
 632  632   *           |          HH      ...  ooo       HH         |
 633  633   *           |    +-----------+...    ooo+-----------+    |
 634  634   *           |    |           |          |           |    |
 635  635   *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
 636  636   *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
 637  637   *           |    |           |          |           |    |
 638  638   *           |    +-----------+          +-----------+    |
 639  639   *           |          PP                     PP         |
 640  640   *           +----------PP---------------------PP---------+
 641  641   *                      PP                     PP
 642  642   *                      PP                     PP
 643  643   *
 644  644   *  This image represents a single Zen package. In this example, it has four
 645  645   *  Zeppelin dies, though some configurations only have a single one. In this
 646  646   *  example, each die is directly connected to the next. Also, each die is
 647  647   *  represented as being connected to memory by the 'M' character and connected
 648  648   *  to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
 649  649   *  die is made up of two core complexes, we have multiple different NUMA
 650  650   *  domains that we care about for these systems.
 651  651   *
 652  652   * CPUID LEAVES
 653  653   *
 654  654   * There are a few different CPUID leaves that we can use to try and understand
 655  655   * the actual state of the world. As part of the introduction of family 0xf, AMD
 656  656   * added CPUID leaf 0x80000008. This leaf tells us the number of logical
 657  657   * processors that are in the system. Because families before Zen didn't have
 658  658   * SMT, this was always the number of cores that were in the system. However, it
 659  659   * should always be thought of as the number of logical threads to be consistent
 660  660   * between generations. In addition we also get the size of the APIC ID that is
 661  661   * used to represent the number of logical processors. This is important for
 662  662   * deriving topology information.
 663  663   *
 664  664   * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
 665  665   * bit between Bulldozer and later families, but it is quite useful in
 666  666   * determining the topology information. Because this information has changed
 667  667   * across family generations, it's worth calling out what these mean
 668  668   * explicitly. The registers have the following meanings:
 669  669   *
 670  670   *      %eax    The APIC ID. The entire register is defined to have a 32-bit
 671  671   *              APIC ID, even though on systems without x2apic support, it will
 672  672   *              be limited to 8 bits.
 673  673   *
 674  674   *      %ebx    On Bulldozer-era systems this contains information about the
 675  675   *              number of cores that are in a compute unit (cores that share
 676  676   *              resources). It also contains a per-package compute unit ID that
 677  677   *              identifies which compute unit the logical CPU is a part of.
 678  678   *
 679  679   *              On Zen-era systems this instead contains the number of threads
 680  680   *              per core and the ID of the core that the logical CPU is a part
 681  681   *              of. Note, this ID is unique only to the package, it is not
 682  682   *              globally unique across the entire system.
 683  683   *
 684  684   *      %ecx    This contains the number of nodes that exist in the package. It
 685  685   *              also contains an ID that identifies which node the logical CPU
 686  686   *              is a part of.
 687  687   *
 688  688   * Finally, we also use cpuid leaf 0x8000001D to determine information about the
 689  689   * cache layout to determine which logical CPUs are sharing which caches.
 690  690   *
 691  691   * illumos Topology
 692  692   * ----------------
 693  693   *
 694  694   * Based on the above we synthesize the information into several different
 695  695   * variables that we store in the 'struct cpuid_info'. We'll go into the details
 696  696   * of what each member is supposed to represent and their uniqueness. In
 697  697   * general, there are two levels of uniqueness that we care about. We care about
 698  698   * an ID that is globally unique. That means that it will be unique across all
 699  699   * entities in the system. For example, the default logical CPU ID is globally
 700  700   * unique. On the other hand, there is some information that we only care about
 701  701   * being unique within the context of a single package / socket. Here are the
 702  702   * variables that we keep track of and their meaning.
 703  703   *
 704  704   * Several of the values that are asking for an identifier, with the exception
 705  705   * of cpi_apicid, are allowed to be synthetic.
 706  706   *
 707  707   *
 708  708   * cpi_apicid
 709  709   *
 710  710   *      This is the value of the CPU's APIC id. This should be the full 32-bit
 711  711   *      ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
 712  712   *      APIC ID. This value is globally unique between all logical CPUs across
 713  713   *      all packages. This is usually required by the APIC.
 714  714   *
 715  715   * cpi_chipid
 716  716   *
 717  717   *      This value indicates the ID of the package that the logical CPU is a
 718  718   *      part of. This value is allowed to be synthetic. It is usually derived by
 719  719   *      taking the CPU's APIC ID and determining how many bits are used to
 720  720   *      represent CPU cores in the package. All logical CPUs that are part of
 721  721   *      the same package must have the same value.
 722  722   *
 723  723   * cpi_coreid
 724  724   *
 725  725   *      This represents the ID of a CPU core. Two logical CPUs should only have
 726  726   *      the same cpi_coreid value if they are part of the same core. These
 727  727   *      values may be synthetic. On systems that support SMT, this value is
 728  728   *      usually derived from the APIC ID, otherwise it is often synthetic and
 729  729   *      just set to the value of the cpu_id in the cpu_t.
 730  730   *
 731  731   * cpi_pkgcoreid
 732  732   *
 733  733   *      This is similar to the cpi_coreid in that logical CPUs that are part of
 734  734   *      the same core should have the same ID. The main difference is that these
 735  735   *      values are only required to be unique to a given socket.
 736  736   *
 737  737   * cpi_clogid
 738  738   *
 739  739   *      This represents the logical ID of a logical CPU. This value should be
 740  740   *      unique within a given socket for each logical CPU. This is allowed to be
 741  741   *      synthetic, though it is usually based off of the CPU's apic ID. The
 742  742   *      broader system expects that logical CPUs that have are part of the same
 743  743   *      core have contiguous numbers. For example, if there were two threads per
 744  744   *      core, then the core IDs divided by two should be the same and the first
 745  745   *      modulus two should be zero and the second one. For example, IDs 4 and 5
 746  746   *      indicate two logical CPUs that are part of the same core. But IDs 5 and
 747  747   *      6 represent two logical CPUs that are part of different cores.
 748  748   *
 749  749   *      While it is common for the cpi_coreid and the cpi_clogid to be derived
 750  750   *      from the same source, strictly speaking, they don't have to be and the
 751  751   *      two values should be considered logically independent. One should not
 752  752   *      try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
 753  753   *      some kind of relationship. While this is tempting, we've seen cases on
 754  754   *      AMD family 0xf where the system's cpu id is not related to its APIC ID.
 755  755   *
 756  756   * cpi_ncpu_per_chip
 757  757   *
 758  758   *      This value indicates the total number of logical CPUs that exist in the
 759  759   *      physical package. Critically, this is not the number of logical CPUs
 760  760   *      that exist for just the single core.
 761  761   *
 762  762   *      This value should be the same for all logical CPUs in the same package.
 763  763   *
 764  764   * cpi_ncore_per_chip
 765  765   *
 766  766   *      This value indicates the total number of physical CPU cores that exist
 767  767   *      in the package. The system compares this value with cpi_ncpu_per_chip to
 768  768   *      determine if simultaneous multi-threading (SMT) is enabled. When
 769  769   *      cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
 770  770   *      the X86FSET_HTT feature is not set. If this value is greater than one,
 771  771   *      than we consider the processor to have the feature X86FSET_CMP, to
 772  772   *      indicate that there is support for more than one core.
 773  773   *
 774  774   *      This value should be the same for all logical CPUs in the same package.
 775  775   *
 776  776   * cpi_procnodes_per_pkg
 777  777   *
 778  778   *      This value indicates the number of 'nodes' that exist in the package.
 779  779   *      When processors are actually a multi-chip module, this represents the
 780  780   *      number of such modules that exist in the package. Currently, on Intel
 781  781   *      based systems this member is always set to 1.
 782  782   *
 783  783   *      This value should be the same for all logical CPUs in the same package.
 784  784   *
 785  785   * cpi_procnodeid
 786  786   *
 787  787   *      This value indicates the ID of the node that the logical CPU is a part
 788  788   *      of. All logical CPUs that are in the same node must have the same value
 789  789   *      here. This value must be unique across all of the packages in the
 790  790   *      system.  On Intel based systems, this is currently set to the value in
 791  791   *      cpi_chipid because there is only one node.
 792  792   *
 793  793   * cpi_cores_per_compunit
 794  794   *
 795  795   *      This value indicates the number of cores that are part of a compute
 796  796   *      unit. See the AMD topology section for this. This member only has real
 797  797   *      meaning currently for AMD Bulldozer family processors. For all other
 798  798   *      processors, this should currently be set to 1.
 799  799   *
 800  800   * cpi_compunitid
 801  801   *
 802  802   *      This indicates the compute unit that the logical CPU belongs to. For
 803  803   *      processors without AMD Bulldozer-style compute units this should be set
 804  804   *      to the value of cpi_coreid.
 805  805   *
 806  806   * cpi_ncpu_shr_last_cache
 807  807   *
 808  808   *      This indicates the number of logical CPUs that are sharing the same last
 809  809   *      level cache. This value should be the same for all CPUs that are sharing
 810  810   *      that cache. The last cache refers to the cache that is closest to memory
 811  811   *      and furthest away from the CPU.
 812  812   *
 813  813   * cpi_last_lvl_cacheid
 814  814   *
 815  815   *      This indicates the ID of the last cache that the logical CPU uses. This
 816  816   *      cache is often shared between multiple logical CPUs and is the cache
 817  817   *      that is closest to memory and furthest away from the CPU. This value
 818  818   *      should be the same for a group of logical CPUs only if they actually
 819  819   *      share the same last level cache. IDs should not overlap between
 820  820   *      packages.
 821  821   *
 822  822   * cpi_ncore_bits
 823  823   *
 824  824   *      This indicates the number of bits that are required to represent all of
 825  825   *      the cores in the system. As cores are derived based on their APIC IDs,
 826  826   *      we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
 827  827   *      this value to be larger than the actual number of IDs that are present
 828  828   *      in the system. This is used to size tables by the CMI framework. It is
 829  829   *      only filled in for Intel and AMD CPUs.
 830  830   *
 831  831   * cpi_nthread_bits
 832  832   *
 833  833   *      This indicates the number of bits required to represent all of the IDs
 834  834   *      that cover the logical CPUs that exist on a given core. It's OK for this
 835  835   *      value to be larger than the actual number of IDs that are present in the
 836  836   *      system.  This is used to size tables by the CMI framework. It is
 837  837   *      only filled in for Intel and AMD CPUs.
 838  838   *
 839  839   * -----------
 840  840   * Hypervisors
 841  841   * -----------
 842  842   *
 843  843   * If trying to manage the differences between vendors wasn't bad enough, it can
 844  844   * get worse thanks to our friend hardware virtualization. Hypervisors are given
 845  845   * the ability to interpose on all cpuid instructions and change them to suit
 846  846   * their purposes. In general, this is necessary as the hypervisor wants to be
 847  847   * able to present a more uniform set of features or not necessarily give the
 848  848   * guest operating system kernel knowledge of all features so it can be
 849  849   * more easily migrated between systems.
 850  850   *
 851  851   * When it comes to trying to determine topology information, this can be a
 852  852   * double edged sword. When a hypervisor doesn't actually implement a cpuid
 853  853   * leaf, it'll often return all zeros. Because of that, you'll often see various
 854  854   * checks scattered about fields being non-zero before we assume we can use
 855  855   * them.
 856  856   *
 857  857   * When it comes to topology information, the hypervisor is often incentivized
 858  858   * to lie to you about topology. This is because it doesn't always actually
 859  859   * guarantee that topology at all. The topology path we take in the system
 860  860   * depends on how the CPU advertises itself. If it advertises itself as an Intel
 861  861   * or AMD CPU, then we basically do our normal path. However, when they don't
 862  862   * use an actual vendor, then that usually turns into multiple one-core CPUs
 863  863   * that we enumerate that are often on different sockets. The actual behavior
 864  864   * depends greatly on what the hypervisor actually exposes to us.
 865  865   *
 866  866   * --------------------
 867  867   * Exposing Information
 868  868   * --------------------
 869  869   *
 870  870   * We expose CPUID information in three different forms in the system.
 871  871   *
 872  872   * The first is through the x86_featureset variable. This is used in conjunction
 873  873   * with the is_x86_feature() function. This is queried by x86-specific functions
 874  874   * to determine which features are or aren't present in the system and to make
 875  875   * decisions based upon them. For example, users of this include everything from
 876  876   * parts of the system dedicated to reliability, availability, and
 877  877   * serviceability (RAS), to making decisions about how to handle security
 878  878   * mitigations, to various x86-specific drivers. General purpose or
 879  879   * architecture independent drivers should never be calling this function.
 880  880   *
 881  881   * The second means is through the auxiliary vector. The auxiliary vector is a
 882  882   * series of tagged data that the kernel passes down to a user program when it
 883  883   * begins executing. This information is used to indicate to programs what
 884  884   * instruction set extensions are present. For example, information about the
 885  885   * CPU supporting the machine check architecture (MCA) wouldn't be passed down
 886  886   * since user programs cannot make use of it. However, things like the AVX
 887  887   * instruction sets are. Programs use this information to make run-time
 888  888   * decisions about what features they should use. As an example, the run-time
 889  889   * link-editor (rtld) can relocate different functions depending on the hardware

↓ open down ↓

889 lines elided

↑ open up ↑

 890  890   * support available.
 891  891   *
 892  892   * The final form is through a series of accessor functions that all have the
 893  893   * form cpuid_get*. This is used by a number of different subsystems in the
 894  894   * kernel to determine more detailed information about what we're running on,
 895  895   * topology information, etc. Some of these subsystems include processor groups
 896  896   * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
 897  897   * microcode, and performance monitoring. These functions all ASSERT that the
 898  898   * CPU they're being called on has reached a certain cpuid pass. If the passes
 899  899   * are rearranged, then this needs to be adjusted.
      900 + *
      901 + * -----------------------------------------------
      902 + * Speculative Execution CPU Side Channel Security
      903 + * -----------------------------------------------
      904 + *
      905 + * With the advent of the Spectre and Meltdown attacks which exploit speculative
      906 + * execution in the CPU to create side channels there have been a number of
      907 + * different attacks and corresponding issues that the operating system needs to
      908 + * mitigate against. The following list is some of the common, but not
      909 + * exhaustive, set of issues that we know about and have done some or need to do
      910 + * more work in the system to mitigate against:
      911 + *
      912 + *   - Spectre v1
      913 + *   - Spectre v2
      914 + *   - Meltdown (Spectre v3)
      915 + *   - Rogue Register Read (Spectre v3a)
      916 + *   - Speculative Store Bypass (Spectre v4)
      917 + *   - ret2spec, SpectreRSB
      918 + *   - L1 Terminal Fault (L1TF)
      919 + *   - Microarchitectural Data Sampling (MDS)
      920 + *
      921 + * Each of these requires different sets of mitigations and has different attack
      922 + * surfaces. For the most part, this discussion is about protecting the kernel
      923 + * from non-kernel executing environments such as user processes and hardware
      924 + * virtual machines. Unfortunately, there are a number of user vs. user
      925 + * scenarios that exist with these. The rest of this section will describe the
      926 + * overall approach that the system has taken to address these as well as their
      927 + * shortcomings. Unfortunately, not all of the above have been handled today.
      928 + *
      929 + * SPECTRE FAMILY (Spectre v2, ret2spec, SpectreRSB)
      930 + *
      931 + * The second variant of the spectre attack focuses on performing branch target
      932 + * injection. This generally impacts indirect call instructions in the system.
      933 + * There are three different ways to mitigate this issue that are commonly
      934 + * described today:
      935 + *
      936 + *  1. Using Indirect Branch Restricted Speculation (IBRS).
      937 + *  2. Using Retpolines and RSB Stuffing
      938 + *  3. Using Enhanced Indirect Branch Restricted Speculation (EIBRS)
      939 + *
      940 + * IBRS uses a feature added to microcode to restrict speculation, among other
      941 + * things. This form of mitigation has not been used as it has been generally
      942 + * seen as too expensive and requires reactivation upon various transitions in
      943 + * the system.
      944 + *
      945 + * As a less impactful alternative to IBRS, retpolines were developed by
      946 + * Google. These basically require one to replace indirect calls with a specific
      947 + * trampoline that will cause speculation to fail and break the attack.
      948 + * Retpolines require compiler support. We always build with retpolines in the
      949 + * external thunk mode. This means that a traditional indirect call is replaced
      950 + * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
      951 + * of this is that all indirect function calls are performed through a register.
      952 + *
      953 + * We have to use a common external location of the thunk and not inline it into
      954 + * the callsite so that way we can have a single place to patch these functions.
      955 + * As it turns out, we actually have three different forms of retpolines that
      956 + * exist in the system:
      957 + *
      958 + *  1. A full retpoline
      959 + *  2. An AMD-specific optimized retpoline
      960 + *  3. A no-op version
      961 + *
      962 + * The first one is used in the general case. The second one is used if we can
      963 + * determine that we're on an AMD system and we can successfully toggle the
      964 + * lfence serializing MSR that exists on the platform. Basically with this
      965 + * present, an lfence is sufficient and we don't need to do anywhere near as
      966 + * complicated a dance to successfully use retpolines.
      967 + *
      968 + * The third form described above is the most curious. It turns out that the way
      969 + * that retpolines are implemented is that they rely on how speculation is
      970 + * performed on a 'ret' instruction. Intel has continued to optimize this
      971 + * process (which is partly why we need to have return stack buffer stuffing,
      972 + * but more on that in a bit) and in processors starting with Cascade Lake
      973 + * on the server side, it's dangerous to rely on retpolines. Instead, a new
      974 + * mechanism has been introduced called Enhanced IBRS (EIBRS).
      975 + *
      976 + * Unlike IBRS, EIBRS is designed to be enabled once at boot and left on each
      977 + * physical core. However, if this is the case, we don't want to use retpolines
      978 + * any more. Therefore if EIBRS is present, we end up turning each retpoline
      979 + * function (called a thunk) into a jmp instruction. This means that we're still
      980 + * paying the cost of an extra jump to the external thunk, but it gives us
      981 + * flexibility and the ability to have a single kernel image that works across a
      982 + * wide variety of systems and hardware features.
      983 + *
      984 + * Unfortunately, this alone is insufficient. First, Skylake systems have
      985 + * additional speculation for the Return Stack Buffer (RSB) which is used to
      986 + * return from call instructions which retpolines take advantage of. However,
      987 + * this problem is not just limited to Skylake and is actually more pernicious.
      988 + * The SpectreRSB paper introduces several more problems that can arise with
      989 + * dealing with this. The RSB can be poisoned just like the indirect branch
      990 + * predictor. This means that one needs to clear the RSB when transitioning
      991 + * between two different privilege domains. Some examples include:
      992 + *
      993 + *  - Switching between two different user processes
      994 + *  - Going between user land and the kernel
      995 + *  - Returning to the kernel from a hardware virtual machine
      996 + *
      997 + * Mitigating this involves combining a couple of different things. The first is
      998 + * SMEP (supervisor mode execution protection) which was introduced in Ivy
      999 + * Bridge. When an RSB entry refers to a user address and we're executing in the
     1000 + * kernel, speculation through it will be stopped when SMEP is enabled. This
     1001 + * protects against a number of the different cases that we would normally be
     1002 + * worried about such as when we enter the kernel from user land.
     1003 + *
     1004 + * To prevent against additional manipulation of the RSB from other contexts
     1005 + * such as a non-root VMX context attacking the kernel we first look to enhanced
     1006 + * IBRS. When EIBRS is present and enabled, then there is nothing else that we
     1007 + * need to do to protect the kernel at this time.
     1008 + *
     1009 + * On CPUs without EIBRS we need to manually overwrite the contents of the
     1010 + * return stack buffer. We do this through the x86_rsb_stuff() function.
     1011 + * Currently this is employed on context switch. The x86_rsb_stuff() function is
     1012 + * disabled when enhanced IBRS is present because Intel claims on such systems
     1013 + * it will be ineffective. Stuffing the RSB in context switch helps prevent user
     1014 + * to user attacks via the RSB.
     1015 + *
     1016 + * If SMEP is not present, then we would have to stuff the RSB every time we
     1017 + * transitioned from user mode to the kernel, which isn't very practical right
     1018 + * now.
     1019 + *
     1020 + * To fully protect user to user and vmx to vmx attacks from these classes of
     1021 + * issues, we would also need to allow them to opt into performing an Indirect
     1022 + * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
     1023 + *
     1024 + * By default, the system will enable RSB stuffing and the required variant of
     1025 + * retpolines and store that information in the x86_spectrev2_mitigation value.
     1026 + * This will be evaluated after a microcode update as well, though it is
     1027 + * expected that microcode updates will not take away features. This may mean
     1028 + * that a late loaded microcode may not end up in the optimal configuration
     1029 + * (though this should be rare).
     1030 + *
     1031 + * Currently we do not build kmdb with retpolines or perform any additional side
     1032 + * channel security mitigations for it. One complication with kmdb is that it
     1033 + * requires its own retpoline thunks and it would need to adjust itself based on
     1034 + * what the kernel does. The threat model of kmdb is more limited and therefore
     1035 + * it may make more sense to investigate using prediction barriers as the whole
     1036 + * system is only executing a single instruction at a time while in kmdb.
     1037 + *
     1038 + * SPECTRE FAMILY (v1, v4)
     1039 + *
     1040 + * The v1 and v4 variants of spectre are not currently mitigated in the
     1041 + * system and require other classes of changes to occur in the code.
     1042 + *
     1043 + * MELTDOWN
     1044 + *
     1045 + * Meltdown, or spectre v3, allowed a user process to read any data in their
     1046 + * address space regardless of whether or not the page tables in question
     1047 + * allowed the user to have the ability to read them. The solution to meltdown
     1048 + * is kernel page table isolation. In this world, there are two page tables that
     1049 + * are used for a process, one in user land and one in the kernel. To implement
     1050 + * this we use per-CPU page tables and switch between the user and kernel
     1051 + * variants when entering and exiting the kernel.  For more information about
     1052 + * this process and how the trampolines work, please see the big theory
     1053 + * statements and additional comments in:
     1054 + *
     1055 + *  - uts/i86pc/ml/kpti_trampolines.s
     1056 + *  - uts/i86pc/vm/hat_i86.c
     1057 + *
     1058 + * While Meltdown only impacted Intel systems and there are also Intel systems
     1059 + * that have Meltdown fixed (called Rogue Data Cache Load), we always have
     1060 + * kernel page table isolation enabled. While this may at first seem weird, an
     1061 + * important thing to remember is that you can't speculatively read an address
     1062 + * if it's never in your page table at all. Having user processes without kernel
     1063 + * pages present provides us with an important layer of defense in the kernel
     1064 + * against any other side channel attacks that exist and have yet to be
     1065 + * discovered. As such, kernel page table isolation (KPTI) is always enabled by
     1066 + * default, no matter the x86 system.
     1067 + *
     1068 + * L1 TERMINAL FAULT
     1069 + *
     1070 + * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
     1071 + * execution uses page table entries. Effectively, it is two different problems.
     1072 + * The first is that it ignores the not present bit in the page table entries
     1073 + * when performing speculative execution. This means that something can
     1074 + * speculatively read the listed physical address if it's present in the L1
     1075 + * cache under certain conditions (see Intel's documentation for the full set of
     1076 + * conditions). Secondly, this can be used to bypass hardware virtualization
     1077 + * extended page tables (EPT) that are part of Intel's hardware virtual machine
     1078 + * instructions.
     1079 + *
     1080 + * For the non-hardware virtualized case, this is relatively easy to deal with.
     1081 + * We must make sure that all unmapped pages have an address of zero. This means
     1082 + * that they could read the first 4k of physical memory; however, we never use
     1083 + * that first page in the operating system and always skip putting it in our
     1084 + * memory map, even if firmware tells us we can use it in our memory map. While
     1085 + * other systems try to put extra metadata in the address and reserved bits,
     1086 + * which led to this being problematic in those cases, we do not.
     1087 + *
     1088 + * For hardware virtual machines things are more complicated. Because they can
     1089 + * construct their own page tables, it isn't hard for them to perform this
     1090 + * attack against any physical address. The one wrinkle is that this physical
     1091 + * address must be in the L1 data cache. Thus Intel added an MSR that we can use
     1092 + * to flush the L1 data cache. We wrap this up in the function
     1093 + * spec_uarch_flush(). This function is also used in the mitigation of
     1094 + * microarchitectural data sampling (MDS) discussed later on. Kernel based
     1095 + * hypervisors such as KVM or bhyve are responsible for performing this before
     1096 + * entering the guest.
     1097 + *
     1098 + * Because this attack takes place in the L1 cache, there's another wrinkle
     1099 + * here. The L1 cache is shared between all logical CPUs in a core in most Intel
     1100 + * designs. This means that when a thread enters a hardware virtualized context
     1101 + * and flushes the L1 data cache, the other thread on the processor may then go
     1102 + * ahead and put new data in it that can be potentially attacked. While one
     1103 + * solution is to disable SMT on the system, another option that is available is
     1104 + * to use a feature for hardware virtualization called 'SMT exclusion'. This
     1105 + * goes through and makes sure that if a HVM is being scheduled on one thread,
     1106 + * then the thing on the other thread is from the same hardware virtual machine.
     1107 + * If an interrupt comes in or the guest exits to the broader system, then the
     1108 + * other SMT thread will be kicked out.
     1109 + *
     1110 + * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
     1111 + * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
     1112 + * perform L1TF related mitigations.
     1113 + *
     1114 + * MICROARCHITECTURAL DATA SAMPLING
     1115 + *
     1116 + * Microarchitectural data sampling (MDS) is a combination of four discrete
     1117 + * vulnerabilities that are similar issues affecting various parts of the CPU's
     1118 + * microarchitectural implementation around load, store, and fill buffers.
     1119 + * Specifically it is made up of the following subcomponents:
     1120 + *
     1121 + *  1. Microarchitectural Store Buffer Data Sampling (MSBDS)
     1122 + *  2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
     1123 + *  3. Microarchitectural Load Port Data Sampling (MLPDS)
     1124 + *  4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
     1125 + *
     1126 + * To begin addressing these, Intel has introduced another feature in microcode
     1127 + * called MD_CLEAR. This changes the verw instruction to operate in a different
     1128 + * way. This allows us to execute the verw instruction in a particular way to
     1129 + * flush the state of the affected parts. The L1TF L1D flush mechanism is also
     1130 + * updated when this microcode is present to flush this state.
     1131 + *
     1132 + * Primarily we need to flush this state whenever we transition from the kernel
     1133 + * to a less privileged context such as user mode or an HVM guest. MSBDS is a
     1134 + * little bit different. Here the structures are statically sized when a logical
     1135 + * CPU is in use and resized when it goes to sleep. Therefore, we also need to
     1136 + * flush the microarchitectural state before the CPU goes idles by calling hlt,
     1137 + * mwait, or another ACPI method. To perform these flushes, we call
     1138 + * x86_md_clear() at all of these transition points.
     1139 + *
     1140 + * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
     1141 + * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
     1142 + * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
     1143 + * a no-op.
     1144 + *
     1145 + * Unfortunately, with this issue hyperthreading rears its ugly head. In
     1146 + * particular, everything we've discussed above is only valid for a single
     1147 + * thread executing on a core. In the case where you have hyper-threading
     1148 + * present, this attack can be performed between threads. The theoretical fix
     1149 + * for this is to ensure that both threads are always in the same security
     1150 + * domain. This means that they are executing in the same ring and mutually
     1151 + * trust each other. Practically speaking, this would mean that a system call
     1152 + * would have to issue an inter-processor interrupt (IPI) to the other thread.
     1153 + * Rather than implement this, we recommend that one disables hyper-threading
     1154 + * through the use of psradm -aS.
     1155 + *
     1156 + * SUMMARY
     1157 + *
     1158 + * The following table attempts to summarize the mitigations for various issues
     1159 + * and what's done in various places:
     1160 + *
     1161 + *  - Spectre v1: Not currently mitigated
     1162 + *  - Spectre v2: Retpolines/RSB Stuffing or EIBRS if HW support
     1163 + *  - Meltdown: Kernel Page Table Isolation
     1164 + *  - Spectre v3a: Updated CPU microcode
     1165 + *  - Spectre v4: Not currently mitigated
     1166 + *  - SpectreRSB: SMEP and RSB Stuffing
     1167 + *  - L1TF: spec_uarch_flush, smt exclusion, requires microcode
     1168 + *  - MDS: x86_md_clear, requires microcode, disabling hyper threading
     1169 + *
     1170 + * The following table indicates the x86 feature set bits that indicate that a
     1171 + * given problem has been solved or a notable feature is present:
     1172 + *
     1173 + *  - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
     1174 + *  - MDS_NO: All forms of MDS
 900 1175   */
 901 1176  
 902 1177  #include <sys/types.h>
 903 1178  #include <sys/archsystm.h>
 904 1179  #include <sys/x86_archext.h>
 905 1180  #include <sys/kmem.h>
 906 1181  #include <sys/systm.h>
 907 1182  #include <sys/cmn_err.h>
 908 1183  #include <sys/sunddi.h>
 909 1184  #include <sys/sunndi.h>

 910 1185  #include <sys/cpuvar.h>
 911 1186  #include <sys/processor.h>
 912 1187  #include <sys/sysmacros.h>
 913 1188  #include <sys/pg.h>

↓ open down ↓

4 lines elided

↑ open up ↑

 914 1189  #include <sys/fp.h>
 915 1190  #include <sys/controlregs.h>
 916 1191  #include <sys/bitmap.h>
 917 1192  #include <sys/auxv_386.h>
 918 1193  #include <sys/memnode.h>
 919 1194  #include <sys/pci_cfgspace.h>
 920 1195  #include <sys/comm_page.h>
 921 1196  #include <sys/mach_mmu.h>
 922 1197  #include <sys/ucode.h>
 923 1198  #include <sys/tsc.h>
     1199 +#include <sys/kobj.h>
     1200 +#include <sys/asm_misc.h>
 924 1201  
 925 1202  #ifdef __xpv
 926 1203  #include <sys/hypervisor.h>
 927 1204  #else
 928 1205  #include <sys/ontrap.h>
 929 1206  #endif
 930 1207  
 931 1208  uint_t x86_vendor = X86_VENDOR_IntelClone;
 932 1209  uint_t x86_type = X86_TYPE_OTHER;
 933 1210  uint_t x86_clflush_size = 0;
 934 1211  
 935 1212  #if defined(__xpv)
 936 1213  int x86_use_pcid = 0;
 937 1214  int x86_use_invpcid = 0;
 938 1215  #else
 939 1216  int x86_use_pcid = -1;
 940 1217  int x86_use_invpcid = -1;
 941 1218  #endif
 942 1219  
     1220 +typedef enum {
     1221 +        X86_SPECTREV2_RETPOLINE,
     1222 +        X86_SPECTREV2_RETPOLINE_AMD,
     1223 +        X86_SPECTREV2_ENHANCED_IBRS,
     1224 +        X86_SPECTREV2_DISABLED
     1225 +} x86_spectrev2_mitigation_t;
     1226 +
     1227 +uint_t x86_disable_spectrev2 = 0;
     1228 +static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
     1229 +    X86_SPECTREV2_RETPOLINE;
     1230 +
 943 1231  uint_t pentiumpro_bug4046376;
 944 1232  
 945 1233  uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
 946 1234  
 947 1235  static char *x86_feature_names[NUM_X86_FEATURES] = {
 948 1236          "lgpg",
 949 1237          "tsc",
 950 1238          "msr",
 951 1239          "mtrr",
 952 1240          "pge",

 953 1241          "de",
 954 1242          "cmov",
 955 1243          "mmx",
 956 1244          "mca",
 957 1245          "pae",
 958 1246          "cv8",
 959 1247          "pat",
 960 1248          "sep",
 961 1249          "sse",
 962 1250          "sse2",
 963 1251          "htt",
 964 1252          "asysc",
 965 1253          "nx",
 966 1254          "sse3",
 967 1255          "cx16",
 968 1256          "cmp",
 969 1257          "tscp",
 970 1258          "mwait",
 971 1259          "sse4a",
 972 1260          "cpuid",
 973 1261          "ssse3",
 974 1262          "sse4_1",
 975 1263          "sse4_2",
 976 1264          "1gpg",
 977 1265          "clfsh",
 978 1266          "64",
 979 1267          "aes",
 980 1268          "pclmulqdq",
 981 1269          "xsave",
 982 1270          "avx",
 983 1271          "vmx",
 984 1272          "svm",
 985 1273          "topoext",
 986 1274          "f16c",
 987 1275          "rdrand",
 988 1276          "x2apic",
 989 1277          "avx2",
 990 1278          "bmi1",
 991 1279          "bmi2",
 992 1280          "fma",
 993 1281          "smep",
 994 1282          "smap",
 995 1283          "adx",
 996 1284          "rdseed",
 997 1285          "mpx",
 998 1286          "avx512f",
 999 1287          "avx512dq",
1000 1288          "avx512pf",
1001 1289          "avx512er",
1002 1290          "avx512cd",
1003 1291          "avx512bw",
1004 1292          "avx512vl",
1005 1293          "avx512fma",
1006 1294          "avx512vbmi",
1007 1295          "avx512_vpopcntdq",
1008 1296          "avx512_4vnniw",
1009 1297          "avx512_4fmaps",
1010 1298          "xsaveopt",
1011 1299          "xsavec",
1012 1300          "xsaves",
1013 1301          "sha",
1014 1302          "umip",
1015 1303          "pku",
1016 1304          "ospke",
1017 1305          "pcid",
1018 1306          "invpcid",
1019 1307          "ibrs",
1020 1308          "ibpb",
1021 1309          "stibp",
1022 1310          "ssbd",
1023 1311          "ssbd_virt",
1024 1312          "rdcl_no",
1025 1313          "ibrs_all",
1026 1314          "rsba",
1027 1315          "ssb_no",
1028 1316          "stibp_all",
1029 1317          "flush_cmd",
1030 1318          "l1d_vmentry_no",
1031 1319          "fsgsbase",
1032 1320          "clflushopt",
1033 1321          "clwb",
1034 1322          "monitorx",
1035 1323          "clzero",
1036 1324          "xop",
1037 1325          "fma4",
1038 1326          "tbm",
1039 1327          "avx512_vnni",
1040 1328          "amd_pcec",
1041 1329          "mb_clear",
1042 1330          "mds_no",
1043 1331          "core_thermal",
1044 1332          "pkg_thermal"
1045 1333  };
1046 1334  
1047 1335  boolean_t
1048 1336  is_x86_feature(void *featureset, uint_t feature)
1049 1337  {
1050 1338          ASSERT(feature < NUM_X86_FEATURES);
1051 1339          return (BT_TEST((ulong_t *)featureset, feature));
1052 1340  }
1053 1341  
1054 1342  void
1055 1343  add_x86_feature(void *featureset, uint_t feature)
1056 1344  {
1057 1345          ASSERT(feature < NUM_X86_FEATURES);
1058 1346          BT_SET((ulong_t *)featureset, feature);
1059 1347  }
1060 1348  
1061 1349  void
1062 1350  remove_x86_feature(void *featureset, uint_t feature)
1063 1351  {
1064 1352          ASSERT(feature < NUM_X86_FEATURES);
1065 1353          BT_CLEAR((ulong_t *)featureset, feature);
1066 1354  }
1067 1355  
1068 1356  boolean_t
1069 1357  compare_x86_featureset(void *setA, void *setB)
1070 1358  {
1071 1359          /*
1072 1360           * We assume that the unused bits of the bitmap are always zero.
1073 1361           */
1074 1362          if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1075 1363                  return (B_TRUE);
1076 1364          } else {
1077 1365                  return (B_FALSE);
1078 1366          }
1079 1367  }
1080 1368  
1081 1369  void
1082 1370  print_x86_featureset(void *featureset)
1083 1371  {
1084 1372          uint_t i;
1085 1373  
1086 1374          for (i = 0; i < NUM_X86_FEATURES; i++) {
1087 1375                  if (is_x86_feature(featureset, i)) {
1088 1376                          cmn_err(CE_CONT, "?x86_feature: %s\n",
1089 1377                              x86_feature_names[i]);
1090 1378                  }
1091 1379          }
1092 1380  }
1093 1381  
1094 1382  /* Note: This is the maximum size for the CPU, not the size of the structure. */
1095 1383  static size_t xsave_state_size = 0;
1096 1384  uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1097 1385  boolean_t xsave_force_disable = B_FALSE;
1098 1386  extern int disable_smap;
1099 1387  
1100 1388  /*
1101 1389   * This is set to platform type we are running on.
1102 1390   */
1103 1391  static int platform_type = -1;
1104 1392  
1105 1393  #if !defined(__xpv)
1106 1394  /*
1107 1395   * Variable to patch if hypervisor platform detection needs to be
1108 1396   * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1109 1397   */
1110 1398  int enable_platform_detection = 1;
1111 1399  #endif
1112 1400  
1113 1401  /*
1114 1402   * monitor/mwait info.
1115 1403   *
1116 1404   * size_actual and buf_actual are the real address and size allocated to get
1117 1405   * proper mwait_buf alignement.  buf_actual and size_actual should be passed
1118 1406   * to kmem_free().  Currently kmem_alloc() and mwait happen to both use
1119 1407   * processor cache-line alignment, but this is not guarantied in the furture.
1120 1408   */
1121 1409  struct mwait_info {
1122 1410          size_t          mon_min;        /* min size to avoid missed wakeups */
1123 1411          size_t          mon_max;        /* size to avoid false wakeups */
1124 1412          size_t          size_actual;    /* size actually allocated */
1125 1413          void            *buf_actual;    /* memory actually allocated */
1126 1414          uint32_t        support;        /* processor support of monitor/mwait */
1127 1415  };
1128 1416  
1129 1417  /*
1130 1418   * xsave/xrestor info.
1131 1419   *
1132 1420   * This structure contains HW feature bits and the size of the xsave save area.
1133 1421   * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1134 1422   * (xsave_state) to describe the xsave layout. However, at runtime the
1135 1423   * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1136 1424   * xsave_state structure simply represents the legacy layout of the beginning
1137 1425   * of the xsave area.
1138 1426   */
1139 1427  struct xsave_info {
1140 1428          uint32_t        xsav_hw_features_low;   /* Supported HW features */
1141 1429          uint32_t        xsav_hw_features_high;  /* Supported HW features */
1142 1430          size_t          xsav_max_size;  /* max size save area for HW features */
1143 1431          size_t          ymm_size;       /* AVX: size of ymm save area */
1144 1432          size_t          ymm_offset;     /* AVX: offset for ymm save area */
1145 1433          size_t          bndregs_size;   /* MPX: size of bndregs save area */
1146 1434          size_t          bndregs_offset; /* MPX: offset for bndregs save area */
1147 1435          size_t          bndcsr_size;    /* MPX: size of bndcsr save area */
1148 1436          size_t          bndcsr_offset;  /* MPX: offset for bndcsr save area */
1149 1437          size_t          opmask_size;    /* AVX512: size of opmask save */
1150 1438          size_t          opmask_offset;  /* AVX512: offset for opmask save */
1151 1439          size_t          zmmlo_size;     /* AVX512: size of zmm 256 save */
1152 1440          size_t          zmmlo_offset;   /* AVX512: offset for zmm 256 save */
1153 1441          size_t          zmmhi_size;     /* AVX512: size of zmm hi reg save */
1154 1442          size_t          zmmhi_offset;   /* AVX512: offset for zmm hi reg save */
1155 1443  };
1156 1444  
1157 1445  
1158 1446  /*
1159 1447   * These constants determine how many of the elements of the
1160 1448   * cpuid we cache in the cpuid_info data structure; the
1161 1449   * remaining elements are accessible via the cpuid instruction.
1162 1450   */
1163 1451  
1164 1452  #define NMAX_CPI_STD    8               /* eax = 0 .. 7 */
1165 1453  #define NMAX_CPI_EXTD   0x1f            /* eax = 0x80000000 .. 0x8000001e */
1166 1454  
1167 1455  /*
1168 1456   * See the big theory statement for a more detailed explanation of what some of
1169 1457   * these members mean.
1170 1458   */
1171 1459  struct cpuid_info {
1172 1460          uint_t cpi_pass;                /* last pass completed */
1173 1461          /*
1174 1462           * standard function information
1175 1463           */
1176 1464          uint_t cpi_maxeax;              /* fn 0: %eax */
1177 1465          char cpi_vendorstr[13];         /* fn 0: %ebx:%ecx:%edx */
1178 1466          uint_t cpi_vendor;              /* enum of cpi_vendorstr */
1179 1467  
1180 1468          uint_t cpi_family;              /* fn 1: extended family */
1181 1469          uint_t cpi_model;               /* fn 1: extended model */
1182 1470          uint_t cpi_step;                /* fn 1: stepping */
1183 1471          chipid_t cpi_chipid;            /* fn 1: %ebx:  Intel: chip # */
1184 1472                                          /*              AMD: package/socket # */
1185 1473          uint_t cpi_brandid;             /* fn 1: %ebx: brand ID */
1186 1474          int cpi_clogid;                 /* fn 1: %ebx: thread # */
1187 1475          uint_t cpi_ncpu_per_chip;       /* fn 1: %ebx: logical cpu count */
1188 1476          uint8_t cpi_cacheinfo[16];      /* fn 2: intel-style cache desc */
1189 1477          uint_t cpi_ncache;              /* fn 2: number of elements */
1190 1478          uint_t cpi_ncpu_shr_last_cache; /* fn 4: %eax: ncpus sharing cache */
1191 1479          id_t cpi_last_lvl_cacheid;      /* fn 4: %eax: derived cache id */
1192 1480          uint_t cpi_cache_leaf_size;     /* Number of cache elements */
1193 1481                                          /* Intel fn: 4, AMD fn: 8000001d */
1194 1482          struct cpuid_regs **cpi_cache_leaves;   /* Acual leaves from above */
1195 1483          struct cpuid_regs cpi_std[NMAX_CPI_STD];        /* 0 .. 7 */
1196 1484          /*
1197 1485           * extended function information
1198 1486           */
1199 1487          uint_t cpi_xmaxeax;             /* fn 0x80000000: %eax */
1200 1488          char cpi_brandstr[49];          /* fn 0x8000000[234] */
1201 1489          uint8_t cpi_pabits;             /* fn 0x80000006: %eax */
1202 1490          uint8_t cpi_vabits;             /* fn 0x80000006: %eax */
1203 1491          uint8_t cpi_fp_amd_save;        /* AMD: FP error pointer save rqd. */
1204 1492          struct  cpuid_regs cpi_extd[NMAX_CPI_EXTD];     /* 0x800000XX */
1205 1493  
1206 1494          id_t cpi_coreid;                /* same coreid => strands share core */
1207 1495          int cpi_pkgcoreid;              /* core number within single package */
1208 1496          uint_t cpi_ncore_per_chip;      /* AMD: fn 0x80000008: %ecx[7-0] */
1209 1497                                          /* Intel: fn 4: %eax[31-26] */
1210 1498  
1211 1499          /*
1212 1500           * These values represent the number of bits that are required to store
1213 1501           * information about the number of cores and threads.
1214 1502           */
1215 1503          uint_t cpi_ncore_bits;
1216 1504          uint_t cpi_nthread_bits;
1217 1505          /*
1218 1506           * supported feature information
1219 1507           */
1220 1508          uint32_t cpi_support[6];
1221 1509  #define STD_EDX_FEATURES        0
1222 1510  #define AMD_EDX_FEATURES        1
1223 1511  #define TM_EDX_FEATURES         2
1224 1512  #define STD_ECX_FEATURES        3
1225 1513  #define AMD_ECX_FEATURES        4
1226 1514  #define STD_EBX_FEATURES        5
1227 1515          /*
1228 1516           * Synthesized information, where known.
1229 1517           */
1230 1518          uint32_t cpi_chiprev;           /* See X86_CHIPREV_* in x86_archext.h */
1231 1519          const char *cpi_chiprevstr;     /* May be NULL if chiprev unknown */
1232 1520          uint32_t cpi_socket;            /* Chip package/socket type */
1233 1521  
1234 1522          struct mwait_info cpi_mwait;    /* fn 5: monitor/mwait info */
1235 1523          uint32_t cpi_apicid;
1236 1524          uint_t cpi_procnodeid;          /* AMD: nodeID on HT, Intel: chipid */
1237 1525          uint_t cpi_procnodes_per_pkg;   /* AMD: # of nodes in the package */
1238 1526                                          /* Intel: 1 */
1239 1527          uint_t cpi_compunitid;          /* AMD: ComputeUnit ID, Intel: coreid */
1240 1528          uint_t cpi_cores_per_compunit;  /* AMD: # of cores in the ComputeUnit */
1241 1529  
1242 1530          struct xsave_info cpi_xsave;    /* fn D: xsave/xrestor info */
1243 1531  };
1244 1532  
1245 1533  
1246 1534  static struct cpuid_info cpuid_info0;
1247 1535  
1248 1536  /*
1249 1537   * These bit fields are defined by the Intel Application Note AP-485
1250 1538   * "Intel Processor Identification and the CPUID Instruction"
1251 1539   */
1252 1540  #define CPI_FAMILY_XTD(cpi)     BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1253 1541  #define CPI_MODEL_XTD(cpi)      BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1254 1542  #define CPI_TYPE(cpi)           BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1255 1543  #define CPI_FAMILY(cpi)         BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1256 1544  #define CPI_STEP(cpi)           BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1257 1545  #define CPI_MODEL(cpi)          BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1258 1546  
1259 1547  #define CPI_FEATURES_EDX(cpi)           ((cpi)->cpi_std[1].cp_edx)
1260 1548  #define CPI_FEATURES_ECX(cpi)           ((cpi)->cpi_std[1].cp_ecx)
1261 1549  #define CPI_FEATURES_XTD_EDX(cpi)       ((cpi)->cpi_extd[1].cp_edx)
1262 1550  #define CPI_FEATURES_XTD_ECX(cpi)       ((cpi)->cpi_extd[1].cp_ecx)
1263 1551  #define CPI_FEATURES_7_0_EBX(cpi)       ((cpi)->cpi_std[7].cp_ebx)
1264 1552  #define CPI_FEATURES_7_0_ECX(cpi)       ((cpi)->cpi_std[7].cp_ecx)
1265 1553  #define CPI_FEATURES_7_0_EDX(cpi)       ((cpi)->cpi_std[7].cp_edx)
1266 1554  
1267 1555  #define CPI_BRANDID(cpi)        BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1268 1556  #define CPI_CHUNKS(cpi)         BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1269 1557  #define CPI_CPU_COUNT(cpi)      BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1270 1558  #define CPI_APIC_ID(cpi)        BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1271 1559  
1272 1560  #define CPI_MAXEAX_MAX          0x100           /* sanity control */
1273 1561  #define CPI_XMAXEAX_MAX         0x80000100
1274 1562  #define CPI_FN4_ECX_MAX         0x20            /* sanity: max fn 4 levels */
1275 1563  #define CPI_FNB_ECX_MAX         0x20            /* sanity: max fn B levels */
1276 1564  
1277 1565  /*
1278 1566   * Function 4 (Deterministic Cache Parameters) macros
1279 1567   * Defined by Intel Application Note AP-485
1280 1568   */
1281 1569  #define CPI_NUM_CORES(regs)             BITX((regs)->cp_eax, 31, 26)
1282 1570  #define CPI_NTHR_SHR_CACHE(regs)        BITX((regs)->cp_eax, 25, 14)
1283 1571  #define CPI_FULL_ASSOC_CACHE(regs)      BITX((regs)->cp_eax, 9, 9)
1284 1572  #define CPI_SELF_INIT_CACHE(regs)       BITX((regs)->cp_eax, 8, 8)
1285 1573  #define CPI_CACHE_LVL(regs)             BITX((regs)->cp_eax, 7, 5)
1286 1574  #define CPI_CACHE_TYPE(regs)            BITX((regs)->cp_eax, 4, 0)
1287 1575  #define CPI_CPU_LEVEL_TYPE(regs)        BITX((regs)->cp_ecx, 15, 8)
1288 1576  
1289 1577  #define CPI_CACHE_WAYS(regs)            BITX((regs)->cp_ebx, 31, 22)
1290 1578  #define CPI_CACHE_PARTS(regs)           BITX((regs)->cp_ebx, 21, 12)
1291 1579  #define CPI_CACHE_COH_LN_SZ(regs)       BITX((regs)->cp_ebx, 11, 0)
1292 1580  
1293 1581  #define CPI_CACHE_SETS(regs)            BITX((regs)->cp_ecx, 31, 0)
1294 1582  
1295 1583  #define CPI_PREFCH_STRIDE(regs)         BITX((regs)->cp_edx, 9, 0)
1296 1584  
1297 1585  
1298 1586  /*
1299 1587   * A couple of shorthand macros to identify "later" P6-family chips
1300 1588   * like the Pentium M and Core.  First, the "older" P6-based stuff
1301 1589   * (loosely defined as "pre-Pentium-4"):
1302 1590   * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1303 1591   */
1304 1592  #define IS_LEGACY_P6(cpi) (                     \
1305 1593          cpi->cpi_family == 6 &&                 \
1306 1594                  (cpi->cpi_model == 1 ||         \
1307 1595                  cpi->cpi_model == 3 ||          \
1308 1596                  cpi->cpi_model == 5 ||          \
1309 1597                  cpi->cpi_model == 6 ||          \
1310 1598                  cpi->cpi_model == 7 ||          \
1311 1599                  cpi->cpi_model == 8 ||          \
1312 1600                  cpi->cpi_model == 0xA ||        \
1313 1601                  cpi->cpi_model == 0xB)          \
1314 1602  )
1315 1603  
1316 1604  /* A "new F6" is everything with family 6 that's not the above */
1317 1605  #define IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
1318 1606  
1319 1607  /* Extended family/model support */
1320 1608  #define IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
1321 1609          cpi->cpi_family >= 0xf)
1322 1610  
1323 1611  /*
1324 1612   * Info for monitor/mwait idle loop.
1325 1613   *
1326 1614   * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
1327 1615   * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
1328 1616   * 2006.
1329 1617   * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
1330 1618   * Documentation Updates" #33633, Rev 2.05, December 2006.
1331 1619   */
1332 1620  #define MWAIT_SUPPORT           (0x00000001)    /* mwait supported */
1333 1621  #define MWAIT_EXTENSIONS        (0x00000002)    /* extenstion supported */
1334 1622  #define MWAIT_ECX_INT_ENABLE    (0x00000004)    /* ecx 1 extension supported */
1335 1623  #define MWAIT_SUPPORTED(cpi)    ((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
1336 1624  #define MWAIT_INT_ENABLE(cpi)   ((cpi)->cpi_std[5].cp_ecx & 0x2)
1337 1625  #define MWAIT_EXTENSION(cpi)    ((cpi)->cpi_std[5].cp_ecx & 0x1)
1338 1626  #define MWAIT_SIZE_MIN(cpi)     BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
1339 1627  #define MWAIT_SIZE_MAX(cpi)     BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
1340 1628  /*
1341 1629   * Number of sub-cstates for a given c-state.
1342 1630   */
1343 1631  #define MWAIT_NUM_SUBC_STATES(cpi, c_state)                     \
1344 1632          BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
1345 1633  
1346 1634  /*
1347 1635   * XSAVE leaf 0xD enumeration
1348 1636   */
1349 1637  #define CPUID_LEAFD_2_YMM_OFFSET        576
1350 1638  #define CPUID_LEAFD_2_YMM_SIZE          256
1351 1639  
1352 1640  /*
1353 1641   * Common extended leaf names to cut down on typos.
1354 1642   */
1355 1643  #define CPUID_LEAF_EXT_0                0x80000000
1356 1644  #define CPUID_LEAF_EXT_8                0x80000008
1357 1645  #define CPUID_LEAF_EXT_1d               0x8000001d
1358 1646  #define CPUID_LEAF_EXT_1e               0x8000001e
1359 1647  
1360 1648  /*
1361 1649   * Functions we consune from cpuid_subr.c;  don't publish these in a header
1362 1650   * file to try and keep people using the expected cpuid_* interfaces.
1363 1651   */
1364 1652  extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
1365 1653  extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
1366 1654  extern uint32_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
1367 1655  extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
1368 1656  extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
1369 1657  
1370 1658  /*
1371 1659   * Apply up various platform-dependent restrictions where the
1372 1660   * underlying platform restrictions mean the CPU can be marked
1373 1661   * as less capable than its cpuid instruction would imply.
1374 1662   */
1375 1663  #if defined(__xpv)
1376 1664  static void
1377 1665  platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
1378 1666  {
1379 1667          switch (eax) {
1380 1668          case 1: {
1381 1669                  uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
1382 1670                      0 : CPUID_INTC_EDX_MCA;
1383 1671                  cp->cp_edx &=
1384 1672                      ~(mcamask |
1385 1673                      CPUID_INTC_EDX_PSE |
1386 1674                      CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1387 1675                      CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
1388 1676                      CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
1389 1677                      CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1390 1678                      CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
1391 1679                  break;
1392 1680          }
1393 1681  
1394 1682          case 0x80000001:
1395 1683                  cp->cp_edx &=
1396 1684                      ~(CPUID_AMD_EDX_PSE |
1397 1685                      CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1398 1686                      CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
1399 1687                      CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
1400 1688                      CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1401 1689                      CPUID_AMD_EDX_TSCP);
1402 1690                  cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
1403 1691                  break;
1404 1692          default:
1405 1693                  break;
1406 1694          }
1407 1695  
1408 1696          switch (vendor) {
1409 1697          case X86_VENDOR_Intel:
1410 1698                  switch (eax) {
1411 1699                  case 4:
1412 1700                          /*
1413 1701                           * Zero out the (ncores-per-chip - 1) field
1414 1702                           */
1415 1703                          cp->cp_eax &= 0x03fffffff;
1416 1704                          break;
1417 1705                  default:
1418 1706                          break;
1419 1707                  }
1420 1708                  break;
1421 1709          case X86_VENDOR_AMD:
1422 1710                  switch (eax) {
1423 1711  
1424 1712                  case 0x80000001:
1425 1713                          cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
1426 1714                          break;
1427 1715  
1428 1716                  case CPUID_LEAF_EXT_8:
1429 1717                          /*
1430 1718                           * Zero out the (ncores-per-chip - 1) field
1431 1719                           */
1432 1720                          cp->cp_ecx &= 0xffffff00;
1433 1721                          break;
1434 1722                  default:
1435 1723                          break;
1436 1724                  }
1437 1725                  break;
1438 1726          default:
1439 1727                  break;
1440 1728          }
1441 1729  }
1442 1730  #else
1443 1731  #define platform_cpuid_mangle(vendor, eax, cp)  /* nothing */
1444 1732  #endif
1445 1733  
1446 1734  /*
1447 1735   *  Some undocumented ways of patching the results of the cpuid
1448 1736   *  instruction to permit running Solaris 10 on future cpus that
1449 1737   *  we don't currently support.  Could be set to non-zero values
1450 1738   *  via settings in eeprom.
1451 1739   */
1452 1740  
1453 1741  uint32_t cpuid_feature_ecx_include;
1454 1742  uint32_t cpuid_feature_ecx_exclude;
1455 1743  uint32_t cpuid_feature_edx_include;
1456 1744  uint32_t cpuid_feature_edx_exclude;
1457 1745  
1458 1746  /*
1459 1747   * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
1460 1748   */
1461 1749  void
1462 1750  cpuid_alloc_space(cpu_t *cpu)
1463 1751  {
1464 1752          /*
1465 1753           * By convention, cpu0 is the boot cpu, which is set up
1466 1754           * before memory allocation is available.  All other cpus get
1467 1755           * their cpuid_info struct allocated here.
1468 1756           */
1469 1757          ASSERT(cpu->cpu_id != 0);
1470 1758          ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
1471 1759          cpu->cpu_m.mcpu_cpi =
1472 1760              kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
1473 1761  }
1474 1762  
1475 1763  void
1476 1764  cpuid_free_space(cpu_t *cpu)
1477 1765  {
1478 1766          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1479 1767          int i;
1480 1768  
1481 1769          ASSERT(cpi != NULL);
1482 1770          ASSERT(cpi != &cpuid_info0);
1483 1771  
1484 1772          /*
1485 1773           * Free up any cache leaf related dynamic storage. The first entry was
1486 1774           * cached from the standard cpuid storage, so we should not free it.
1487 1775           */
1488 1776          for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
1489 1777                  kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
1490 1778          if (cpi->cpi_cache_leaf_size > 0)
1491 1779                  kmem_free(cpi->cpi_cache_leaves,
1492 1780                      cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
1493 1781  
1494 1782          kmem_free(cpi, sizeof (*cpi));
1495 1783          cpu->cpu_m.mcpu_cpi = NULL;
1496 1784  }
1497 1785  
1498 1786  #if !defined(__xpv)
1499 1787  /*
1500 1788   * Determine the type of the underlying platform. This is used to customize
1501 1789   * initialization of various subsystems (e.g. TSC). determine_platform() must
1502 1790   * only ever be called once to prevent two processors from seeing different
1503 1791   * values of platform_type. Must be called before cpuid_pass1(), the earliest
1504 1792   * consumer to execute (uses _cpuid_chiprev --> synth_amd_info --> get_hwenv).
1505 1793   */
1506 1794  void
1507 1795  determine_platform(void)
1508 1796  {
1509 1797          struct cpuid_regs cp;
1510 1798          uint32_t base;
1511 1799          uint32_t regs[4];
1512 1800          char *hvstr = (char *)regs;
1513 1801  
1514 1802          ASSERT(platform_type == -1);
1515 1803  
1516 1804          platform_type = HW_NATIVE;
1517 1805  
1518 1806          if (!enable_platform_detection)
1519 1807                  return;
1520 1808  
1521 1809          /*
1522 1810           * If Hypervisor CPUID bit is set, try to determine hypervisor
1523 1811           * vendor signature, and set platform type accordingly.
1524 1812           *
1525 1813           * References:
1526 1814           * http://lkml.org/lkml/2008/10/1/246
1527 1815           * http://kb.vmware.com/kb/1009458
1528 1816           */
1529 1817          cp.cp_eax = 0x1;
1530 1818          (void) __cpuid_insn(&cp);
1531 1819          if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
1532 1820                  cp.cp_eax = 0x40000000;
1533 1821                  (void) __cpuid_insn(&cp);
1534 1822                  regs[0] = cp.cp_ebx;
1535 1823                  regs[1] = cp.cp_ecx;
1536 1824                  regs[2] = cp.cp_edx;
1537 1825                  regs[3] = 0;
1538 1826                  if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
1539 1827                          platform_type = HW_XEN_HVM;
1540 1828                          return;
1541 1829                  }
1542 1830                  if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
1543 1831                          platform_type = HW_VMWARE;
1544 1832                          return;
1545 1833                  }
1546 1834                  if (strcmp(hvstr, HVSIG_KVM) == 0) {
1547 1835                          platform_type = HW_KVM;
1548 1836                          return;
1549 1837                  }
1550 1838                  if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
1551 1839                          platform_type = HW_BHYVE;
1552 1840                          return;
1553 1841                  }
1554 1842                  if (strcmp(hvstr, HVSIG_MICROSOFT) == 0)
1555 1843                          platform_type = HW_MICROSOFT;
1556 1844          } else {
1557 1845                  /*
1558 1846                   * Check older VMware hardware versions. VMware hypervisor is
1559 1847                   * detected by performing an IN operation to VMware hypervisor
1560 1848                   * port and checking that value returned in %ebx is VMware
1561 1849                   * hypervisor magic value.
1562 1850                   *
1563 1851                   * References: http://kb.vmware.com/kb/1009458
1564 1852                   */
1565 1853                  vmware_port(VMWARE_HVCMD_GETVERSION, regs);
1566 1854                  if (regs[1] == VMWARE_HVMAGIC) {
1567 1855                          platform_type = HW_VMWARE;
1568 1856                          return;
1569 1857                  }
1570 1858          }
1571 1859  
1572 1860          /*
1573 1861           * Check Xen hypervisor. In a fully virtualized domain,
1574 1862           * Xen's pseudo-cpuid function returns a string representing the
1575 1863           * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
1576 1864           * supported cpuid function. We need at least a (base + 2) leaf value
1577 1865           * to do what we want to do. Try different base values, since the
1578 1866           * hypervisor might use a different one depending on whether Hyper-V
1579 1867           * emulation is switched on by default or not.
1580 1868           */
1581 1869          for (base = 0x40000000; base < 0x40010000; base += 0x100) {
1582 1870                  cp.cp_eax = base;
1583 1871                  (void) __cpuid_insn(&cp);
1584 1872                  regs[0] = cp.cp_ebx;
1585 1873                  regs[1] = cp.cp_ecx;
1586 1874                  regs[2] = cp.cp_edx;
1587 1875                  regs[3] = 0;
1588 1876                  if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
1589 1877                      cp.cp_eax >= (base + 2)) {
1590 1878                          platform_type &= ~HW_NATIVE;
1591 1879                          platform_type |= HW_XEN_HVM;
1592 1880                          return;
1593 1881                  }
1594 1882          }
1595 1883  }
1596 1884  
1597 1885  int
1598 1886  get_hwenv(void)
1599 1887  {
1600 1888          ASSERT(platform_type != -1);
1601 1889          return (platform_type);
1602 1890  }
1603 1891  
1604 1892  int
1605 1893  is_controldom(void)
1606 1894  {
1607 1895          return (0);
1608 1896  }
1609 1897  
1610 1898  #else
1611 1899  
1612 1900  int
1613 1901  get_hwenv(void)
1614 1902  {
1615 1903          return (HW_XEN_PV);
1616 1904  }
1617 1905  
1618 1906  int
1619 1907  is_controldom(void)
1620 1908  {
1621 1909          return (DOMAIN_IS_INITDOMAIN(xen_info));
1622 1910  }
1623 1911  
1624 1912  #endif  /* __xpv */
1625 1913  
1626 1914  /*
1627 1915   * Make sure that we have gathered all of the CPUID leaves that we might need to
1628 1916   * determine topology. We assume that the standard leaf 1 has already been done
1629 1917   * and that xmaxeax has already been calculated.
1630 1918   */
1631 1919  static void
1632 1920  cpuid_gather_amd_topology_leaves(cpu_t *cpu)
1633 1921  {
1634 1922          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1635 1923  
1636 1924          if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
1637 1925                  struct cpuid_regs *cp;
1638 1926  
1639 1927                  cp = &cpi->cpi_extd[8];
1640 1928                  cp->cp_eax = CPUID_LEAF_EXT_8;
1641 1929                  (void) __cpuid_insn(cp);
1642 1930                  platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
1643 1931          }
1644 1932  
1645 1933          if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
1646 1934              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
1647 1935                  struct cpuid_regs *cp;
1648 1936  
1649 1937                  cp = &cpi->cpi_extd[0x1e];
1650 1938                  cp->cp_eax = CPUID_LEAF_EXT_1e;
1651 1939                  (void) __cpuid_insn(cp);
1652 1940          }
1653 1941  }
1654 1942  
1655 1943  /*
1656 1944   * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
1657 1945   * it to everything else. If not, and we're on an AMD system where 8000001e is
1658 1946   * valid, then we use that. Othewrise, we fall back to the default value for the
1659 1947   * APIC ID in leaf 1.
1660 1948   */
1661 1949  static uint32_t
1662 1950  cpuid_gather_apicid(struct cpuid_info *cpi)
1663 1951  {
1664 1952          /*
1665 1953           * Leaf B changes based on the arguments to it. Beacuse we don't cache
1666 1954           * it, we need to gather it again.
1667 1955           */
1668 1956          if (cpi->cpi_maxeax >= 0xB) {
1669 1957                  struct cpuid_regs regs;
1670 1958                  struct cpuid_regs *cp;
1671 1959  
1672 1960                  cp = &regs;
1673 1961                  cp->cp_eax = 0xB;
1674 1962                  cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
1675 1963                  (void) __cpuid_insn(cp);
1676 1964  
1677 1965                  if (cp->cp_ebx != 0) {
1678 1966                          return (cp->cp_edx);
1679 1967                  }
1680 1968          }
1681 1969  
1682 1970          if (cpi->cpi_vendor == X86_VENDOR_AMD &&
1683 1971              is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
1684 1972              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
1685 1973                  return (cpi->cpi_extd[0x1e].cp_eax);
1686 1974          }
1687 1975  
1688 1976          return (CPI_APIC_ID(cpi));
1689 1977  }
1690 1978  
1691 1979  /*
1692 1980   * For AMD processors, attempt to calculate the number of chips and cores that
1693 1981   * exist. The way that we do this varies based on the generation, because the
1694 1982   * generations themselves have changed dramatically.
1695 1983   *
1696 1984   * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
1697 1985   * However, with the advent of family 17h (Zen) it actually tells us the number
1698 1986   * of threads, so we need to look at leaf 0x8000001e if available to determine
1699 1987   * its value. Otherwise, for all prior families, the number of enabled cores is
1700 1988   * the same as threads.
1701 1989   *
1702 1990   * If we do not have leaf 0x80000008, then we assume that this processor does
1703 1991   * not have anything. AMD's older CPUID specification says there's no reason to
1704 1992   * fall back to leaf 1.
1705 1993   *
1706 1994   * In some virtualization cases we will not have leaf 8000001e or it will be
1707 1995   * zero. When that happens we assume the number of threads is one.
1708 1996   */
1709 1997  static void
1710 1998  cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
1711 1999  {
1712 2000          uint_t nthreads, nthread_per_core;
1713 2001  
1714 2002          nthreads = nthread_per_core = 1;
1715 2003  
1716 2004          if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
1717 2005                  nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
1718 2006          } else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
1719 2007                  nthreads = CPI_CPU_COUNT(cpi);
1720 2008          }
1721 2009  
1722 2010          /*
1723 2011           * For us to have threads, and know about it, we have to be at least at
1724 2012           * family 17h and have the cpuid bit that says we have extended
1725 2013           * topology.
1726 2014           */
1727 2015          if (cpi->cpi_family >= 0x17 &&
1728 2016              is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
1729 2017              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
1730 2018                  nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
1731 2019          }
1732 2020  
1733 2021          *ncpus = nthreads;
1734 2022          *ncores = nthreads / nthread_per_core;
1735 2023  }
1736 2024  
1737 2025  /*
1738 2026   * Seed the initial values for the cores and threads for an Intel based
1739 2027   * processor. These values will be overwritten if we detect that the processor
1740 2028   * supports CPUID leaf 0xb.
1741 2029   */
1742 2030  static void
1743 2031  cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
1744 2032  {
1745 2033          /*
1746 2034           * Only seed the number of physical cores from the first level leaf 4
1747 2035           * information. The number of threads there indicate how many share the
1748 2036           * L1 cache, which may or may not have anything to do with the number of
1749 2037           * logical CPUs per core.
1750 2038           */
1751 2039          if (cpi->cpi_maxeax >= 4) {
1752 2040                  *ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
1753 2041          } else {
1754 2042                  *ncores = 1;
1755 2043          }
1756 2044  
1757 2045          if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
1758 2046                  *ncpus = CPI_CPU_COUNT(cpi);
1759 2047          } else {
1760 2048                  *ncpus = *ncores;
1761 2049          }
1762 2050  }
1763 2051  
1764 2052  static boolean_t
1765 2053  cpuid_leafB_getids(cpu_t *cpu)
1766 2054  {
1767 2055          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1768 2056          struct cpuid_regs regs;
1769 2057          struct cpuid_regs *cp;
1770 2058  
1771 2059          if (cpi->cpi_maxeax < 0xB)
1772 2060                  return (B_FALSE);
1773 2061  
1774 2062          cp = &regs;
1775 2063          cp->cp_eax = 0xB;
1776 2064          cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
1777 2065  
1778 2066          (void) __cpuid_insn(cp);
1779 2067  
1780 2068          /*
1781 2069           * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
1782 2070           * indicates that the extended topology enumeration leaf is
1783 2071           * available.
1784 2072           */
1785 2073          if (cp->cp_ebx != 0) {
1786 2074                  uint32_t x2apic_id = 0;
1787 2075                  uint_t coreid_shift = 0;
1788 2076                  uint_t ncpu_per_core = 1;
1789 2077                  uint_t chipid_shift = 0;
1790 2078                  uint_t ncpu_per_chip = 1;
1791 2079                  uint_t i;
1792 2080                  uint_t level;
1793 2081  
1794 2082                  for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
1795 2083                          cp->cp_eax = 0xB;
1796 2084                          cp->cp_ecx = i;
1797 2085  
1798 2086                          (void) __cpuid_insn(cp);
1799 2087                          level = CPI_CPU_LEVEL_TYPE(cp);
1800 2088  
1801 2089                          if (level == 1) {
1802 2090                                  x2apic_id = cp->cp_edx;
1803 2091                                  coreid_shift = BITX(cp->cp_eax, 4, 0);
1804 2092                                  ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
1805 2093                          } else if (level == 2) {
1806 2094                                  x2apic_id = cp->cp_edx;
1807 2095                                  chipid_shift = BITX(cp->cp_eax, 4, 0);
1808 2096                                  ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
1809 2097                          }
1810 2098                  }
1811 2099  
1812 2100                  /*
1813 2101                   * cpi_apicid is taken care of in cpuid_gather_apicid.
1814 2102                   */
1815 2103                  cpi->cpi_ncpu_per_chip = ncpu_per_chip;
1816 2104                  cpi->cpi_ncore_per_chip = ncpu_per_chip /
1817 2105                      ncpu_per_core;
1818 2106                  cpi->cpi_chipid = x2apic_id >> chipid_shift;
1819 2107                  cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
1820 2108                  cpi->cpi_coreid = x2apic_id >> coreid_shift;
1821 2109                  cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
1822 2110                  cpi->cpi_procnodeid = cpi->cpi_chipid;
1823 2111                  cpi->cpi_compunitid = cpi->cpi_coreid;
1824 2112  
1825 2113                  if (coreid_shift > 0 && chipid_shift > coreid_shift) {
1826 2114                          cpi->cpi_nthread_bits = coreid_shift;
1827 2115                          cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
1828 2116                  }
1829 2117  
1830 2118                  return (B_TRUE);
1831 2119          } else {
1832 2120                  return (B_FALSE);
1833 2121          }
1834 2122  }
1835 2123  
1836 2124  static void
1837 2125  cpuid_intel_getids(cpu_t *cpu, void *feature)
1838 2126  {
1839 2127          uint_t i;
1840 2128          uint_t chipid_shift = 0;
1841 2129          uint_t coreid_shift = 0;
1842 2130          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1843 2131  
1844 2132          /*
1845 2133           * There are no compute units or processor nodes currently on Intel.
1846 2134           * Always set these to one.
1847 2135           */
1848 2136          cpi->cpi_procnodes_per_pkg = 1;
1849 2137          cpi->cpi_cores_per_compunit = 1;
1850 2138  
1851 2139          /*
1852 2140           * If cpuid Leaf B is present, use that to try and get this information.
1853 2141           * It will be the most accurate for Intel CPUs.
1854 2142           */
1855 2143          if (cpuid_leafB_getids(cpu))
1856 2144                  return;
1857 2145  
1858 2146          /*
1859 2147           * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
1860 2148           * and ncore_per_chip. These represent the largest power of two values
1861 2149           * that we need to cover all of the IDs in the system. Therefore, we use
1862 2150           * those values to seed the number of bits needed to cover information
1863 2151           * in the case when leaf B is not available. These values will probably
1864 2152           * be larger than required, but that's OK.
1865 2153           */
1866 2154          cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
1867 2155          cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
1868 2156  
1869 2157          for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
1870 2158                  chipid_shift++;
1871 2159  
1872 2160          cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
1873 2161          cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
1874 2162  
1875 2163          if (is_x86_feature(feature, X86FSET_CMP)) {
1876 2164                  /*
1877 2165                   * Multi-core (and possibly multi-threaded)
1878 2166                   * processors.
1879 2167                   */
1880 2168                  uint_t ncpu_per_core;
1881 2169                  if (cpi->cpi_ncore_per_chip == 1)
1882 2170                          ncpu_per_core = cpi->cpi_ncpu_per_chip;
1883 2171                  else if (cpi->cpi_ncore_per_chip > 1)
1884 2172                          ncpu_per_core = cpi->cpi_ncpu_per_chip /
1885 2173                              cpi->cpi_ncore_per_chip;
1886 2174                  /*
1887 2175                   * 8bit APIC IDs on dual core Pentiums
1888 2176                   * look like this:
1889 2177                   *
1890 2178                   * +-----------------------+------+------+
1891 2179                   * | Physical Package ID   |  MC  |  HT  |
1892 2180                   * +-----------------------+------+------+
1893 2181                   * <------- chipid -------->
1894 2182                   * <------- coreid --------------->
1895 2183                   *                         <--- clogid -->
1896 2184                   *                         <------>
1897 2185                   *                         pkgcoreid
1898 2186                   *
1899 2187                   * Where the number of bits necessary to
1900 2188                   * represent MC and HT fields together equals
1901 2189                   * to the minimum number of bits necessary to
1902 2190                   * store the value of cpi->cpi_ncpu_per_chip.
1903 2191                   * Of those bits, the MC part uses the number
1904 2192                   * of bits necessary to store the value of
1905 2193                   * cpi->cpi_ncore_per_chip.
1906 2194                   */
1907 2195                  for (i = 1; i < ncpu_per_core; i <<= 1)
1908 2196                          coreid_shift++;
1909 2197                  cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
1910 2198                  cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
1911 2199          } else if (is_x86_feature(feature, X86FSET_HTT)) {
1912 2200                  /*
1913 2201                   * Single-core multi-threaded processors.
1914 2202                   */
1915 2203                  cpi->cpi_coreid = cpi->cpi_chipid;
1916 2204                  cpi->cpi_pkgcoreid = 0;
1917 2205          } else {
1918 2206                  /*
1919 2207                   * Single-core single-thread processors.
1920 2208                   */
1921 2209                  cpi->cpi_coreid = cpu->cpu_id;
1922 2210                  cpi->cpi_pkgcoreid = 0;
1923 2211          }
1924 2212          cpi->cpi_procnodeid = cpi->cpi_chipid;
1925 2213          cpi->cpi_compunitid = cpi->cpi_coreid;
1926 2214  }
1927 2215  
1928 2216  /*
1929 2217   * Historically, AMD has had CMP chips with only a single thread per core.
1930 2218   * However, starting in family 17h (Zen), this has changed and they now have
1931 2219   * multiple threads. Our internal core id needs to be a unique value.
1932 2220   *
1933 2221   * To determine the core id of an AMD system, if we're from a family before 17h,
1934 2222   * then we just use the cpu id, as that gives us a good value that will be
1935 2223   * unique for each core. If instead, we're on family 17h or later, then we need
1936 2224   * to do something more complicated. CPUID leaf 0x8000001e can tell us
1937 2225   * how many threads are in the system. Based on that, we'll shift the APIC ID.
1938 2226   * We can't use the normal core id in that leaf as it's only unique within the
1939 2227   * socket, which is perfect for cpi_pkgcoreid, but not us.
1940 2228   */
1941 2229  static id_t
1942 2230  cpuid_amd_get_coreid(cpu_t *cpu)
1943 2231  {
1944 2232          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1945 2233  
1946 2234          if (cpi->cpi_family >= 0x17 &&
1947 2235              is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
1948 2236              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
1949 2237                  uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
1950 2238                  if (nthreads > 1) {
1951 2239                          VERIFY3U(nthreads, ==, 2);
1952 2240                          return (cpi->cpi_apicid >> 1);
1953 2241                  }
1954 2242          }
1955 2243  
1956 2244          return (cpu->cpu_id);
1957 2245  }
1958 2246  
1959 2247  /*
1960 2248   * IDs on AMD is a more challenging task. This is notable because of the
1961 2249   * following two facts:
1962 2250   *
1963 2251   *  1. Before family 0x17 (Zen), there was no support for SMT and there was
1964 2252   *     also no way to get an actual unique core id from the system. As such, we
1965 2253   *     synthesize this case by using cpu->cpu_id.  This scheme does not,
1966 2254   *     however, guarantee that sibling cores of a chip will have sequential
1967 2255   *     coreids starting at a multiple of the number of cores per chip - that is
1968 2256   *     usually the case, but if the ACPI MADT table is presented in a different
1969 2257   *     order then we need to perform a few more gymnastics for the pkgcoreid.
1970 2258   *
1971 2259   *  2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
1972 2260   *     called compute units. These compute units share the L1I cache, L2 cache,
1973 2261   *     and the FPU. To deal with this, a new topology leaf was added in
1974 2262   *     0x8000001e. However, parts of this leaf have different meanings
1975 2263   *     once we get to family 0x17.
1976 2264   */
1977 2265  
1978 2266  static void
1979 2267  cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
1980 2268  {
1981 2269          int i, first_half, coreidsz;
1982 2270          uint32_t nb_caps_reg;
1983 2271          uint_t node2_1;
1984 2272          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1985 2273          struct cpuid_regs *cp;
1986 2274  
1987 2275          /*
1988 2276           * Calculate the core id (this comes from hardware in family 0x17 if it
1989 2277           * hasn't been stripped by virtualization). We always set the compute
1990 2278           * unit id to the same value. Also, initialize the default number of
1991 2279           * cores per compute unit and nodes per package. This will be
1992 2280           * overwritten when we know information about a particular family.
1993 2281           */
1994 2282          cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
1995 2283          cpi->cpi_compunitid = cpi->cpi_coreid;
1996 2284          cpi->cpi_cores_per_compunit = 1;
1997 2285          cpi->cpi_procnodes_per_pkg = 1;
1998 2286  
1999 2287          /*
2000 2288           * To construct the logical ID, we need to determine how many APIC IDs
2001 2289           * are dedicated to the cores and threads. This is provided for us in
2002 2290           * 0x80000008. However, if it's not present (say due to virtualization),
2003 2291           * then we assume it's one. This should be present on all 64-bit AMD
2004 2292           * processors.  It was added in family 0xf (Hammer).
2005 2293           */
2006 2294          if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2007 2295                  coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2008 2296  
2009 2297                  /*
2010 2298                   * In AMD parlance chip is really a node while illumos
2011 2299                   * uses chip as equivalent to socket/package.
2012 2300                   */
2013 2301                  if (coreidsz == 0) {
2014 2302                          /* Use legacy method */
2015 2303                          for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2016 2304                                  coreidsz++;
2017 2305                          if (coreidsz == 0)
2018 2306                                  coreidsz = 1;
2019 2307                  }
2020 2308          } else {
2021 2309                  /* Assume single-core part */
2022 2310                  coreidsz = 1;
2023 2311          }
2024 2312          cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2025 2313  
2026 2314          /*
2027 2315           * The package core ID varies depending on the family. While it may be
2028 2316           * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2029 2317           * this value is the core id in the given node. For non-virtualized
2030 2318           * family 17h, we need to take the logical core id and shift off the
2031 2319           * threads like we do when getting the core id.  Otherwise, we can use
2032 2320           * the clogid as is. When family 17h is virtualized, the clogid should
2033 2321           * be sufficient as if we don't have valid data in the leaf, then we
2034 2322           * won't think we have SMT, in which case the cpi_clogid should be
2035 2323           * sufficient.
2036 2324           */
2037 2325          if (cpi->cpi_family >= 0x17 &&
2038 2326              is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2039 2327              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2040 2328              cpi->cpi_extd[0x1e].cp_ebx != 0) {
2041 2329                  uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2042 2330                  if (nthreads > 1) {
2043 2331                          VERIFY3U(nthreads, ==, 2);
2044 2332                          cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2045 2333                  } else {
2046 2334                          cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2047 2335                  }
2048 2336          } else {
2049 2337                  cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2050 2338          }
2051 2339  
2052 2340          /*
2053 2341           * Obtain the node ID and compute unit IDs. If we're on family 0x15
2054 2342           * (bulldozer) or newer, then we can derive all of this from leaf
2055 2343           * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2056 2344           */
2057 2345          if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2058 2346              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2059 2347                  cp = &cpi->cpi_extd[0x1e];
2060 2348  
2061 2349                  cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2062 2350                  cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2063 2351  
2064 2352                  /*
2065 2353                   * For Bulldozer-era CPUs, recalculate the compute unit
2066 2354                   * information.
2067 2355                   */
2068 2356                  if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2069 2357                          cpi->cpi_cores_per_compunit =
2070 2358                              BITX(cp->cp_ebx, 15, 8) + 1;
2071 2359                          cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2072 2360                              (cpi->cpi_ncore_per_chip /
2073 2361                              cpi->cpi_cores_per_compunit) *
2074 2362                              (cpi->cpi_procnodeid /
2075 2363                              cpi->cpi_procnodes_per_pkg);
2076 2364                  }
2077 2365          } else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2078 2366                  cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2079 2367          } else if (cpi->cpi_family == 0x10) {
2080 2368                  /*
2081 2369                   * See if we are a multi-node processor.
2082 2370                   * All processors in the system have the same number of nodes
2083 2371                   */
2084 2372                  nb_caps_reg =  pci_getl_func(0, 24, 3, 0xe8);
2085 2373                  if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2086 2374                          /* Single-node */
2087 2375                          cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2088 2376                              coreidsz);
2089 2377                  } else {
2090 2378  
2091 2379                          /*
2092 2380                           * Multi-node revision D (2 nodes per package
2093 2381                           * are supported)
2094 2382                           */
2095 2383                          cpi->cpi_procnodes_per_pkg = 2;
2096 2384  
2097 2385                          first_half = (cpi->cpi_pkgcoreid <=
2098 2386                              (cpi->cpi_ncore_per_chip/2 - 1));
2099 2387  
2100 2388                          if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2101 2389                                  /* We are BSP */
2102 2390                                  cpi->cpi_procnodeid = (first_half ? 0 : 1);
2103 2391                          } else {
2104 2392  
2105 2393                                  /* We are AP */
2106 2394                                  /* NodeId[2:1] bits to use for reading F3xe8 */
2107 2395                                  node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2108 2396  
2109 2397                                  nb_caps_reg =
2110 2398                                      pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2111 2399  
2112 2400                                  /*
2113 2401                                   * Check IntNodeNum bit (31:30, but bit 31 is
2114 2402                                   * always 0 on dual-node processors)
2115 2403                                   */
2116 2404                                  if (BITX(nb_caps_reg, 30, 30) == 0)
2117 2405                                          cpi->cpi_procnodeid = node2_1 +
2118 2406                                              !first_half;
2119 2407                                  else
2120 2408                                          cpi->cpi_procnodeid = node2_1 +
2121 2409                                              first_half;
2122 2410                          }
2123 2411                  }
2124 2412          } else {
2125 2413                  cpi->cpi_procnodeid = 0;
2126 2414          }
2127 2415  
2128 2416          cpi->cpi_chipid =
2129 2417              cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2130 2418  
2131 2419          cpi->cpi_ncore_bits = coreidsz;
2132 2420          cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2133 2421              cpi->cpi_ncore_per_chip);
2134 2422  }
2135 2423  
2136 2424  static void
2137 2425  spec_uarch_flush_noop(void)
2138 2426  {
2139 2427  }
2140 2428  
2141 2429  /*
2142 2430   * When microcode is present that mitigates MDS, this wrmsr will also flush the
2143 2431   * MDS-related micro-architectural state that would normally happen by calling
2144 2432   * x86_md_clear().
2145 2433   */
2146 2434  static void
2147 2435  spec_uarch_flush_msr(void)
2148 2436  {
2149 2437          wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2150 2438  }
2151 2439  
2152 2440  /*
2153 2441   * This function points to a function that will flush certain
2154 2442   * micro-architectural state on the processor. This flush is used to mitigate
2155 2443   * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This
2156 2444   * function can point to one of three functions:
2157 2445   *
2158 2446   * - A noop which is done because we either are vulnerable, but do not have
2159 2447   *   microcode available to help deal with a fix, or because we aren't
2160 2448   *   vulnerable.
2161 2449   *
2162 2450   * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to

↓ open down ↓

1210 lines elided

↑ open up ↑

2163 2451   *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2164 2452   *   however, it only flushes the MDS related micro-architectural state on the
2165 2453   *   current hyperthread, it does not do anything for the twin.
2166 2454   *
2167 2455   * - x86_md_clear which will flush the MDS related state. This is done when we
2168 2456   *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2169 2457   *   (RDCL_NO is set).
2170 2458   */
2171 2459  void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2172 2460  
2173      -void (*x86_md_clear)(void) = x86_md_clear_noop;
2174      -
2175 2461  static void
2176 2462  cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2177 2463  {
2178 2464          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2179 2465  
2180 2466          /*
2181 2467           * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2182 2468           * has been fixed in hardware, it doesn't cover everything related to
2183 2469           * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2184 2470           * need to mitigate this.
2185 2471           */
2186 2472          if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2187 2473              is_x86_feature(featureset, X86FSET_MDS_NO)) {
2188      -                x86_md_clear = x86_md_clear_noop;
2189      -                membar_producer();
2190 2474                  return;
2191 2475          }
2192 2476  
2193 2477          if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2194      -                x86_md_clear = x86_md_clear_verw;
     2478 +                const uint8_t nop = NOP_INSTR;
     2479 +                uint8_t *md = (uint8_t *)x86_md_clear;
     2480 +
     2481 +                *md = nop;
2195 2482          }
2196 2483  
2197 2484          membar_producer();
2198 2485  }
2199 2486  
2200 2487  static void
2201 2488  cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2202 2489  {
2203 2490          boolean_t need_l1d, need_mds;
2204 2491          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;

2205 2492  
2206 2493          /*
2207 2494           * If we're not on Intel or we've mitigated both RDCL and MDS in
2208 2495           * hardware, then there's nothing left for us to do for enabling the
2209 2496           * flush. We can also go ahead and say that SMT exclusion is
2210 2497           * unnecessary.
2211 2498           */
2212 2499          if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2213 2500              (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2214 2501              is_x86_feature(featureset, X86FSET_MDS_NO))) {
2215 2502                  extern int smt_exclusion;
2216 2503                  smt_exclusion = 0;
2217 2504                  spec_uarch_flush = spec_uarch_flush_noop;
2218 2505                  membar_producer();
2219 2506                  return;
2220 2507          }
2221 2508  
2222 2509          /*
2223 2510           * The locations where we need to perform an L1D flush are required both
2224 2511           * for mitigating L1TF and MDS. When verw support is present in
2225 2512           * microcode, then the L1D flush will take care of doing that as well.
2226 2513           * However, if we have a system where RDCL_NO is present, but we don't
2227 2514           * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2228 2515           * L1D flush.
2229 2516           */
2230 2517          if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2231 2518              is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2232 2519              !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2233 2520                  need_l1d = B_TRUE;
2234 2521          } else {
2235 2522                  need_l1d = B_FALSE;
2236 2523          }
2237 2524  
2238 2525          if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2239 2526              is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2240 2527                  need_mds = B_TRUE;
2241 2528          } else {
2242 2529                  need_mds = B_FALSE;
2243 2530          }
2244 2531  
2245 2532          if (need_l1d) {
2246 2533                  spec_uarch_flush = spec_uarch_flush_msr;
2247 2534          } else if (need_mds) {

↓ open down ↓

43 lines elided

↑ open up ↑

2248 2535                  spec_uarch_flush = x86_md_clear;
2249 2536          } else {
2250 2537                  /*
2251 2538                   * We have no hardware mitigations available to us.
2252 2539                   */
2253 2540                  spec_uarch_flush = spec_uarch_flush_noop;
2254 2541          }
2255 2542          membar_producer();
2256 2543  }
2257 2544  
     2545 +/*
     2546 + * We default to enabling RSB mitigations.
     2547 + */
2258 2548  static void
     2549 +cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)
     2550 +{
     2551 +        const uint8_t ret = RET_INSTR;
     2552 +        uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
     2553 +
     2554 +        switch (mit) {
     2555 +        case X86_SPECTREV2_ENHANCED_IBRS:
     2556 +        case X86_SPECTREV2_DISABLED:
     2557 +                *stuff = ret;
     2558 +                break;
     2559 +        default:
     2560 +                break;
     2561 +        }
     2562 +}
     2563 +
     2564 +static void
     2565 +cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
     2566 +{
     2567 +        const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
     2568 +            "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
     2569 +            "_r14", "_r15" };
     2570 +        const uint_t nthunks = ARRAY_SIZE(thunks);
     2571 +        const char *type;
     2572 +        uint_t i;
     2573 +
     2574 +        if (mit == x86_spectrev2_mitigation)
     2575 +                return;
     2576 +
     2577 +        switch (mit) {
     2578 +        case X86_SPECTREV2_RETPOLINE:
     2579 +                type = "gen";
     2580 +                break;
     2581 +        case X86_SPECTREV2_RETPOLINE_AMD:
     2582 +                type = "amd";
     2583 +                break;
     2584 +        case X86_SPECTREV2_ENHANCED_IBRS:
     2585 +        case X86_SPECTREV2_DISABLED:
     2586 +                type = "jmp";
     2587 +                break;
     2588 +        default:
     2589 +                panic("asked to updated retpoline state with unknown state!");
     2590 +        }
     2591 +
     2592 +        for (i = 0; i < nthunks; i++) {
     2593 +                uintptr_t source, dest;
     2594 +                int ssize, dsize;
     2595 +                char sourcebuf[64], destbuf[64];
     2596 +                size_t len;
     2597 +
     2598 +                (void) snprintf(destbuf, sizeof (destbuf),
     2599 +                    "__x86_indirect_thunk%s", thunks[i]);
     2600 +                (void) snprintf(sourcebuf, sizeof (sourcebuf),
     2601 +                    "__x86_indirect_thunk_%s%s", type, thunks[i]);
     2602 +
     2603 +                source = kobj_getelfsym(sourcebuf, NULL, &ssize);
     2604 +                dest = kobj_getelfsym(destbuf, NULL, &dsize);
     2605 +                VERIFY3U(source, !=, 0);
     2606 +                VERIFY3U(dest, !=, 0);
     2607 +                VERIFY3S(dsize, >=, ssize);
     2608 +                bcopy((void *)source, (void *)dest, ssize);
     2609 +        }
     2610 +}
     2611 +
     2612 +static void
     2613 +cpuid_enable_enhanced_ibrs(void)
     2614 +{
     2615 +        uint64_t val;
     2616 +
     2617 +        val = rdmsr(MSR_IA32_SPEC_CTRL);
     2618 +        val |= IA32_SPEC_CTRL_IBRS;
     2619 +        wrmsr(MSR_IA32_SPEC_CTRL, val);
     2620 +}
     2621 +
     2622 +#ifndef __xpv
     2623 +/*
     2624 + * Determine whether or not we can use the AMD optimized retpoline
     2625 + * functionality. We use this when we know we're on an AMD system and we can
     2626 + * successfully verify that lfence is dispatch serializing.
     2627 + */
     2628 +static boolean_t
     2629 +cpuid_use_amd_retpoline(struct cpuid_info *cpi)
     2630 +{
     2631 +        uint64_t val;
     2632 +        on_trap_data_t otd;
     2633 +
     2634 +        if (cpi->cpi_vendor != X86_VENDOR_AMD)
     2635 +                return (B_FALSE);
     2636 +
     2637 +        /*
     2638 +         * We need to determine whether or not lfence is serializing. It always
     2639 +         * is on families 0xf and 0x11. On others, it's controlled by
     2640 +         * MSR_AMD_DECODE_CONFIG (MSRC001_1029). If some hypervisor gives us a
     2641 +         * crazy old family, don't try and do anything.
     2642 +         */
     2643 +        if (cpi->cpi_family < 0xf)
     2644 +                return (B_FALSE);
     2645 +        if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11)
     2646 +                return (B_TRUE);
     2647 +
     2648 +        /*
     2649 +         * While it may be tempting to use get_hwenv(), there are no promises
     2650 +         * that a hypervisor will actually declare themselves to be so in a
     2651 +         * friendly way. As such, try to read and set the MSR. If we can then
     2652 +         * read back the value we set (it wasn't just set to zero), then we go
     2653 +         * for it.
     2654 +         */
     2655 +        if (!on_trap(&otd, OT_DATA_ACCESS)) {
     2656 +                val = rdmsr(MSR_AMD_DECODE_CONFIG);
     2657 +                val |= AMD_DECODE_CONFIG_LFENCE_DISPATCH;
     2658 +                wrmsr(MSR_AMD_DECODE_CONFIG, val);
     2659 +                val = rdmsr(MSR_AMD_DECODE_CONFIG);
     2660 +        } else {
     2661 +                val = 0;
     2662 +        }
     2663 +        no_trap();
     2664 +
     2665 +        if ((val & AMD_DECODE_CONFIG_LFENCE_DISPATCH) != 0)
     2666 +                return (B_TRUE);
     2667 +        return (B_FALSE);
     2668 +}
     2669 +#endif  /* !__xpv */
     2670 +
     2671 +static void
2259 2672  cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
2260 2673  {
2261 2674          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
     2675 +        x86_spectrev2_mitigation_t v2mit;
2262 2676  
2263 2677          if (cpi->cpi_vendor == X86_VENDOR_AMD &&
2264 2678              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2265 2679                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
2266 2680                          add_x86_feature(featureset, X86FSET_IBPB);
2267 2681                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
2268 2682                          add_x86_feature(featureset, X86FSET_IBRS);
2269 2683                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
2270 2684                          add_x86_feature(featureset, X86FSET_STIBP);
2271      -                if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)
2272      -                        add_x86_feature(featureset, X86FSET_IBRS_ALL);
2273 2685                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
2274 2686                          add_x86_feature(featureset, X86FSET_STIBP_ALL);
2275      -                if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS)
2276      -                        add_x86_feature(featureset, X86FSET_RSBA);
2277 2687                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
2278 2688                          add_x86_feature(featureset, X86FSET_SSBD);
2279 2689                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
2280 2690                          add_x86_feature(featureset, X86FSET_SSBD_VIRT);
2281 2691                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
2282 2692                          add_x86_feature(featureset, X86FSET_SSB_NO);
     2693 +                /*
     2694 +                 * Don't enable enhanced IBRS unless we're told that we should
     2695 +                 * prefer it and it has the same semantics as Intel. This is
     2696 +                 * split into two bits rather than a single one.
     2697 +                 */
     2698 +                if ((cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
     2699 +                    (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)) {
     2700 +                        add_x86_feature(featureset, X86FSET_IBRS_ALL);
     2701 +                }
     2702 +
2283 2703          } else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
2284 2704              cpi->cpi_maxeax >= 7) {
2285 2705                  struct cpuid_regs *ecp;
2286 2706                  ecp = &cpi->cpi_std[7];
2287 2707  
2288 2708                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
2289 2709                          add_x86_feature(featureset, X86FSET_MD_CLEAR);
2290 2710                  }
2291 2711  
2292 2712                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {

2293 2713                          add_x86_feature(featureset, X86FSET_IBRS);
2294 2714                          add_x86_feature(featureset, X86FSET_IBPB);
2295 2715                  }
2296 2716  
2297 2717                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
2298 2718                          add_x86_feature(featureset, X86FSET_STIBP);
2299 2719                  }
2300 2720  
2301 2721                  /*
2302 2722                   * Don't read the arch caps MSR on xpv where we lack the
2303 2723                   * on_trap().
2304 2724                   */
2305 2725  #ifndef __xpv
2306 2726                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
2307 2727                          on_trap_data_t otd;
2308 2728  
2309 2729                          /*
2310 2730                           * Be paranoid and assume we'll get a #GP.
2311 2731                           */
2312 2732                          if (!on_trap(&otd, OT_DATA_ACCESS)) {
2313 2733                                  uint64_t reg;
2314 2734  
2315 2735                                  reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
2316 2736                                  if (reg & IA32_ARCH_CAP_RDCL_NO) {
2317 2737                                          add_x86_feature(featureset,
2318 2738                                              X86FSET_RDCL_NO);
2319 2739                                  }
2320 2740                                  if (reg & IA32_ARCH_CAP_IBRS_ALL) {
2321 2741                                          add_x86_feature(featureset,
2322 2742                                              X86FSET_IBRS_ALL);
2323 2743                                  }
2324 2744                                  if (reg & IA32_ARCH_CAP_RSBA) {
2325 2745                                          add_x86_feature(featureset,
2326 2746                                              X86FSET_RSBA);
2327 2747                                  }
2328 2748                                  if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
2329 2749                                          add_x86_feature(featureset,
2330 2750                                              X86FSET_L1D_VM_NO);
2331 2751                                  }
2332 2752                                  if (reg & IA32_ARCH_CAP_SSB_NO) {
2333 2753                                          add_x86_feature(featureset,
2334 2754                                              X86FSET_SSB_NO);
2335 2755                                  }
2336 2756                                  if (reg & IA32_ARCH_CAP_MDS_NO) {
2337 2757                                          add_x86_feature(featureset,
2338 2758                                              X86FSET_MDS_NO);
2339 2759                                  }
2340 2760                          }
2341 2761                          no_trap();

↓ open down ↓

49 lines elided

↑ open up ↑

2342 2762                  }
2343 2763  #endif  /* !__xpv */
2344 2764  
2345 2765                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
2346 2766                          add_x86_feature(featureset, X86FSET_SSBD);
2347 2767  
2348 2768                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
2349 2769                          add_x86_feature(featureset, X86FSET_FLUSH_CMD);
2350 2770          }
2351 2771  
2352      -        if (cpu->cpu_id != 0)
     2772 +        if (cpu->cpu_id != 0) {
     2773 +                if (x86_spectrev2_mitigation == X86_SPECTREV2_ENHANCED_IBRS) {
     2774 +                        cpuid_enable_enhanced_ibrs();
     2775 +                }
2353 2776                  return;
     2777 +        }
2354 2778  
2355 2779          /*
     2780 +         * Go through and initialize various security mechanisms that we should
     2781 +         * only do on a single CPU. This includes Spectre V2, L1TF, and MDS.
     2782 +         */
     2783 +
     2784 +        /*
     2785 +         * By default we've come in with retpolines enabled. Check whether we
     2786 +         * should disable them or enable enhanced IBRS. RSB stuffing is enabled
     2787 +         * by default, but disabled if we are using enhanced IBRS.
     2788 +         */
     2789 +        if (x86_disable_spectrev2 != 0) {
     2790 +                v2mit = X86_SPECTREV2_DISABLED;
     2791 +        } else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
     2792 +                cpuid_enable_enhanced_ibrs();
     2793 +                v2mit = X86_SPECTREV2_ENHANCED_IBRS;
     2794 +#ifndef __xpv
     2795 +        } else if (cpuid_use_amd_retpoline(cpi)) {
     2796 +                v2mit = X86_SPECTREV2_RETPOLINE_AMD;
     2797 +#endif  /* !__xpv */
     2798 +        } else {
     2799 +                v2mit = X86_SPECTREV2_RETPOLINE;
     2800 +        }
     2801 +
     2802 +        cpuid_patch_retpolines(v2mit);
     2803 +        cpuid_patch_rsb(v2mit);
     2804 +        x86_spectrev2_mitigation = v2mit;
     2805 +        membar_producer();
     2806 +
     2807 +        /*
2356 2808           * We need to determine what changes are required for mitigating L1TF
2357 2809           * and MDS. If the CPU suffers from either of them, then SMT exclusion
2358 2810           * is required.
2359 2811           *
2360 2812           * If any of these are present, then we need to flush u-arch state at
2361 2813           * various points. For MDS, we need to do so whenever we change to a
2362 2814           * lesser privilege level or we are halting the CPU. For L1TF we need to
2363 2815           * flush the L1D cache at VM entry. When we have microcode that handles
2364 2816           * MDS, the L1D flush also clears the other u-arch state that the
2365 2817           * md_clear does.

2366 2818           */
2367 2819  
2368 2820          /*
2369 2821           * Update whether or not we need to be taking explicit action against
2370 2822           * MDS.
2371 2823           */
2372 2824          cpuid_update_md_clear(cpu, featureset);
2373 2825  
2374 2826          /*
2375 2827           * Determine whether SMT exclusion is required and whether or not we
2376 2828           * need to perform an l1d flush.
2377 2829           */
2378 2830          cpuid_update_l1d_flush(cpu, featureset);
2379 2831  }
2380 2832  
2381 2833  /*
2382 2834   * Setup XFeature_Enabled_Mask register. Required by xsave feature.
2383 2835   */
2384 2836  void
2385 2837  setup_xfem(void)
2386 2838  {
2387 2839          uint64_t flags = XFEATURE_LEGACY_FP;
2388 2840  
2389 2841          ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
2390 2842  
2391 2843          if (is_x86_feature(x86_featureset, X86FSET_SSE))
2392 2844                  flags |= XFEATURE_SSE;
2393 2845  
2394 2846          if (is_x86_feature(x86_featureset, X86FSET_AVX))
2395 2847                  flags |= XFEATURE_AVX;
2396 2848  
2397 2849          if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
2398 2850                  flags |= XFEATURE_AVX512;
2399 2851  
2400 2852          set_xcr(XFEATURE_ENABLED_MASK, flags);
2401 2853  
2402 2854          xsave_bv_all = flags;
2403 2855  }
2404 2856  
2405 2857  static void
2406 2858  cpuid_pass1_topology(cpu_t *cpu, uchar_t *featureset)
2407 2859  {
2408 2860          struct cpuid_info *cpi;
2409 2861  
2410 2862          cpi = cpu->cpu_m.mcpu_cpi;
2411 2863  
2412 2864          if (cpi->cpi_vendor == X86_VENDOR_AMD) {
2413 2865                  cpuid_gather_amd_topology_leaves(cpu);
2414 2866          }
2415 2867  
2416 2868          cpi->cpi_apicid = cpuid_gather_apicid(cpi);
2417 2869  
2418 2870          /*
2419 2871           * Before we can calculate the IDs that we should assign to this
2420 2872           * processor, we need to understand how many cores and threads it has.
2421 2873           */
2422 2874          switch (cpi->cpi_vendor) {
2423 2875          case X86_VENDOR_Intel:
2424 2876                  cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
2425 2877                      &cpi->cpi_ncore_per_chip);
2426 2878                  break;
2427 2879          case X86_VENDOR_AMD:
2428 2880                  cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
2429 2881                      &cpi->cpi_ncore_per_chip);
2430 2882                  break;
2431 2883          default:
2432 2884                  /*
2433 2885                   * If we have some other x86 compatible chip, it's not clear how
2434 2886                   * they would behave. The most common case is virtualization
2435 2887                   * today, though there are also 64-bit VIA chips. Assume that
2436 2888                   * all we can get is the basic Leaf 1 HTT information.
2437 2889                   */
2438 2890                  if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2439 2891                          cpi->cpi_ncore_per_chip = 1;
2440 2892                          cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
2441 2893                  }
2442 2894                  break;
2443 2895          }
2444 2896  
2445 2897          /*
2446 2898           * Based on the calculated number of threads and cores, potentially
2447 2899           * assign the HTT and CMT features.
2448 2900           */
2449 2901          if (cpi->cpi_ncore_per_chip > 1) {
2450 2902                  add_x86_feature(featureset, X86FSET_CMP);
2451 2903          }
2452 2904  
2453 2905          if (cpi->cpi_ncpu_per_chip > 1 &&
2454 2906              cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
2455 2907                  add_x86_feature(featureset, X86FSET_HTT);
2456 2908          }
2457 2909  
2458 2910          /*
2459 2911           * Now that has been set up, we need to go through and calculate all of
2460 2912           * the rest of the parameters that exist. If we think the CPU doesn't
2461 2913           * have either SMT (HTT) or CMP, then we basically go through and fake
2462 2914           * up information in some way. The most likely case for this is
2463 2915           * virtualization where we have a lot of partial topology information.
2464 2916           */
2465 2917          if (!is_x86_feature(featureset, X86FSET_HTT) &&
2466 2918              !is_x86_feature(featureset, X86FSET_CMP)) {
2467 2919                  /*
2468 2920                   * This is a single core, single-threaded processor.
2469 2921                   */
2470 2922                  cpi->cpi_procnodes_per_pkg = 1;
2471 2923                  cpi->cpi_cores_per_compunit = 1;
2472 2924                  cpi->cpi_compunitid = 0;
2473 2925                  cpi->cpi_chipid = -1;
2474 2926                  cpi->cpi_clogid = 0;
2475 2927                  cpi->cpi_coreid = cpu->cpu_id;
2476 2928                  cpi->cpi_pkgcoreid = 0;
2477 2929                  if (cpi->cpi_vendor == X86_VENDOR_AMD) {
2478 2930                          cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
2479 2931                  } else {
2480 2932                          cpi->cpi_procnodeid = cpi->cpi_chipid;
2481 2933                  }
2482 2934          } else {
2483 2935                  switch (cpi->cpi_vendor) {
2484 2936                  case X86_VENDOR_Intel:
2485 2937                          cpuid_intel_getids(cpu, featureset);
2486 2938                          break;
2487 2939                  case X86_VENDOR_AMD:
2488 2940                          cpuid_amd_getids(cpu, featureset);
2489 2941                          break;
2490 2942                  default:
2491 2943                          /*
2492 2944                           * In this case, it's hard to say what we should do.
2493 2945                           * We're going to model them to the OS as single core
2494 2946                           * threads. We don't have a good identifier for them, so
2495 2947                           * we're just going to use the cpu id all on a single
2496 2948                           * chip.
2497 2949                           *
2498 2950                           * This case has historically been different from the
2499 2951                           * case above where we don't have HTT or CMP. While they
2500 2952                           * could be combined, we've opted to keep it separate to
2501 2953                           * minimize the risk of topology changes in weird cases.
2502 2954                           */
2503 2955                          cpi->cpi_procnodes_per_pkg = 1;
2504 2956                          cpi->cpi_cores_per_compunit = 1;
2505 2957                          cpi->cpi_chipid = 0;
2506 2958                          cpi->cpi_coreid = cpu->cpu_id;
2507 2959                          cpi->cpi_clogid = cpu->cpu_id;
2508 2960                          cpi->cpi_pkgcoreid = cpu->cpu_id;
2509 2961                          cpi->cpi_procnodeid = cpi->cpi_chipid;
2510 2962                          cpi->cpi_compunitid = cpi->cpi_coreid;
2511 2963                          break;
2512 2964                  }
2513 2965          }
2514 2966  }
2515 2967  
2516 2968  /*
2517 2969   * Gather relevant CPU features from leaf 6 which covers thermal information. We
2518 2970   * always gather leaf 6 if it's supported; however, we only look for features on
2519 2971   * Intel systems as AMD does not currently define any of the features we look
2520 2972   * for below.
2521 2973   */
2522 2974  static void
2523 2975  cpuid_pass1_thermal(cpu_t *cpu, uchar_t *featureset)
2524 2976  {
2525 2977          struct cpuid_regs *cp;
2526 2978          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2527 2979  
2528 2980          if (cpi->cpi_maxeax < 6) {
2529 2981                  return;
2530 2982          }
2531 2983  
2532 2984          cp = &cpi->cpi_std[6];
2533 2985          cp->cp_eax = 6;
2534 2986          cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
2535 2987          (void) __cpuid_insn(cp);
2536 2988          platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
2537 2989  
2538 2990          if (cpi->cpi_vendor != X86_VENDOR_Intel) {
2539 2991                  return;
2540 2992          }
2541 2993  
2542 2994          if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
2543 2995                  add_x86_feature(featureset, X86FSET_CORE_THERMAL);
2544 2996          }
2545 2997  
2546 2998          if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
2547 2999                  add_x86_feature(featureset, X86FSET_PKG_THERMAL);
2548 3000          }
2549 3001  }
2550 3002  
2551 3003  void
2552 3004  cpuid_pass1(cpu_t *cpu, uchar_t *featureset)
2553 3005  {
2554 3006          uint32_t mask_ecx, mask_edx;
2555 3007          struct cpuid_info *cpi;
2556 3008          struct cpuid_regs *cp;
2557 3009          int xcpuid;
2558 3010  #if !defined(__xpv)
2559 3011          extern int idle_cpu_prefer_mwait;
2560 3012  #endif
2561 3013  
2562 3014          /*
2563 3015           * Space statically allocated for BSP, ensure pointer is set
2564 3016           */
2565 3017          if (cpu->cpu_id == 0) {
2566 3018                  if (cpu->cpu_m.mcpu_cpi == NULL)
2567 3019                          cpu->cpu_m.mcpu_cpi = &cpuid_info0;
2568 3020          }
2569 3021  
2570 3022          add_x86_feature(featureset, X86FSET_CPUID);
2571 3023  
2572 3024          cpi = cpu->cpu_m.mcpu_cpi;
2573 3025          ASSERT(cpi != NULL);
2574 3026          cp = &cpi->cpi_std[0];
2575 3027          cp->cp_eax = 0;
2576 3028          cpi->cpi_maxeax = __cpuid_insn(cp);
2577 3029          {
2578 3030                  uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
2579 3031                  *iptr++ = cp->cp_ebx;
2580 3032                  *iptr++ = cp->cp_edx;
2581 3033                  *iptr++ = cp->cp_ecx;
2582 3034                  *(char *)&cpi->cpi_vendorstr[12] = '\0';
2583 3035          }
2584 3036  
2585 3037          cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
2586 3038          x86_vendor = cpi->cpi_vendor; /* for compatibility */
2587 3039  
2588 3040          /*
2589 3041           * Limit the range in case of weird hardware
2590 3042           */
2591 3043          if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
2592 3044                  cpi->cpi_maxeax = CPI_MAXEAX_MAX;
2593 3045          if (cpi->cpi_maxeax < 1)
2594 3046                  goto pass1_done;
2595 3047  
2596 3048          cp = &cpi->cpi_std[1];
2597 3049          cp->cp_eax = 1;
2598 3050          (void) __cpuid_insn(cp);
2599 3051  
2600 3052          /*
2601 3053           * Extract identifying constants for easy access.
2602 3054           */
2603 3055          cpi->cpi_model = CPI_MODEL(cpi);
2604 3056          cpi->cpi_family = CPI_FAMILY(cpi);
2605 3057  
2606 3058          if (cpi->cpi_family == 0xf)
2607 3059                  cpi->cpi_family += CPI_FAMILY_XTD(cpi);
2608 3060  
2609 3061          /*
2610 3062           * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
2611 3063           * Intel, and presumably everyone else, uses model == 0xf, as
2612 3064           * one would expect (max value means possible overflow).  Sigh.
2613 3065           */
2614 3066  
2615 3067          switch (cpi->cpi_vendor) {
2616 3068          case X86_VENDOR_Intel:
2617 3069                  if (IS_EXTENDED_MODEL_INTEL(cpi))
2618 3070                          cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
2619 3071                  break;
2620 3072          case X86_VENDOR_AMD:
2621 3073                  if (CPI_FAMILY(cpi) == 0xf)
2622 3074                          cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
2623 3075                  break;
2624 3076          default:
2625 3077                  if (cpi->cpi_model == 0xf)
2626 3078                          cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
2627 3079                  break;
2628 3080          }
2629 3081  
2630 3082          cpi->cpi_step = CPI_STEP(cpi);
2631 3083          cpi->cpi_brandid = CPI_BRANDID(cpi);
2632 3084  
2633 3085          /*
2634 3086           * *default* assumptions:
2635 3087           * - believe %edx feature word
2636 3088           * - ignore %ecx feature word
2637 3089           * - 32-bit virtual and physical addressing
2638 3090           */
2639 3091          mask_edx = 0xffffffff;
2640 3092          mask_ecx = 0;
2641 3093  
2642 3094          cpi->cpi_pabits = cpi->cpi_vabits = 32;
2643 3095  
2644 3096          switch (cpi->cpi_vendor) {
2645 3097          case X86_VENDOR_Intel:
2646 3098                  if (cpi->cpi_family == 5)
2647 3099                          x86_type = X86_TYPE_P5;
2648 3100                  else if (IS_LEGACY_P6(cpi)) {
2649 3101                          x86_type = X86_TYPE_P6;
2650 3102                          pentiumpro_bug4046376 = 1;
2651 3103                          /*
2652 3104                           * Clear the SEP bit when it was set erroneously
2653 3105                           */
2654 3106                          if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
2655 3107                                  cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
2656 3108                  } else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
2657 3109                          x86_type = X86_TYPE_P4;
2658 3110                          /*
2659 3111                           * We don't currently depend on any of the %ecx
2660 3112                           * features until Prescott, so we'll only check
2661 3113                           * this from P4 onwards.  We might want to revisit
2662 3114                           * that idea later.
2663 3115                           */
2664 3116                          mask_ecx = 0xffffffff;
2665 3117                  } else if (cpi->cpi_family > 0xf)
2666 3118                          mask_ecx = 0xffffffff;
2667 3119                  /*
2668 3120                   * We don't support MONITOR/MWAIT if leaf 5 is not available
2669 3121                   * to obtain the monitor linesize.
2670 3122                   */
2671 3123                  if (cpi->cpi_maxeax < 5)
2672 3124                          mask_ecx &= ~CPUID_INTC_ECX_MON;
2673 3125                  break;
2674 3126          case X86_VENDOR_IntelClone:
2675 3127          default:
2676 3128                  break;
2677 3129          case X86_VENDOR_AMD:
2678 3130  #if defined(OPTERON_ERRATUM_108)
2679 3131                  if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
2680 3132                          cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
2681 3133                          cpi->cpi_model = 0xc;
2682 3134                  } else
2683 3135  #endif
2684 3136                  if (cpi->cpi_family == 5) {
2685 3137                          /*
2686 3138                           * AMD K5 and K6
2687 3139                           *
2688 3140                           * These CPUs have an incomplete implementation
2689 3141                           * of MCA/MCE which we mask away.
2690 3142                           */
2691 3143                          mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
2692 3144  
2693 3145                          /*
2694 3146                           * Model 0 uses the wrong (APIC) bit
2695 3147                           * to indicate PGE.  Fix it here.
2696 3148                           */
2697 3149                          if (cpi->cpi_model == 0) {
2698 3150                                  if (cp->cp_edx & 0x200) {
2699 3151                                          cp->cp_edx &= ~0x200;
2700 3152                                          cp->cp_edx |= CPUID_INTC_EDX_PGE;
2701 3153                                  }
2702 3154                          }
2703 3155  
2704 3156                          /*
2705 3157                           * Early models had problems w/ MMX; disable.
2706 3158                           */
2707 3159                          if (cpi->cpi_model < 6)
2708 3160                                  mask_edx &= ~CPUID_INTC_EDX_MMX;
2709 3161                  }
2710 3162  
2711 3163                  /*
2712 3164                   * For newer families, SSE3 and CX16, at least, are valid;
2713 3165                   * enable all
2714 3166                   */
2715 3167                  if (cpi->cpi_family >= 0xf)
2716 3168                          mask_ecx = 0xffffffff;
2717 3169                  /*
2718 3170                   * We don't support MONITOR/MWAIT if leaf 5 is not available
2719 3171                   * to obtain the monitor linesize.
2720 3172                   */
2721 3173                  if (cpi->cpi_maxeax < 5)
2722 3174                          mask_ecx &= ~CPUID_INTC_ECX_MON;
2723 3175  
2724 3176  #if !defined(__xpv)
2725 3177                  /*
2726 3178                   * AMD has not historically used MWAIT in the CPU's idle loop.
2727 3179                   * Pre-family-10h Opterons do not have the MWAIT instruction. We
2728 3180                   * know for certain that in at least family 17h, per AMD, mwait
2729 3181                   * is preferred. Families in-between are less certain.
2730 3182                   */
2731 3183                  if (cpi->cpi_family < 0x17) {
2732 3184                          idle_cpu_prefer_mwait = 0;
2733 3185                  }
2734 3186  #endif
2735 3187  
2736 3188                  break;
2737 3189          case X86_VENDOR_TM:
2738 3190                  /*
2739 3191                   * workaround the NT workaround in CMS 4.1
2740 3192                   */
2741 3193                  if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
2742 3194                      (cpi->cpi_step == 2 || cpi->cpi_step == 3))
2743 3195                          cp->cp_edx |= CPUID_INTC_EDX_CX8;
2744 3196                  break;
2745 3197          case X86_VENDOR_Centaur:
2746 3198                  /*
2747 3199                   * workaround the NT workarounds again
2748 3200                   */
2749 3201                  if (cpi->cpi_family == 6)
2750 3202                          cp->cp_edx |= CPUID_INTC_EDX_CX8;
2751 3203                  break;
2752 3204          case X86_VENDOR_Cyrix:
2753 3205                  /*
2754 3206                   * We rely heavily on the probing in locore
2755 3207                   * to actually figure out what parts, if any,
2756 3208                   * of the Cyrix cpuid instruction to believe.
2757 3209                   */
2758 3210                  switch (x86_type) {
2759 3211                  case X86_TYPE_CYRIX_486:
2760 3212                          mask_edx = 0;
2761 3213                          break;
2762 3214                  case X86_TYPE_CYRIX_6x86:
2763 3215                          mask_edx = 0;
2764 3216                          break;
2765 3217                  case X86_TYPE_CYRIX_6x86L:
2766 3218                          mask_edx =
2767 3219                              CPUID_INTC_EDX_DE |
2768 3220                              CPUID_INTC_EDX_CX8;
2769 3221                          break;
2770 3222                  case X86_TYPE_CYRIX_6x86MX:
2771 3223                          mask_edx =
2772 3224                              CPUID_INTC_EDX_DE |
2773 3225                              CPUID_INTC_EDX_MSR |
2774 3226                              CPUID_INTC_EDX_CX8 |
2775 3227                              CPUID_INTC_EDX_PGE |
2776 3228                              CPUID_INTC_EDX_CMOV |
2777 3229                              CPUID_INTC_EDX_MMX;
2778 3230                          break;
2779 3231                  case X86_TYPE_CYRIX_GXm:
2780 3232                          mask_edx =
2781 3233                              CPUID_INTC_EDX_MSR |
2782 3234                              CPUID_INTC_EDX_CX8 |
2783 3235                              CPUID_INTC_EDX_CMOV |
2784 3236                              CPUID_INTC_EDX_MMX;
2785 3237                          break;
2786 3238                  case X86_TYPE_CYRIX_MediaGX:
2787 3239                          break;
2788 3240                  case X86_TYPE_CYRIX_MII:
2789 3241                  case X86_TYPE_VIA_CYRIX_III:
2790 3242                          mask_edx =
2791 3243                              CPUID_INTC_EDX_DE |
2792 3244                              CPUID_INTC_EDX_TSC |
2793 3245                              CPUID_INTC_EDX_MSR |
2794 3246                              CPUID_INTC_EDX_CX8 |
2795 3247                              CPUID_INTC_EDX_PGE |
2796 3248                              CPUID_INTC_EDX_CMOV |
2797 3249                              CPUID_INTC_EDX_MMX;
2798 3250                          break;
2799 3251                  default:
2800 3252                          break;
2801 3253                  }
2802 3254                  break;
2803 3255          }
2804 3256  
2805 3257  #if defined(__xpv)
2806 3258          /*
2807 3259           * Do not support MONITOR/MWAIT under a hypervisor
2808 3260           */
2809 3261          mask_ecx &= ~CPUID_INTC_ECX_MON;
2810 3262          /*
2811 3263           * Do not support XSAVE under a hypervisor for now
2812 3264           */
2813 3265          xsave_force_disable = B_TRUE;
2814 3266  
2815 3267  #endif  /* __xpv */
2816 3268  
2817 3269          if (xsave_force_disable) {
2818 3270                  mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
2819 3271                  mask_ecx &= ~CPUID_INTC_ECX_AVX;
2820 3272                  mask_ecx &= ~CPUID_INTC_ECX_F16C;
2821 3273                  mask_ecx &= ~CPUID_INTC_ECX_FMA;
2822 3274          }
2823 3275  
2824 3276          /*
2825 3277           * Now we've figured out the masks that determine
2826 3278           * which bits we choose to believe, apply the masks
2827 3279           * to the feature words, then map the kernel's view
2828 3280           * of these feature words into its feature word.
2829 3281           */
2830 3282          cp->cp_edx &= mask_edx;
2831 3283          cp->cp_ecx &= mask_ecx;
2832 3284  
2833 3285          /*
2834 3286           * apply any platform restrictions (we don't call this
2835 3287           * immediately after __cpuid_insn here, because we need the
2836 3288           * workarounds applied above first)
2837 3289           */
2838 3290          platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
2839 3291  
2840 3292          /*
2841 3293           * In addition to ecx and edx, Intel and AMD are storing a bunch of
2842 3294           * instruction set extensions in leaf 7's ebx, ecx, and edx.
2843 3295           */
2844 3296          if (cpi->cpi_maxeax >= 7) {
2845 3297                  struct cpuid_regs *ecp;
2846 3298                  ecp = &cpi->cpi_std[7];
2847 3299                  ecp->cp_eax = 7;
2848 3300                  ecp->cp_ecx = 0;
2849 3301                  (void) __cpuid_insn(ecp);
2850 3302  
2851 3303                  /*
2852 3304                   * If XSAVE has been disabled, just ignore all of the
2853 3305                   * extended-save-area dependent flags here.
2854 3306                   */
2855 3307                  if (xsave_force_disable) {
2856 3308                          ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
2857 3309                          ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
2858 3310                          ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
2859 3311                          ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
2860 3312                          ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
2861 3313                          ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
2862 3314                          ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
2863 3315                  }
2864 3316  
2865 3317                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
2866 3318                          add_x86_feature(featureset, X86FSET_SMEP);
2867 3319  
2868 3320                  /*
2869 3321                   * We check disable_smap here in addition to in startup_smap()
2870 3322                   * to ensure CPUs that aren't the boot CPU don't accidentally
2871 3323                   * include it in the feature set and thus generate a mismatched
2872 3324                   * x86 feature set across CPUs.
2873 3325                   */
2874 3326                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
2875 3327                      disable_smap == 0)
2876 3328                          add_x86_feature(featureset, X86FSET_SMAP);
2877 3329  
2878 3330                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
2879 3331                          add_x86_feature(featureset, X86FSET_RDSEED);
2880 3332  
2881 3333                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
2882 3334                          add_x86_feature(featureset, X86FSET_ADX);
2883 3335  
2884 3336                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
2885 3337                          add_x86_feature(featureset, X86FSET_FSGSBASE);
2886 3338  
2887 3339                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
2888 3340                          add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
2889 3341  
2890 3342                  if (cpi->cpi_vendor == X86_VENDOR_Intel) {
2891 3343                          if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
2892 3344                                  add_x86_feature(featureset, X86FSET_INVPCID);
2893 3345  
2894 3346                          if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
2895 3347                                  add_x86_feature(featureset, X86FSET_MPX);
2896 3348  
2897 3349                          if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
2898 3350                                  add_x86_feature(featureset, X86FSET_CLWB);
2899 3351                  }
2900 3352          }
2901 3353  
2902 3354          /*
2903 3355           * fold in overrides from the "eeprom" mechanism
2904 3356           */
2905 3357          cp->cp_edx |= cpuid_feature_edx_include;
2906 3358          cp->cp_edx &= ~cpuid_feature_edx_exclude;
2907 3359  
2908 3360          cp->cp_ecx |= cpuid_feature_ecx_include;
2909 3361          cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
2910 3362  
2911 3363          if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
2912 3364                  add_x86_feature(featureset, X86FSET_LARGEPAGE);
2913 3365          }
2914 3366          if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
2915 3367                  add_x86_feature(featureset, X86FSET_TSC);
2916 3368          }
2917 3369          if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
2918 3370                  add_x86_feature(featureset, X86FSET_MSR);
2919 3371          }
2920 3372          if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
2921 3373                  add_x86_feature(featureset, X86FSET_MTRR);
2922 3374          }
2923 3375          if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
2924 3376                  add_x86_feature(featureset, X86FSET_PGE);
2925 3377          }
2926 3378          if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
2927 3379                  add_x86_feature(featureset, X86FSET_CMOV);
2928 3380          }
2929 3381          if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
2930 3382                  add_x86_feature(featureset, X86FSET_MMX);
2931 3383          }
2932 3384          if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
2933 3385              (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
2934 3386                  add_x86_feature(featureset, X86FSET_MCA);
2935 3387          }
2936 3388          if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
2937 3389                  add_x86_feature(featureset, X86FSET_PAE);
2938 3390          }
2939 3391          if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
2940 3392                  add_x86_feature(featureset, X86FSET_CX8);
2941 3393          }
2942 3394          if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
2943 3395                  add_x86_feature(featureset, X86FSET_CX16);
2944 3396          }
2945 3397          if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
2946 3398                  add_x86_feature(featureset, X86FSET_PAT);
2947 3399          }
2948 3400          if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
2949 3401                  add_x86_feature(featureset, X86FSET_SEP);
2950 3402          }
2951 3403          if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
2952 3404                  /*
2953 3405                   * In our implementation, fxsave/fxrstor
2954 3406                   * are prerequisites before we'll even
2955 3407                   * try and do SSE things.
2956 3408                   */
2957 3409                  if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
2958 3410                          add_x86_feature(featureset, X86FSET_SSE);
2959 3411                  }
2960 3412                  if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
2961 3413                          add_x86_feature(featureset, X86FSET_SSE2);
2962 3414                  }
2963 3415                  if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
2964 3416                          add_x86_feature(featureset, X86FSET_SSE3);
2965 3417                  }
2966 3418                  if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
2967 3419                          add_x86_feature(featureset, X86FSET_SSSE3);
2968 3420                  }
2969 3421                  if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
2970 3422                          add_x86_feature(featureset, X86FSET_SSE4_1);
2971 3423                  }
2972 3424                  if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
2973 3425                          add_x86_feature(featureset, X86FSET_SSE4_2);
2974 3426                  }
2975 3427                  if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
2976 3428                          add_x86_feature(featureset, X86FSET_AES);
2977 3429                  }
2978 3430                  if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
2979 3431                          add_x86_feature(featureset, X86FSET_PCLMULQDQ);
2980 3432                  }
2981 3433  
2982 3434                  if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
2983 3435                          add_x86_feature(featureset, X86FSET_SHA);
2984 3436  
2985 3437                  if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
2986 3438                          add_x86_feature(featureset, X86FSET_UMIP);
2987 3439                  if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_PKU)
2988 3440                          add_x86_feature(featureset, X86FSET_PKU);
2989 3441                  if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
2990 3442                          add_x86_feature(featureset, X86FSET_OSPKE);
2991 3443  
2992 3444                  if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
2993 3445                          add_x86_feature(featureset, X86FSET_XSAVE);
2994 3446  
2995 3447                          /* We only test AVX & AVX512 when there is XSAVE */
2996 3448  
2997 3449                          if (cp->cp_ecx & CPUID_INTC_ECX_AVX) {
2998 3450                                  add_x86_feature(featureset,
2999 3451                                      X86FSET_AVX);
3000 3452  
3001 3453                                  /*
3002 3454                                   * Intel says we can't check these without also
3003 3455                                   * checking AVX.
3004 3456                                   */
3005 3457                                  if (cp->cp_ecx & CPUID_INTC_ECX_F16C)
3006 3458                                          add_x86_feature(featureset,
3007 3459                                              X86FSET_F16C);
3008 3460  
3009 3461                                  if (cp->cp_ecx & CPUID_INTC_ECX_FMA)
3010 3462                                          add_x86_feature(featureset,
3011 3463                                              X86FSET_FMA);
3012 3464  
3013 3465                                  if (cpi->cpi_std[7].cp_ebx &
3014 3466                                      CPUID_INTC_EBX_7_0_BMI1)
3015 3467                                          add_x86_feature(featureset,
3016 3468                                              X86FSET_BMI1);
3017 3469  
3018 3470                                  if (cpi->cpi_std[7].cp_ebx &
3019 3471                                      CPUID_INTC_EBX_7_0_BMI2)
3020 3472                                          add_x86_feature(featureset,
3021 3473                                              X86FSET_BMI2);
3022 3474  
3023 3475                                  if (cpi->cpi_std[7].cp_ebx &
3024 3476                                      CPUID_INTC_EBX_7_0_AVX2)
3025 3477                                          add_x86_feature(featureset,
3026 3478                                              X86FSET_AVX2);
3027 3479                          }
3028 3480  
3029 3481                          if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3030 3482                              (cpi->cpi_std[7].cp_ebx &
3031 3483                              CPUID_INTC_EBX_7_0_AVX512F) != 0) {
3032 3484                                  add_x86_feature(featureset, X86FSET_AVX512F);
3033 3485  
3034 3486                                  if (cpi->cpi_std[7].cp_ebx &
3035 3487                                      CPUID_INTC_EBX_7_0_AVX512DQ)
3036 3488                                          add_x86_feature(featureset,
3037 3489                                              X86FSET_AVX512DQ);
3038 3490                                  if (cpi->cpi_std[7].cp_ebx &
3039 3491                                      CPUID_INTC_EBX_7_0_AVX512IFMA)
3040 3492                                          add_x86_feature(featureset,
3041 3493                                              X86FSET_AVX512FMA);
3042 3494                                  if (cpi->cpi_std[7].cp_ebx &
3043 3495                                      CPUID_INTC_EBX_7_0_AVX512PF)
3044 3496                                          add_x86_feature(featureset,
3045 3497                                              X86FSET_AVX512PF);
3046 3498                                  if (cpi->cpi_std[7].cp_ebx &
3047 3499                                      CPUID_INTC_EBX_7_0_AVX512ER)
3048 3500                                          add_x86_feature(featureset,
3049 3501                                              X86FSET_AVX512ER);
3050 3502                                  if (cpi->cpi_std[7].cp_ebx &
3051 3503                                      CPUID_INTC_EBX_7_0_AVX512CD)
3052 3504                                          add_x86_feature(featureset,
3053 3505                                              X86FSET_AVX512CD);
3054 3506                                  if (cpi->cpi_std[7].cp_ebx &
3055 3507                                      CPUID_INTC_EBX_7_0_AVX512BW)
3056 3508                                          add_x86_feature(featureset,
3057 3509                                              X86FSET_AVX512BW);
3058 3510                                  if (cpi->cpi_std[7].cp_ebx &
3059 3511                                      CPUID_INTC_EBX_7_0_AVX512VL)
3060 3512                                          add_x86_feature(featureset,
3061 3513                                              X86FSET_AVX512VL);
3062 3514  
3063 3515                                  if (cpi->cpi_std[7].cp_ecx &
3064 3516                                      CPUID_INTC_ECX_7_0_AVX512VBMI)
3065 3517                                          add_x86_feature(featureset,
3066 3518                                              X86FSET_AVX512VBMI);
3067 3519                                  if (cpi->cpi_std[7].cp_ecx &
3068 3520                                      CPUID_INTC_ECX_7_0_AVX512VNNI)
3069 3521                                          add_x86_feature(featureset,
3070 3522                                              X86FSET_AVX512VNNI);
3071 3523                                  if (cpi->cpi_std[7].cp_ecx &
3072 3524                                      CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3073 3525                                          add_x86_feature(featureset,
3074 3526                                              X86FSET_AVX512VPOPCDQ);
3075 3527  
3076 3528                                  if (cpi->cpi_std[7].cp_edx &
3077 3529                                      CPUID_INTC_EDX_7_0_AVX5124NNIW)
3078 3530                                          add_x86_feature(featureset,
3079 3531                                              X86FSET_AVX512NNIW);
3080 3532                                  if (cpi->cpi_std[7].cp_edx &
3081 3533                                      CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3082 3534                                          add_x86_feature(featureset,
3083 3535                                              X86FSET_AVX512FMAPS);
3084 3536                          }
3085 3537                  }
3086 3538          }
3087 3539  
3088 3540          if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3089 3541                  if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
3090 3542                          add_x86_feature(featureset, X86FSET_PCID);
3091 3543                  }
3092 3544          }
3093 3545  
3094 3546          if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
3095 3547                  add_x86_feature(featureset, X86FSET_X2APIC);
3096 3548          }
3097 3549          if (cp->cp_edx & CPUID_INTC_EDX_DE) {
3098 3550                  add_x86_feature(featureset, X86FSET_DE);
3099 3551          }
3100 3552  #if !defined(__xpv)
3101 3553          if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
3102 3554  
3103 3555                  /*
3104 3556                   * We require the CLFLUSH instruction for erratum workaround
3105 3557                   * to use MONITOR/MWAIT.
3106 3558                   */
3107 3559                  if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3108 3560                          cpi->cpi_mwait.support |= MWAIT_SUPPORT;
3109 3561                          add_x86_feature(featureset, X86FSET_MWAIT);
3110 3562                  } else {
3111 3563                          extern int idle_cpu_assert_cflush_monitor;
3112 3564  
3113 3565                          /*
3114 3566                           * All processors we are aware of which have
3115 3567                           * MONITOR/MWAIT also have CLFLUSH.
3116 3568                           */
3117 3569                          if (idle_cpu_assert_cflush_monitor) {
3118 3570                                  ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
3119 3571                                      (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
3120 3572                          }
3121 3573                  }
3122 3574          }
3123 3575  #endif  /* __xpv */
3124 3576  
3125 3577          if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
3126 3578                  add_x86_feature(featureset, X86FSET_VMX);
3127 3579          }
3128 3580  
3129 3581          if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
3130 3582                  add_x86_feature(featureset, X86FSET_RDRAND);
3131 3583  
3132 3584          /*
3133 3585           * Only need it first time, rest of the cpus would follow suit.
3134 3586           * we only capture this for the bootcpu.
3135 3587           */
3136 3588          if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3137 3589                  add_x86_feature(featureset, X86FSET_CLFSH);
3138 3590                  x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
3139 3591          }
3140 3592          if (is_x86_feature(featureset, X86FSET_PAE))
3141 3593                  cpi->cpi_pabits = 36;
3142 3594  
3143 3595          if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
3144 3596                  struct cpuid_regs r, *ecp;
3145 3597  
3146 3598                  ecp = &r;
3147 3599                  ecp->cp_eax = 0xD;
3148 3600                  ecp->cp_ecx = 1;
3149 3601                  ecp->cp_edx = ecp->cp_ebx = 0;
3150 3602                  (void) __cpuid_insn(ecp);
3151 3603  
3152 3604                  if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
3153 3605                          add_x86_feature(featureset, X86FSET_XSAVEOPT);
3154 3606                  if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
3155 3607                          add_x86_feature(featureset, X86FSET_XSAVEC);
3156 3608                  if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
3157 3609                          add_x86_feature(featureset, X86FSET_XSAVES);
3158 3610          }
3159 3611  
3160 3612          /*
3161 3613           * Work on the "extended" feature information, doing
3162 3614           * some basic initialization for cpuid_pass2()
3163 3615           */
3164 3616          xcpuid = 0;
3165 3617          switch (cpi->cpi_vendor) {
3166 3618          case X86_VENDOR_Intel:
3167 3619                  /*
3168 3620                   * On KVM we know we will have proper support for extended
3169 3621                   * cpuid.
3170 3622                   */
3171 3623                  if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
3172 3624                      (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
3173 3625                      (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
3174 3626                          xcpuid++;
3175 3627                  break;
3176 3628          case X86_VENDOR_AMD:
3177 3629                  if (cpi->cpi_family > 5 ||
3178 3630                      (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
3179 3631                          xcpuid++;
3180 3632                  break;
3181 3633          case X86_VENDOR_Cyrix:
3182 3634                  /*
3183 3635                   * Only these Cyrix CPUs are -known- to support
3184 3636                   * extended cpuid operations.
3185 3637                   */
3186 3638                  if (x86_type == X86_TYPE_VIA_CYRIX_III ||
3187 3639                      x86_type == X86_TYPE_CYRIX_GXm)
3188 3640                          xcpuid++;
3189 3641                  break;
3190 3642          case X86_VENDOR_Centaur:
3191 3643          case X86_VENDOR_TM:
3192 3644          default:
3193 3645                  xcpuid++;
3194 3646                  break;
3195 3647          }
3196 3648  
3197 3649          if (xcpuid) {
3198 3650                  cp = &cpi->cpi_extd[0];
3199 3651                  cp->cp_eax = CPUID_LEAF_EXT_0;
3200 3652                  cpi->cpi_xmaxeax = __cpuid_insn(cp);
3201 3653          }
3202 3654  
3203 3655          if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
3204 3656  
3205 3657                  if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
3206 3658                          cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
3207 3659  
3208 3660                  switch (cpi->cpi_vendor) {
3209 3661                  case X86_VENDOR_Intel:
3210 3662                  case X86_VENDOR_AMD:
3211 3663                          if (cpi->cpi_xmaxeax < 0x80000001)
3212 3664                                  break;
3213 3665                          cp = &cpi->cpi_extd[1];
3214 3666                          cp->cp_eax = 0x80000001;
3215 3667                          (void) __cpuid_insn(cp);
3216 3668  
3217 3669                          if (cpi->cpi_vendor == X86_VENDOR_AMD &&
3218 3670                              cpi->cpi_family == 5 &&
3219 3671                              cpi->cpi_model == 6 &&
3220 3672                              cpi->cpi_step == 6) {
3221 3673                                  /*
3222 3674                                   * K6 model 6 uses bit 10 to indicate SYSC
3223 3675                                   * Later models use bit 11. Fix it here.
3224 3676                                   */
3225 3677                                  if (cp->cp_edx & 0x400) {
3226 3678                                          cp->cp_edx &= ~0x400;
3227 3679                                          cp->cp_edx |= CPUID_AMD_EDX_SYSC;
3228 3680                                  }
3229 3681                          }
3230 3682  
3231 3683                          platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
3232 3684  
3233 3685                          /*
3234 3686                           * Compute the additions to the kernel's feature word.
3235 3687                           */
3236 3688                          if (cp->cp_edx & CPUID_AMD_EDX_NX) {
3237 3689                                  add_x86_feature(featureset, X86FSET_NX);
3238 3690                          }
3239 3691  
3240 3692                          /*
3241 3693                           * Regardless whether or not we boot 64-bit,
3242 3694                           * we should have a way to identify whether
3243 3695                           * the CPU is capable of running 64-bit.
3244 3696                           */
3245 3697                          if (cp->cp_edx & CPUID_AMD_EDX_LM) {
3246 3698                                  add_x86_feature(featureset, X86FSET_64);
3247 3699                          }
3248 3700  
3249 3701                          /* 1 GB large page - enable only for 64 bit kernel */
3250 3702                          if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
3251 3703                                  add_x86_feature(featureset, X86FSET_1GPG);
3252 3704                          }
3253 3705  
3254 3706                          if ((cpi->cpi_vendor == X86_VENDOR_AMD) &&
3255 3707                              (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
3256 3708                              (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
3257 3709                                  add_x86_feature(featureset, X86FSET_SSE4A);
3258 3710                          }
3259 3711  
3260 3712                          /*
3261 3713                           * It's really tricky to support syscall/sysret in
3262 3714                           * the i386 kernel; we rely on sysenter/sysexit
3263 3715                           * instead.  In the amd64 kernel, things are -way-
3264 3716                           * better.
3265 3717                           */
3266 3718                          if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
3267 3719                                  add_x86_feature(featureset, X86FSET_ASYSC);
3268 3720                          }
3269 3721  
3270 3722                          /*
3271 3723                           * While we're thinking about system calls, note
3272 3724                           * that AMD processors don't support sysenter
3273 3725                           * in long mode at all, so don't try to program them.
3274 3726                           */
3275 3727                          if (x86_vendor == X86_VENDOR_AMD) {
3276 3728                                  remove_x86_feature(featureset, X86FSET_SEP);
3277 3729                          }
3278 3730  
3279 3731                          if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
3280 3732                                  add_x86_feature(featureset, X86FSET_TSCP);
3281 3733                          }
3282 3734  
3283 3735                          if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
3284 3736                                  add_x86_feature(featureset, X86FSET_SVM);
3285 3737                          }
3286 3738  
3287 3739                          if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
3288 3740                                  add_x86_feature(featureset, X86FSET_TOPOEXT);
3289 3741                          }
3290 3742  
3291 3743                          if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
3292 3744                                  add_x86_feature(featureset, X86FSET_AMD_PCEC);
3293 3745                          }
3294 3746  
3295 3747                          if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
3296 3748                                  add_x86_feature(featureset, X86FSET_XOP);
3297 3749                          }
3298 3750  
3299 3751                          if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
3300 3752                                  add_x86_feature(featureset, X86FSET_FMA4);
3301 3753                          }
3302 3754  
3303 3755                          if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
3304 3756                                  add_x86_feature(featureset, X86FSET_TBM);
3305 3757                          }
3306 3758  
3307 3759                          if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
3308 3760                                  add_x86_feature(featureset, X86FSET_MONITORX);
3309 3761                          }
3310 3762                          break;
3311 3763                  default:
3312 3764                          break;
3313 3765                  }
3314 3766  
3315 3767                  /*
3316 3768                   * Get CPUID data about processor cores and hyperthreads.
3317 3769                   */
3318 3770                  switch (cpi->cpi_vendor) {
3319 3771                  case X86_VENDOR_Intel:
3320 3772                          if (cpi->cpi_maxeax >= 4) {
3321 3773                                  cp = &cpi->cpi_std[4];
3322 3774                                  cp->cp_eax = 4;
3323 3775                                  cp->cp_ecx = 0;
3324 3776                                  (void) __cpuid_insn(cp);
3325 3777                                  platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
3326 3778                          }
3327 3779                          /*FALLTHROUGH*/
3328 3780                  case X86_VENDOR_AMD:
3329 3781                          if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
3330 3782                                  break;
3331 3783                          cp = &cpi->cpi_extd[8];
3332 3784                          cp->cp_eax = CPUID_LEAF_EXT_8;
3333 3785                          (void) __cpuid_insn(cp);
3334 3786                          platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
3335 3787                              cp);
3336 3788  
3337 3789                          /*
3338 3790                           * AMD uses ebx for some extended functions.
3339 3791                           */
3340 3792                          if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3341 3793                                  /*
3342 3794                                   * While we're here, check for the AMD "Error
3343 3795                                   * Pointer Zero/Restore" feature. This can be
3344 3796                                   * used to setup the FP save handlers
3345 3797                                   * appropriately.
3346 3798                                   */
3347 3799                                  if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
3348 3800                                          cpi->cpi_fp_amd_save = 0;
3349 3801                                  } else {
3350 3802                                          cpi->cpi_fp_amd_save = 1;
3351 3803                                  }
3352 3804  
3353 3805                                  if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
3354 3806                                          add_x86_feature(featureset,
3355 3807                                              X86FSET_CLZERO);
3356 3808                                  }
3357 3809                          }
3358 3810  
3359 3811                          /*
3360 3812                           * Virtual and physical address limits from
3361 3813                           * cpuid override previously guessed values.
3362 3814                           */
3363 3815                          cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
3364 3816                          cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
3365 3817                          break;
3366 3818                  default:
3367 3819                          break;
3368 3820                  }
3369 3821  
3370 3822                  /*
3371 3823                   * Get CPUID data about TSC Invariance in Deep C-State.
3372 3824                   */
3373 3825                  switch (cpi->cpi_vendor) {
3374 3826                  case X86_VENDOR_Intel:
3375 3827                  case X86_VENDOR_AMD:
3376 3828                          if (cpi->cpi_maxeax >= 7) {
3377 3829                                  cp = &cpi->cpi_extd[7];
3378 3830                                  cp->cp_eax = 0x80000007;
3379 3831                                  cp->cp_ecx = 0;
3380 3832                                  (void) __cpuid_insn(cp);
3381 3833                          }
3382 3834                          break;
3383 3835                  default:
3384 3836                          break;
3385 3837                  }
3386 3838          }
3387 3839  
3388 3840          cpuid_pass1_topology(cpu, featureset);
3389 3841          cpuid_pass1_thermal(cpu, featureset);
3390 3842  
3391 3843          /*
3392 3844           * Synthesize chip "revision" and socket type
3393 3845           */
3394 3846          cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
3395 3847              cpi->cpi_model, cpi->cpi_step);
3396 3848          cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
3397 3849              cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
3398 3850          cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
3399 3851              cpi->cpi_model, cpi->cpi_step);
3400 3852  
3401 3853          if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3402 3854                  if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
3403 3855                      cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
3404 3856                          /* Special handling for AMD FP not necessary. */
3405 3857                          cpi->cpi_fp_amd_save = 0;
3406 3858                  } else {
3407 3859                          cpi->cpi_fp_amd_save = 1;
3408 3860                  }
3409 3861          }
3410 3862  
3411 3863          /*
3412 3864           * Check the processor leaves that are used for security features.
3413 3865           */
3414 3866          cpuid_scan_security(cpu, featureset);
3415 3867  
3416 3868  pass1_done:
3417 3869          cpi->cpi_pass = 1;
3418 3870  }
3419 3871  
3420 3872  /*
3421 3873   * Make copies of the cpuid table entries we depend on, in
3422 3874   * part for ease of parsing now, in part so that we have only
3423 3875   * one place to correct any of it, in part for ease of
3424 3876   * later export to userland, and in part so we can look at
3425 3877   * this stuff in a crash dump.
3426 3878   */
3427 3879  
3428 3880  /*ARGSUSED*/
3429 3881  void
3430 3882  cpuid_pass2(cpu_t *cpu)
3431 3883  {
3432 3884          uint_t n, nmax;
3433 3885          int i;
3434 3886          struct cpuid_regs *cp;
3435 3887          uint8_t *dp;
3436 3888          uint32_t *iptr;
3437 3889          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3438 3890  
3439 3891          ASSERT(cpi->cpi_pass == 1);
3440 3892  
3441 3893          if (cpi->cpi_maxeax < 1)
3442 3894                  goto pass2_done;
3443 3895  
3444 3896          if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
3445 3897                  nmax = NMAX_CPI_STD;
3446 3898          /*
3447 3899           * (We already handled n == 0 and n == 1 in pass 1)
3448 3900           */
3449 3901          for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
3450 3902                  /*
3451 3903                   * leaves 6 and 7 were handled in pass 1
3452 3904                   */
3453 3905                  if (n == 6 || n == 7)
3454 3906                          continue;
3455 3907  
3456 3908                  cp->cp_eax = n;
3457 3909  
3458 3910                  /*
3459 3911                   * CPUID function 4 expects %ecx to be initialized
3460 3912                   * with an index which indicates which cache to return
3461 3913                   * information about. The OS is expected to call function 4
3462 3914                   * with %ecx set to 0, 1, 2, ... until it returns with
3463 3915                   * EAX[4:0] set to 0, which indicates there are no more
3464 3916                   * caches.
3465 3917                   *
3466 3918                   * Here, populate cpi_std[4] with the information returned by
3467 3919                   * function 4 when %ecx == 0, and do the rest in cpuid_pass3()
3468 3920                   * when dynamic memory allocation becomes available.
3469 3921                   *
3470 3922                   * Note: we need to explicitly initialize %ecx here, since
3471 3923                   * function 4 may have been previously invoked.
3472 3924                   */
3473 3925                  if (n == 4)
3474 3926                          cp->cp_ecx = 0;
3475 3927  
3476 3928                  (void) __cpuid_insn(cp);
3477 3929                  platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
3478 3930                  switch (n) {
3479 3931                  case 2:
3480 3932                          /*
3481 3933                           * "the lower 8 bits of the %eax register
3482 3934                           * contain a value that identifies the number
3483 3935                           * of times the cpuid [instruction] has to be
3484 3936                           * executed to obtain a complete image of the
3485 3937                           * processor's caching systems."
3486 3938                           *
3487 3939                           * How *do* they make this stuff up?
3488 3940                           */
3489 3941                          cpi->cpi_ncache = sizeof (*cp) *
3490 3942                              BITX(cp->cp_eax, 7, 0);
3491 3943                          if (cpi->cpi_ncache == 0)
3492 3944                                  break;
3493 3945                          cpi->cpi_ncache--;      /* skip count byte */
3494 3946  
3495 3947                          /*
3496 3948                           * Well, for now, rather than attempt to implement
3497 3949                           * this slightly dubious algorithm, we just look
3498 3950                           * at the first 15 ..
3499 3951                           */
3500 3952                          if (cpi->cpi_ncache > (sizeof (*cp) - 1))
3501 3953                                  cpi->cpi_ncache = sizeof (*cp) - 1;
3502 3954  
3503 3955                          dp = cpi->cpi_cacheinfo;
3504 3956                          if (BITX(cp->cp_eax, 31, 31) == 0) {
3505 3957                                  uint8_t *p = (void *)&cp->cp_eax;
3506 3958                                  for (i = 1; i < 4; i++)
3507 3959                                          if (p[i] != 0)
3508 3960                                                  *dp++ = p[i];
3509 3961                          }
3510 3962                          if (BITX(cp->cp_ebx, 31, 31) == 0) {
3511 3963                                  uint8_t *p = (void *)&cp->cp_ebx;
3512 3964                                  for (i = 0; i < 4; i++)
3513 3965                                          if (p[i] != 0)
3514 3966                                                  *dp++ = p[i];
3515 3967                          }
3516 3968                          if (BITX(cp->cp_ecx, 31, 31) == 0) {
3517 3969                                  uint8_t *p = (void *)&cp->cp_ecx;
3518 3970                                  for (i = 0; i < 4; i++)
3519 3971                                          if (p[i] != 0)
3520 3972                                                  *dp++ = p[i];
3521 3973                          }
3522 3974                          if (BITX(cp->cp_edx, 31, 31) == 0) {
3523 3975                                  uint8_t *p = (void *)&cp->cp_edx;
3524 3976                                  for (i = 0; i < 4; i++)
3525 3977                                          if (p[i] != 0)
3526 3978                                                  *dp++ = p[i];
3527 3979                          }
3528 3980                          break;
3529 3981  
3530 3982                  case 3: /* Processor serial number, if PSN supported */
3531 3983                          break;
3532 3984  
3533 3985                  case 4: /* Deterministic cache parameters */
3534 3986                          break;
3535 3987  
3536 3988                  case 5: /* Monitor/Mwait parameters */
3537 3989                  {
3538 3990                          size_t mwait_size;
3539 3991  
3540 3992                          /*
3541 3993                           * check cpi_mwait.support which was set in cpuid_pass1
3542 3994                           */
3543 3995                          if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
3544 3996                                  break;
3545 3997  
3546 3998                          /*
3547 3999                           * Protect ourself from insane mwait line size.
3548 4000                           * Workaround for incomplete hardware emulator(s).
3549 4001                           */
3550 4002                          mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
3551 4003                          if (mwait_size < sizeof (uint32_t) ||
3552 4004                              !ISP2(mwait_size)) {
3553 4005  #if DEBUG
3554 4006                                  cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
3555 4007                                      "size %ld", cpu->cpu_id, (long)mwait_size);
3556 4008  #endif
3557 4009                                  break;
3558 4010                          }
3559 4011  
3560 4012                          cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
3561 4013                          cpi->cpi_mwait.mon_max = mwait_size;
3562 4014                          if (MWAIT_EXTENSION(cpi)) {
3563 4015                                  cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
3564 4016                                  if (MWAIT_INT_ENABLE(cpi))
3565 4017                                          cpi->cpi_mwait.support |=
3566 4018                                              MWAIT_ECX_INT_ENABLE;
3567 4019                          }
3568 4020                          break;
3569 4021                  }
3570 4022                  default:
3571 4023                          break;
3572 4024                  }
3573 4025          }
3574 4026  
3575 4027          /*
3576 4028           * XSAVE enumeration
3577 4029           */
3578 4030          if (cpi->cpi_maxeax >= 0xD) {
3579 4031                  struct cpuid_regs regs;
3580 4032                  boolean_t cpuid_d_valid = B_TRUE;
3581 4033  
3582 4034                  cp = &regs;
3583 4035                  cp->cp_eax = 0xD;
3584 4036                  cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
3585 4037  
3586 4038                  (void) __cpuid_insn(cp);
3587 4039  
3588 4040                  /*
3589 4041                   * Sanity checks for debug
3590 4042                   */
3591 4043                  if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
3592 4044                      (cp->cp_eax & XFEATURE_SSE) == 0) {
3593 4045                          cpuid_d_valid = B_FALSE;
3594 4046                  }
3595 4047  
3596 4048                  cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
3597 4049                  cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
3598 4050                  cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
3599 4051  
3600 4052                  /*
3601 4053                   * If the hw supports AVX, get the size and offset in the save
3602 4054                   * area for the ymm state.
3603 4055                   */
3604 4056                  if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
3605 4057                          cp->cp_eax = 0xD;
3606 4058                          cp->cp_ecx = 2;
3607 4059                          cp->cp_edx = cp->cp_ebx = 0;
3608 4060  
3609 4061                          (void) __cpuid_insn(cp);
3610 4062  
3611 4063                          if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
3612 4064                              cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
3613 4065                                  cpuid_d_valid = B_FALSE;
3614 4066                          }
3615 4067  
3616 4068                          cpi->cpi_xsave.ymm_size = cp->cp_eax;
3617 4069                          cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
3618 4070                  }
3619 4071  
3620 4072                  /*
3621 4073                   * If the hw supports MPX, get the size and offset in the
3622 4074                   * save area for BNDREGS and BNDCSR.
3623 4075                   */
3624 4076                  if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
3625 4077                          cp->cp_eax = 0xD;
3626 4078                          cp->cp_ecx = 3;
3627 4079                          cp->cp_edx = cp->cp_ebx = 0;
3628 4080  
3629 4081                          (void) __cpuid_insn(cp);
3630 4082  
3631 4083                          cpi->cpi_xsave.bndregs_size = cp->cp_eax;
3632 4084                          cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
3633 4085  
3634 4086                          cp->cp_eax = 0xD;
3635 4087                          cp->cp_ecx = 4;
3636 4088                          cp->cp_edx = cp->cp_ebx = 0;
3637 4089  
3638 4090                          (void) __cpuid_insn(cp);
3639 4091  
3640 4092                          cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
3641 4093                          cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
3642 4094                  }
3643 4095  
3644 4096                  /*
3645 4097                   * If the hw supports AVX512, get the size and offset in the
3646 4098                   * save area for the opmask registers and zmm state.
3647 4099                   */
3648 4100                  if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
3649 4101                          cp->cp_eax = 0xD;
3650 4102                          cp->cp_ecx = 5;
3651 4103                          cp->cp_edx = cp->cp_ebx = 0;
3652 4104  
3653 4105                          (void) __cpuid_insn(cp);
3654 4106  
3655 4107                          cpi->cpi_xsave.opmask_size = cp->cp_eax;
3656 4108                          cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
3657 4109  
3658 4110                          cp->cp_eax = 0xD;
3659 4111                          cp->cp_ecx = 6;
3660 4112                          cp->cp_edx = cp->cp_ebx = 0;
3661 4113  
3662 4114                          (void) __cpuid_insn(cp);
3663 4115  
3664 4116                          cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
3665 4117                          cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
3666 4118  
3667 4119                          cp->cp_eax = 0xD;
3668 4120                          cp->cp_ecx = 7;
3669 4121                          cp->cp_edx = cp->cp_ebx = 0;
3670 4122  
3671 4123                          (void) __cpuid_insn(cp);
3672 4124  
3673 4125                          cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
3674 4126                          cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
3675 4127                  }
3676 4128  
3677 4129                  if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
3678 4130                          xsave_state_size = 0;
3679 4131                  } else if (cpuid_d_valid) {
3680 4132                          xsave_state_size = cpi->cpi_xsave.xsav_max_size;
3681 4133                  } else {
3682 4134                          /* Broken CPUID 0xD, probably in HVM */
3683 4135                          cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
3684 4136                              "value: hw_low = %d, hw_high = %d, xsave_size = %d"
3685 4137                              ", ymm_size = %d, ymm_offset = %d\n",
3686 4138                              cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
3687 4139                              cpi->cpi_xsave.xsav_hw_features_high,
3688 4140                              (int)cpi->cpi_xsave.xsav_max_size,
3689 4141                              (int)cpi->cpi_xsave.ymm_size,
3690 4142                              (int)cpi->cpi_xsave.ymm_offset);
3691 4143  
3692 4144                          if (xsave_state_size != 0) {
3693 4145                                  /*
3694 4146                                   * This must be a non-boot CPU. We cannot
3695 4147                                   * continue, because boot cpu has already
3696 4148                                   * enabled XSAVE.
3697 4149                                   */
3698 4150                                  ASSERT(cpu->cpu_id != 0);
3699 4151                                  cmn_err(CE_PANIC, "cpu%d: we have already "
3700 4152                                      "enabled XSAVE on boot cpu, cannot "
3701 4153                                      "continue.", cpu->cpu_id);
3702 4154                          } else {
3703 4155                                  /*
3704 4156                                   * If we reached here on the boot CPU, it's also
3705 4157                                   * almost certain that we'll reach here on the
3706 4158                                   * non-boot CPUs. When we're here on a boot CPU
3707 4159                                   * we should disable the feature, on a non-boot
3708 4160                                   * CPU we need to confirm that we have.
3709 4161                                   */
3710 4162                                  if (cpu->cpu_id == 0) {
3711 4163                                          remove_x86_feature(x86_featureset,
3712 4164                                              X86FSET_XSAVE);
3713 4165                                          remove_x86_feature(x86_featureset,
3714 4166                                              X86FSET_AVX);
3715 4167                                          remove_x86_feature(x86_featureset,
3716 4168                                              X86FSET_F16C);
3717 4169                                          remove_x86_feature(x86_featureset,
3718 4170                                              X86FSET_BMI1);
3719 4171                                          remove_x86_feature(x86_featureset,
3720 4172                                              X86FSET_BMI2);
3721 4173                                          remove_x86_feature(x86_featureset,
3722 4174                                              X86FSET_FMA);
3723 4175                                          remove_x86_feature(x86_featureset,
3724 4176                                              X86FSET_AVX2);
3725 4177                                          remove_x86_feature(x86_featureset,
3726 4178                                              X86FSET_MPX);
3727 4179                                          remove_x86_feature(x86_featureset,
3728 4180                                              X86FSET_AVX512F);
3729 4181                                          remove_x86_feature(x86_featureset,
3730 4182                                              X86FSET_AVX512DQ);
3731 4183                                          remove_x86_feature(x86_featureset,
3732 4184                                              X86FSET_AVX512PF);
3733 4185                                          remove_x86_feature(x86_featureset,
3734 4186                                              X86FSET_AVX512ER);
3735 4187                                          remove_x86_feature(x86_featureset,
3736 4188                                              X86FSET_AVX512CD);
3737 4189                                          remove_x86_feature(x86_featureset,
3738 4190                                              X86FSET_AVX512BW);
3739 4191                                          remove_x86_feature(x86_featureset,
3740 4192                                              X86FSET_AVX512VL);
3741 4193                                          remove_x86_feature(x86_featureset,
3742 4194                                              X86FSET_AVX512FMA);
3743 4195                                          remove_x86_feature(x86_featureset,
3744 4196                                              X86FSET_AVX512VBMI);
3745 4197                                          remove_x86_feature(x86_featureset,
3746 4198                                              X86FSET_AVX512VNNI);
3747 4199                                          remove_x86_feature(x86_featureset,
3748 4200                                              X86FSET_AVX512VPOPCDQ);
3749 4201                                          remove_x86_feature(x86_featureset,
3750 4202                                              X86FSET_AVX512NNIW);
3751 4203                                          remove_x86_feature(x86_featureset,
3752 4204                                              X86FSET_AVX512FMAPS);
3753 4205  
3754 4206                                          CPI_FEATURES_ECX(cpi) &=
3755 4207                                              ~CPUID_INTC_ECX_XSAVE;
3756 4208                                          CPI_FEATURES_ECX(cpi) &=
3757 4209                                              ~CPUID_INTC_ECX_AVX;
3758 4210                                          CPI_FEATURES_ECX(cpi) &=
3759 4211                                              ~CPUID_INTC_ECX_F16C;
3760 4212                                          CPI_FEATURES_ECX(cpi) &=
3761 4213                                              ~CPUID_INTC_ECX_FMA;
3762 4214                                          CPI_FEATURES_7_0_EBX(cpi) &=
3763 4215                                              ~CPUID_INTC_EBX_7_0_BMI1;
3764 4216                                          CPI_FEATURES_7_0_EBX(cpi) &=
3765 4217                                              ~CPUID_INTC_EBX_7_0_BMI2;
3766 4218                                          CPI_FEATURES_7_0_EBX(cpi) &=
3767 4219                                              ~CPUID_INTC_EBX_7_0_AVX2;
3768 4220                                          CPI_FEATURES_7_0_EBX(cpi) &=
3769 4221                                              ~CPUID_INTC_EBX_7_0_MPX;
3770 4222                                          CPI_FEATURES_7_0_EBX(cpi) &=
3771 4223                                              ~CPUID_INTC_EBX_7_0_ALL_AVX512;
3772 4224  
3773 4225                                          CPI_FEATURES_7_0_ECX(cpi) &=
3774 4226                                              ~CPUID_INTC_ECX_7_0_ALL_AVX512;
3775 4227  
3776 4228                                          CPI_FEATURES_7_0_EDX(cpi) &=
3777 4229                                              ~CPUID_INTC_EDX_7_0_ALL_AVX512;
3778 4230  
3779 4231                                          xsave_force_disable = B_TRUE;
3780 4232                                  } else {
3781 4233                                          VERIFY(is_x86_feature(x86_featureset,
3782 4234                                              X86FSET_XSAVE) == B_FALSE);
3783 4235                                  }
3784 4236                          }
3785 4237                  }
3786 4238          }
3787 4239  
3788 4240  
3789 4241          if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
3790 4242                  goto pass2_done;
3791 4243  
3792 4244          if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
3793 4245                  nmax = NMAX_CPI_EXTD;
3794 4246          /*
3795 4247           * Copy the extended properties, fixing them as we go.
3796 4248           * (We already handled n == 0 and n == 1 in pass 1)
3797 4249           */
3798 4250          iptr = (void *)cpi->cpi_brandstr;
3799 4251          for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
3800 4252                  cp->cp_eax = CPUID_LEAF_EXT_0 + n;
3801 4253                  (void) __cpuid_insn(cp);
3802 4254                  platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
3803 4255                      cp);
3804 4256                  switch (n) {
3805 4257                  case 2:
3806 4258                  case 3:
3807 4259                  case 4:
3808 4260                          /*
3809 4261                           * Extract the brand string
3810 4262                           */
3811 4263                          *iptr++ = cp->cp_eax;
3812 4264                          *iptr++ = cp->cp_ebx;
3813 4265                          *iptr++ = cp->cp_ecx;
3814 4266                          *iptr++ = cp->cp_edx;
3815 4267                          break;
3816 4268                  case 5:
3817 4269                          switch (cpi->cpi_vendor) {
3818 4270                          case X86_VENDOR_AMD:
3819 4271                                  /*
3820 4272                                   * The Athlon and Duron were the first
3821 4273                                   * parts to report the sizes of the
3822 4274                                   * TLB for large pages. Before then,
3823 4275                                   * we don't trust the data.
3824 4276                                   */
3825 4277                                  if (cpi->cpi_family < 6 ||
3826 4278                                      (cpi->cpi_family == 6 &&
3827 4279                                      cpi->cpi_model < 1))
3828 4280                                          cp->cp_eax = 0;
3829 4281                                  break;
3830 4282                          default:
3831 4283                                  break;
3832 4284                          }
3833 4285                          break;
3834 4286                  case 6:
3835 4287                          switch (cpi->cpi_vendor) {
3836 4288                          case X86_VENDOR_AMD:
3837 4289                                  /*
3838 4290                                   * The Athlon and Duron were the first
3839 4291                                   * AMD parts with L2 TLB's.
3840 4292                                   * Before then, don't trust the data.
3841 4293                                   */
3842 4294                                  if (cpi->cpi_family < 6 ||
3843 4295                                      cpi->cpi_family == 6 &&
3844 4296                                      cpi->cpi_model < 1)
3845 4297                                          cp->cp_eax = cp->cp_ebx = 0;
3846 4298                                  /*
3847 4299                                   * AMD Duron rev A0 reports L2
3848 4300                                   * cache size incorrectly as 1K
3849 4301                                   * when it is really 64K
3850 4302                                   */
3851 4303                                  if (cpi->cpi_family == 6 &&
3852 4304                                      cpi->cpi_model == 3 &&
3853 4305                                      cpi->cpi_step == 0) {
3854 4306                                          cp->cp_ecx &= 0xffff;
3855 4307                                          cp->cp_ecx |= 0x400000;
3856 4308                                  }
3857 4309                                  break;
3858 4310                          case X86_VENDOR_Cyrix:  /* VIA C3 */
3859 4311                                  /*
3860 4312                                   * VIA C3 processors are a bit messed
3861 4313                                   * up w.r.t. encoding cache sizes in %ecx
3862 4314                                   */
3863 4315                                  if (cpi->cpi_family != 6)
3864 4316                                          break;
3865 4317                                  /*
3866 4318                                   * model 7 and 8 were incorrectly encoded
3867 4319                                   *
3868 4320                                   * xxx is model 8 really broken?
3869 4321                                   */
3870 4322                                  if (cpi->cpi_model == 7 ||
3871 4323                                      cpi->cpi_model == 8)
3872 4324                                          cp->cp_ecx =
3873 4325                                              BITX(cp->cp_ecx, 31, 24) << 16 |
3874 4326                                              BITX(cp->cp_ecx, 23, 16) << 12 |
3875 4327                                              BITX(cp->cp_ecx, 15, 8) << 8 |
3876 4328                                              BITX(cp->cp_ecx, 7, 0);
3877 4329                                  /*
3878 4330                                   * model 9 stepping 1 has wrong associativity
3879 4331                                   */
3880 4332                                  if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
3881 4333                                          cp->cp_ecx |= 8 << 12;
3882 4334                                  break;
3883 4335                          case X86_VENDOR_Intel:
3884 4336                                  /*
3885 4337                                   * Extended L2 Cache features function.
3886 4338                                   * First appeared on Prescott.
3887 4339                                   */
3888 4340                          default:
3889 4341                                  break;
3890 4342                          }
3891 4343                          break;
3892 4344                  default:
3893 4345                          break;
3894 4346                  }
3895 4347          }
3896 4348  
3897 4349  pass2_done:
3898 4350          cpi->cpi_pass = 2;
3899 4351  }
3900 4352  
3901 4353  static const char *
3902 4354  intel_cpubrand(const struct cpuid_info *cpi)
3903 4355  {
3904 4356          int i;
3905 4357  
3906 4358          if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
3907 4359              cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
3908 4360                  return ("i486");
3909 4361  
3910 4362          switch (cpi->cpi_family) {
3911 4363          case 5:
3912 4364                  return ("Intel Pentium(r)");
3913 4365          case 6:
3914 4366                  switch (cpi->cpi_model) {
3915 4367                          uint_t celeron, xeon;
3916 4368                          const struct cpuid_regs *cp;
3917 4369                  case 0:
3918 4370                  case 1:
3919 4371                  case 2:
3920 4372                          return ("Intel Pentium(r) Pro");
3921 4373                  case 3:
3922 4374                  case 4:
3923 4375                          return ("Intel Pentium(r) II");
3924 4376                  case 6:
3925 4377                          return ("Intel Celeron(r)");
3926 4378                  case 5:
3927 4379                  case 7:
3928 4380                          celeron = xeon = 0;
3929 4381                          cp = &cpi->cpi_std[2];  /* cache info */
3930 4382  
3931 4383                          for (i = 1; i < 4; i++) {
3932 4384                                  uint_t tmp;
3933 4385  
3934 4386                                  tmp = (cp->cp_eax >> (8 * i)) & 0xff;
3935 4387                                  if (tmp == 0x40)
3936 4388                                          celeron++;
3937 4389                                  if (tmp >= 0x44 && tmp <= 0x45)
3938 4390                                          xeon++;
3939 4391                          }
3940 4392  
3941 4393                          for (i = 0; i < 2; i++) {
3942 4394                                  uint_t tmp;
3943 4395  
3944 4396                                  tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
3945 4397                                  if (tmp == 0x40)
3946 4398                                          celeron++;
3947 4399                                  else if (tmp >= 0x44 && tmp <= 0x45)
3948 4400                                          xeon++;
3949 4401                          }
3950 4402  
3951 4403                          for (i = 0; i < 4; i++) {
3952 4404                                  uint_t tmp;
3953 4405  
3954 4406                                  tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
3955 4407                                  if (tmp == 0x40)
3956 4408                                          celeron++;
3957 4409                                  else if (tmp >= 0x44 && tmp <= 0x45)
3958 4410                                          xeon++;
3959 4411                          }
3960 4412  
3961 4413                          for (i = 0; i < 4; i++) {
3962 4414                                  uint_t tmp;
3963 4415  
3964 4416                                  tmp = (cp->cp_edx >> (8 * i)) & 0xff;
3965 4417                                  if (tmp == 0x40)
3966 4418                                          celeron++;
3967 4419                                  else if (tmp >= 0x44 && tmp <= 0x45)
3968 4420                                          xeon++;
3969 4421                          }
3970 4422  
3971 4423                          if (celeron)
3972 4424                                  return ("Intel Celeron(r)");
3973 4425                          if (xeon)
3974 4426                                  return (cpi->cpi_model == 5 ?
3975 4427                                      "Intel Pentium(r) II Xeon(tm)" :
3976 4428                                      "Intel Pentium(r) III Xeon(tm)");
3977 4429                          return (cpi->cpi_model == 5 ?
3978 4430                              "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
3979 4431                              "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
3980 4432                  default:
3981 4433                          break;
3982 4434                  }
3983 4435          default:
3984 4436                  break;
3985 4437          }
3986 4438  
3987 4439          /* BrandID is present if the field is nonzero */
3988 4440          if (cpi->cpi_brandid != 0) {
3989 4441                  static const struct {
3990 4442                          uint_t bt_bid;
3991 4443                          const char *bt_str;
3992 4444                  } brand_tbl[] = {
3993 4445                          { 0x1,  "Intel(r) Celeron(r)" },
3994 4446                          { 0x2,  "Intel(r) Pentium(r) III" },
3995 4447                          { 0x3,  "Intel(r) Pentium(r) III Xeon(tm)" },
3996 4448                          { 0x4,  "Intel(r) Pentium(r) III" },
3997 4449                          { 0x6,  "Mobile Intel(r) Pentium(r) III" },
3998 4450                          { 0x7,  "Mobile Intel(r) Celeron(r)" },
3999 4451                          { 0x8,  "Intel(r) Pentium(r) 4" },
4000 4452                          { 0x9,  "Intel(r) Pentium(r) 4" },
4001 4453                          { 0xa,  "Intel(r) Celeron(r)" },
4002 4454                          { 0xb,  "Intel(r) Xeon(tm)" },
4003 4455                          { 0xc,  "Intel(r) Xeon(tm) MP" },
4004 4456                          { 0xe,  "Mobile Intel(r) Pentium(r) 4" },
4005 4457                          { 0xf,  "Mobile Intel(r) Celeron(r)" },
4006 4458                          { 0x11, "Mobile Genuine Intel(r)" },
4007 4459                          { 0x12, "Intel(r) Celeron(r) M" },
4008 4460                          { 0x13, "Mobile Intel(r) Celeron(r)" },
4009 4461                          { 0x14, "Intel(r) Celeron(r)" },
4010 4462                          { 0x15, "Mobile Genuine Intel(r)" },
4011 4463                          { 0x16, "Intel(r) Pentium(r) M" },
4012 4464                          { 0x17, "Mobile Intel(r) Celeron(r)" }
4013 4465                  };
4014 4466                  uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
4015 4467                  uint_t sgn;
4016 4468  
4017 4469                  sgn = (cpi->cpi_family << 8) |
4018 4470                      (cpi->cpi_model << 4) | cpi->cpi_step;
4019 4471  
4020 4472                  for (i = 0; i < btblmax; i++)
4021 4473                          if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
4022 4474                                  break;
4023 4475                  if (i < btblmax) {
4024 4476                          if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
4025 4477                                  return ("Intel(r) Celeron(r)");
4026 4478                          if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
4027 4479                                  return ("Intel(r) Xeon(tm) MP");
4028 4480                          if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
4029 4481                                  return ("Intel(r) Xeon(tm)");
4030 4482                          return (brand_tbl[i].bt_str);
4031 4483                  }
4032 4484          }
4033 4485  
4034 4486          return (NULL);
4035 4487  }
4036 4488  
4037 4489  static const char *
4038 4490  amd_cpubrand(const struct cpuid_info *cpi)
4039 4491  {
4040 4492          if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4041 4493              cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
4042 4494                  return ("i486 compatible");
4043 4495  
4044 4496          switch (cpi->cpi_family) {
4045 4497          case 5:
4046 4498                  switch (cpi->cpi_model) {
4047 4499                  case 0:
4048 4500                  case 1:
4049 4501                  case 2:
4050 4502                  case 3:
4051 4503                  case 4:
4052 4504                  case 5:
4053 4505                          return ("AMD-K5(r)");
4054 4506                  case 6:
4055 4507                  case 7:
4056 4508                          return ("AMD-K6(r)");
4057 4509                  case 8:
4058 4510                          return ("AMD-K6(r)-2");
4059 4511                  case 9:
4060 4512                          return ("AMD-K6(r)-III");
4061 4513                  default:
4062 4514                          return ("AMD (family 5)");
4063 4515                  }
4064 4516          case 6:
4065 4517                  switch (cpi->cpi_model) {
4066 4518                  case 1:
4067 4519                          return ("AMD-K7(tm)");
4068 4520                  case 0:
4069 4521                  case 2:
4070 4522                  case 4:
4071 4523                          return ("AMD Athlon(tm)");
4072 4524                  case 3:
4073 4525                  case 7:
4074 4526                          return ("AMD Duron(tm)");
4075 4527                  case 6:
4076 4528                  case 8:
4077 4529                  case 10:
4078 4530                          /*
4079 4531                           * Use the L2 cache size to distinguish
4080 4532                           */
4081 4533                          return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
4082 4534                              "AMD Athlon(tm)" : "AMD Duron(tm)");
4083 4535                  default:
4084 4536                          return ("AMD (family 6)");
4085 4537                  }
4086 4538          default:
4087 4539                  break;
4088 4540          }
4089 4541  
4090 4542          if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
4091 4543              cpi->cpi_brandid != 0) {
4092 4544                  switch (BITX(cpi->cpi_brandid, 7, 5)) {
4093 4545                  case 3:
4094 4546                          return ("AMD Opteron(tm) UP 1xx");
4095 4547                  case 4:
4096 4548                          return ("AMD Opteron(tm) DP 2xx");
4097 4549                  case 5:
4098 4550                          return ("AMD Opteron(tm) MP 8xx");
4099 4551                  default:
4100 4552                          return ("AMD Opteron(tm)");
4101 4553                  }
4102 4554          }
4103 4555  
4104 4556          return (NULL);
4105 4557  }
4106 4558  
4107 4559  static const char *
4108 4560  cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
4109 4561  {
4110 4562          if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4111 4563              cpi->cpi_maxeax < 1 || cpi->cpi_family < 5 ||
4112 4564              type == X86_TYPE_CYRIX_486)
4113 4565                  return ("i486 compatible");
4114 4566  
4115 4567          switch (type) {
4116 4568          case X86_TYPE_CYRIX_6x86:
4117 4569                  return ("Cyrix 6x86");
4118 4570          case X86_TYPE_CYRIX_6x86L:
4119 4571                  return ("Cyrix 6x86L");
4120 4572          case X86_TYPE_CYRIX_6x86MX:
4121 4573                  return ("Cyrix 6x86MX");
4122 4574          case X86_TYPE_CYRIX_GXm:
4123 4575                  return ("Cyrix GXm");
4124 4576          case X86_TYPE_CYRIX_MediaGX:
4125 4577                  return ("Cyrix MediaGX");
4126 4578          case X86_TYPE_CYRIX_MII:
4127 4579                  return ("Cyrix M2");
4128 4580          case X86_TYPE_VIA_CYRIX_III:
4129 4581                  return ("VIA Cyrix M3");
4130 4582          default:
4131 4583                  /*
4132 4584                   * Have another wild guess ..
4133 4585                   */
4134 4586                  if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
4135 4587                          return ("Cyrix 5x86");
4136 4588                  else if (cpi->cpi_family == 5) {
4137 4589                          switch (cpi->cpi_model) {
4138 4590                          case 2:
4139 4591                                  return ("Cyrix 6x86");  /* Cyrix M1 */
4140 4592                          case 4:
4141 4593                                  return ("Cyrix MediaGX");
4142 4594                          default:
4143 4595                                  break;
4144 4596                          }
4145 4597                  } else if (cpi->cpi_family == 6) {
4146 4598                          switch (cpi->cpi_model) {
4147 4599                          case 0:
4148 4600                                  return ("Cyrix 6x86MX"); /* Cyrix M2? */
4149 4601                          case 5:
4150 4602                          case 6:
4151 4603                          case 7:
4152 4604                          case 8:
4153 4605                          case 9:
4154 4606                                  return ("VIA C3");
4155 4607                          default:
4156 4608                                  break;
4157 4609                          }
4158 4610                  }
4159 4611                  break;
4160 4612          }
4161 4613          return (NULL);
4162 4614  }
4163 4615  
4164 4616  /*
4165 4617   * This only gets called in the case that the CPU extended
4166 4618   * feature brand string (0x80000002, 0x80000003, 0x80000004)
4167 4619   * aren't available, or contain null bytes for some reason.
4168 4620   */
4169 4621  static void
4170 4622  fabricate_brandstr(struct cpuid_info *cpi)
4171 4623  {
4172 4624          const char *brand = NULL;
4173 4625  
4174 4626          switch (cpi->cpi_vendor) {
4175 4627          case X86_VENDOR_Intel:
4176 4628                  brand = intel_cpubrand(cpi);
4177 4629                  break;
4178 4630          case X86_VENDOR_AMD:
4179 4631                  brand = amd_cpubrand(cpi);
4180 4632                  break;
4181 4633          case X86_VENDOR_Cyrix:
4182 4634                  brand = cyrix_cpubrand(cpi, x86_type);
4183 4635                  break;
4184 4636          case X86_VENDOR_NexGen:
4185 4637                  if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4186 4638                          brand = "NexGen Nx586";
4187 4639                  break;
4188 4640          case X86_VENDOR_Centaur:
4189 4641                  if (cpi->cpi_family == 5)
4190 4642                          switch (cpi->cpi_model) {
4191 4643                          case 4:
4192 4644                                  brand = "Centaur C6";
4193 4645                                  break;
4194 4646                          case 8:
4195 4647                                  brand = "Centaur C2";
4196 4648                                  break;
4197 4649                          case 9:
4198 4650                                  brand = "Centaur C3";
4199 4651                                  break;
4200 4652                          default:
4201 4653                                  break;
4202 4654                          }
4203 4655                  break;
4204 4656          case X86_VENDOR_Rise:
4205 4657                  if (cpi->cpi_family == 5 &&
4206 4658                      (cpi->cpi_model == 0 || cpi->cpi_model == 2))
4207 4659                          brand = "Rise mP6";
4208 4660                  break;
4209 4661          case X86_VENDOR_SiS:
4210 4662                  if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4211 4663                          brand = "SiS 55x";
4212 4664                  break;
4213 4665          case X86_VENDOR_TM:
4214 4666                  if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
4215 4667                          brand = "Transmeta Crusoe TM3x00 or TM5x00";
4216 4668                  break;
4217 4669          case X86_VENDOR_NSC:
4218 4670          case X86_VENDOR_UMC:
4219 4671          default:
4220 4672                  break;
4221 4673          }
4222 4674          if (brand) {
4223 4675                  (void) strcpy((char *)cpi->cpi_brandstr, brand);
4224 4676                  return;
4225 4677          }
4226 4678  
4227 4679          /*
4228 4680           * If all else fails ...
4229 4681           */
4230 4682          (void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
4231 4683              "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
4232 4684              cpi->cpi_model, cpi->cpi_step);
4233 4685  }
4234 4686  
4235 4687  /*
4236 4688   * This routine is called just after kernel memory allocation
4237 4689   * becomes available on cpu0, and as part of mp_startup() on
4238 4690   * the other cpus.
4239 4691   *
4240 4692   * Fixup the brand string, and collect any information from cpuid
4241 4693   * that requires dynamically allocated storage to represent.
4242 4694   */
4243 4695  /*ARGSUSED*/
4244 4696  void
4245 4697  cpuid_pass3(cpu_t *cpu)
4246 4698  {
4247 4699          int     i, max, shft, level, size;
4248 4700          struct cpuid_regs regs;
4249 4701          struct cpuid_regs *cp;
4250 4702          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4251 4703  
4252 4704          ASSERT(cpi->cpi_pass == 2);
4253 4705  
4254 4706          /*
4255 4707           * Deterministic cache parameters
4256 4708           *
4257 4709           * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
4258 4710           * values that are present are currently defined to be the same. This
4259 4711           * means we can use the same logic to parse it as long as we use the
4260 4712           * appropriate leaf to get the data. If you're updating this, make sure
4261 4713           * you're careful about which vendor supports which aspect.
4262 4714           *
4263 4715           * Take this opportunity to detect the number of threads sharing the
4264 4716           * last level cache, and construct a corresponding cache id. The
4265 4717           * respective cpuid_info members are initialized to the default case of
4266 4718           * "no last level cache sharing".
4267 4719           */
4268 4720          cpi->cpi_ncpu_shr_last_cache = 1;
4269 4721          cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
4270 4722  
4271 4723          if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
4272 4724              (cpi->cpi_vendor == X86_VENDOR_AMD &&
4273 4725              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
4274 4726              is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
4275 4727                  uint32_t leaf;
4276 4728  
4277 4729                  if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4278 4730                          leaf = 4;
4279 4731                  } else {
4280 4732                          leaf = CPUID_LEAF_EXT_1d;
4281 4733                  }
4282 4734  
4283 4735                  /*
4284 4736                   * Find the # of elements (size) returned by the leaf and along
4285 4737                   * the way detect last level cache sharing details.
4286 4738                   */
4287 4739                  bzero(&regs, sizeof (regs));
4288 4740                  cp = &regs;
4289 4741                  for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
4290 4742                          cp->cp_eax = leaf;
4291 4743                          cp->cp_ecx = i;
4292 4744  
4293 4745                          (void) __cpuid_insn(cp);
4294 4746  
4295 4747                          if (CPI_CACHE_TYPE(cp) == 0)
4296 4748                                  break;
4297 4749                          level = CPI_CACHE_LVL(cp);
4298 4750                          if (level > max) {
4299 4751                                  max = level;
4300 4752                                  cpi->cpi_ncpu_shr_last_cache =
4301 4753                                      CPI_NTHR_SHR_CACHE(cp) + 1;
4302 4754                          }
4303 4755                  }
4304 4756                  cpi->cpi_cache_leaf_size = size = i;
4305 4757  
4306 4758                  /*
4307 4759                   * Allocate the cpi_cache_leaves array. The first element
4308 4760                   * references the regs for the corresponding leaf with %ecx set
4309 4761                   * to 0. This was gathered in cpuid_pass2().
4310 4762                   */
4311 4763                  if (size > 0) {
4312 4764                          cpi->cpi_cache_leaves =
4313 4765                              kmem_alloc(size * sizeof (cp), KM_SLEEP);
4314 4766                          if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4315 4767                                  cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
4316 4768                          } else {
4317 4769                                  cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
4318 4770                          }
4319 4771  
4320 4772                          /*
4321 4773                           * Allocate storage to hold the additional regs
4322 4774                           * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
4323 4775                           *
4324 4776                           * The regs for the leaf, %ecx == 0 has already
4325 4777                           * been allocated as indicated above.
4326 4778                           */
4327 4779                          for (i = 1; i < size; i++) {
4328 4780                                  cp = cpi->cpi_cache_leaves[i] =
4329 4781                                      kmem_zalloc(sizeof (regs), KM_SLEEP);
4330 4782                                  cp->cp_eax = leaf;
4331 4783                                  cp->cp_ecx = i;
4332 4784  
4333 4785                                  (void) __cpuid_insn(cp);
4334 4786                          }
4335 4787                  }
4336 4788                  /*
4337 4789                   * Determine the number of bits needed to represent
4338 4790                   * the number of CPUs sharing the last level cache.
4339 4791                   *
4340 4792                   * Shift off that number of bits from the APIC id to
4341 4793                   * derive the cache id.
4342 4794                   */
4343 4795                  shft = 0;
4344 4796                  for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
4345 4797                          shft++;
4346 4798                  cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
4347 4799          }
4348 4800  
4349 4801          /*
4350 4802           * Now fixup the brand string
4351 4803           */
4352 4804          if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
4353 4805                  fabricate_brandstr(cpi);
4354 4806          } else {
4355 4807  
4356 4808                  /*
4357 4809                   * If we successfully extracted a brand string from the cpuid
4358 4810                   * instruction, clean it up by removing leading spaces and
4359 4811                   * similar junk.
4360 4812                   */
4361 4813                  if (cpi->cpi_brandstr[0]) {
4362 4814                          size_t maxlen = sizeof (cpi->cpi_brandstr);
4363 4815                          char *src, *dst;
4364 4816  
4365 4817                          dst = src = (char *)cpi->cpi_brandstr;
4366 4818                          src[maxlen - 1] = '\0';
4367 4819                          /*
4368 4820                           * strip leading spaces
4369 4821                           */
4370 4822                          while (*src == ' ')
4371 4823                                  src++;
4372 4824                          /*
4373 4825                           * Remove any 'Genuine' or "Authentic" prefixes
4374 4826                           */
4375 4827                          if (strncmp(src, "Genuine ", 8) == 0)
4376 4828                                  src += 8;
4377 4829                          if (strncmp(src, "Authentic ", 10) == 0)
4378 4830                                  src += 10;
4379 4831  
4380 4832                          /*
4381 4833                           * Now do an in-place copy.
4382 4834                           * Map (R) to (r) and (TM) to (tm).
4383 4835                           * The era of teletypes is long gone, and there's
4384 4836                           * -really- no need to shout.
4385 4837                           */
4386 4838                          while (*src != '\0') {
4387 4839                                  if (src[0] == '(') {
4388 4840                                          if (strncmp(src + 1, "R)", 2) == 0) {
4389 4841                                                  (void) strncpy(dst, "(r)", 3);
4390 4842                                                  src += 3;
4391 4843                                                  dst += 3;
4392 4844                                                  continue;
4393 4845                                          }
4394 4846                                          if (strncmp(src + 1, "TM)", 3) == 0) {
4395 4847                                                  (void) strncpy(dst, "(tm)", 4);
4396 4848                                                  src += 4;
4397 4849                                                  dst += 4;
4398 4850                                                  continue;
4399 4851                                          }
4400 4852                                  }
4401 4853                                  *dst++ = *src++;
4402 4854                          }
4403 4855                          *dst = '\0';
4404 4856  
4405 4857                          /*
4406 4858                           * Finally, remove any trailing spaces
4407 4859                           */
4408 4860                          while (--dst > cpi->cpi_brandstr)
4409 4861                                  if (*dst == ' ')
4410 4862                                          *dst = '\0';
4411 4863                                  else
4412 4864                                          break;
4413 4865                  } else
4414 4866                          fabricate_brandstr(cpi);
4415 4867          }
4416 4868          cpi->cpi_pass = 3;
4417 4869  }
4418 4870  
4419 4871  /*
4420 4872   * This routine is called out of bind_hwcap() much later in the life
4421 4873   * of the kernel (post_startup()).  The job of this routine is to resolve
4422 4874   * the hardware feature support and kernel support for those features into
4423 4875   * what we're actually going to tell applications via the aux vector.
4424 4876   */
4425 4877  void
4426 4878  cpuid_pass4(cpu_t *cpu, uint_t *hwcap_out)
4427 4879  {
4428 4880          struct cpuid_info *cpi;
4429 4881          uint_t hwcap_flags = 0, hwcap_flags_2 = 0;
4430 4882  
4431 4883          if (cpu == NULL)
4432 4884                  cpu = CPU;
4433 4885          cpi = cpu->cpu_m.mcpu_cpi;
4434 4886  
4435 4887          ASSERT(cpi->cpi_pass == 3);
4436 4888  
4437 4889          if (cpi->cpi_maxeax >= 1) {
4438 4890                  uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
4439 4891                  uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
4440 4892                  uint32_t *ebx = &cpi->cpi_support[STD_EBX_FEATURES];
4441 4893  
4442 4894                  *edx = CPI_FEATURES_EDX(cpi);
4443 4895                  *ecx = CPI_FEATURES_ECX(cpi);
4444 4896                  *ebx = CPI_FEATURES_7_0_EBX(cpi);
4445 4897  
4446 4898                  /*
4447 4899                   * [these require explicit kernel support]
4448 4900                   */
4449 4901                  if (!is_x86_feature(x86_featureset, X86FSET_SEP))
4450 4902                          *edx &= ~CPUID_INTC_EDX_SEP;
4451 4903  
4452 4904                  if (!is_x86_feature(x86_featureset, X86FSET_SSE))
4453 4905                          *edx &= ~(CPUID_INTC_EDX_FXSR|CPUID_INTC_EDX_SSE);
4454 4906                  if (!is_x86_feature(x86_featureset, X86FSET_SSE2))
4455 4907                          *edx &= ~CPUID_INTC_EDX_SSE2;
4456 4908  
4457 4909                  if (!is_x86_feature(x86_featureset, X86FSET_HTT))
4458 4910                          *edx &= ~CPUID_INTC_EDX_HTT;
4459 4911  
4460 4912                  if (!is_x86_feature(x86_featureset, X86FSET_SSE3))
4461 4913                          *ecx &= ~CPUID_INTC_ECX_SSE3;
4462 4914  
4463 4915                  if (!is_x86_feature(x86_featureset, X86FSET_SSSE3))
4464 4916                          *ecx &= ~CPUID_INTC_ECX_SSSE3;
4465 4917                  if (!is_x86_feature(x86_featureset, X86FSET_SSE4_1))
4466 4918                          *ecx &= ~CPUID_INTC_ECX_SSE4_1;
4467 4919                  if (!is_x86_feature(x86_featureset, X86FSET_SSE4_2))
4468 4920                          *ecx &= ~CPUID_INTC_ECX_SSE4_2;
4469 4921                  if (!is_x86_feature(x86_featureset, X86FSET_AES))
4470 4922                          *ecx &= ~CPUID_INTC_ECX_AES;
4471 4923                  if (!is_x86_feature(x86_featureset, X86FSET_PCLMULQDQ))
4472 4924                          *ecx &= ~CPUID_INTC_ECX_PCLMULQDQ;
4473 4925                  if (!is_x86_feature(x86_featureset, X86FSET_XSAVE))
4474 4926                          *ecx &= ~(CPUID_INTC_ECX_XSAVE |
4475 4927                              CPUID_INTC_ECX_OSXSAVE);
4476 4928                  if (!is_x86_feature(x86_featureset, X86FSET_AVX))
4477 4929                          *ecx &= ~CPUID_INTC_ECX_AVX;
4478 4930                  if (!is_x86_feature(x86_featureset, X86FSET_F16C))
4479 4931                          *ecx &= ~CPUID_INTC_ECX_F16C;
4480 4932                  if (!is_x86_feature(x86_featureset, X86FSET_FMA))
4481 4933                          *ecx &= ~CPUID_INTC_ECX_FMA;
4482 4934                  if (!is_x86_feature(x86_featureset, X86FSET_BMI1))
4483 4935                          *ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
4484 4936                  if (!is_x86_feature(x86_featureset, X86FSET_BMI2))
4485 4937                          *ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
4486 4938                  if (!is_x86_feature(x86_featureset, X86FSET_AVX2))
4487 4939                          *ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
4488 4940                  if (!is_x86_feature(x86_featureset, X86FSET_RDSEED))
4489 4941                          *ebx &= ~CPUID_INTC_EBX_7_0_RDSEED;
4490 4942                  if (!is_x86_feature(x86_featureset, X86FSET_ADX))
4491 4943                          *ebx &= ~CPUID_INTC_EBX_7_0_ADX;
4492 4944  
4493 4945                  /*
4494 4946                   * [no explicit support required beyond x87 fp context]
4495 4947                   */
4496 4948                  if (!fpu_exists)
4497 4949                          *edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
4498 4950  
4499 4951                  /*
4500 4952                   * Now map the supported feature vector to things that we
4501 4953                   * think userland will care about.
4502 4954                   */
4503 4955                  if (*edx & CPUID_INTC_EDX_SEP)
4504 4956                          hwcap_flags |= AV_386_SEP;
4505 4957                  if (*edx & CPUID_INTC_EDX_SSE)
4506 4958                          hwcap_flags |= AV_386_FXSR | AV_386_SSE;
4507 4959                  if (*edx & CPUID_INTC_EDX_SSE2)
4508 4960                          hwcap_flags |= AV_386_SSE2;
4509 4961                  if (*ecx & CPUID_INTC_ECX_SSE3)
4510 4962                          hwcap_flags |= AV_386_SSE3;
4511 4963                  if (*ecx & CPUID_INTC_ECX_SSSE3)
4512 4964                          hwcap_flags |= AV_386_SSSE3;
4513 4965                  if (*ecx & CPUID_INTC_ECX_SSE4_1)
4514 4966                          hwcap_flags |= AV_386_SSE4_1;
4515 4967                  if (*ecx & CPUID_INTC_ECX_SSE4_2)
4516 4968                          hwcap_flags |= AV_386_SSE4_2;
4517 4969                  if (*ecx & CPUID_INTC_ECX_MOVBE)
4518 4970                          hwcap_flags |= AV_386_MOVBE;
4519 4971                  if (*ecx & CPUID_INTC_ECX_AES)
4520 4972                          hwcap_flags |= AV_386_AES;
4521 4973                  if (*ecx & CPUID_INTC_ECX_PCLMULQDQ)
4522 4974                          hwcap_flags |= AV_386_PCLMULQDQ;
4523 4975                  if ((*ecx & CPUID_INTC_ECX_XSAVE) &&
4524 4976                      (*ecx & CPUID_INTC_ECX_OSXSAVE)) {
4525 4977                          hwcap_flags |= AV_386_XSAVE;
4526 4978  
4527 4979                          if (*ecx & CPUID_INTC_ECX_AVX) {
4528 4980                                  uint32_t *ecx_7 = &CPI_FEATURES_7_0_ECX(cpi);
4529 4981                                  uint32_t *edx_7 = &CPI_FEATURES_7_0_EDX(cpi);
4530 4982  
4531 4983                                  hwcap_flags |= AV_386_AVX;
4532 4984                                  if (*ecx & CPUID_INTC_ECX_F16C)
4533 4985                                          hwcap_flags_2 |= AV_386_2_F16C;
4534 4986                                  if (*ecx & CPUID_INTC_ECX_FMA)
4535 4987                                          hwcap_flags_2 |= AV_386_2_FMA;
4536 4988  
4537 4989                                  if (*ebx & CPUID_INTC_EBX_7_0_BMI1)
4538 4990                                          hwcap_flags_2 |= AV_386_2_BMI1;
4539 4991                                  if (*ebx & CPUID_INTC_EBX_7_0_BMI2)
4540 4992                                          hwcap_flags_2 |= AV_386_2_BMI2;
4541 4993                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX2)
4542 4994                                          hwcap_flags_2 |= AV_386_2_AVX2;
4543 4995                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512F)
4544 4996                                          hwcap_flags_2 |= AV_386_2_AVX512F;
4545 4997                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
4546 4998                                          hwcap_flags_2 |= AV_386_2_AVX512DQ;
4547 4999                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
4548 5000                                          hwcap_flags_2 |= AV_386_2_AVX512IFMA;
4549 5001                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512PF)
4550 5002                                          hwcap_flags_2 |= AV_386_2_AVX512PF;
4551 5003                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512ER)
4552 5004                                          hwcap_flags_2 |= AV_386_2_AVX512ER;
4553 5005                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512CD)
4554 5006                                          hwcap_flags_2 |= AV_386_2_AVX512CD;
4555 5007                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512BW)
4556 5008                                          hwcap_flags_2 |= AV_386_2_AVX512BW;
4557 5009                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512VL)
4558 5010                                          hwcap_flags_2 |= AV_386_2_AVX512VL;
4559 5011  
4560 5012                                  if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VBMI)
4561 5013                                          hwcap_flags_2 |= AV_386_2_AVX512VBMI;
4562 5014                                  if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VNNI)
4563 5015                                          hwcap_flags_2 |= AV_386_2_AVX512_VNNI;
4564 5016                                  if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
4565 5017                                          hwcap_flags_2 |= AV_386_2_AVX512VPOPCDQ;
4566 5018  
4567 5019                                  if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124NNIW)
4568 5020                                          hwcap_flags_2 |= AV_386_2_AVX512_4NNIW;
4569 5021                                  if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
4570 5022                                          hwcap_flags_2 |= AV_386_2_AVX512_4FMAPS;
4571 5023                          }
4572 5024                  }
4573 5025                  if (*ecx & CPUID_INTC_ECX_VMX)
4574 5026                          hwcap_flags |= AV_386_VMX;
4575 5027                  if (*ecx & CPUID_INTC_ECX_POPCNT)
4576 5028                          hwcap_flags |= AV_386_POPCNT;
4577 5029                  if (*edx & CPUID_INTC_EDX_FPU)
4578 5030                          hwcap_flags |= AV_386_FPU;
4579 5031                  if (*edx & CPUID_INTC_EDX_MMX)
4580 5032                          hwcap_flags |= AV_386_MMX;
4581 5033  
4582 5034                  if (*edx & CPUID_INTC_EDX_TSC)
4583 5035                          hwcap_flags |= AV_386_TSC;
4584 5036                  if (*edx & CPUID_INTC_EDX_CX8)
4585 5037                          hwcap_flags |= AV_386_CX8;
4586 5038                  if (*edx & CPUID_INTC_EDX_CMOV)
4587 5039                          hwcap_flags |= AV_386_CMOV;
4588 5040                  if (*ecx & CPUID_INTC_ECX_CX16)
4589 5041                          hwcap_flags |= AV_386_CX16;
4590 5042  
4591 5043                  if (*ecx & CPUID_INTC_ECX_RDRAND)
4592 5044                          hwcap_flags_2 |= AV_386_2_RDRAND;
4593 5045                  if (*ebx & CPUID_INTC_EBX_7_0_ADX)
4594 5046                          hwcap_flags_2 |= AV_386_2_ADX;
4595 5047                  if (*ebx & CPUID_INTC_EBX_7_0_RDSEED)
4596 5048                          hwcap_flags_2 |= AV_386_2_RDSEED;
4597 5049                  if (*ebx & CPUID_INTC_EBX_7_0_SHA)
4598 5050                          hwcap_flags_2 |= AV_386_2_SHA;
4599 5051                  if (*ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
4600 5052                          hwcap_flags_2 |= AV_386_2_FSGSBASE;
4601 5053                  if (*ebx & CPUID_INTC_EBX_7_0_CLWB)
4602 5054                          hwcap_flags_2 |= AV_386_2_CLWB;
4603 5055                  if (*ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
4604 5056                          hwcap_flags_2 |= AV_386_2_CLFLUSHOPT;
4605 5057  
4606 5058          }
4607 5059          /*
4608 5060           * Check a few miscilaneous features.
4609 5061           */
4610 5062          if (is_x86_feature(x86_featureset, X86FSET_CLZERO))
4611 5063                  hwcap_flags_2 |= AV_386_2_CLZERO;
4612 5064  
4613 5065          if (cpi->cpi_xmaxeax < 0x80000001)
4614 5066                  goto pass4_done;
4615 5067  
4616 5068          switch (cpi->cpi_vendor) {
4617 5069                  struct cpuid_regs cp;
4618 5070                  uint32_t *edx, *ecx;
4619 5071  
4620 5072          case X86_VENDOR_Intel:
4621 5073                  /*
4622 5074                   * Seems like Intel duplicated what we necessary
4623 5075                   * here to make the initial crop of 64-bit OS's work.
4624 5076                   * Hopefully, those are the only "extended" bits
4625 5077                   * they'll add.
4626 5078                   */
4627 5079                  /*FALLTHROUGH*/
4628 5080  
4629 5081          case X86_VENDOR_AMD:
4630 5082                  edx = &cpi->cpi_support[AMD_EDX_FEATURES];
4631 5083                  ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
4632 5084  
4633 5085                  *edx = CPI_FEATURES_XTD_EDX(cpi);
4634 5086                  *ecx = CPI_FEATURES_XTD_ECX(cpi);
4635 5087  
4636 5088                  /*
4637 5089                   * [these features require explicit kernel support]
4638 5090                   */
4639 5091                  switch (cpi->cpi_vendor) {
4640 5092                  case X86_VENDOR_Intel:
4641 5093                          if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
4642 5094                                  *edx &= ~CPUID_AMD_EDX_TSCP;
4643 5095                          break;
4644 5096  
4645 5097                  case X86_VENDOR_AMD:
4646 5098                          if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
4647 5099                                  *edx &= ~CPUID_AMD_EDX_TSCP;
4648 5100                          if (!is_x86_feature(x86_featureset, X86FSET_SSE4A))
4649 5101                                  *ecx &= ~CPUID_AMD_ECX_SSE4A;
4650 5102                          break;
4651 5103  
4652 5104                  default:
4653 5105                          break;
4654 5106                  }
4655 5107  
4656 5108                  /*
4657 5109                   * [no explicit support required beyond
4658 5110                   * x87 fp context and exception handlers]
4659 5111                   */
4660 5112                  if (!fpu_exists)
4661 5113                          *edx &= ~(CPUID_AMD_EDX_MMXamd |
4662 5114                              CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
4663 5115  
4664 5116                  if (!is_x86_feature(x86_featureset, X86FSET_NX))
4665 5117                          *edx &= ~CPUID_AMD_EDX_NX;
4666 5118  #if !defined(__amd64)
4667 5119                  *edx &= ~CPUID_AMD_EDX_LM;
4668 5120  #endif
4669 5121                  /*
4670 5122                   * Now map the supported feature vector to
4671 5123                   * things that we think userland will care about.
4672 5124                   */
4673 5125  #if defined(__amd64)
4674 5126                  if (*edx & CPUID_AMD_EDX_SYSC)
4675 5127                          hwcap_flags |= AV_386_AMD_SYSC;
4676 5128  #endif
4677 5129                  if (*edx & CPUID_AMD_EDX_MMXamd)
4678 5130                          hwcap_flags |= AV_386_AMD_MMX;
4679 5131                  if (*edx & CPUID_AMD_EDX_3DNow)
4680 5132                          hwcap_flags |= AV_386_AMD_3DNow;
4681 5133                  if (*edx & CPUID_AMD_EDX_3DNowx)
4682 5134                          hwcap_flags |= AV_386_AMD_3DNowx;
4683 5135                  if (*ecx & CPUID_AMD_ECX_SVM)
4684 5136                          hwcap_flags |= AV_386_AMD_SVM;
4685 5137  
4686 5138                  switch (cpi->cpi_vendor) {
4687 5139                  case X86_VENDOR_AMD:
4688 5140                          if (*edx & CPUID_AMD_EDX_TSCP)
4689 5141                                  hwcap_flags |= AV_386_TSCP;
4690 5142                          if (*ecx & CPUID_AMD_ECX_AHF64)
4691 5143                                  hwcap_flags |= AV_386_AHF;
4692 5144                          if (*ecx & CPUID_AMD_ECX_SSE4A)
4693 5145                                  hwcap_flags |= AV_386_AMD_SSE4A;
4694 5146                          if (*ecx & CPUID_AMD_ECX_LZCNT)
4695 5147                                  hwcap_flags |= AV_386_AMD_LZCNT;
4696 5148                          if (*ecx & CPUID_AMD_ECX_MONITORX)
4697 5149                                  hwcap_flags_2 |= AV_386_2_MONITORX;
4698 5150                          break;
4699 5151  
4700 5152                  case X86_VENDOR_Intel:
4701 5153                          if (*edx & CPUID_AMD_EDX_TSCP)
4702 5154                                  hwcap_flags |= AV_386_TSCP;
4703 5155                          if (*ecx & CPUID_AMD_ECX_LZCNT)
4704 5156                                  hwcap_flags |= AV_386_AMD_LZCNT;
4705 5157                          /*
4706 5158                           * Aarrgh.
4707 5159                           * Intel uses a different bit in the same word.
4708 5160                           */
4709 5161                          if (*ecx & CPUID_INTC_ECX_AHF64)
4710 5162                                  hwcap_flags |= AV_386_AHF;
4711 5163                          break;
4712 5164  
4713 5165                  default:
4714 5166                          break;
4715 5167                  }
4716 5168                  break;
4717 5169  
4718 5170          case X86_VENDOR_TM:
4719 5171                  cp.cp_eax = 0x80860001;
4720 5172                  (void) __cpuid_insn(&cp);
4721 5173                  cpi->cpi_support[TM_EDX_FEATURES] = cp.cp_edx;
4722 5174                  break;
4723 5175  
4724 5176          default:
4725 5177                  break;
4726 5178          }
4727 5179  
4728 5180  pass4_done:
4729 5181          cpi->cpi_pass = 4;
4730 5182          if (hwcap_out != NULL) {
4731 5183                  hwcap_out[0] = hwcap_flags;
4732 5184                  hwcap_out[1] = hwcap_flags_2;
4733 5185          }
4734 5186  }
4735 5187  
4736 5188  
4737 5189  /*
4738 5190   * Simulate the cpuid instruction using the data we previously
4739 5191   * captured about this CPU.  We try our best to return the truth
4740 5192   * about the hardware, independently of kernel support.
4741 5193   */
4742 5194  uint32_t
4743 5195  cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
4744 5196  {
4745 5197          struct cpuid_info *cpi;
4746 5198          struct cpuid_regs *xcp;
4747 5199  
4748 5200          if (cpu == NULL)
4749 5201                  cpu = CPU;
4750 5202          cpi = cpu->cpu_m.mcpu_cpi;
4751 5203  
4752 5204          ASSERT(cpuid_checkpass(cpu, 3));
4753 5205  
4754 5206          /*
4755 5207           * CPUID data is cached in two separate places: cpi_std for standard
4756 5208           * CPUID leaves , and cpi_extd for extended CPUID leaves.
4757 5209           */
4758 5210          if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
4759 5211                  xcp = &cpi->cpi_std[cp->cp_eax];
4760 5212          } else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
4761 5213              cp->cp_eax <= cpi->cpi_xmaxeax &&
4762 5214              cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
4763 5215                  xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
4764 5216          } else {
4765 5217                  /*
4766 5218                   * The caller is asking for data from an input parameter which
4767 5219                   * the kernel has not cached.  In this case we go fetch from
4768 5220                   * the hardware and return the data directly to the user.
4769 5221                   */
4770 5222                  return (__cpuid_insn(cp));
4771 5223          }
4772 5224  
4773 5225          cp->cp_eax = xcp->cp_eax;
4774 5226          cp->cp_ebx = xcp->cp_ebx;
4775 5227          cp->cp_ecx = xcp->cp_ecx;
4776 5228          cp->cp_edx = xcp->cp_edx;
4777 5229          return (cp->cp_eax);
4778 5230  }
4779 5231  
4780 5232  int
4781 5233  cpuid_checkpass(cpu_t *cpu, int pass)
4782 5234  {
4783 5235          return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
4784 5236              cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
4785 5237  }
4786 5238  
4787 5239  int
4788 5240  cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
4789 5241  {
4790 5242          ASSERT(cpuid_checkpass(cpu, 3));
4791 5243  
4792 5244          return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
4793 5245  }
4794 5246  
4795 5247  int
4796 5248  cpuid_is_cmt(cpu_t *cpu)
4797 5249  {
4798 5250          if (cpu == NULL)
4799 5251                  cpu = CPU;
4800 5252  
4801 5253          ASSERT(cpuid_checkpass(cpu, 1));
4802 5254  
4803 5255          return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
4804 5256  }
4805 5257  
4806 5258  /*
4807 5259   * AMD and Intel both implement the 64-bit variant of the syscall
4808 5260   * instruction (syscallq), so if there's -any- support for syscall,
4809 5261   * cpuid currently says "yes, we support this".
4810 5262   *
4811 5263   * However, Intel decided to -not- implement the 32-bit variant of the
4812 5264   * syscall instruction, so we provide a predicate to allow our caller
4813 5265   * to test that subtlety here.
4814 5266   *
4815 5267   * XXPV Currently, 32-bit syscall instructions don't work via the hypervisor,
4816 5268   *      even in the case where the hardware would in fact support it.
4817 5269   */
4818 5270  /*ARGSUSED*/
4819 5271  int
4820 5272  cpuid_syscall32_insn(cpu_t *cpu)
4821 5273  {
4822 5274          ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), 1));
4823 5275  
4824 5276  #if !defined(__xpv)
4825 5277          if (cpu == NULL)
4826 5278                  cpu = CPU;
4827 5279  
4828 5280          /*CSTYLED*/
4829 5281          {
4830 5282                  struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4831 5283  
4832 5284                  if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4833 5285                      cpi->cpi_xmaxeax >= 0x80000001 &&
4834 5286                      (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
4835 5287                          return (1);
4836 5288          }
4837 5289  #endif
4838 5290          return (0);
4839 5291  }
4840 5292  
4841 5293  int
4842 5294  cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
4843 5295  {
4844 5296          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4845 5297  
4846 5298          static const char fmt[] =
4847 5299              "x86 (%s %X family %d model %d step %d clock %d MHz)";
4848 5300          static const char fmt_ht[] =
4849 5301              "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
4850 5302  
4851 5303          ASSERT(cpuid_checkpass(cpu, 1));
4852 5304  
4853 5305          if (cpuid_is_cmt(cpu))
4854 5306                  return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
4855 5307                      cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
4856 5308                      cpi->cpi_family, cpi->cpi_model,
4857 5309                      cpi->cpi_step, cpu->cpu_type_info.pi_clock));
4858 5310          return (snprintf(s, n, fmt,
4859 5311              cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
4860 5312              cpi->cpi_family, cpi->cpi_model,
4861 5313              cpi->cpi_step, cpu->cpu_type_info.pi_clock));
4862 5314  }
4863 5315  
4864 5316  const char *
4865 5317  cpuid_getvendorstr(cpu_t *cpu)
4866 5318  {
4867 5319          ASSERT(cpuid_checkpass(cpu, 1));
4868 5320          return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
4869 5321  }
4870 5322  
4871 5323  uint_t
4872 5324  cpuid_getvendor(cpu_t *cpu)
4873 5325  {
4874 5326          ASSERT(cpuid_checkpass(cpu, 1));
4875 5327          return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
4876 5328  }
4877 5329  
4878 5330  uint_t
4879 5331  cpuid_getfamily(cpu_t *cpu)
4880 5332  {
4881 5333          ASSERT(cpuid_checkpass(cpu, 1));
4882 5334          return (cpu->cpu_m.mcpu_cpi->cpi_family);
4883 5335  }
4884 5336  
4885 5337  uint_t
4886 5338  cpuid_getmodel(cpu_t *cpu)
4887 5339  {
4888 5340          ASSERT(cpuid_checkpass(cpu, 1));
4889 5341          return (cpu->cpu_m.mcpu_cpi->cpi_model);
4890 5342  }
4891 5343  
4892 5344  uint_t
4893 5345  cpuid_get_ncpu_per_chip(cpu_t *cpu)
4894 5346  {
4895 5347          ASSERT(cpuid_checkpass(cpu, 1));
4896 5348          return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
4897 5349  }
4898 5350  
4899 5351  uint_t
4900 5352  cpuid_get_ncore_per_chip(cpu_t *cpu)
4901 5353  {
4902 5354          ASSERT(cpuid_checkpass(cpu, 1));
4903 5355          return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
4904 5356  }
4905 5357  
4906 5358  uint_t
4907 5359  cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
4908 5360  {
4909 5361          ASSERT(cpuid_checkpass(cpu, 2));
4910 5362          return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
4911 5363  }
4912 5364  
4913 5365  id_t
4914 5366  cpuid_get_last_lvl_cacheid(cpu_t *cpu)
4915 5367  {
4916 5368          ASSERT(cpuid_checkpass(cpu, 2));
4917 5369          return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
4918 5370  }
4919 5371  
4920 5372  uint_t
4921 5373  cpuid_getstep(cpu_t *cpu)
4922 5374  {
4923 5375          ASSERT(cpuid_checkpass(cpu, 1));
4924 5376          return (cpu->cpu_m.mcpu_cpi->cpi_step);
4925 5377  }
4926 5378  
4927 5379  uint_t
4928 5380  cpuid_getsig(struct cpu *cpu)
4929 5381  {
4930 5382          ASSERT(cpuid_checkpass(cpu, 1));
4931 5383          return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
4932 5384  }
4933 5385  
4934 5386  uint32_t
4935 5387  cpuid_getchiprev(struct cpu *cpu)
4936 5388  {
4937 5389          ASSERT(cpuid_checkpass(cpu, 1));
4938 5390          return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
4939 5391  }
4940 5392  
4941 5393  const char *
4942 5394  cpuid_getchiprevstr(struct cpu *cpu)
4943 5395  {
4944 5396          ASSERT(cpuid_checkpass(cpu, 1));
4945 5397          return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
4946 5398  }
4947 5399  
4948 5400  uint32_t
4949 5401  cpuid_getsockettype(struct cpu *cpu)
4950 5402  {
4951 5403          ASSERT(cpuid_checkpass(cpu, 1));
4952 5404          return (cpu->cpu_m.mcpu_cpi->cpi_socket);
4953 5405  }
4954 5406  
4955 5407  const char *
4956 5408  cpuid_getsocketstr(cpu_t *cpu)
4957 5409  {
4958 5410          static const char *socketstr = NULL;
4959 5411          struct cpuid_info *cpi;
4960 5412  
4961 5413          ASSERT(cpuid_checkpass(cpu, 1));
4962 5414          cpi = cpu->cpu_m.mcpu_cpi;
4963 5415  
4964 5416          /* Assume that socket types are the same across the system */
4965 5417          if (socketstr == NULL)
4966 5418                  socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
4967 5419                      cpi->cpi_model, cpi->cpi_step);
4968 5420  
4969 5421  
4970 5422          return (socketstr);
4971 5423  }
4972 5424  
4973 5425  int
4974 5426  cpuid_get_chipid(cpu_t *cpu)
4975 5427  {
4976 5428          ASSERT(cpuid_checkpass(cpu, 1));
4977 5429  
4978 5430          if (cpuid_is_cmt(cpu))
4979 5431                  return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
4980 5432          return (cpu->cpu_id);
4981 5433  }
4982 5434  
4983 5435  id_t
4984 5436  cpuid_get_coreid(cpu_t *cpu)
4985 5437  {
4986 5438          ASSERT(cpuid_checkpass(cpu, 1));
4987 5439          return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
4988 5440  }
4989 5441  
4990 5442  int
4991 5443  cpuid_get_pkgcoreid(cpu_t *cpu)
4992 5444  {
4993 5445          ASSERT(cpuid_checkpass(cpu, 1));
4994 5446          return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
4995 5447  }
4996 5448  
4997 5449  int
4998 5450  cpuid_get_clogid(cpu_t *cpu)
4999 5451  {
5000 5452          ASSERT(cpuid_checkpass(cpu, 1));
5001 5453          return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
5002 5454  }
5003 5455  
5004 5456  int
5005 5457  cpuid_get_cacheid(cpu_t *cpu)
5006 5458  {
5007 5459          ASSERT(cpuid_checkpass(cpu, 1));
5008 5460          return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5009 5461  }
5010 5462  
5011 5463  uint_t
5012 5464  cpuid_get_procnodeid(cpu_t *cpu)
5013 5465  {
5014 5466          ASSERT(cpuid_checkpass(cpu, 1));
5015 5467          return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
5016 5468  }
5017 5469  
5018 5470  uint_t
5019 5471  cpuid_get_procnodes_per_pkg(cpu_t *cpu)
5020 5472  {
5021 5473          ASSERT(cpuid_checkpass(cpu, 1));
5022 5474          return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
5023 5475  }
5024 5476  
5025 5477  uint_t
5026 5478  cpuid_get_compunitid(cpu_t *cpu)
5027 5479  {
5028 5480          ASSERT(cpuid_checkpass(cpu, 1));
5029 5481          return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
5030 5482  }
5031 5483  
5032 5484  uint_t
5033 5485  cpuid_get_cores_per_compunit(cpu_t *cpu)
5034 5486  {
5035 5487          ASSERT(cpuid_checkpass(cpu, 1));
5036 5488          return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
5037 5489  }
5038 5490  
5039 5491  /*ARGSUSED*/
5040 5492  int
5041 5493  cpuid_have_cr8access(cpu_t *cpu)
5042 5494  {
5043 5495  #if defined(__amd64)
5044 5496          return (1);
5045 5497  #else
5046 5498          struct cpuid_info *cpi;
5047 5499  
5048 5500          ASSERT(cpu != NULL);
5049 5501          cpi = cpu->cpu_m.mcpu_cpi;
5050 5502          if (cpi->cpi_vendor == X86_VENDOR_AMD && cpi->cpi_maxeax >= 1 &&
5051 5503              (CPI_FEATURES_XTD_ECX(cpi) & CPUID_AMD_ECX_CR8D) != 0)
5052 5504                  return (1);
5053 5505          return (0);
5054 5506  #endif
5055 5507  }
5056 5508  
5057 5509  uint32_t
5058 5510  cpuid_get_apicid(cpu_t *cpu)
5059 5511  {
5060 5512          ASSERT(cpuid_checkpass(cpu, 1));
5061 5513          if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
5062 5514                  return (UINT32_MAX);
5063 5515          } else {
5064 5516                  return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
5065 5517          }
5066 5518  }
5067 5519  
5068 5520  void
5069 5521  cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
5070 5522  {
5071 5523          struct cpuid_info *cpi;
5072 5524  
5073 5525          if (cpu == NULL)
5074 5526                  cpu = CPU;
5075 5527          cpi = cpu->cpu_m.mcpu_cpi;
5076 5528  
5077 5529          ASSERT(cpuid_checkpass(cpu, 1));
5078 5530  
5079 5531          if (pabits)
5080 5532                  *pabits = cpi->cpi_pabits;
5081 5533          if (vabits)
5082 5534                  *vabits = cpi->cpi_vabits;
5083 5535  }
5084 5536  
5085 5537  size_t
5086 5538  cpuid_get_xsave_size()
5087 5539  {
5088 5540          return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
5089 5541              sizeof (struct xsave_state)));
5090 5542  }
5091 5543  
5092 5544  /*
5093 5545   * Return true if the CPUs on this system require 'pointer clearing' for the
5094 5546   * floating point error pointer exception handling. In the past, this has been
5095 5547   * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
5096 5548   * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
5097 5549   * feature bit and is reflected in the cpi_fp_amd_save member.
5098 5550   */
5099 5551  boolean_t
5100 5552  cpuid_need_fp_excp_handling()
5101 5553  {
5102 5554          return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
5103 5555              cpuid_info0.cpi_fp_amd_save != 0);
5104 5556  }
5105 5557  
5106 5558  /*
5107 5559   * Returns the number of data TLB entries for a corresponding
5108 5560   * pagesize.  If it can't be computed, or isn't known, the
5109 5561   * routine returns zero.  If you ask about an architecturally
5110 5562   * impossible pagesize, the routine will panic (so that the
5111 5563   * hat implementor knows that things are inconsistent.)
5112 5564   */
5113 5565  uint_t
5114 5566  cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
5115 5567  {
5116 5568          struct cpuid_info *cpi;
5117 5569          uint_t dtlb_nent = 0;
5118 5570  
5119 5571          if (cpu == NULL)
5120 5572                  cpu = CPU;
5121 5573          cpi = cpu->cpu_m.mcpu_cpi;
5122 5574  
5123 5575          ASSERT(cpuid_checkpass(cpu, 1));
5124 5576  
5125 5577          /*
5126 5578           * Check the L2 TLB info
5127 5579           */
5128 5580          if (cpi->cpi_xmaxeax >= 0x80000006) {
5129 5581                  struct cpuid_regs *cp = &cpi->cpi_extd[6];
5130 5582  
5131 5583                  switch (pagesize) {
5132 5584  
5133 5585                  case 4 * 1024:
5134 5586                          /*
5135 5587                           * All zero in the top 16 bits of the register
5136 5588                           * indicates a unified TLB. Size is in low 16 bits.
5137 5589                           */
5138 5590                          if ((cp->cp_ebx & 0xffff0000) == 0)
5139 5591                                  dtlb_nent = cp->cp_ebx & 0x0000ffff;
5140 5592                          else
5141 5593                                  dtlb_nent = BITX(cp->cp_ebx, 27, 16);
5142 5594                          break;
5143 5595  
5144 5596                  case 2 * 1024 * 1024:
5145 5597                          if ((cp->cp_eax & 0xffff0000) == 0)
5146 5598                                  dtlb_nent = cp->cp_eax & 0x0000ffff;
5147 5599                          else
5148 5600                                  dtlb_nent = BITX(cp->cp_eax, 27, 16);
5149 5601                          break;
5150 5602  
5151 5603                  default:
5152 5604                          panic("unknown L2 pagesize");
5153 5605                          /*NOTREACHED*/
5154 5606                  }
5155 5607          }
5156 5608  
5157 5609          if (dtlb_nent != 0)
5158 5610                  return (dtlb_nent);
5159 5611  
5160 5612          /*
5161 5613           * No L2 TLB support for this size, try L1.
5162 5614           */
5163 5615          if (cpi->cpi_xmaxeax >= 0x80000005) {
5164 5616                  struct cpuid_regs *cp = &cpi->cpi_extd[5];
5165 5617  
5166 5618                  switch (pagesize) {
5167 5619                  case 4 * 1024:
5168 5620                          dtlb_nent = BITX(cp->cp_ebx, 23, 16);
5169 5621                          break;
5170 5622                  case 2 * 1024 * 1024:
5171 5623                          dtlb_nent = BITX(cp->cp_eax, 23, 16);
5172 5624                          break;
5173 5625                  default:
5174 5626                          panic("unknown L1 d-TLB pagesize");
5175 5627                          /*NOTREACHED*/
5176 5628                  }
5177 5629          }
5178 5630  
5179 5631          return (dtlb_nent);
5180 5632  }
5181 5633  
5182 5634  /*
5183 5635   * Return 0 if the erratum is not present or not applicable, positive
5184 5636   * if it is, and negative if the status of the erratum is unknown.
5185 5637   *
5186 5638   * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
5187 5639   * Processors" #25759, Rev 3.57, August 2005
5188 5640   */
5189 5641  int
5190 5642  cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
5191 5643  {
5192 5644          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5193 5645          uint_t eax;
5194 5646  
5195 5647          /*
5196 5648           * Bail out if this CPU isn't an AMD CPU, or if it's
5197 5649           * a legacy (32-bit) AMD CPU.
5198 5650           */
5199 5651          if (cpi->cpi_vendor != X86_VENDOR_AMD ||
5200 5652              cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
5201 5653              cpi->cpi_family == 6) {
5202 5654                  return (0);
5203 5655          }
5204 5656  
5205 5657          eax = cpi->cpi_std[1].cp_eax;
5206 5658  
5207 5659  #define SH_B0(eax)      (eax == 0xf40 || eax == 0xf50)
5208 5660  #define SH_B3(eax)      (eax == 0xf51)
5209 5661  #define B(eax)          (SH_B0(eax) || SH_B3(eax))
5210 5662  
5211 5663  #define SH_C0(eax)      (eax == 0xf48 || eax == 0xf58)
5212 5664  
5213 5665  #define SH_CG(eax)      (eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
5214 5666  #define DH_CG(eax)      (eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
5215 5667  #define CH_CG(eax)      (eax == 0xf82 || eax == 0xfb2)
5216 5668  #define CG(eax)         (SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
5217 5669  
5218 5670  #define SH_D0(eax)      (eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
5219 5671  #define DH_D0(eax)      (eax == 0x10fc0 || eax == 0x10ff0)
5220 5672  #define CH_D0(eax)      (eax == 0x10f80 || eax == 0x10fb0)
5221 5673  #define D0(eax)         (SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
5222 5674  
5223 5675  #define SH_E0(eax)      (eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
5224 5676  #define JH_E1(eax)      (eax == 0x20f10)        /* JH8_E0 had 0x20f30 */
5225 5677  #define DH_E3(eax)      (eax == 0x20fc0 || eax == 0x20ff0)
5226 5678  #define SH_E4(eax)      (eax == 0x20f51 || eax == 0x20f71)
5227 5679  #define BH_E4(eax)      (eax == 0x20fb1)
5228 5680  #define SH_E5(eax)      (eax == 0x20f42)
5229 5681  #define DH_E6(eax)      (eax == 0x20ff2 || eax == 0x20fc2)
5230 5682  #define JH_E6(eax)      (eax == 0x20f12 || eax == 0x20f32)
5231 5683  #define EX(eax)         (SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
5232 5684                              SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
5233 5685                              DH_E6(eax) || JH_E6(eax))
5234 5686  
5235 5687  #define DR_AX(eax)      (eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
5236 5688  #define DR_B0(eax)      (eax == 0x100f20)
5237 5689  #define DR_B1(eax)      (eax == 0x100f21)
5238 5690  #define DR_BA(eax)      (eax == 0x100f2a)
5239 5691  #define DR_B2(eax)      (eax == 0x100f22)
5240 5692  #define DR_B3(eax)      (eax == 0x100f23)
5241 5693  #define RB_C0(eax)      (eax == 0x100f40)
5242 5694  
5243 5695          switch (erratum) {
5244 5696          case 1:
5245 5697                  return (cpi->cpi_family < 0x10);
5246 5698          case 51:        /* what does the asterisk mean? */
5247 5699                  return (B(eax) || SH_C0(eax) || CG(eax));
5248 5700          case 52:
5249 5701                  return (B(eax));
5250 5702          case 57:
5251 5703                  return (cpi->cpi_family <= 0x11);
5252 5704          case 58:
5253 5705                  return (B(eax));
5254 5706          case 60:
5255 5707                  return (cpi->cpi_family <= 0x11);
5256 5708          case 61:
5257 5709          case 62:
5258 5710          case 63:
5259 5711          case 64:
5260 5712          case 65:
5261 5713          case 66:
5262 5714          case 68:
5263 5715          case 69:
5264 5716          case 70:
5265 5717          case 71:
5266 5718                  return (B(eax));
5267 5719          case 72:
5268 5720                  return (SH_B0(eax));
5269 5721          case 74:
5270 5722                  return (B(eax));
5271 5723          case 75:
5272 5724                  return (cpi->cpi_family < 0x10);
5273 5725          case 76:
5274 5726                  return (B(eax));
5275 5727          case 77:
5276 5728                  return (cpi->cpi_family <= 0x11);
5277 5729          case 78:
5278 5730                  return (B(eax) || SH_C0(eax));
5279 5731          case 79:
5280 5732                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5281 5733          case 80:
5282 5734          case 81:
5283 5735          case 82:
5284 5736                  return (B(eax));
5285 5737          case 83:
5286 5738                  return (B(eax) || SH_C0(eax) || CG(eax));
5287 5739          case 85:
5288 5740                  return (cpi->cpi_family < 0x10);
5289 5741          case 86:
5290 5742                  return (SH_C0(eax) || CG(eax));
5291 5743          case 88:
5292 5744  #if !defined(__amd64)
5293 5745                  return (0);
5294 5746  #else
5295 5747                  return (B(eax) || SH_C0(eax));
5296 5748  #endif
5297 5749          case 89:
5298 5750                  return (cpi->cpi_family < 0x10);
5299 5751          case 90:
5300 5752                  return (B(eax) || SH_C0(eax) || CG(eax));
5301 5753          case 91:
5302 5754          case 92:
5303 5755                  return (B(eax) || SH_C0(eax));
5304 5756          case 93:
5305 5757                  return (SH_C0(eax));
5306 5758          case 94:
5307 5759                  return (B(eax) || SH_C0(eax) || CG(eax));
5308 5760          case 95:
5309 5761  #if !defined(__amd64)
5310 5762                  return (0);
5311 5763  #else
5312 5764                  return (B(eax) || SH_C0(eax));
5313 5765  #endif
5314 5766          case 96:
5315 5767                  return (B(eax) || SH_C0(eax) || CG(eax));
5316 5768          case 97:
5317 5769          case 98:
5318 5770                  return (SH_C0(eax) || CG(eax));
5319 5771          case 99:
5320 5772                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5321 5773          case 100:
5322 5774                  return (B(eax) || SH_C0(eax));
5323 5775          case 101:
5324 5776          case 103:
5325 5777                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5326 5778          case 104:
5327 5779                  return (SH_C0(eax) || CG(eax) || D0(eax));
5328 5780          case 105:
5329 5781          case 106:
5330 5782          case 107:
5331 5783                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5332 5784          case 108:
5333 5785                  return (DH_CG(eax));
5334 5786          case 109:
5335 5787                  return (SH_C0(eax) || CG(eax) || D0(eax));
5336 5788          case 110:
5337 5789                  return (D0(eax) || EX(eax));
5338 5790          case 111:
5339 5791                  return (CG(eax));
5340 5792          case 112:
5341 5793                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5342 5794          case 113:
5343 5795                  return (eax == 0x20fc0);
5344 5796          case 114:
5345 5797                  return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
5346 5798          case 115:
5347 5799                  return (SH_E0(eax) || JH_E1(eax));
5348 5800          case 116:
5349 5801                  return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
5350 5802          case 117:
5351 5803                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5352 5804          case 118:
5353 5805                  return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
5354 5806                      JH_E6(eax));
5355 5807          case 121:
5356 5808                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5357 5809          case 122:
5358 5810                  return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
5359 5811          case 123:
5360 5812                  return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
5361 5813          case 131:
5362 5814                  return (cpi->cpi_family < 0x10);
5363 5815          case 6336786:
5364 5816  
5365 5817                  /*
5366 5818                   * Test for AdvPowerMgmtInfo.TscPStateInvariant
5367 5819                   * if this is a K8 family or newer processor. We're testing for
5368 5820                   * this 'erratum' to determine whether or not we have a constant
5369 5821                   * TSC.
5370 5822                   *
5371 5823                   * Our current fix for this is to disable the C1-Clock ramping.
5372 5824                   * However, this doesn't work on newer processor families nor
5373 5825                   * does it work when virtualized as those devices don't exist.
5374 5826                   */
5375 5827                  if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
5376 5828                          return (0);
5377 5829                  }
5378 5830  
5379 5831                  if (CPI_FAMILY(cpi) == 0xf) {
5380 5832                          struct cpuid_regs regs;
5381 5833                          regs.cp_eax = 0x80000007;
5382 5834                          (void) __cpuid_insn(&regs);
5383 5835                          return (!(regs.cp_edx & 0x100));
5384 5836                  }
5385 5837                  return (0);
5386 5838          case 6323525:
5387 5839                  /*
5388 5840                   * This erratum (K8 #147) is not present on family 10 and newer.
5389 5841                   */
5390 5842                  if (cpi->cpi_family >= 0x10) {
5391 5843                          return (0);
5392 5844                  }
5393 5845                  return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
5394 5846                      (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
5395 5847  
5396 5848          case 6671130:
5397 5849                  /*
5398 5850                   * check for processors (pre-Shanghai) that do not provide
5399 5851                   * optimal management of 1gb ptes in its tlb.
5400 5852                   */
5401 5853                  return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
5402 5854  
5403 5855          case 298:
5404 5856                  return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
5405 5857                      DR_B2(eax) || RB_C0(eax));
5406 5858  
5407 5859          case 721:
5408 5860  #if defined(__amd64)
5409 5861                  return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
5410 5862  #else
5411 5863                  return (0);
5412 5864  #endif
5413 5865  
5414 5866          default:
5415 5867                  return (-1);
5416 5868  
5417 5869          }
5418 5870  }
5419 5871  
5420 5872  /*
5421 5873   * Determine if specified erratum is present via OSVW (OS Visible Workaround).
5422 5874   * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
5423 5875   */
5424 5876  int
5425 5877  osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
5426 5878  {
5427 5879          struct cpuid_info       *cpi;
5428 5880          uint_t                  osvwid;
5429 5881          static int              osvwfeature = -1;
5430 5882          uint64_t                osvwlength;
5431 5883  
5432 5884  
5433 5885          cpi = cpu->cpu_m.mcpu_cpi;
5434 5886  
5435 5887          /* confirm OSVW supported */
5436 5888          if (osvwfeature == -1) {
5437 5889                  osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
5438 5890          } else {
5439 5891                  /* assert that osvw feature setting is consistent on all cpus */
5440 5892                  ASSERT(osvwfeature ==
5441 5893                      (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
5442 5894          }
5443 5895          if (!osvwfeature)
5444 5896                  return (-1);
5445 5897  
5446 5898          osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
5447 5899  
5448 5900          switch (erratum) {
5449 5901          case 298:       /* osvwid is 0 */
5450 5902                  osvwid = 0;
5451 5903                  if (osvwlength <= (uint64_t)osvwid) {
5452 5904                          /* osvwid 0 is unknown */
5453 5905                          return (-1);
5454 5906                  }
5455 5907  
5456 5908                  /*
5457 5909                   * Check the OSVW STATUS MSR to determine the state
5458 5910                   * of the erratum where:
5459 5911                   *   0 - fixed by HW
5460 5912                   *   1 - BIOS has applied the workaround when BIOS
5461 5913                   *   workaround is available. (Or for other errata,
5462 5914                   *   OS workaround is required.)
5463 5915                   * For a value of 1, caller will confirm that the
5464 5916                   * erratum 298 workaround has indeed been applied by BIOS.
5465 5917                   *
5466 5918                   * A 1 may be set in cpus that have a HW fix
5467 5919                   * in a mixed cpu system. Regarding erratum 298:
5468 5920                   *   In a multiprocessor platform, the workaround above
5469 5921                   *   should be applied to all processors regardless of
5470 5922                   *   silicon revision when an affected processor is
5471 5923                   *   present.
5472 5924                   */
5473 5925  
5474 5926                  return (rdmsr(MSR_AMD_OSVW_STATUS +
5475 5927                      (osvwid / OSVW_ID_CNT_PER_MSR)) &
5476 5928                      (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
5477 5929  
5478 5930          default:
5479 5931                  return (-1);
5480 5932          }
5481 5933  }
5482 5934  
5483 5935  static const char assoc_str[] = "associativity";
5484 5936  static const char line_str[] = "line-size";
5485 5937  static const char size_str[] = "size";
5486 5938  
5487 5939  static void
5488 5940  add_cache_prop(dev_info_t *devi, const char *label, const char *type,
5489 5941      uint32_t val)
5490 5942  {
5491 5943          char buf[128];
5492 5944  
5493 5945          /*
5494 5946           * ndi_prop_update_int() is used because it is desirable for
5495 5947           * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
5496 5948           */
5497 5949          if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
5498 5950                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
5499 5951  }
5500 5952  
5501 5953  /*
5502 5954   * Intel-style cache/tlb description
5503 5955   *
5504 5956   * Standard cpuid level 2 gives a randomly ordered
5505 5957   * selection of tags that index into a table that describes
5506 5958   * cache and tlb properties.
5507 5959   */
5508 5960  
5509 5961  static const char l1_icache_str[] = "l1-icache";
5510 5962  static const char l1_dcache_str[] = "l1-dcache";
5511 5963  static const char l2_cache_str[] = "l2-cache";
5512 5964  static const char l3_cache_str[] = "l3-cache";
5513 5965  static const char itlb4k_str[] = "itlb-4K";
5514 5966  static const char dtlb4k_str[] = "dtlb-4K";
5515 5967  static const char itlb2M_str[] = "itlb-2M";
5516 5968  static const char itlb4M_str[] = "itlb-4M";
5517 5969  static const char dtlb4M_str[] = "dtlb-4M";
5518 5970  static const char dtlb24_str[] = "dtlb0-2M-4M";
5519 5971  static const char itlb424_str[] = "itlb-4K-2M-4M";
5520 5972  static const char itlb24_str[] = "itlb-2M-4M";
5521 5973  static const char dtlb44_str[] = "dtlb-4K-4M";
5522 5974  static const char sl1_dcache_str[] = "sectored-l1-dcache";
5523 5975  static const char sl2_cache_str[] = "sectored-l2-cache";
5524 5976  static const char itrace_str[] = "itrace-cache";
5525 5977  static const char sl3_cache_str[] = "sectored-l3-cache";
5526 5978  static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
5527 5979  
5528 5980  static const struct cachetab {
5529 5981          uint8_t         ct_code;
5530 5982          uint8_t         ct_assoc;
5531 5983          uint16_t        ct_line_size;
5532 5984          size_t          ct_size;
5533 5985          const char      *ct_label;
5534 5986  } intel_ctab[] = {
5535 5987          /*
5536 5988           * maintain descending order!
5537 5989           *
5538 5990           * Codes ignored - Reason
5539 5991           * ----------------------
5540 5992           * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
5541 5993           * f0H/f1H - Currently we do not interpret prefetch size by design
5542 5994           */
5543 5995          { 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
5544 5996          { 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
5545 5997          { 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
5546 5998          { 0xde, 12, 64, 6*1024*1024, l3_cache_str},
5547 5999          { 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
5548 6000          { 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
5549 6001          { 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
5550 6002          { 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
5551 6003          { 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
5552 6004          { 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
5553 6005          { 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
5554 6006          { 0xd0, 4, 64, 512*1024, l3_cache_str},
5555 6007          { 0xca, 4, 0, 512, sh_l2_tlb4k_str},
5556 6008          { 0xc0, 4, 0, 8, dtlb44_str },
5557 6009          { 0xba, 4, 0, 64, dtlb4k_str },
5558 6010          { 0xb4, 4, 0, 256, dtlb4k_str },
5559 6011          { 0xb3, 4, 0, 128, dtlb4k_str },
5560 6012          { 0xb2, 4, 0, 64, itlb4k_str },
5561 6013          { 0xb0, 4, 0, 128, itlb4k_str },
5562 6014          { 0x87, 8, 64, 1024*1024, l2_cache_str},
5563 6015          { 0x86, 4, 64, 512*1024, l2_cache_str},
5564 6016          { 0x85, 8, 32, 2*1024*1024, l2_cache_str},
5565 6017          { 0x84, 8, 32, 1024*1024, l2_cache_str},
5566 6018          { 0x83, 8, 32, 512*1024, l2_cache_str},
5567 6019          { 0x82, 8, 32, 256*1024, l2_cache_str},
5568 6020          { 0x80, 8, 64, 512*1024, l2_cache_str},
5569 6021          { 0x7f, 2, 64, 512*1024, l2_cache_str},
5570 6022          { 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
5571 6023          { 0x7c, 8, 64, 1024*1024, sl2_cache_str},
5572 6024          { 0x7b, 8, 64, 512*1024, sl2_cache_str},
5573 6025          { 0x7a, 8, 64, 256*1024, sl2_cache_str},
5574 6026          { 0x79, 8, 64, 128*1024, sl2_cache_str},
5575 6027          { 0x78, 8, 64, 1024*1024, l2_cache_str},
5576 6028          { 0x73, 8, 0, 64*1024, itrace_str},
5577 6029          { 0x72, 8, 0, 32*1024, itrace_str},
5578 6030          { 0x71, 8, 0, 16*1024, itrace_str},
5579 6031          { 0x70, 8, 0, 12*1024, itrace_str},
5580 6032          { 0x68, 4, 64, 32*1024, sl1_dcache_str},
5581 6033          { 0x67, 4, 64, 16*1024, sl1_dcache_str},
5582 6034          { 0x66, 4, 64, 8*1024, sl1_dcache_str},
5583 6035          { 0x60, 8, 64, 16*1024, sl1_dcache_str},
5584 6036          { 0x5d, 0, 0, 256, dtlb44_str},
5585 6037          { 0x5c, 0, 0, 128, dtlb44_str},
5586 6038          { 0x5b, 0, 0, 64, dtlb44_str},
5587 6039          { 0x5a, 4, 0, 32, dtlb24_str},
5588 6040          { 0x59, 0, 0, 16, dtlb4k_str},
5589 6041          { 0x57, 4, 0, 16, dtlb4k_str},
5590 6042          { 0x56, 4, 0, 16, dtlb4M_str},
5591 6043          { 0x55, 0, 0, 7, itlb24_str},
5592 6044          { 0x52, 0, 0, 256, itlb424_str},
5593 6045          { 0x51, 0, 0, 128, itlb424_str},
5594 6046          { 0x50, 0, 0, 64, itlb424_str},
5595 6047          { 0x4f, 0, 0, 32, itlb4k_str},
5596 6048          { 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
5597 6049          { 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
5598 6050          { 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
5599 6051          { 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
5600 6052          { 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
5601 6053          { 0x49, 16, 64, 4*1024*1024, l3_cache_str},
5602 6054          { 0x48, 12, 64, 3*1024*1024, l2_cache_str},
5603 6055          { 0x47, 8, 64, 8*1024*1024, l3_cache_str},
5604 6056          { 0x46, 4, 64, 4*1024*1024, l3_cache_str},
5605 6057          { 0x45, 4, 32, 2*1024*1024, l2_cache_str},
5606 6058          { 0x44, 4, 32, 1024*1024, l2_cache_str},
5607 6059          { 0x43, 4, 32, 512*1024, l2_cache_str},
5608 6060          { 0x42, 4, 32, 256*1024, l2_cache_str},
5609 6061          { 0x41, 4, 32, 128*1024, l2_cache_str},
5610 6062          { 0x3e, 4, 64, 512*1024, sl2_cache_str},
5611 6063          { 0x3d, 6, 64, 384*1024, sl2_cache_str},
5612 6064          { 0x3c, 4, 64, 256*1024, sl2_cache_str},
5613 6065          { 0x3b, 2, 64, 128*1024, sl2_cache_str},
5614 6066          { 0x3a, 6, 64, 192*1024, sl2_cache_str},
5615 6067          { 0x39, 4, 64, 128*1024, sl2_cache_str},
5616 6068          { 0x30, 8, 64, 32*1024, l1_icache_str},
5617 6069          { 0x2c, 8, 64, 32*1024, l1_dcache_str},
5618 6070          { 0x29, 8, 64, 4096*1024, sl3_cache_str},
5619 6071          { 0x25, 8, 64, 2048*1024, sl3_cache_str},
5620 6072          { 0x23, 8, 64, 1024*1024, sl3_cache_str},
5621 6073          { 0x22, 4, 64, 512*1024, sl3_cache_str},
5622 6074          { 0x0e, 6, 64, 24*1024, l1_dcache_str},
5623 6075          { 0x0d, 4, 32, 16*1024, l1_dcache_str},
5624 6076          { 0x0c, 4, 32, 16*1024, l1_dcache_str},
5625 6077          { 0x0b, 4, 0, 4, itlb4M_str},
5626 6078          { 0x0a, 2, 32, 8*1024, l1_dcache_str},
5627 6079          { 0x08, 4, 32, 16*1024, l1_icache_str},
5628 6080          { 0x06, 4, 32, 8*1024, l1_icache_str},
5629 6081          { 0x05, 4, 0, 32, dtlb4M_str},
5630 6082          { 0x04, 4, 0, 8, dtlb4M_str},
5631 6083          { 0x03, 4, 0, 64, dtlb4k_str},
5632 6084          { 0x02, 4, 0, 2, itlb4M_str},
5633 6085          { 0x01, 4, 0, 32, itlb4k_str},
5634 6086          { 0 }
5635 6087  };
5636 6088  
5637 6089  static const struct cachetab cyrix_ctab[] = {
5638 6090          { 0x70, 4, 0, 32, "tlb-4K" },
5639 6091          { 0x80, 4, 16, 16*1024, "l1-cache" },
5640 6092          { 0 }
5641 6093  };
5642 6094  
5643 6095  /*
5644 6096   * Search a cache table for a matching entry
5645 6097   */
5646 6098  static const struct cachetab *
5647 6099  find_cacheent(const struct cachetab *ct, uint_t code)
5648 6100  {
5649 6101          if (code != 0) {
5650 6102                  for (; ct->ct_code != 0; ct++)
5651 6103                          if (ct->ct_code <= code)
5652 6104                                  break;
5653 6105                  if (ct->ct_code == code)
5654 6106                          return (ct);
5655 6107          }
5656 6108          return (NULL);
5657 6109  }
5658 6110  
5659 6111  /*
5660 6112   * Populate cachetab entry with L2 or L3 cache-information using
5661 6113   * cpuid function 4. This function is called from intel_walk_cacheinfo()
5662 6114   * when descriptor 0x49 is encountered. It returns 0 if no such cache
5663 6115   * information is found.
5664 6116   */
5665 6117  static int
5666 6118  intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
5667 6119  {
5668 6120          uint32_t level, i;
5669 6121          int ret = 0;
5670 6122  
5671 6123          for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
5672 6124                  level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
5673 6125  
5674 6126                  if (level == 2 || level == 3) {
5675 6127                          ct->ct_assoc =
5676 6128                              CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
5677 6129                          ct->ct_line_size =
5678 6130                              CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
5679 6131                          ct->ct_size = ct->ct_assoc *
5680 6132                              (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
5681 6133                              ct->ct_line_size *
5682 6134                              (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
5683 6135  
5684 6136                          if (level == 2) {
5685 6137                                  ct->ct_label = l2_cache_str;
5686 6138                          } else if (level == 3) {
5687 6139                                  ct->ct_label = l3_cache_str;
5688 6140                          }
5689 6141                          ret = 1;
5690 6142                  }
5691 6143          }
5692 6144  
5693 6145          return (ret);
5694 6146  }
5695 6147  
5696 6148  /*
5697 6149   * Walk the cacheinfo descriptor, applying 'func' to every valid element
5698 6150   * The walk is terminated if the walker returns non-zero.
5699 6151   */
5700 6152  static void
5701 6153  intel_walk_cacheinfo(struct cpuid_info *cpi,
5702 6154      void *arg, int (*func)(void *, const struct cachetab *))
5703 6155  {
5704 6156          const struct cachetab *ct;
5705 6157          struct cachetab des_49_ct, des_b1_ct;
5706 6158          uint8_t *dp;
5707 6159          int i;
5708 6160  
5709 6161          if ((dp = cpi->cpi_cacheinfo) == NULL)
5710 6162                  return;
5711 6163          for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
5712 6164                  /*
5713 6165                   * For overloaded descriptor 0x49 we use cpuid function 4
5714 6166                   * if supported by the current processor, to create
5715 6167                   * cache information.
5716 6168                   * For overloaded descriptor 0xb1 we use X86_PAE flag
5717 6169                   * to disambiguate the cache information.
5718 6170                   */
5719 6171                  if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
5720 6172                      intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
5721 6173                                  ct = &des_49_ct;
5722 6174                  } else if (*dp == 0xb1) {
5723 6175                          des_b1_ct.ct_code = 0xb1;
5724 6176                          des_b1_ct.ct_assoc = 4;
5725 6177                          des_b1_ct.ct_line_size = 0;
5726 6178                          if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
5727 6179                                  des_b1_ct.ct_size = 8;
5728 6180                                  des_b1_ct.ct_label = itlb2M_str;
5729 6181                          } else {
5730 6182                                  des_b1_ct.ct_size = 4;
5731 6183                                  des_b1_ct.ct_label = itlb4M_str;
5732 6184                          }
5733 6185                          ct = &des_b1_ct;
5734 6186                  } else {
5735 6187                          if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
5736 6188                                  continue;
5737 6189                          }
5738 6190                  }
5739 6191  
5740 6192                  if (func(arg, ct) != 0) {
5741 6193                          break;
5742 6194                  }
5743 6195          }
5744 6196  }
5745 6197  
5746 6198  /*
5747 6199   * (Like the Intel one, except for Cyrix CPUs)
5748 6200   */
5749 6201  static void
5750 6202  cyrix_walk_cacheinfo(struct cpuid_info *cpi,
5751 6203      void *arg, int (*func)(void *, const struct cachetab *))
5752 6204  {
5753 6205          const struct cachetab *ct;
5754 6206          uint8_t *dp;
5755 6207          int i;
5756 6208  
5757 6209          if ((dp = cpi->cpi_cacheinfo) == NULL)
5758 6210                  return;
5759 6211          for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
5760 6212                  /*
5761 6213                   * Search Cyrix-specific descriptor table first ..
5762 6214                   */
5763 6215                  if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
5764 6216                          if (func(arg, ct) != 0)
5765 6217                                  break;
5766 6218                          continue;
5767 6219                  }
5768 6220                  /*
5769 6221                   * .. else fall back to the Intel one
5770 6222                   */
5771 6223                  if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
5772 6224                          if (func(arg, ct) != 0)
5773 6225                                  break;
5774 6226                          continue;
5775 6227                  }
5776 6228          }
5777 6229  }
5778 6230  
5779 6231  /*
5780 6232   * A cacheinfo walker that adds associativity, line-size, and size properties
5781 6233   * to the devinfo node it is passed as an argument.
5782 6234   */
5783 6235  static int
5784 6236  add_cacheent_props(void *arg, const struct cachetab *ct)
5785 6237  {
5786 6238          dev_info_t *devi = arg;
5787 6239  
5788 6240          add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
5789 6241          if (ct->ct_line_size != 0)
5790 6242                  add_cache_prop(devi, ct->ct_label, line_str,
5791 6243                      ct->ct_line_size);
5792 6244          add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
5793 6245          return (0);
5794 6246  }
5795 6247  
5796 6248  
5797 6249  static const char fully_assoc[] = "fully-associative?";
5798 6250  
5799 6251  /*
5800 6252   * AMD style cache/tlb description
5801 6253   *
5802 6254   * Extended functions 5 and 6 directly describe properties of
5803 6255   * tlbs and various cache levels.
5804 6256   */
5805 6257  static void
5806 6258  add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
5807 6259  {
5808 6260          switch (assoc) {
5809 6261          case 0: /* reserved; ignore */
5810 6262                  break;
5811 6263          default:
5812 6264                  add_cache_prop(devi, label, assoc_str, assoc);
5813 6265                  break;
5814 6266          case 0xff:
5815 6267                  add_cache_prop(devi, label, fully_assoc, 1);
5816 6268                  break;
5817 6269          }
5818 6270  }
5819 6271  
5820 6272  static void
5821 6273  add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
5822 6274  {
5823 6275          if (size == 0)
5824 6276                  return;
5825 6277          add_cache_prop(devi, label, size_str, size);
5826 6278          add_amd_assoc(devi, label, assoc);
5827 6279  }
5828 6280  
5829 6281  static void
5830 6282  add_amd_cache(dev_info_t *devi, const char *label,
5831 6283      uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
5832 6284  {
5833 6285          if (size == 0 || line_size == 0)
5834 6286                  return;
5835 6287          add_amd_assoc(devi, label, assoc);
5836 6288          /*
5837 6289           * Most AMD parts have a sectored cache. Multiple cache lines are
5838 6290           * associated with each tag. A sector consists of all cache lines
5839 6291           * associated with a tag. For example, the AMD K6-III has a sector
5840 6292           * size of 2 cache lines per tag.
5841 6293           */
5842 6294          if (lines_per_tag != 0)
5843 6295                  add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
5844 6296          add_cache_prop(devi, label, line_str, line_size);
5845 6297          add_cache_prop(devi, label, size_str, size * 1024);
5846 6298  }
5847 6299  
5848 6300  static void
5849 6301  add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
5850 6302  {
5851 6303          switch (assoc) {
5852 6304          case 0: /* off */
5853 6305                  break;
5854 6306          case 1:
5855 6307          case 2:
5856 6308          case 4:
5857 6309                  add_cache_prop(devi, label, assoc_str, assoc);
5858 6310                  break;
5859 6311          case 6:
5860 6312                  add_cache_prop(devi, label, assoc_str, 8);
5861 6313                  break;
5862 6314          case 8:
5863 6315                  add_cache_prop(devi, label, assoc_str, 16);
5864 6316                  break;
5865 6317          case 0xf:
5866 6318                  add_cache_prop(devi, label, fully_assoc, 1);
5867 6319                  break;
5868 6320          default: /* reserved; ignore */
5869 6321                  break;
5870 6322          }
5871 6323  }
5872 6324  
5873 6325  static void
5874 6326  add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
5875 6327  {
5876 6328          if (size == 0 || assoc == 0)
5877 6329                  return;
5878 6330          add_amd_l2_assoc(devi, label, assoc);
5879 6331          add_cache_prop(devi, label, size_str, size);
5880 6332  }
5881 6333  
5882 6334  static void
5883 6335  add_amd_l2_cache(dev_info_t *devi, const char *label,
5884 6336      uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
5885 6337  {
5886 6338          if (size == 0 || assoc == 0 || line_size == 0)
5887 6339                  return;
5888 6340          add_amd_l2_assoc(devi, label, assoc);
5889 6341          if (lines_per_tag != 0)
5890 6342                  add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
5891 6343          add_cache_prop(devi, label, line_str, line_size);
5892 6344          add_cache_prop(devi, label, size_str, size * 1024);
5893 6345  }
5894 6346  
5895 6347  static void
5896 6348  amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
5897 6349  {
5898 6350          struct cpuid_regs *cp;
5899 6351  
5900 6352          if (cpi->cpi_xmaxeax < 0x80000005)
5901 6353                  return;
5902 6354          cp = &cpi->cpi_extd[5];
5903 6355  
5904 6356          /*
5905 6357           * 4M/2M L1 TLB configuration
5906 6358           *
5907 6359           * We report the size for 2M pages because AMD uses two
5908 6360           * TLB entries for one 4M page.
5909 6361           */
5910 6362          add_amd_tlb(devi, "dtlb-2M",
5911 6363              BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
5912 6364          add_amd_tlb(devi, "itlb-2M",
5913 6365              BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
5914 6366  
5915 6367          /*
5916 6368           * 4K L1 TLB configuration
5917 6369           */
5918 6370  
5919 6371          switch (cpi->cpi_vendor) {
5920 6372                  uint_t nentries;
5921 6373          case X86_VENDOR_TM:
5922 6374                  if (cpi->cpi_family >= 5) {
5923 6375                          /*
5924 6376                           * Crusoe processors have 256 TLB entries, but
5925 6377                           * cpuid data format constrains them to only
5926 6378                           * reporting 255 of them.
5927 6379                           */
5928 6380                          if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
5929 6381                                  nentries = 256;
5930 6382                          /*
5931 6383                           * Crusoe processors also have a unified TLB
5932 6384                           */
5933 6385                          add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
5934 6386                              nentries);
5935 6387                          break;
5936 6388                  }
5937 6389                  /*FALLTHROUGH*/
5938 6390          default:
5939 6391                  add_amd_tlb(devi, itlb4k_str,
5940 6392                      BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
5941 6393                  add_amd_tlb(devi, dtlb4k_str,
5942 6394                      BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
5943 6395                  break;
5944 6396          }
5945 6397  
5946 6398          /*
5947 6399           * data L1 cache configuration
5948 6400           */
5949 6401  
5950 6402          add_amd_cache(devi, l1_dcache_str,
5951 6403              BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
5952 6404              BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
5953 6405  
5954 6406          /*
5955 6407           * code L1 cache configuration
5956 6408           */
5957 6409  
5958 6410          add_amd_cache(devi, l1_icache_str,
5959 6411              BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
5960 6412              BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
5961 6413  
5962 6414          if (cpi->cpi_xmaxeax < 0x80000006)
5963 6415                  return;
5964 6416          cp = &cpi->cpi_extd[6];
5965 6417  
5966 6418          /* Check for a unified L2 TLB for large pages */
5967 6419  
5968 6420          if (BITX(cp->cp_eax, 31, 16) == 0)
5969 6421                  add_amd_l2_tlb(devi, "l2-tlb-2M",
5970 6422                      BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
5971 6423          else {
5972 6424                  add_amd_l2_tlb(devi, "l2-dtlb-2M",
5973 6425                      BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
5974 6426                  add_amd_l2_tlb(devi, "l2-itlb-2M",
5975 6427                      BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
5976 6428          }
5977 6429  
5978 6430          /* Check for a unified L2 TLB for 4K pages */
5979 6431  
5980 6432          if (BITX(cp->cp_ebx, 31, 16) == 0) {
5981 6433                  add_amd_l2_tlb(devi, "l2-tlb-4K",
5982 6434                      BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
5983 6435          } else {
5984 6436                  add_amd_l2_tlb(devi, "l2-dtlb-4K",
5985 6437                      BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
5986 6438                  add_amd_l2_tlb(devi, "l2-itlb-4K",
5987 6439                      BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
5988 6440          }
5989 6441  
5990 6442          add_amd_l2_cache(devi, l2_cache_str,
5991 6443              BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
5992 6444              BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
5993 6445  }
5994 6446  
5995 6447  /*
5996 6448   * There are two basic ways that the x86 world describes it cache
5997 6449   * and tlb architecture - Intel's way and AMD's way.
5998 6450   *
5999 6451   * Return which flavor of cache architecture we should use
6000 6452   */
6001 6453  static int
6002 6454  x86_which_cacheinfo(struct cpuid_info *cpi)
6003 6455  {
6004 6456          switch (cpi->cpi_vendor) {
6005 6457          case X86_VENDOR_Intel:
6006 6458                  if (cpi->cpi_maxeax >= 2)
6007 6459                          return (X86_VENDOR_Intel);
6008 6460                  break;
6009 6461          case X86_VENDOR_AMD:
6010 6462                  /*
6011 6463                   * The K5 model 1 was the first part from AMD that reported
6012 6464                   * cache sizes via extended cpuid functions.
6013 6465                   */
6014 6466                  if (cpi->cpi_family > 5 ||
6015 6467                      (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
6016 6468                          return (X86_VENDOR_AMD);
6017 6469                  break;
6018 6470          case X86_VENDOR_TM:
6019 6471                  if (cpi->cpi_family >= 5)
6020 6472                          return (X86_VENDOR_AMD);
6021 6473                  /*FALLTHROUGH*/
6022 6474          default:
6023 6475                  /*
6024 6476                   * If they have extended CPU data for 0x80000005
6025 6477                   * then we assume they have AMD-format cache
6026 6478                   * information.
6027 6479                   *
6028 6480                   * If not, and the vendor happens to be Cyrix,
6029 6481                   * then try our-Cyrix specific handler.
6030 6482                   *
6031 6483                   * If we're not Cyrix, then assume we're using Intel's
6032 6484                   * table-driven format instead.
6033 6485                   */
6034 6486                  if (cpi->cpi_xmaxeax >= 0x80000005)
6035 6487                          return (X86_VENDOR_AMD);
6036 6488                  else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
6037 6489                          return (X86_VENDOR_Cyrix);
6038 6490                  else if (cpi->cpi_maxeax >= 2)
6039 6491                          return (X86_VENDOR_Intel);
6040 6492                  break;
6041 6493          }
6042 6494          return (-1);
6043 6495  }
6044 6496  
6045 6497  void
6046 6498  cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
6047 6499      struct cpuid_info *cpi)
6048 6500  {
6049 6501          dev_info_t *cpu_devi;
6050 6502          int create;
6051 6503  
6052 6504          cpu_devi = (dev_info_t *)dip;
6053 6505  
6054 6506          /* device_type */
6055 6507          (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6056 6508              "device_type", "cpu");
6057 6509  
6058 6510          /* reg */
6059 6511          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6060 6512              "reg", cpu_id);
6061 6513  
6062 6514          /* cpu-mhz, and clock-frequency */
6063 6515          if (cpu_freq > 0) {
6064 6516                  long long mul;
6065 6517  
6066 6518                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6067 6519                      "cpu-mhz", cpu_freq);
6068 6520                  if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
6069 6521                          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6070 6522                              "clock-frequency", (int)mul);
6071 6523          }
6072 6524  
6073 6525          if (!is_x86_feature(x86_featureset, X86FSET_CPUID)) {
6074 6526                  return;
6075 6527          }
6076 6528  
6077 6529          /* vendor-id */
6078 6530          (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6079 6531              "vendor-id", cpi->cpi_vendorstr);
6080 6532  
6081 6533          if (cpi->cpi_maxeax == 0) {
6082 6534                  return;
6083 6535          }
6084 6536  
6085 6537          /*
6086 6538           * family, model, and step
6087 6539           */
6088 6540          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6089 6541              "family", CPI_FAMILY(cpi));
6090 6542          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6091 6543              "cpu-model", CPI_MODEL(cpi));
6092 6544          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6093 6545              "stepping-id", CPI_STEP(cpi));
6094 6546  
6095 6547          /* type */
6096 6548          switch (cpi->cpi_vendor) {
6097 6549          case X86_VENDOR_Intel:
6098 6550                  create = 1;
6099 6551                  break;
6100 6552          default:
6101 6553                  create = 0;
6102 6554                  break;
6103 6555          }
6104 6556          if (create)
6105 6557                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6106 6558                      "type", CPI_TYPE(cpi));
6107 6559  
6108 6560          /* ext-family */
6109 6561          switch (cpi->cpi_vendor) {
6110 6562          case X86_VENDOR_Intel:
6111 6563          case X86_VENDOR_AMD:
6112 6564                  create = cpi->cpi_family >= 0xf;
6113 6565                  break;
6114 6566          default:
6115 6567                  create = 0;
6116 6568                  break;
6117 6569          }
6118 6570          if (create)
6119 6571                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6120 6572                      "ext-family", CPI_FAMILY_XTD(cpi));
6121 6573  
6122 6574          /* ext-model */
6123 6575          switch (cpi->cpi_vendor) {
6124 6576          case X86_VENDOR_Intel:
6125 6577                  create = IS_EXTENDED_MODEL_INTEL(cpi);
6126 6578                  break;
6127 6579          case X86_VENDOR_AMD:
6128 6580                  create = CPI_FAMILY(cpi) == 0xf;
6129 6581                  break;
6130 6582          default:
6131 6583                  create = 0;
6132 6584                  break;
6133 6585          }
6134 6586          if (create)
6135 6587                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6136 6588                      "ext-model", CPI_MODEL_XTD(cpi));
6137 6589  
6138 6590          /* generation */
6139 6591          switch (cpi->cpi_vendor) {
6140 6592          case X86_VENDOR_AMD:
6141 6593                  /*
6142 6594                   * AMD K5 model 1 was the first part to support this
6143 6595                   */
6144 6596                  create = cpi->cpi_xmaxeax >= 0x80000001;
6145 6597                  break;
6146 6598          default:
6147 6599                  create = 0;
6148 6600                  break;
6149 6601          }
6150 6602          if (create)
6151 6603                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6152 6604                      "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
6153 6605  
6154 6606          /* brand-id */
6155 6607          switch (cpi->cpi_vendor) {
6156 6608          case X86_VENDOR_Intel:
6157 6609                  /*
6158 6610                   * brand id first appeared on Pentium III Xeon model 8,
6159 6611                   * and Celeron model 8 processors and Opteron
6160 6612                   */
6161 6613                  create = cpi->cpi_family > 6 ||
6162 6614                      (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
6163 6615                  break;
6164 6616          case X86_VENDOR_AMD:
6165 6617                  create = cpi->cpi_family >= 0xf;
6166 6618                  break;
6167 6619          default:
6168 6620                  create = 0;
6169 6621                  break;
6170 6622          }
6171 6623          if (create && cpi->cpi_brandid != 0) {
6172 6624                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6173 6625                      "brand-id", cpi->cpi_brandid);
6174 6626          }
6175 6627  
6176 6628          /* chunks, and apic-id */
6177 6629          switch (cpi->cpi_vendor) {
6178 6630                  /*
6179 6631                   * first available on Pentium IV and Opteron (K8)
6180 6632                   */
6181 6633          case X86_VENDOR_Intel:
6182 6634                  create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6183 6635                  break;
6184 6636          case X86_VENDOR_AMD:
6185 6637                  create = cpi->cpi_family >= 0xf;
6186 6638                  break;
6187 6639          default:
6188 6640                  create = 0;
6189 6641                  break;
6190 6642          }
6191 6643          if (create) {
6192 6644                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6193 6645                      "chunks", CPI_CHUNKS(cpi));
6194 6646                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6195 6647                      "apic-id", cpi->cpi_apicid);
6196 6648                  if (cpi->cpi_chipid >= 0) {
6197 6649                          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6198 6650                              "chip#", cpi->cpi_chipid);
6199 6651                          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6200 6652                              "clog#", cpi->cpi_clogid);
6201 6653                  }
6202 6654          }
6203 6655  
6204 6656          /* cpuid-features */
6205 6657          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6206 6658              "cpuid-features", CPI_FEATURES_EDX(cpi));
6207 6659  
6208 6660  
6209 6661          /* cpuid-features-ecx */
6210 6662          switch (cpi->cpi_vendor) {
6211 6663          case X86_VENDOR_Intel:
6212 6664                  create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6213 6665                  break;
6214 6666          case X86_VENDOR_AMD:
6215 6667                  create = cpi->cpi_family >= 0xf;
6216 6668                  break;
6217 6669          default:
6218 6670                  create = 0;
6219 6671                  break;
6220 6672          }
6221 6673          if (create)
6222 6674                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6223 6675                      "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
6224 6676  
6225 6677          /* ext-cpuid-features */
6226 6678          switch (cpi->cpi_vendor) {
6227 6679          case X86_VENDOR_Intel:
6228 6680          case X86_VENDOR_AMD:
6229 6681          case X86_VENDOR_Cyrix:
6230 6682          case X86_VENDOR_TM:
6231 6683          case X86_VENDOR_Centaur:
6232 6684                  create = cpi->cpi_xmaxeax >= 0x80000001;
6233 6685                  break;
6234 6686          default:
6235 6687                  create = 0;
6236 6688                  break;
6237 6689          }
6238 6690          if (create) {
6239 6691                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6240 6692                      "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
6241 6693                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6242 6694                      "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
6243 6695          }
6244 6696  
6245 6697          /*
6246 6698           * Brand String first appeared in Intel Pentium IV, AMD K5
6247 6699           * model 1, and Cyrix GXm.  On earlier models we try and
6248 6700           * simulate something similar .. so this string should always
6249 6701           * same -something- about the processor, however lame.
6250 6702           */
6251 6703          (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6252 6704              "brand-string", cpi->cpi_brandstr);
6253 6705  
6254 6706          /*
6255 6707           * Finally, cache and tlb information
6256 6708           */
6257 6709          switch (x86_which_cacheinfo(cpi)) {
6258 6710          case X86_VENDOR_Intel:
6259 6711                  intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
6260 6712                  break;
6261 6713          case X86_VENDOR_Cyrix:
6262 6714                  cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
6263 6715                  break;
6264 6716          case X86_VENDOR_AMD:
6265 6717                  amd_cache_info(cpi, cpu_devi);
6266 6718                  break;
6267 6719          default:
6268 6720                  break;
6269 6721          }
6270 6722  }
6271 6723  
6272 6724  struct l2info {
6273 6725          int *l2i_csz;
6274 6726          int *l2i_lsz;
6275 6727          int *l2i_assoc;
6276 6728          int l2i_ret;
6277 6729  };
6278 6730  
6279 6731  /*
6280 6732   * A cacheinfo walker that fetches the size, line-size and associativity
6281 6733   * of the L2 cache
6282 6734   */
6283 6735  static int
6284 6736  intel_l2cinfo(void *arg, const struct cachetab *ct)
6285 6737  {
6286 6738          struct l2info *l2i = arg;
6287 6739          int *ip;
6288 6740  
6289 6741          if (ct->ct_label != l2_cache_str &&
6290 6742              ct->ct_label != sl2_cache_str)
6291 6743                  return (0);     /* not an L2 -- keep walking */
6292 6744  
6293 6745          if ((ip = l2i->l2i_csz) != NULL)
6294 6746                  *ip = ct->ct_size;
6295 6747          if ((ip = l2i->l2i_lsz) != NULL)
6296 6748                  *ip = ct->ct_line_size;
6297 6749          if ((ip = l2i->l2i_assoc) != NULL)
6298 6750                  *ip = ct->ct_assoc;
6299 6751          l2i->l2i_ret = ct->ct_size;
6300 6752          return (1);             /* was an L2 -- terminate walk */
6301 6753  }
6302 6754  
6303 6755  /*
6304 6756   * AMD L2/L3 Cache and TLB Associativity Field Definition:
6305 6757   *
6306 6758   *      Unlike the associativity for the L1 cache and tlb where the 8 bit
6307 6759   *      value is the associativity, the associativity for the L2 cache and
6308 6760   *      tlb is encoded in the following table. The 4 bit L2 value serves as
6309 6761   *      an index into the amd_afd[] array to determine the associativity.
6310 6762   *      -1 is undefined. 0 is fully associative.
6311 6763   */
6312 6764  
6313 6765  static int amd_afd[] =
6314 6766          {-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
6315 6767  
6316 6768  static void
6317 6769  amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
6318 6770  {
6319 6771          struct cpuid_regs *cp;
6320 6772          uint_t size, assoc;
6321 6773          int i;
6322 6774          int *ip;
6323 6775  
6324 6776          if (cpi->cpi_xmaxeax < 0x80000006)
6325 6777                  return;
6326 6778          cp = &cpi->cpi_extd[6];
6327 6779  
6328 6780          if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
6329 6781              (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
6330 6782                  uint_t cachesz = size * 1024;
6331 6783                  assoc = amd_afd[i];
6332 6784  
6333 6785                  ASSERT(assoc != -1);
6334 6786  
6335 6787                  if ((ip = l2i->l2i_csz) != NULL)
6336 6788                          *ip = cachesz;
6337 6789                  if ((ip = l2i->l2i_lsz) != NULL)
6338 6790                          *ip = BITX(cp->cp_ecx, 7, 0);
6339 6791                  if ((ip = l2i->l2i_assoc) != NULL)
6340 6792                          *ip = assoc;
6341 6793                  l2i->l2i_ret = cachesz;
6342 6794          }
6343 6795  }
6344 6796  
6345 6797  int
6346 6798  getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
6347 6799  {
6348 6800          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6349 6801          struct l2info __l2info, *l2i = &__l2info;
6350 6802  
6351 6803          l2i->l2i_csz = csz;
6352 6804          l2i->l2i_lsz = lsz;
6353 6805          l2i->l2i_assoc = assoc;
6354 6806          l2i->l2i_ret = -1;
6355 6807  
6356 6808          switch (x86_which_cacheinfo(cpi)) {
6357 6809          case X86_VENDOR_Intel:
6358 6810                  intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
6359 6811                  break;
6360 6812          case X86_VENDOR_Cyrix:
6361 6813                  cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
6362 6814                  break;
6363 6815          case X86_VENDOR_AMD:
6364 6816                  amd_l2cacheinfo(cpi, l2i);
6365 6817                  break;
6366 6818          default:
6367 6819                  break;
6368 6820          }
6369 6821          return (l2i->l2i_ret);
6370 6822  }
6371 6823  
6372 6824  #if !defined(__xpv)
6373 6825  
6374 6826  uint32_t *
6375 6827  cpuid_mwait_alloc(cpu_t *cpu)
6376 6828  {
6377 6829          uint32_t        *ret;
6378 6830          size_t          mwait_size;
6379 6831  
6380 6832          ASSERT(cpuid_checkpass(CPU, 2));
6381 6833  
6382 6834          mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
6383 6835          if (mwait_size == 0)
6384 6836                  return (NULL);
6385 6837  
6386 6838          /*
6387 6839           * kmem_alloc() returns cache line size aligned data for mwait_size
6388 6840           * allocations.  mwait_size is currently cache line sized.  Neither
6389 6841           * of these implementation details are guarantied to be true in the
6390 6842           * future.
6391 6843           *
6392 6844           * First try allocating mwait_size as kmem_alloc() currently returns
6393 6845           * correctly aligned memory.  If kmem_alloc() does not return
6394 6846           * mwait_size aligned memory, then use mwait_size ROUNDUP.
6395 6847           *
6396 6848           * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
6397 6849           * decide to free this memory.
6398 6850           */
6399 6851          ret = kmem_zalloc(mwait_size, KM_SLEEP);
6400 6852          if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
6401 6853                  cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
6402 6854                  cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
6403 6855                  *ret = MWAIT_RUNNING;
6404 6856                  return (ret);
6405 6857          } else {
6406 6858                  kmem_free(ret, mwait_size);
6407 6859                  ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
6408 6860                  cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
6409 6861                  cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
6410 6862                  ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
6411 6863                  *ret = MWAIT_RUNNING;
6412 6864                  return (ret);
6413 6865          }
6414 6866  }
6415 6867  
6416 6868  void
6417 6869  cpuid_mwait_free(cpu_t *cpu)
6418 6870  {
6419 6871          if (cpu->cpu_m.mcpu_cpi == NULL) {
6420 6872                  return;
6421 6873          }
6422 6874  
6423 6875          if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
6424 6876              cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
6425 6877                  kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
6426 6878                      cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
6427 6879          }
6428 6880  
6429 6881          cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
6430 6882          cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
6431 6883  }
6432 6884  
6433 6885  void
6434 6886  patch_tsc_read(int flag)
6435 6887  {
6436 6888          size_t cnt;
6437 6889  
6438 6890          switch (flag) {
6439 6891          case TSC_NONE:
6440 6892                  cnt = &_no_rdtsc_end - &_no_rdtsc_start;
6441 6893                  (void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
6442 6894                  break;
6443 6895          case TSC_RDTSC_MFENCE:
6444 6896                  cnt = &_tsc_mfence_end - &_tsc_mfence_start;
6445 6897                  (void) memcpy((void *)tsc_read,
6446 6898                      (void *)&_tsc_mfence_start, cnt);
6447 6899                  break;
6448 6900          case TSC_RDTSC_LFENCE:
6449 6901                  cnt = &_tsc_lfence_end - &_tsc_lfence_start;
6450 6902                  (void) memcpy((void *)tsc_read,
6451 6903                      (void *)&_tsc_lfence_start, cnt);
6452 6904                  break;
6453 6905          case TSC_TSCP:
6454 6906                  cnt = &_tscp_end - &_tscp_start;
6455 6907                  (void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
6456 6908                  break;
6457 6909          default:
6458 6910                  /* Bail for unexpected TSC types. (TSC_NONE covers 0) */
6459 6911                  cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
6460 6912                  break;
6461 6913          }
6462 6914          tsc_type = flag;
6463 6915  }
6464 6916  
6465 6917  int
6466 6918  cpuid_deep_cstates_supported(void)
6467 6919  {
6468 6920          struct cpuid_info *cpi;
6469 6921          struct cpuid_regs regs;
6470 6922  
6471 6923          ASSERT(cpuid_checkpass(CPU, 1));
6472 6924  
6473 6925          cpi = CPU->cpu_m.mcpu_cpi;
6474 6926  
6475 6927          if (!is_x86_feature(x86_featureset, X86FSET_CPUID))
6476 6928                  return (0);
6477 6929  
6478 6930          switch (cpi->cpi_vendor) {
6479 6931          case X86_VENDOR_Intel:
6480 6932                  if (cpi->cpi_xmaxeax < 0x80000007)
6481 6933                          return (0);
6482 6934  
6483 6935                  /*
6484 6936                   * TSC run at a constant rate in all ACPI C-states?
6485 6937                   */
6486 6938                  regs.cp_eax = 0x80000007;
6487 6939                  (void) __cpuid_insn(&regs);
6488 6940                  return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
6489 6941  
6490 6942          default:
6491 6943                  return (0);
6492 6944          }
6493 6945  }
6494 6946  
6495 6947  #endif  /* !__xpv */
6496 6948  
6497 6949  void
6498 6950  post_startup_cpu_fixups(void)
6499 6951  {
6500 6952  #ifndef __xpv
6501 6953          /*
6502 6954           * Some AMD processors support C1E state. Entering this state will
6503 6955           * cause the local APIC timer to stop, which we can't deal with at
6504 6956           * this time.
6505 6957           */
6506 6958          if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
6507 6959                  on_trap_data_t otd;
6508 6960                  uint64_t reg;
6509 6961  
6510 6962                  if (!on_trap(&otd, OT_DATA_ACCESS)) {
6511 6963                          reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
6512 6964                          /* Disable C1E state if it is enabled by BIOS */
6513 6965                          if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
6514 6966                              AMD_ACTONCMPHALT_MASK) {
6515 6967                                  reg &= ~(AMD_ACTONCMPHALT_MASK <<
6516 6968                                      AMD_ACTONCMPHALT_SHIFT);
6517 6969                                  wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
6518 6970                          }
6519 6971                  }
6520 6972                  no_trap();
6521 6973          }
6522 6974  #endif  /* !__xpv */
6523 6975  }
6524 6976  
6525 6977  void
6526 6978  enable_pcid(void)
6527 6979  {
6528 6980          if (x86_use_pcid == -1)
6529 6981                  x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
6530 6982  
6531 6983          if (x86_use_invpcid == -1) {
6532 6984                  x86_use_invpcid = is_x86_feature(x86_featureset,
6533 6985                      X86FSET_INVPCID);
6534 6986          }
6535 6987  
6536 6988          if (!x86_use_pcid)
6537 6989                  return;
6538 6990  
6539 6991          /*
6540 6992           * Intel say that on setting PCIDE, it immediately starts using the PCID
6541 6993           * bits; better make sure there's nothing there.
6542 6994           */
6543 6995          ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
6544 6996  
6545 6997          setcr4(getcr4() | CR4_PCIDE);
6546 6998  }
6547 6999  
6548 7000  /*
6549 7001   * Setup necessary registers to enable XSAVE feature on this processor.
6550 7002   * This function needs to be called early enough, so that no xsave/xrstor
6551 7003   * ops will execute on the processor before the MSRs are properly set up.
6552 7004   *
6553 7005   * Current implementation has the following assumption:
6554 7006   * - cpuid_pass1() is done, so that X86 features are known.
6555 7007   * - fpu_probe() is done, so that fp_save_mech is chosen.
6556 7008   */
6557 7009  void
6558 7010  xsave_setup_msr(cpu_t *cpu)
6559 7011  {
6560 7012          ASSERT(fp_save_mech == FP_XSAVE);
6561 7013          ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
6562 7014  
6563 7015          /* Enable OSXSAVE in CR4. */
6564 7016          setcr4(getcr4() | CR4_OSXSAVE);
6565 7017          /*
6566 7018           * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report
6567 7019           * correct value.
6568 7020           */
6569 7021          cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE;
6570 7022          setup_xfem();
6571 7023  }
6572 7024  
6573 7025  /*
6574 7026   * Starting with the Westmere processor the local
6575 7027   * APIC timer will continue running in all C-states,
6576 7028   * including the deepest C-states.
6577 7029   */
6578 7030  int
6579 7031  cpuid_arat_supported(void)
6580 7032  {
6581 7033          struct cpuid_info *cpi;
6582 7034          struct cpuid_regs regs;
6583 7035  
6584 7036          ASSERT(cpuid_checkpass(CPU, 1));
6585 7037          ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
6586 7038  
6587 7039          cpi = CPU->cpu_m.mcpu_cpi;
6588 7040  
6589 7041          switch (cpi->cpi_vendor) {
6590 7042          case X86_VENDOR_Intel:
6591 7043                  /*
6592 7044                   * Always-running Local APIC Timer is
6593 7045                   * indicated by CPUID.6.EAX[2].
6594 7046                   */
6595 7047                  if (cpi->cpi_maxeax >= 6) {
6596 7048                          regs.cp_eax = 6;
6597 7049                          (void) cpuid_insn(NULL, &regs);
6598 7050                          return (regs.cp_eax & CPUID_INTC_EAX_ARAT);
6599 7051                  } else {
6600 7052                          return (0);
6601 7053                  }
6602 7054          default:
6603 7055                  return (0);
6604 7056          }
6605 7057  }
6606 7058  
6607 7059  /*
6608 7060   * Check support for Intel ENERGY_PERF_BIAS feature
6609 7061   */
6610 7062  int
6611 7063  cpuid_iepb_supported(struct cpu *cp)
6612 7064  {
6613 7065          struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi;
6614 7066          struct cpuid_regs regs;
6615 7067  
6616 7068          ASSERT(cpuid_checkpass(cp, 1));
6617 7069  
6618 7070          if (!(is_x86_feature(x86_featureset, X86FSET_CPUID)) ||
6619 7071              !(is_x86_feature(x86_featureset, X86FSET_MSR))) {
6620 7072                  return (0);
6621 7073          }
6622 7074  
6623 7075          /*
6624 7076           * Intel ENERGY_PERF_BIAS MSR is indicated by
6625 7077           * capability bit CPUID.6.ECX.3
6626 7078           */
6627 7079          if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6))
6628 7080                  return (0);
6629 7081  
6630 7082          regs.cp_eax = 0x6;
6631 7083          (void) cpuid_insn(NULL, &regs);
6632 7084          return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS);
6633 7085  }
6634 7086  
6635 7087  /*
6636 7088   * Check support for TSC deadline timer
6637 7089   *
6638 7090   * TSC deadline timer provides a superior software programming
6639 7091   * model over local APIC timer that eliminates "time drifts".
6640 7092   * Instead of specifying a relative time, software specifies an
6641 7093   * absolute time as the target at which the processor should
6642 7094   * generate a timer event.
6643 7095   */
6644 7096  int
6645 7097  cpuid_deadline_tsc_supported(void)
6646 7098  {
6647 7099          struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi;
6648 7100          struct cpuid_regs regs;
6649 7101  
6650 7102          ASSERT(cpuid_checkpass(CPU, 1));
6651 7103          ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
6652 7104  
6653 7105          switch (cpi->cpi_vendor) {
6654 7106          case X86_VENDOR_Intel:
6655 7107                  if (cpi->cpi_maxeax >= 1) {
6656 7108                          regs.cp_eax = 1;
6657 7109                          (void) cpuid_insn(NULL, &regs);
6658 7110                          return (regs.cp_ecx & CPUID_DEADLINE_TSC);
6659 7111                  } else {
6660 7112                          return (0);
6661 7113                  }
6662 7114          default:
6663 7115                  return (0);
6664 7116          }
6665 7117  }
6666 7118  
6667 7119  #if defined(__amd64) && !defined(__xpv)
6668 7120  /*
6669 7121   * Patch in versions of bcopy for high performance Intel Nhm processors
6670 7122   * and later...
6671 7123   */
6672 7124  void
6673 7125  patch_memops(uint_t vendor)
6674 7126  {
6675 7127          size_t cnt, i;
6676 7128          caddr_t to, from;
6677 7129  
6678 7130          if ((vendor == X86_VENDOR_Intel) &&
6679 7131              is_x86_feature(x86_featureset, X86FSET_SSE4_2)) {
6680 7132                  cnt = &bcopy_patch_end - &bcopy_patch_start;
6681 7133                  to = &bcopy_ck_size;
6682 7134                  from = &bcopy_patch_start;
6683 7135                  for (i = 0; i < cnt; i++) {
6684 7136                          *to++ = *from++;
6685 7137                  }
6686 7138          }
6687 7139  }
6688 7140  #endif  /* __amd64 && !__xpv */
6689 7141  
6690 7142  /*
6691 7143   * We're being asked to tell the system how many bits are required to represent
6692 7144   * the various thread and strand IDs. While it's tempting to derive this based
6693 7145   * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite
6694 7146   * correct. Instead, this needs to be based on the number of bits that the APIC
6695 7147   * allows for these different configurations. We only update these to a larger
6696 7148   * value if we find one.
6697 7149   */
6698 7150  void
6699 7151  cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
6700 7152  {
6701 7153          struct cpuid_info *cpi;
6702 7154  
6703 7155          VERIFY(cpuid_checkpass(CPU, 1));
6704 7156          cpi = cpu->cpu_m.mcpu_cpi;
6705 7157  
6706 7158          if (cpi->cpi_ncore_bits > *core_nbits) {
6707 7159                  *core_nbits = cpi->cpi_ncore_bits;
6708 7160          }
6709 7161  
6710 7162          if (cpi->cpi_nthread_bits > *strand_nbits) {
6711 7163                  *strand_nbits = cpi->cpi_nthread_bits;
6712 7164          }
6713 7165  }
6714 7166  
6715 7167  void
6716 7168  cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
6717 7169  {
6718 7170          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6719 7171          struct cpuid_regs cp;
6720 7172  
6721 7173          /*
6722 7174           * Reread the CPUID portions that we need for various security
6723 7175           * information.
6724 7176           */
6725 7177          if (cpi->cpi_vendor == X86_VENDOR_Intel) {
6726 7178                  /*
6727 7179                   * Check if we now have leaf 7 available to us.
6728 7180                   */
6729 7181                  if (cpi->cpi_maxeax < 7) {
6730 7182                          bzero(&cp, sizeof (cp));
6731 7183                          cp.cp_eax = 0;
6732 7184                          cpi->cpi_maxeax = __cpuid_insn(&cp);
6733 7185                          if (cpi->cpi_maxeax < 7)
6734 7186                                  return;
6735 7187                  }
6736 7188  
6737 7189                  bzero(&cp, sizeof (cp));
6738 7190                  cp.cp_eax = 7;
6739 7191                  cp.cp_ecx = 0;
6740 7192                  (void) __cpuid_insn(&cp);
6741 7193                  cpi->cpi_std[7] = cp;
6742 7194          } else if (cpi->cpi_vendor == X86_VENDOR_AMD) {
6743 7195                  /* No xcpuid support */
6744 7196                  if (cpi->cpi_family < 5 ||
6745 7197                      (cpi->cpi_family == 5 && cpi->cpi_model < 1))
6746 7198                          return;
6747 7199  
6748 7200                  if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
6749 7201                          bzero(&cp, sizeof (cp));
6750 7202                          cp.cp_eax = CPUID_LEAF_EXT_0;
6751 7203                          cpi->cpi_xmaxeax = __cpuid_insn(&cp);
6752 7204                          if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
6753 7205                                  return;
6754 7206                          }
6755 7207                  }
6756 7208  
6757 7209                  bzero(&cp, sizeof (cp));
6758 7210                  cp.cp_eax = CPUID_LEAF_EXT_8;
6759 7211                  (void) __cpuid_insn(&cp);
6760 7212                  platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
6761 7213                  cpi->cpi_extd[8] = cp;
6762 7214          } else {
6763 7215                  /*
6764 7216                   * Nothing to do here. Return an empty set which has already
6765 7217                   * been zeroed for us.
6766 7218                   */

↓ open down ↓

4401 lines elided

↑ open up ↑

6767 7219                  return;
6768 7220          }
6769 7221          cpuid_scan_security(cpu, fset);
6770 7222  }
6771 7223  
6772 7224  /* ARGSUSED */
6773 7225  static int
6774 7226  cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
6775 7227  {
6776 7228          uchar_t *fset;
     7229 +        boolean_t first_pass = (boolean_t)arg1;
6777 7230  
6778 7231          fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
     7232 +        if (first_pass && CPU->cpu_id != 0)
     7233 +                return (0);
     7234 +        if (!first_pass && CPU->cpu_id == 0)
     7235 +                return (0);
6779 7236          cpuid_pass_ucode(CPU, fset);
6780 7237  
6781 7238          return (0);
6782 7239  }
6783 7240  
6784 7241  /*
6785 7242   * After a microcode update where the version has changed, then we need to
6786 7243   * rescan CPUID. To do this we check every CPU to make sure that they have the
6787 7244   * same microcode. Then we perform a cross call to all such CPUs. It's the
6788 7245   * caller's job to make sure that no one else can end up doing an update while

6789 7246   * this is going on.
6790 7247   *
6791 7248   * We assume that the system is microcode capable if we're called.
6792 7249   */
6793 7250  void
6794 7251  cpuid_post_ucodeadm(void)
6795 7252  {
6796 7253          uint32_t rev;
6797 7254          int i;
6798 7255          struct cpu *cpu;
6799 7256          cpuset_t cpuset;
6800 7257          void *argdata;
6801 7258          uchar_t *f0;
6802 7259  
6803 7260          argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
6804 7261  
6805 7262          mutex_enter(&cpu_lock);
6806 7263          cpu = cpu_get(0);
6807 7264          rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
6808 7265          CPUSET_ONLY(cpuset, 0);
6809 7266          for (i = 1; i < max_ncpus; i++) {
6810 7267                  if ((cpu = cpu_get(i)) == NULL)

↓ open down ↓

22 lines elided

↑ open up ↑

6811 7268                          continue;
6812 7269  
6813 7270                  if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
6814 7271                          panic("post microcode update CPU %d has differing "
6815 7272                              "microcode revision (%u) from CPU 0 (%u)",
6816 7273                              i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
6817 7274                  }
6818 7275                  CPUSET_ADD(cpuset, i);
6819 7276          }
6820 7277  
     7278 +        /*
     7279 +         * We do the cross calls in two passes. The first pass is only for the
     7280 +         * boot CPU. The second pass is for all of the other CPUs. This allows
     7281 +         * the boot CPU to go through and change behavior related to patching or
     7282 +         * whether or not Enhanced IBRS needs to be enabled and then allow all
     7283 +         * other CPUs to follow suit.
     7284 +         */
6821 7285          kpreempt_disable();
6822      -        xc_sync((xc_arg_t)argdata, 0, 0, CPUSET2BV(cpuset),
     7286 +        xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset),
6823 7287              cpuid_post_ucodeadm_xc);
     7288 +        xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset),
     7289 +            cpuid_post_ucodeadm_xc);
6824 7290          kpreempt_enable();
6825 7291  
6826 7292          /*
6827 7293           * OK, now look at each CPU and see if their feature sets are equal.
6828 7294           */
6829 7295          f0 = argdata;
6830 7296          for (i = 1; i < max_ncpus; i++) {
6831 7297                  uchar_t *fset;
6832 7298                  if (!CPU_IN_SET(cpuset, i))
6833 7299                          continue;

6834 7300  
6835 7301                  fset = (uchar_t *)((uintptr_t)argdata +
6836 7302                      sizeof (x86_featureset) * i);
6837 7303  
6838 7304                  if (!compare_x86_featureset(f0, fset)) {
6839 7305                          panic("Post microcode update CPU %d has "
6840 7306                              "differing security feature (%p) set from CPU 0 "
6841 7307                              "(%p), not appending to feature set", i,
6842 7308                              (void *)fset, (void *)f0);
6843 7309                  }
6844 7310          }
6845 7311  
6846 7312          mutex_exit(&cpu_lock);
6847 7313  
6848 7314          for (i = 0; i < NUM_X86_FEATURES; i++) {
6849 7315                  cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n",
6850 7316                      x86_feature_names[i]);
6851 7317                  if (is_x86_feature(f0, i)) {
6852 7318                          add_x86_feature(x86_featureset, i);
6853 7319                  }
6854 7320          }
6855 7321          kmem_free(argdata, sizeof (x86_featureset) * NCPU);
6856 7322  }

↓ open down ↓

23 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX