illumos-gate Wdiff usr/src/uts/i86pc/os/cpuid.c

Print this page

11859 need swapgs mitigation
Reviewed by: Robert Mustacchi <rm@fingolfin.org>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@fingolfin.org>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/i86pc/os/cpuid.c
          +++ new/usr/src/uts/i86pc/os/cpuid.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  24   24   * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
  25   25   * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
  26   26   */
  27   27  /*
  28   28   * Copyright (c) 2010, Intel Corporation.
  29   29   * All rights reserved.
  30   30   */
  31   31  /*
  32   32   * Portions Copyright 2009 Advanced Micro Devices, Inc.
  33   33   */
  34   34  /*
  35   35   * Copyright 2019 Joyent, Inc.
  36   36   */
  37   37  
  38   38  /*
  39   39   * CPU Identification logic
  40   40   *
  41   41   * The purpose of this file and its companion, cpuid_subr.c, is to help deal
  42   42   * with the identification of CPUs, their features, and their topologies. More
  43   43   * specifically, this file helps drive the following:
  44   44   *
  45   45   * 1. Enumeration of features of the processor which are used by the kernel to
  46   46   *    determine what features to enable or disable. These may be instruction set
  47   47   *    enhancements or features that we use.
  48   48   *
  49   49   * 2. Enumeration of instruction set architecture (ISA) additions that userland
  50   50   *    will be told about through the auxiliary vector.
  51   51   *
  52   52   * 3. Understanding the physical topology of the CPU such as the number of
  53   53   *    caches, how many cores it has, whether or not it supports symmetric
  54   54   *    multi-processing (SMT), etc.
  55   55   *
  56   56   * ------------------------
  57   57   * CPUID History and Basics
  58   58   * ------------------------
  59   59   *
  60   60   * The cpuid instruction was added by Intel roughly around the time that the
  61   61   * original Pentium was introduced. The purpose of cpuid was to tell in a
  62   62   * programmatic fashion information about the CPU that previously was guessed
  63   63   * at. For example, an important part of cpuid is that we can know what
  64   64   * extensions to the ISA exist. If you use an invalid opcode you would get a
  65   65   * #UD, so this method allows a program (whether a user program or the kernel)
  66   66   * to determine what exists without crashing or getting a SIGILL. Of course,
  67   67   * this was also during the era of the clones and the AMD Am5x86. The vendor
  68   68   * name shows up first in cpuid for a reason.
  69   69   *
  70   70   * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
  71   71   * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
  72   72   * its own meaning. The different leaves are broken down into different regions:
  73   73   *
  74   74   *      [ 0, 7fffffff ]                 This region is called the 'basic'
  75   75   *                                      region. This region is generally defined
  76   76   *                                      by Intel, though some of the original
  77   77   *                                      portions have different meanings based
  78   78   *                                      on the manufacturer. These days, Intel
  79   79   *                                      adds most new features to this region.
  80   80   *                                      AMD adds non-Intel compatible
  81   81   *                                      information in the third, extended
  82   82   *                                      region. Intel uses this for everything
  83   83   *                                      including ISA extensions, CPU
  84   84   *                                      features, cache information, topology,
  85   85   *                                      and more.
  86   86   *
  87   87   *                                      There is a hole carved out of this
  88   88   *                                      region which is reserved for
  89   89   *                                      hypervisors.
  90   90   *
  91   91   *      [ 40000000, 4fffffff ]          This region, which is found in the
  92   92   *                                      middle of the previous region, is
  93   93   *                                      explicitly promised to never be used by
  94   94   *                                      CPUs. Instead, it is used by hypervisors
  95   95   *                                      to communicate information about
  96   96   *                                      themselves to the operating system. The
  97   97   *                                      values and details are unique for each
  98   98   *                                      hypervisor.
  99   99   *
 100  100   *      [ 80000000, ffffffff ]          This region is called the 'extended'
 101  101   *                                      region. Some of the low leaves mirror
 102  102   *                                      parts of the basic leaves. This region
 103  103   *                                      has generally been used by AMD for
 104  104   *                                      various extensions. For example, AMD-
 105  105   *                                      specific information about caches,
 106  106   *                                      features, and topology are found in this
 107  107   *                                      region.
 108  108   *
 109  109   * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
 110  110   * and %edx, and then issue the cpuid instruction. At the first leaf in each of
 111  111   * the ranges, one of the primary things returned is the maximum valid leaf in
 112  112   * that range. This allows for discovery of what range of CPUID is valid.
 113  113   *
 114  114   * The CPUs have potentially surprising behavior when using an invalid leaf or
 115  115   * unimplemented leaf. If the requested leaf is within the valid basic or
 116  116   * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
 117  117   * set to zero. However, if you specify a leaf that is outside of a valid range,
 118  118   * then instead it will be filled with the last valid _basic_ leaf. For example,
 119  119   * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
 120  120   * an invalid extended leaf will return the information for leaf 3.
 121  121   *
 122  122   * Some leaves are broken down into sub-leaves. This means that the value
 123  123   * depends on both the leaf asked for in %eax and a secondary register. For
 124  124   * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
 125  125   * additional information. Or when getting topology information in leaf 0xb, the
 126  126   * initial value in %ecx changes which level of the topology that you are
 127  127   * getting information about.
 128  128   *
 129  129   * cpuid values are always kept to 32 bits regardless of whether or not the
 130  130   * program is in 64-bit mode. When executing in 64-bit mode, the upper
 131  131   * 32 bits of the register are always set to zero so that way the values are the
 132  132   * same regardless of execution mode.
 133  133   *
 134  134   * ----------------------
 135  135   * Identifying Processors
 136  136   * ----------------------
 137  137   *
 138  138   * We can identify a processor in two steps. The first step looks at cpuid leaf
 139  139   * 0. Leaf 0 contains the processor's vendor information. This is done by
 140  140   * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
 141  141   * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
 142  142   *
 143  143   * From there, a processor is identified by a combination of three different
 144  144   * values:
 145  145   *
 146  146   *  1. Family
 147  147   *  2. Model
 148  148   *  3. Stepping
 149  149   *
 150  150   * Each vendor uses the family and model to uniquely identify a processor. The
 151  151   * way that family and model are changed depends on the vendor. For example,
 152  152   * Intel has been using family 0x6 for almost all of their processor since the
 153  153   * Pentium Pro/Pentium II era, often called the P6. The model is used to
 154  154   * identify the exact processor. Different models are often used for the client
 155  155   * (consumer) and server parts. Even though each processor often has major
 156  156   * architectural differences, they still are considered the same family by
 157  157   * Intel.
 158  158   *
 159  159   * On the other hand, each major AMD architecture generally has its own family.
 160  160   * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
 161  161   * the model number is used to help identify specific processors.
 162  162   *
 163  163   * The stepping is used to refer to a revision of a specific microprocessor. The
 164  164   * term comes from equipment used to produce masks that are used to create
 165  165   * integrated circuits.
 166  166   *
 167  167   * The information is present in leaf 1, %eax. In technical documentation you
 168  168   * will see the terms extended model and extended family. The original family,
 169  169   * model, and stepping fields were each 4 bits wide. If the values in either
 170  170   * are 0xf, then one is to consult the extended model and extended family, which
 171  171   * take previously reserved bits and allow for a larger number of models and add
 172  172   * 0xf to them.
 173  173   *
 174  174   * When we process this information, we store the full family, model, and
 175  175   * stepping in the struct cpuid_info members cpi_family, cpi_model, and
 176  176   * cpi_step, respectively. Whenever you are performing comparisons with the
 177  177   * family, model, and stepping, you should use these members and not the raw
 178  178   * values from cpuid. If you must use the raw values from cpuid directly, you
 179  179   * must make sure that you add the extended model and family to the base model
 180  180   * and family.
 181  181   *
 182  182   * In general, we do not use information about the family, model, and stepping
 183  183   * to determine whether or not a feature is present; that is generally driven by
 184  184   * specific leaves. However, when something we care about on the processor is
 185  185   * not considered 'architectural' meaning that it is specific to a set of
 186  186   * processors and not promised in the architecture model to be consistent from
 187  187   * generation to generation, then we will fall back on this information. The
 188  188   * most common cases where this comes up is when we have to workaround errata in
 189  189   * the processor, are dealing with processor-specific features such as CPU
 190  190   * performance counters, or we want to provide additional information for things
 191  191   * such as fault management.
 192  192   *
 193  193   * While processors also do have a brand string, which is the name that people
 194  194   * are familiar with when buying the processor, they are not meant for
 195  195   * programmatic consumption. That is what the family, model, and stepping are
 196  196   * for.
 197  197   *
 198  198   * ------------
 199  199   * CPUID Passes
 200  200   * ------------
 201  201   *
 202  202   * As part of performing feature detection, we break this into several different
 203  203   * passes. The passes are as follows:
 204  204   *
 205  205   *      Pass 0          This is a primordial pass done in locore.s to deal with
 206  206   *                      Cyrix CPUs that don't support cpuid. The reality is that
 207  207   *                      we likely don't run on them any more, but there is still
 208  208   *                      logic for handling them.
 209  209   *
 210  210   *      Pass 1          This is the primary pass and is responsible for doing a
 211  211   *                      large number of different things:
 212  212   *
 213  213   *                      1. Determine which vendor manufactured the CPU and
 214  214   *                      determining the family, model, and stepping information.
 215  215   *
 216  216   *                      2. Gathering a large number of feature flags to
 217  217   *                      determine which features the CPU support and which
 218  218   *                      indicate things that we need to do other work in the OS
 219  219   *                      to enable. Features detected this way are added to the
 220  220   *                      x86_featureset which can be queried to
 221  221   *                      determine what we should do. This includes processing
 222  222   *                      all of the basic and extended CPU features that we care
 223  223   *                      about.
 224  224   *
 225  225   *                      3. Determining the CPU's topology. This includes
 226  226   *                      information about how many cores and threads are present
 227  227   *                      in the package. It also is responsible for figuring out
 228  228   *                      which logical CPUs are potentially part of the same core
 229  229   *                      and what other resources they might share. For more
 230  230   *                      information see the 'Topology' section.
 231  231   *
 232  232   *                      4. Determining the set of CPU security-specific features
 233  233   *                      that we need to worry about and determine the
 234  234   *                      appropriate set of workarounds.
 235  235   *
 236  236   *                      Pass 1 on the boot CPU occurs before KMDB is started.
 237  237   *
 238  238   *      Pass 2          The second pass is done after startup(). Here, we check
 239  239   *                      other miscellaneous features. Most of this is gathering
 240  240   *                      additional basic and extended features that we'll use in
 241  241   *                      later passes or for debugging support.
 242  242   *
 243  243   *      Pass 3          The third pass occurs after the kernel memory allocator
 244  244   *                      has been fully initialized. This gathers information
 245  245   *                      where we might need dynamic memory available for our
 246  246   *                      uses. This includes several varying width leaves that
 247  247   *                      have cache information and the processor's brand string.
 248  248   *
 249  249   *      Pass 4          The fourth and final normal pass is performed after the
 250  250   *                      kernel has brought most everything online. This is
 251  251   *                      invoked from post_startup(). In this pass, we go through
 252  252   *                      the set of features that we have enabled and turn that
 253  253   *                      into the hardware auxiliary vector features that
 254  254   *                      userland receives. This is used by userland, primarily
 255  255   *                      by the run-time link-editor (RTLD), though userland
 256  256   *                      software could also refer to it directly.
 257  257   *
 258  258   *      Microcode       After a microcode update, we do a selective rescan of
 259  259   *                      the cpuid leaves to determine what features have
 260  260   *                      changed. Microcode updates can provide more details
 261  261   *                      about security related features to deal with issues like
 262  262   *                      Spectre and L1TF. On occasion, vendors have violated
 263  263   *                      their contract and removed bits. However, we don't try
 264  264   *                      to detect that because that puts us in a situation that
 265  265   *                      we really can't deal with. As such, the only thing we
 266  266   *                      rescan are security related features today. See
 267  267   *                      cpuid_pass_ucode().
 268  268   *
 269  269   * All of the passes (except pass 0) are run on all CPUs. However, for the most
 270  270   * part we only care about what the boot CPU says about this information and use
 271  271   * the other CPUs as a rough guide to sanity check that we have the same feature
 272  272   * set.
 273  273   *
 274  274   * We do not support running multiple logical CPUs with disjoint, let alone
 275  275   * different, feature sets.
 276  276   *
 277  277   * ------------------
 278  278   * Processor Topology
 279  279   * ------------------
 280  280   *
 281  281   * One of the important things that we need to do is to understand the topology
 282  282   * of the underlying processor. When we say topology in this case, we're trying
 283  283   * to understand the relationship between the logical CPUs that the operating
 284  284   * system sees and the underlying physical layout. Different logical CPUs may
 285  285   * share different resources which can have important consequences for the
 286  286   * performance of the system. For example, they may share caches, execution
 287  287   * units, and more.
 288  288   *
 289  289   * The topology of the processor changes from generation to generation and
 290  290   * vendor to vendor.  Along with that, different vendors use different
 291  291   * terminology, and the operating system itself uses occasionally overlapping
 292  292   * terminology. It's important to understand what this topology looks like so
 293  293   * one can understand the different things that we try to calculate and
 294  294   * determine.
 295  295   *
 296  296   * To get started, let's talk about a little bit of terminology that we've used
 297  297   * so far, is used throughout this file, and is fairly generic across multiple
 298  298   * vendors:
 299  299   *
 300  300   * CPU
 301  301   *      A central processing unit (CPU) refers to a logical and/or virtual
 302  302   *      entity that the operating system can execute instructions on. The
 303  303   *      underlying resources for this CPU may be shared between multiple
 304  304   *      entities; however, to the operating system it is a discrete unit.
 305  305   *
 306  306   * PROCESSOR and PACKAGE
 307  307   *
 308  308   *      Generally, when we use the term 'processor' on its own, we are referring
 309  309   *      to the physical entity that one buys and plugs into a board. However,
 310  310   *      because processor has been overloaded and one might see it used to mean
 311  311   *      multiple different levels, we will instead use the term 'package' for
 312  312   *      the rest of this file. The term package comes from the electrical
 313  313   *      engineering side and refers to the physical entity that encloses the
 314  314   *      electronics inside. Strictly speaking the package can contain more than
 315  315   *      just the CPU, for example, on many processors it may also have what's
 316  316   *      called an 'integrated graphical processing unit (GPU)'. Because the
 317  317   *      package can encapsulate multiple units, it is the largest physical unit
 318  318   *      that we refer to.
 319  319   *
 320  320   * SOCKET
 321  321   *
 322  322   *      A socket refers to unit on a system board (generally the motherboard)
 323  323   *      that can receive a package. A single package, or processor, is plugged
 324  324   *      into a single socket. A system may have multiple sockets. Often times,
 325  325   *      the term socket is used interchangeably with package and refers to the
 326  326   *      electrical component that has plugged in, and not the receptacle itself.
 327  327   *
 328  328   * CORE
 329  329   *
 330  330   *      A core refers to the physical instantiation of a CPU, generally, with a
 331  331   *      full set of hardware resources available to it. A package may contain
 332  332   *      multiple cores inside of it or it may just have a single one. A
 333  333   *      processor with more than one core is often referred to as 'multi-core'.
 334  334   *      In illumos, we will use the feature X86FSET_CMP to refer to a system
 335  335   *      that has 'multi-core' processors.
 336  336   *
 337  337   *      A core may expose a single logical CPU to the operating system, or it
 338  338   *      may expose multiple CPUs, which we call threads, defined below.
 339  339   *
 340  340   *      Some resources may still be shared by cores in the same package. For
 341  341   *      example, many processors will share the level 3 cache between cores.
 342  342   *      Some AMD generations share hardware resources between cores. For more
 343  343   *      information on that see the section 'AMD Topology'.
 344  344   *
 345  345   * THREAD and STRAND
 346  346   *
 347  347   *      In this file, generally a thread refers to a hardware resources and not
 348  348   *      the operating system's logical abstraction. A thread is always exposed
 349  349   *      as an independent logical CPU to the operating system. A thread belongs
 350  350   *      to a specific core. A core may have more than one thread. When that is
 351  351   *      the case, the threads that are part of the same core are often referred
 352  352   *      to as 'siblings'.
 353  353   *
 354  354   *      When multiple threads exist, this is generally referred to as
 355  355   *      simultaneous multi-threading (SMT). When Intel introduced this in their
 356  356   *      processors they called it hyper-threading (HT). When multiple threads
 357  357   *      are active in a core, they split the resources of the core. For example,
 358  358   *      two threads may share the same set of hardware execution units.
 359  359   *
 360  360   *      The operating system often uses the term 'strand' to refer to a thread.
 361  361   *      This helps disambiguate it from the software concept.
 362  362   *
 363  363   * CHIP
 364  364   *
 365  365   *      Unfortunately, the term 'chip' is dramatically overloaded. At its most
 366  366   *      base meaning, it is used to refer to a single integrated circuit, which
 367  367   *      may or may not be the only thing in the package. In illumos, when you
 368  368   *      see the term 'chip' it is almost always referring to the same thing as
 369  369   *      the 'package'. However, many vendors may use chip to refer to one of
 370  370   *      many integrated circuits that have been placed in the package. As an
 371  371   *      example, see the subsequent definition.
 372  372   *
 373  373   *      To try and keep things consistent, we will only use chip when referring
 374  374   *      to the entire integrated circuit package, with the exception of the
 375  375   *      definition of multi-chip module (because it is in the name) and use the
 376  376   *      term 'die' when we want the more general, potential sub-component
 377  377   *      definition.
 378  378   *
 379  379   * DIE
 380  380   *
 381  381   *      A die refers to an integrated circuit. Inside of the package there may
 382  382   *      be a single die or multiple dies. This is sometimes called a 'chip' in
 383  383   *      vendor's parlance, but in this file, we use the term die to refer to a
 384  384   *      subcomponent.
 385  385   *
 386  386   * MULTI-CHIP MODULE
 387  387   *
 388  388   *      A multi-chip module (MCM) refers to putting multiple distinct chips that
 389  389   *      are connected together in the same package. When a multi-chip design is
 390  390   *      used, generally each chip is manufactured independently and then joined
 391  391   *      together in the package. For example, on AMD's Zen microarchitecture
 392  392   *      (family 0x17), the package contains several dies (the second meaning of
 393  393   *      chip from above) that are connected together.
 394  394   *
 395  395   * CACHE
 396  396   *
 397  397   *      A cache is a part of the processor that maintains copies of recently
 398  398   *      accessed memory. Caches are split into levels and then into types.
 399  399   *      Commonly there are one to three levels, called level one, two, and
 400  400   *      three. The lower the level, the smaller it is, the closer it is to the
 401  401   *      execution units of the CPU, and the faster it is to access. The layout
 402  402   *      and design of the cache come in many different flavors, consult other
 403  403   *      resources for a discussion of those.
 404  404   *
 405  405   *      Caches are generally split into two types, the instruction and data
 406  406   *      cache. The caches contain what their names suggest, the instruction
 407  407   *      cache has executable program text, while the data cache has all other
 408  408   *      memory that the processor accesses. As of this writing, data is kept
 409  409   *      coherent between all of the caches on x86, so if one modifies program
 410  410   *      text before it is executed, that will be in the data cache, and the
 411  411   *      instruction cache will be synchronized with that change when the
 412  412   *      processor actually executes those instructions. This coherency also
 413  413   *      covers the fact that data could show up in multiple caches.
 414  414   *
 415  415   *      Generally, the lowest level caches are specific to a core. However, the
 416  416   *      last layer cache is shared between some number of cores. The number of
 417  417   *      CPUs sharing this last level cache is important. This has implications
 418  418   *      for the choices that the scheduler makes, as accessing memory that might
 419  419   *      be in a remote cache after thread migration can be quite expensive.
 420  420   *
 421  421   *      Sometimes, the word cache is abbreviated with a '$', because in US
 422  422   *      English the word cache is pronounced the same as cash. So L1D$ refers to
 423  423   *      the L1 data cache, and L2$ would be the L2 cache. This will not be used
 424  424   *      in the rest of this theory statement for clarity.
 425  425   *
 426  426   * MEMORY CONTROLLER
 427  427   *
 428  428   *      The memory controller is a component that provides access to DRAM. Each
 429  429   *      memory controller can access a set number of DRAM channels. Each channel
 430  430   *      can have a number of DIMMs (sticks of memory) associated with it. A
 431  431   *      given package may have more than one memory controller. The association
 432  432   *      of the memory controller to a group of cores is important as it is
 433  433   *      cheaper to access memory on the controller that you are associated with.
 434  434   *
 435  435   * NUMA
 436  436   *
 437  437   *      NUMA or non-uniform memory access, describes a way that systems are
 438  438   *      built. On x86, any processor core can address all of the memory in the
 439  439   *      system. However, When using multiple sockets or possibly within a
 440  440   *      multi-chip module, some of that memory is physically closer and some of
 441  441   *      it is further. Memory that is further away is more expensive to access.
 442  442   *      Consider the following image of multiple sockets with memory:
 443  443   *
 444  444   *      +--------+                                                +--------+
 445  445   *      | DIMM A |         +----------+      +----------+         | DIMM D |
 446  446   *      +--------+-+       |          |      |          |       +-+------+-+
 447  447   *        | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
 448  448   *        +--------+-+     |          |      |          |     +-+------+-+
 449  449   *          | DIMM C |     +----------+      +----------+     | DIMM F |
 450  450   *          +--------+                                        +--------+
 451  451   *
 452  452   *      In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
 453  453   *      closer to DIMMs D-F. This means that it is cheaper for socket 0 to
 454  454   *      access DIMMs A-C and more expensive to access D-F as it has to go
 455  455   *      through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
 456  456   *      D-F are cheaper than A-C. While the socket form is the most common, when
 457  457   *      using multi-chip modules, this can also sometimes occur. For another
 458  458   *      example of this that's more involved, see the AMD topology section.
 459  459   *
 460  460   *
 461  461   * Intel Topology
 462  462   * --------------
 463  463   *
 464  464   * Most Intel processors since Nehalem, (as of this writing the current gen
 465  465   * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
 466  466   * the package is a single monolithic die. MCMs currently aren't used. Most
 467  467   * parts have three levels of caches, with the L3 cache being shared between
 468  468   * all of the cores on the package. The L1/L2 cache is generally specific to
 469  469   * an individual core. The following image shows at a simplified level what
 470  470   * this looks like. The memory controller is commonly part of something called
 471  471   * the 'Uncore', that used to be separate physical chips that were not a part of
 472  472   * the package, but are now part of the same chip.
 473  473   *
 474  474   *  +-----------------------------------------------------------------------+
 475  475   *  | Package                                                               |
 476  476   *  |  +-------------------+  +-------------------+  +-------------------+  |
 477  477   *  |  | Core              |  | Core              |  | Core              |  |
 478  478   *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
 479  479   *  |  |  | Thread | | L | |  |  | Thread | | L | |  |  | Thread | | L | |  |
 480  480   *  |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |
 481  481   *  |  |  +--------+ |   | |  |  +--------+ |   | |  |  +--------+ |   | |  |
 482  482   *  |  |  | Thread | |   | |  |  | Thread | |   | |  |  | Thread | |   | |  |
 483  483   *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
 484  484   *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
 485  485   *  |  |  | L2 Cache     | |  |  | L2 Cache     | |  |  | L2 Cache     | |  |
 486  486   *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
 487  487   *  |  +-------------------+  +-------------------+  +-------------------+  |
 488  488   *  | +-------------------------------------------------------------------+ |
 489  489   *  | |                         Shared L3 Cache                           | |
 490  490   *  | +-------------------------------------------------------------------+ |
 491  491   *  | +-------------------------------------------------------------------+ |
 492  492   *  | |                        Memory Controller                          | |
 493  493   *  | +-------------------------------------------------------------------+ |
 494  494   *  +-----------------------------------------------------------------------+
 495  495   *
 496  496   * A side effect of this current architecture is that what we care about from a
 497  497   * scheduling and topology perspective, is simplified. In general we care about
 498  498   * understanding which logical CPUs are part of the same core and socket.
 499  499   *
 500  500   * To determine the relationship between threads and cores, Intel initially used
 501  501   * the identifier in the advanced programmable interrupt controller (APIC). They
 502  502   * also added cpuid leaf 4 to give additional information about the number of
 503  503   * threads and CPUs in the processor. With the addition of x2apic (which
 504  504   * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
 505  505   * additional cpuid topology leaf 0xB was added.
 506  506   *
 507  507   * AMD Topology
 508  508   * ------------
 509  509   *
 510  510   * When discussing AMD topology, we want to break this into three distinct
 511  511   * generations of topology. There's the basic topology that has been used in
 512  512   * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
 513  513   * with family 0x15 (Bulldozer), and there's the topology that was introduced
 514  514   * with family 0x17 (Zen). AMD also has some additional terminology that's worth
 515  515   * talking about.
 516  516   *
 517  517   * Until the introduction of family 0x17 (Zen), AMD did not implement something
 518  518   * that they considered SMT. Whether or not the AMD processors have SMT
 519  519   * influences many things including scheduling and reliability, availability,
 520  520   * and serviceability (RAS) features.
 521  521   *
 522  522   * NODE
 523  523   *
 524  524   *      AMD uses the term node to refer to a die that contains a number of cores
 525  525   *      and I/O resources. Depending on the processor family and model, more
 526  526   *      than one node can be present in the package. When there is more than one
 527  527   *      node this indicates a multi-chip module. Usually each node has its own
 528  528   *      access to memory and I/O devices. This is important and generally
 529  529   *      different from the corresponding Intel Nehalem-Skylake+ processors. As a
 530  530   *      result, we track this relationship in the operating system.
 531  531   *
 532  532   *      In processors with an L3 cache, the L3 cache is generally shared across
 533  533   *      the entire node, though the way this is carved up varies from generation
 534  534   *      to generation.
 535  535   *
 536  536   * BULLDOZER
 537  537   *
 538  538   *      Starting with the Bulldozer family (0x15) and continuing until the
 539  539   *      introduction of the Zen microarchitecture, AMD introduced the idea of a
 540  540   *      compute unit. In a compute unit, two traditional cores share a number of
 541  541   *      hardware resources. Critically, they share the FPU, L1 instruction
 542  542   *      cache, and the L2 cache. Several compute units were then combined inside
 543  543   *      of a single node.  Because the integer execution units, L1 data cache,
 544  544   *      and some other resources were not shared between the cores, AMD never
 545  545   *      considered this to be SMT.
 546  546   *
 547  547   * ZEN
 548  548   *
 549  549   *      The Zen family (0x17) uses a multi-chip module (MCM) design, the module
 550  550   *      is called Zeppelin. These modules are similar to the idea of nodes used
 551  551   *      previously. Each of these nodes has two DRAM channels which all of the
 552  552   *      cores in the node can access uniformly. These nodes are linked together
 553  553   *      in the package, creating a NUMA environment.
 554  554   *
 555  555   *      The Zeppelin die itself contains two different 'core complexes'. Each
 556  556   *      core complex consists of four cores which each have two threads, for a
 557  557   *      total of 8 logical CPUs per complex. Unlike other generations,
 558  558   *      where all the logical CPUs in a given node share the L3 cache, here each
 559  559   *      core complex has its own shared L3 cache.
 560  560   *
 561  561   *      A further thing that we need to consider is that in some configurations,
 562  562   *      particularly with the Threadripper line of processors, not every die
 563  563   *      actually has its memory controllers wired up to actual memory channels.
 564  564   *      This means that some cores have memory attached to them and others
 565  565   *      don't.
 566  566   *
 567  567   *      To put Zen in perspective, consider the following images:
 568  568   *
 569  569   *      +--------------------------------------------------------+
 570  570   *      | Core Complex                                           |
 571  571   *      | +-------------------+    +-------------------+  +---+  |
 572  572   *      | | Core       +----+ |    | Core       +----+ |  |   |  |
 573  573   *      | | +--------+ | L2 | |    | +--------+ | L2 | |  |   |  |
 574  574   *      | | | Thread | +----+ |    | | Thread | +----+ |  |   |  |
 575  575   *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  | L |  |
 576  576   *      | |   | Thread | |L1| |    |   | Thread | |L1| |  | 3 |  |
 577  577   *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
 578  578   *      | +-------------------+    +-------------------+  | C |  |
 579  579   *      | +-------------------+    +-------------------+  | a |  |
 580  580   *      | | Core       +----+ |    | Core       +----+ |  | c |  |
 581  581   *      | | +--------+ | L2 | |    | +--------+ | L2 | |  | h |  |
 582  582   *      | | | Thread | +----+ |    | | Thread | +----+ |  | e |  |
 583  583   *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |   |  |
 584  584   *      | |   | Thread | |L1| |    |   | Thread | |L1| |  |   |  |
 585  585   *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
 586  586   *      | +-------------------+    +-------------------+  +---+  |
 587  587   *      |                                                        |
 588  588   *      +--------------------------------------------------------+
 589  589   *
 590  590   *  This first image represents a single Zen core complex that consists of four
 591  591   *  cores.
 592  592   *
 593  593   *
 594  594   *      +--------------------------------------------------------+
 595  595   *      | Zeppelin Die                                           |
 596  596   *      |  +--------------------------------------------------+  |
 597  597   *      |  |         I/O Units (PCIe, SATA, USB, etc.)        |  |
 598  598   *      |  +--------------------------------------------------+  |
 599  599   *      |                           HH                           |
 600  600   *      |          +-----------+    HH    +-----------+          |
 601  601   *      |          |           |    HH    |           |          |
 602  602   *      |          |    Core   |==========|    Core   |          |
 603  603   *      |          |  Complex  |==========|  Complex  |          |
 604  604   *      |          |           |    HH    |           |          |
 605  605   *      |          +-----------+    HH    +-----------+          |
 606  606   *      |                           HH                           |
 607  607   *      |  +--------------------------------------------------+  |
 608  608   *      |  |                Memory Controller                 |  |
 609  609   *      |  +--------------------------------------------------+  |
 610  610   *      |                                                        |
 611  611   *      +--------------------------------------------------------+
 612  612   *
 613  613   *  This image represents a single Zeppelin Die. Note how both cores are
 614  614   *  connected to the same memory controller and I/O units. While each core
 615  615   *  complex has its own L3 cache as seen in the first image, they both have
 616  616   *  uniform access to memory.
 617  617   *
 618  618   *
 619  619   *                      PP                     PP
 620  620   *                      PP                     PP
 621  621   *           +----------PP---------------------PP---------+
 622  622   *           |          PP                     PP         |
 623  623   *           |    +-----------+          +-----------+    |
 624  624   *           |    |           |          |           |    |
 625  625   *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
 626  626   *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
 627  627   *           |    |           |          |           |    |
 628  628   *           |    +-----------+ooo    ...+-----------+    |
 629  629   *           |          HH      ooo  ...       HH         |
 630  630   *           |          HH        oo..         HH         |
 631  631   *           |          HH        ..oo         HH         |
 632  632   *           |          HH      ...  ooo       HH         |
 633  633   *           |    +-----------+...    ooo+-----------+    |
 634  634   *           |    |           |          |           |    |
 635  635   *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
 636  636   *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
 637  637   *           |    |           |          |           |    |
 638  638   *           |    +-----------+          +-----------+    |
 639  639   *           |          PP                     PP         |
 640  640   *           +----------PP---------------------PP---------+
 641  641   *                      PP                     PP
 642  642   *                      PP                     PP
 643  643   *
 644  644   *  This image represents a single Zen package. In this example, it has four
 645  645   *  Zeppelin dies, though some configurations only have a single one. In this
 646  646   *  example, each die is directly connected to the next. Also, each die is
 647  647   *  represented as being connected to memory by the 'M' character and connected
 648  648   *  to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
 649  649   *  die is made up of two core complexes, we have multiple different NUMA
 650  650   *  domains that we care about for these systems.
 651  651   *
 652  652   * CPUID LEAVES
 653  653   *
 654  654   * There are a few different CPUID leaves that we can use to try and understand
 655  655   * the actual state of the world. As part of the introduction of family 0xf, AMD
 656  656   * added CPUID leaf 0x80000008. This leaf tells us the number of logical
 657  657   * processors that are in the system. Because families before Zen didn't have
 658  658   * SMT, this was always the number of cores that were in the system. However, it
 659  659   * should always be thought of as the number of logical threads to be consistent
 660  660   * between generations. In addition we also get the size of the APIC ID that is
 661  661   * used to represent the number of logical processors. This is important for
 662  662   * deriving topology information.
 663  663   *
 664  664   * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
 665  665   * bit between Bulldozer and later families, but it is quite useful in
 666  666   * determining the topology information. Because this information has changed
 667  667   * across family generations, it's worth calling out what these mean
 668  668   * explicitly. The registers have the following meanings:
 669  669   *
 670  670   *      %eax    The APIC ID. The entire register is defined to have a 32-bit
 671  671   *              APIC ID, even though on systems without x2apic support, it will
 672  672   *              be limited to 8 bits.
 673  673   *
 674  674   *      %ebx    On Bulldozer-era systems this contains information about the
 675  675   *              number of cores that are in a compute unit (cores that share
 676  676   *              resources). It also contains a per-package compute unit ID that
 677  677   *              identifies which compute unit the logical CPU is a part of.
 678  678   *
 679  679   *              On Zen-era systems this instead contains the number of threads
 680  680   *              per core and the ID of the core that the logical CPU is a part
 681  681   *              of. Note, this ID is unique only to the package, it is not
 682  682   *              globally unique across the entire system.
 683  683   *
 684  684   *      %ecx    This contains the number of nodes that exist in the package. It
 685  685   *              also contains an ID that identifies which node the logical CPU
 686  686   *              is a part of.
 687  687   *
 688  688   * Finally, we also use cpuid leaf 0x8000001D to determine information about the
 689  689   * cache layout to determine which logical CPUs are sharing which caches.
 690  690   *
 691  691   * illumos Topology
 692  692   * ----------------
 693  693   *
 694  694   * Based on the above we synthesize the information into several different
 695  695   * variables that we store in the 'struct cpuid_info'. We'll go into the details
 696  696   * of what each member is supposed to represent and their uniqueness. In
 697  697   * general, there are two levels of uniqueness that we care about. We care about
 698  698   * an ID that is globally unique. That means that it will be unique across all
 699  699   * entities in the system. For example, the default logical CPU ID is globally
 700  700   * unique. On the other hand, there is some information that we only care about
 701  701   * being unique within the context of a single package / socket. Here are the
 702  702   * variables that we keep track of and their meaning.
 703  703   *
 704  704   * Several of the values that are asking for an identifier, with the exception
 705  705   * of cpi_apicid, are allowed to be synthetic.
 706  706   *
 707  707   *
 708  708   * cpi_apicid
 709  709   *
 710  710   *      This is the value of the CPU's APIC id. This should be the full 32-bit
 711  711   *      ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
 712  712   *      APIC ID. This value is globally unique between all logical CPUs across
 713  713   *      all packages. This is usually required by the APIC.
 714  714   *
 715  715   * cpi_chipid
 716  716   *
 717  717   *      This value indicates the ID of the package that the logical CPU is a
 718  718   *      part of. This value is allowed to be synthetic. It is usually derived by
 719  719   *      taking the CPU's APIC ID and determining how many bits are used to
 720  720   *      represent CPU cores in the package. All logical CPUs that are part of
 721  721   *      the same package must have the same value.
 722  722   *
 723  723   * cpi_coreid
 724  724   *
 725  725   *      This represents the ID of a CPU core. Two logical CPUs should only have
 726  726   *      the same cpi_coreid value if they are part of the same core. These
 727  727   *      values may be synthetic. On systems that support SMT, this value is
 728  728   *      usually derived from the APIC ID, otherwise it is often synthetic and
 729  729   *      just set to the value of the cpu_id in the cpu_t.
 730  730   *
 731  731   * cpi_pkgcoreid
 732  732   *
 733  733   *      This is similar to the cpi_coreid in that logical CPUs that are part of
 734  734   *      the same core should have the same ID. The main difference is that these
 735  735   *      values are only required to be unique to a given socket.
 736  736   *
 737  737   * cpi_clogid
 738  738   *
 739  739   *      This represents the logical ID of a logical CPU. This value should be
 740  740   *      unique within a given socket for each logical CPU. This is allowed to be
 741  741   *      synthetic, though it is usually based off of the CPU's apic ID. The
 742  742   *      broader system expects that logical CPUs that have are part of the same
 743  743   *      core have contiguous numbers. For example, if there were two threads per
 744  744   *      core, then the core IDs divided by two should be the same and the first
 745  745   *      modulus two should be zero and the second one. For example, IDs 4 and 5
 746  746   *      indicate two logical CPUs that are part of the same core. But IDs 5 and
 747  747   *      6 represent two logical CPUs that are part of different cores.
 748  748   *
 749  749   *      While it is common for the cpi_coreid and the cpi_clogid to be derived
 750  750   *      from the same source, strictly speaking, they don't have to be and the
 751  751   *      two values should be considered logically independent. One should not
 752  752   *      try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
 753  753   *      some kind of relationship. While this is tempting, we've seen cases on
 754  754   *      AMD family 0xf where the system's cpu id is not related to its APIC ID.
 755  755   *
 756  756   * cpi_ncpu_per_chip
 757  757   *
 758  758   *      This value indicates the total number of logical CPUs that exist in the
 759  759   *      physical package. Critically, this is not the number of logical CPUs
 760  760   *      that exist for just the single core.
 761  761   *
 762  762   *      This value should be the same for all logical CPUs in the same package.
 763  763   *
 764  764   * cpi_ncore_per_chip
 765  765   *
 766  766   *      This value indicates the total number of physical CPU cores that exist
 767  767   *      in the package. The system compares this value with cpi_ncpu_per_chip to
 768  768   *      determine if simultaneous multi-threading (SMT) is enabled. When
 769  769   *      cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
 770  770   *      the X86FSET_HTT feature is not set. If this value is greater than one,
 771  771   *      than we consider the processor to have the feature X86FSET_CMP, to
 772  772   *      indicate that there is support for more than one core.
 773  773   *
 774  774   *      This value should be the same for all logical CPUs in the same package.
 775  775   *
 776  776   * cpi_procnodes_per_pkg
 777  777   *
 778  778   *      This value indicates the number of 'nodes' that exist in the package.
 779  779   *      When processors are actually a multi-chip module, this represents the
 780  780   *      number of such modules that exist in the package. Currently, on Intel
 781  781   *      based systems this member is always set to 1.
 782  782   *
 783  783   *      This value should be the same for all logical CPUs in the same package.
 784  784   *
 785  785   * cpi_procnodeid
 786  786   *
 787  787   *      This value indicates the ID of the node that the logical CPU is a part
 788  788   *      of. All logical CPUs that are in the same node must have the same value
 789  789   *      here. This value must be unique across all of the packages in the
 790  790   *      system.  On Intel based systems, this is currently set to the value in
 791  791   *      cpi_chipid because there is only one node.
 792  792   *
 793  793   * cpi_cores_per_compunit
 794  794   *
 795  795   *      This value indicates the number of cores that are part of a compute
 796  796   *      unit. See the AMD topology section for this. This member only has real
 797  797   *      meaning currently for AMD Bulldozer family processors. For all other
 798  798   *      processors, this should currently be set to 1.
 799  799   *
 800  800   * cpi_compunitid
 801  801   *
 802  802   *      This indicates the compute unit that the logical CPU belongs to. For
 803  803   *      processors without AMD Bulldozer-style compute units this should be set
 804  804   *      to the value of cpi_coreid.
 805  805   *
 806  806   * cpi_ncpu_shr_last_cache
 807  807   *
 808  808   *      This indicates the number of logical CPUs that are sharing the same last
 809  809   *      level cache. This value should be the same for all CPUs that are sharing
 810  810   *      that cache. The last cache refers to the cache that is closest to memory
 811  811   *      and furthest away from the CPU.
 812  812   *
 813  813   * cpi_last_lvl_cacheid
 814  814   *
 815  815   *      This indicates the ID of the last cache that the logical CPU uses. This
 816  816   *      cache is often shared between multiple logical CPUs and is the cache
 817  817   *      that is closest to memory and furthest away from the CPU. This value
 818  818   *      should be the same for a group of logical CPUs only if they actually
 819  819   *      share the same last level cache. IDs should not overlap between
 820  820   *      packages.
 821  821   *
 822  822   * cpi_ncore_bits
 823  823   *
 824  824   *      This indicates the number of bits that are required to represent all of
 825  825   *      the cores in the system. As cores are derived based on their APIC IDs,
 826  826   *      we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
 827  827   *      this value to be larger than the actual number of IDs that are present
 828  828   *      in the system. This is used to size tables by the CMI framework. It is
 829  829   *      only filled in for Intel and AMD CPUs.
 830  830   *
 831  831   * cpi_nthread_bits
 832  832   *
 833  833   *      This indicates the number of bits required to represent all of the IDs
 834  834   *      that cover the logical CPUs that exist on a given core. It's OK for this
 835  835   *      value to be larger than the actual number of IDs that are present in the
 836  836   *      system.  This is used to size tables by the CMI framework. It is
 837  837   *      only filled in for Intel and AMD CPUs.
 838  838   *
 839  839   * -----------
 840  840   * Hypervisors
 841  841   * -----------
 842  842   *
 843  843   * If trying to manage the differences between vendors wasn't bad enough, it can
 844  844   * get worse thanks to our friend hardware virtualization. Hypervisors are given
 845  845   * the ability to interpose on all cpuid instructions and change them to suit
 846  846   * their purposes. In general, this is necessary as the hypervisor wants to be
 847  847   * able to present a more uniform set of features or not necessarily give the
 848  848   * guest operating system kernel knowledge of all features so it can be
 849  849   * more easily migrated between systems.
 850  850   *
 851  851   * When it comes to trying to determine topology information, this can be a
 852  852   * double edged sword. When a hypervisor doesn't actually implement a cpuid
 853  853   * leaf, it'll often return all zeros. Because of that, you'll often see various
 854  854   * checks scattered about fields being non-zero before we assume we can use
 855  855   * them.
 856  856   *
 857  857   * When it comes to topology information, the hypervisor is often incentivized
 858  858   * to lie to you about topology. This is because it doesn't always actually
 859  859   * guarantee that topology at all. The topology path we take in the system
 860  860   * depends on how the CPU advertises itself. If it advertises itself as an Intel
 861  861   * or AMD CPU, then we basically do our normal path. However, when they don't
 862  862   * use an actual vendor, then that usually turns into multiple one-core CPUs
 863  863   * that we enumerate that are often on different sockets. The actual behavior
 864  864   * depends greatly on what the hypervisor actually exposes to us.
 865  865   *
 866  866   * --------------------
 867  867   * Exposing Information
 868  868   * --------------------
 869  869   *
 870  870   * We expose CPUID information in three different forms in the system.
 871  871   *
 872  872   * The first is through the x86_featureset variable. This is used in conjunction
 873  873   * with the is_x86_feature() function. This is queried by x86-specific functions
 874  874   * to determine which features are or aren't present in the system and to make
 875  875   * decisions based upon them. For example, users of this include everything from
 876  876   * parts of the system dedicated to reliability, availability, and
 877  877   * serviceability (RAS), to making decisions about how to handle security
 878  878   * mitigations, to various x86-specific drivers. General purpose or
 879  879   * architecture independent drivers should never be calling this function.
 880  880   *
 881  881   * The second means is through the auxiliary vector. The auxiliary vector is a
 882  882   * series of tagged data that the kernel passes down to a user program when it
 883  883   * begins executing. This information is used to indicate to programs what
 884  884   * instruction set extensions are present. For example, information about the
 885  885   * CPU supporting the machine check architecture (MCA) wouldn't be passed down
 886  886   * since user programs cannot make use of it. However, things like the AVX
 887  887   * instruction sets are. Programs use this information to make run-time
 888  888   * decisions about what features they should use. As an example, the run-time
 889  889   * link-editor (rtld) can relocate different functions depending on the hardware
 890  890   * support available.
 891  891   *
 892  892   * The final form is through a series of accessor functions that all have the
 893  893   * form cpuid_get*. This is used by a number of different subsystems in the
 894  894   * kernel to determine more detailed information about what we're running on,
 895  895   * topology information, etc. Some of these subsystems include processor groups
 896  896   * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
 897  897   * microcode, and performance monitoring. These functions all ASSERT that the
 898  898   * CPU they're being called on has reached a certain cpuid pass. If the passes
 899  899   * are rearranged, then this needs to be adjusted.
 900  900   *
 901  901   * -----------------------------------------------
 902  902   * Speculative Execution CPU Side Channel Security

↓ open down ↓

902 lines elided

↑ open up ↑

 903  903   * -----------------------------------------------
 904  904   *
 905  905   * With the advent of the Spectre and Meltdown attacks which exploit speculative
 906  906   * execution in the CPU to create side channels there have been a number of
 907  907   * different attacks and corresponding issues that the operating system needs to
 908  908   * mitigate against. The following list is some of the common, but not
 909  909   * exhaustive, set of issues that we know about and have done some or need to do
 910  910   * more work in the system to mitigate against:
 911  911   *
 912  912   *   - Spectre v1
      913 + *   - swapgs (Spectre v1 variant)
 913  914   *   - Spectre v2
 914  915   *   - Meltdown (Spectre v3)
 915  916   *   - Rogue Register Read (Spectre v3a)
 916  917   *   - Speculative Store Bypass (Spectre v4)
 917  918   *   - ret2spec, SpectreRSB
 918  919   *   - L1 Terminal Fault (L1TF)
 919  920   *   - Microarchitectural Data Sampling (MDS)
 920  921   *
 921  922   * Each of these requires different sets of mitigations and has different attack
 922  923   * surfaces. For the most part, this discussion is about protecting the kernel
 923  924   * from non-kernel executing environments such as user processes and hardware
 924  925   * virtual machines. Unfortunately, there are a number of user vs. user
 925  926   * scenarios that exist with these. The rest of this section will describe the
 926  927   * overall approach that the system has taken to address these as well as their
 927  928   * shortcomings. Unfortunately, not all of the above have been handled today.
 928  929   *
 929      - * SPECTRE FAMILY (Spectre v2, ret2spec, SpectreRSB)
      930 + * SPECTRE v2, ret2spec, SpectreRSB
 930  931   *
 931  932   * The second variant of the spectre attack focuses on performing branch target
 932  933   * injection. This generally impacts indirect call instructions in the system.
 933  934   * There are three different ways to mitigate this issue that are commonly
 934  935   * described today:
 935  936   *
 936  937   *  1. Using Indirect Branch Restricted Speculation (IBRS).
 937  938   *  2. Using Retpolines and RSB Stuffing
 938  939   *  3. Using Enhanced Indirect Branch Restricted Speculation (EIBRS)
 939  940   *

 940  941   * IBRS uses a feature added to microcode to restrict speculation, among other
 941  942   * things. This form of mitigation has not been used as it has been generally
 942  943   * seen as too expensive and requires reactivation upon various transitions in
 943  944   * the system.
 944  945   *
 945  946   * As a less impactful alternative to IBRS, retpolines were developed by
 946  947   * Google. These basically require one to replace indirect calls with a specific
 947  948   * trampoline that will cause speculation to fail and break the attack.
 948  949   * Retpolines require compiler support. We always build with retpolines in the
 949  950   * external thunk mode. This means that a traditional indirect call is replaced
 950  951   * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
 951  952   * of this is that all indirect function calls are performed through a register.
 952  953   *
 953  954   * We have to use a common external location of the thunk and not inline it into
 954  955   * the callsite so that way we can have a single place to patch these functions.
 955  956   * As it turns out, we actually have three different forms of retpolines that
 956  957   * exist in the system:
 957  958   *
 958  959   *  1. A full retpoline
 959  960   *  2. An AMD-specific optimized retpoline
 960  961   *  3. A no-op version
 961  962   *
 962  963   * The first one is used in the general case. The second one is used if we can
 963  964   * determine that we're on an AMD system and we can successfully toggle the
 964  965   * lfence serializing MSR that exists on the platform. Basically with this
 965  966   * present, an lfence is sufficient and we don't need to do anywhere near as
 966  967   * complicated a dance to successfully use retpolines.
 967  968   *
 968  969   * The third form described above is the most curious. It turns out that the way
 969  970   * that retpolines are implemented is that they rely on how speculation is
 970  971   * performed on a 'ret' instruction. Intel has continued to optimize this
 971  972   * process (which is partly why we need to have return stack buffer stuffing,
 972  973   * but more on that in a bit) and in processors starting with Cascade Lake
 973  974   * on the server side, it's dangerous to rely on retpolines. Instead, a new
 974  975   * mechanism has been introduced called Enhanced IBRS (EIBRS).
 975  976   *
 976  977   * Unlike IBRS, EIBRS is designed to be enabled once at boot and left on each
 977  978   * physical core. However, if this is the case, we don't want to use retpolines
 978  979   * any more. Therefore if EIBRS is present, we end up turning each retpoline
 979  980   * function (called a thunk) into a jmp instruction. This means that we're still
 980  981   * paying the cost of an extra jump to the external thunk, but it gives us
 981  982   * flexibility and the ability to have a single kernel image that works across a
 982  983   * wide variety of systems and hardware features.
 983  984   *
 984  985   * Unfortunately, this alone is insufficient. First, Skylake systems have
 985  986   * additional speculation for the Return Stack Buffer (RSB) which is used to
 986  987   * return from call instructions which retpolines take advantage of. However,
 987  988   * this problem is not just limited to Skylake and is actually more pernicious.
 988  989   * The SpectreRSB paper introduces several more problems that can arise with
 989  990   * dealing with this. The RSB can be poisoned just like the indirect branch
 990  991   * predictor. This means that one needs to clear the RSB when transitioning
 991  992   * between two different privilege domains. Some examples include:
 992  993   *
 993  994   *  - Switching between two different user processes
 994  995   *  - Going between user land and the kernel
 995  996   *  - Returning to the kernel from a hardware virtual machine
 996  997   *
 997  998   * Mitigating this involves combining a couple of different things. The first is
 998  999   * SMEP (supervisor mode execution protection) which was introduced in Ivy
 999 1000   * Bridge. When an RSB entry refers to a user address and we're executing in the
1000 1001   * kernel, speculation through it will be stopped when SMEP is enabled. This
1001 1002   * protects against a number of the different cases that we would normally be
1002 1003   * worried about such as when we enter the kernel from user land.
1003 1004   *
1004 1005   * To prevent against additional manipulation of the RSB from other contexts
1005 1006   * such as a non-root VMX context attacking the kernel we first look to enhanced
1006 1007   * IBRS. When EIBRS is present and enabled, then there is nothing else that we
1007 1008   * need to do to protect the kernel at this time.
1008 1009   *
1009 1010   * On CPUs without EIBRS we need to manually overwrite the contents of the
1010 1011   * return stack buffer. We do this through the x86_rsb_stuff() function.
1011 1012   * Currently this is employed on context switch. The x86_rsb_stuff() function is
1012 1013   * disabled when enhanced IBRS is present because Intel claims on such systems
1013 1014   * it will be ineffective. Stuffing the RSB in context switch helps prevent user
1014 1015   * to user attacks via the RSB.
1015 1016   *
1016 1017   * If SMEP is not present, then we would have to stuff the RSB every time we
1017 1018   * transitioned from user mode to the kernel, which isn't very practical right
1018 1019   * now.
1019 1020   *
1020 1021   * To fully protect user to user and vmx to vmx attacks from these classes of
1021 1022   * issues, we would also need to allow them to opt into performing an Indirect
1022 1023   * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1023 1024   *
1024 1025   * By default, the system will enable RSB stuffing and the required variant of
1025 1026   * retpolines and store that information in the x86_spectrev2_mitigation value.
1026 1027   * This will be evaluated after a microcode update as well, though it is
1027 1028   * expected that microcode updates will not take away features. This may mean

↓ open down ↓

88 lines elided

↑ open up ↑

1028 1029   * that a late loaded microcode may not end up in the optimal configuration
1029 1030   * (though this should be rare).
1030 1031   *
1031 1032   * Currently we do not build kmdb with retpolines or perform any additional side
1032 1033   * channel security mitigations for it. One complication with kmdb is that it
1033 1034   * requires its own retpoline thunks and it would need to adjust itself based on
1034 1035   * what the kernel does. The threat model of kmdb is more limited and therefore
1035 1036   * it may make more sense to investigate using prediction barriers as the whole
1036 1037   * system is only executing a single instruction at a time while in kmdb.
1037 1038   *
1038      - * SPECTRE FAMILY (v1, v4)
     1039 + * SPECTRE v1, v4
1039 1040   *
1040 1041   * The v1 and v4 variants of spectre are not currently mitigated in the
1041 1042   * system and require other classes of changes to occur in the code.
1042 1043   *
     1044 + * SPECTRE v1 (SWAPGS VARIANT)
     1045 + *
     1046 + * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but
     1047 + * can generally affect any branch-dependent code. The swapgs issue is one
     1048 + * variant of this. If we are coming in from userspace, we can have code like
     1049 + * this:
     1050 + *
     1051 + *      cmpw    $KCS_SEL, REGOFF_CS(%rsp)
     1052 + *      je      1f
     1053 + *      movq    $0, REGOFF_SAVFP(%rsp)
     1054 + *      swapgs
     1055 + *      1:
     1056 + *      movq    %gs:CPU_THREAD, %rax
     1057 + *
     1058 + * If an attacker can cause a mis-speculation of the branch here, we could skip
     1059 + * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based
     1060 + * load. If subsequent code can act as the usual Spectre cache gadget, this
     1061 + * would potentially allow KPTI bypass. To fix this, we need an lfence prior to
     1062 + * any use of the %gs override.
     1063 + *
     1064 + * The other case is also an issue: if we're coming into a trap from kernel
     1065 + * space, we could mis-speculate and swapgs the user %gsbase back in prior to
     1066 + * using it. AMD systems are not vulnerable to this version, as a swapgs is
     1067 + * serializing with respect to subsequent uses. But as AMD /does/ need the other
     1068 + * case, and the fix is the same in both cases (an lfence at the branch target
     1069 + * 1: in this example), we'll just do it unconditionally.
     1070 + *
     1071 + * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it
     1072 + * harder for user-space to actually set a useful %gsbase value: although it's
     1073 + * not clear, it might still be feasible via lwp_setprivate(), though, so we
     1074 + * mitigate anyway.
     1075 + *
1043 1076   * MELTDOWN
1044 1077   *
1045 1078   * Meltdown, or spectre v3, allowed a user process to read any data in their
1046 1079   * address space regardless of whether or not the page tables in question
1047 1080   * allowed the user to have the ability to read them. The solution to meltdown
1048 1081   * is kernel page table isolation. In this world, there are two page tables that
1049 1082   * are used for a process, one in user land and one in the kernel. To implement
1050 1083   * this we use per-CPU page tables and switch between the user and kernel
1051 1084   * variants when entering and exiting the kernel.  For more information about
1052 1085   * this process and how the trampolines work, please see the big theory

1053 1086 * statements and additional comments in:
1054 1087 *
1055 1088 * - uts/i86pc/ml/kpti_trampolines.s
1056 1089 * - uts/i86pc/vm/hat_i86.c
1057 1090 *
1058 1091 * While Meltdown only impacted Intel systems and there are also Intel systems
1059 1092 * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1060 1093 * kernel page table isolation enabled. While this may at first seem weird, an
1061 1094 * important thing to remember is that you can't speculatively read an address
1062 1095 * if it's never in your page table at all. Having user processes without kernel
1063 1096 * pages present provides us with an important layer of defense in the kernel
1064 1097 * against any other side channel attacks that exist and have yet to be
1065 1098 * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1066 1099 * default, no matter the x86 system.
1067 1100 *
1068 1101 * L1 TERMINAL FAULT
1069 1102 *
1070 1103 * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1071 1104 * execution uses page table entries. Effectively, it is two different problems.
1072 1105 * The first is that it ignores the not present bit in the page table entries
1073 1106 * when performing speculative execution. This means that something can
1074 1107 * speculatively read the listed physical address if it's present in the L1
1075 1108 * cache under certain conditions (see Intel's documentation for the full set of
1076 1109 * conditions). Secondly, this can be used to bypass hardware virtualization
1077 1110 * extended page tables (EPT) that are part of Intel's hardware virtual machine
1078 1111 * instructions.
1079 1112 *
1080 1113 * For the non-hardware virtualized case, this is relatively easy to deal with.
1081 1114 * We must make sure that all unmapped pages have an address of zero. This means
1082 1115 * that they could read the first 4k of physical memory; however, we never use
1083 1116 * that first page in the operating system and always skip putting it in our
1084 1117 * memory map, even if firmware tells us we can use it in our memory map. While
1085 1118 * other systems try to put extra metadata in the address and reserved bits,
1086 1119 * which led to this being problematic in those cases, we do not.
1087 1120 *
1088 1121 * For hardware virtual machines things are more complicated. Because they can
1089 1122 * construct their own page tables, it isn't hard for them to perform this
1090 1123 * attack against any physical address. The one wrinkle is that this physical
1091 1124 * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1092 1125 * to flush the L1 data cache. We wrap this up in the function
1093 1126 * spec_uarch_flush(). This function is also used in the mitigation of
1094 1127 * microarchitectural data sampling (MDS) discussed later on. Kernel based
1095 1128 * hypervisors such as KVM or bhyve are responsible for performing this before
1096 1129 * entering the guest.
1097 1130 *
1098 1131 * Because this attack takes place in the L1 cache, there's another wrinkle
1099 1132 * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1100 1133 * designs. This means that when a thread enters a hardware virtualized context
1101 1134 * and flushes the L1 data cache, the other thread on the processor may then go
1102 1135 * ahead and put new data in it that can be potentially attacked. While one
1103 1136 * solution is to disable SMT on the system, another option that is available is
1104 1137 * to use a feature for hardware virtualization called 'SMT exclusion'. This
1105 1138 * goes through and makes sure that if a HVM is being scheduled on one thread,
1106 1139 * then the thing on the other thread is from the same hardware virtual machine.
1107 1140 * If an interrupt comes in or the guest exits to the broader system, then the
1108 1141 * other SMT thread will be kicked out.
1109 1142 *
1110 1143 * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1111 1144 * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1112 1145 * perform L1TF related mitigations.
1113 1146 *
1114 1147 * MICROARCHITECTURAL DATA SAMPLING
1115 1148 *
1116 1149 * Microarchitectural data sampling (MDS) is a combination of four discrete
1117 1150 * vulnerabilities that are similar issues affecting various parts of the CPU's
1118 1151 * microarchitectural implementation around load, store, and fill buffers.
1119 1152 * Specifically it is made up of the following subcomponents:
1120 1153 *
1121 1154 * 1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1122 1155 * 2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1123 1156 * 3. Microarchitectural Load Port Data Sampling (MLPDS)
1124 1157 * 4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1125 1158 *
1126 1159 * To begin addressing these, Intel has introduced another feature in microcode
1127 1160 * called MD_CLEAR. This changes the verw instruction to operate in a different
1128 1161 * way. This allows us to execute the verw instruction in a particular way to
1129 1162 * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1130 1163 * updated when this microcode is present to flush this state.
1131 1164 *
1132 1165 * Primarily we need to flush this state whenever we transition from the kernel
1133 1166 * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1134 1167 * little bit different. Here the structures are statically sized when a logical
1135 1168 * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1136 1169 * flush the microarchitectural state before the CPU goes idles by calling hlt,
1137 1170 * mwait, or another ACPI method. To perform these flushes, we call
1138 1171 * x86_md_clear() at all of these transition points.
1139 1172 *
1140 1173 * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1141 1174 * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1142 1175 * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1143 1176 * a no-op.
1144 1177 *
1145 1178 * Unfortunately, with this issue hyperthreading rears its ugly head. In
1146 1179 * particular, everything we've discussed above is only valid for a single
1147 1180 * thread executing on a core. In the case where you have hyper-threading
1148 1181 * present, this attack can be performed between threads. The theoretical fix
1149 1182 * for this is to ensure that both threads are always in the same security
1150 1183 * domain. This means that they are executing in the same ring and mutually
1151 1184 * trust each other. Practically speaking, this would mean that a system call

↓ open down ↓

99 lines elided

↑ open up ↑

1152 1185   * would have to issue an inter-processor interrupt (IPI) to the other thread.
1153 1186   * Rather than implement this, we recommend that one disables hyper-threading
1154 1187   * through the use of psradm -aS.
1155 1188   *
1156 1189   * SUMMARY
1157 1190   *
1158 1191   * The following table attempts to summarize the mitigations for various issues
1159 1192   * and what's done in various places:
1160 1193   *
1161 1194   *  - Spectre v1: Not currently mitigated
     1195 + *  - swapgs: lfences after swapgs paths
1162 1196   *  - Spectre v2: Retpolines/RSB Stuffing or EIBRS if HW support
1163 1197   *  - Meltdown: Kernel Page Table Isolation
1164 1198   *  - Spectre v3a: Updated CPU microcode
1165 1199   *  - Spectre v4: Not currently mitigated
1166 1200   *  - SpectreRSB: SMEP and RSB Stuffing
1167      - *  - L1TF: spec_uarch_flush, smt exclusion, requires microcode
     1201 + *  - L1TF: spec_uarch_flush, SMT exclusion, requires microcode
1168 1202   *  - MDS: x86_md_clear, requires microcode, disabling hyper threading
1169 1203   *
1170 1204   * The following table indicates the x86 feature set bits that indicate that a
1171 1205   * given problem has been solved or a notable feature is present:
1172 1206   *
1173 1207   *  - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1174 1208   *  - MDS_NO: All forms of MDS
1175 1209   */
1176 1210  
1177 1211  #include <sys/types.h>

1178 1212  #include <sys/archsystm.h>
1179 1213  #include <sys/x86_archext.h>
1180 1214  #include <sys/kmem.h>
1181 1215  #include <sys/systm.h>
1182 1216  #include <sys/cmn_err.h>
1183 1217  #include <sys/sunddi.h>
1184 1218  #include <sys/sunndi.h>
1185 1219  #include <sys/cpuvar.h>
1186 1220  #include <sys/processor.h>
1187 1221  #include <sys/sysmacros.h>
1188 1222  #include <sys/pg.h>
1189 1223  #include <sys/fp.h>
1190 1224  #include <sys/controlregs.h>
1191 1225  #include <sys/bitmap.h>
1192 1226  #include <sys/auxv_386.h>
1193 1227  #include <sys/memnode.h>
1194 1228  #include <sys/pci_cfgspace.h>
1195 1229  #include <sys/comm_page.h>
1196 1230  #include <sys/mach_mmu.h>
1197 1231  #include <sys/ucode.h>
1198 1232  #include <sys/tsc.h>
1199 1233  #include <sys/kobj.h>
1200 1234  #include <sys/asm_misc.h>
1201 1235  
1202 1236  #ifdef __xpv
1203 1237  #include <sys/hypervisor.h>
1204 1238  #else
1205 1239  #include <sys/ontrap.h>
1206 1240  #endif
1207 1241  
1208 1242  uint_t x86_vendor = X86_VENDOR_IntelClone;
1209 1243  uint_t x86_type = X86_TYPE_OTHER;
1210 1244  uint_t x86_clflush_size = 0;
1211 1245  
1212 1246  #if defined(__xpv)
1213 1247  int x86_use_pcid = 0;
1214 1248  int x86_use_invpcid = 0;
1215 1249  #else
1216 1250  int x86_use_pcid = -1;
1217 1251  int x86_use_invpcid = -1;
1218 1252  #endif
1219 1253  
1220 1254  typedef enum {
1221 1255          X86_SPECTREV2_RETPOLINE,
1222 1256          X86_SPECTREV2_RETPOLINE_AMD,
1223 1257          X86_SPECTREV2_ENHANCED_IBRS,
1224 1258          X86_SPECTREV2_DISABLED
1225 1259  } x86_spectrev2_mitigation_t;
1226 1260  
1227 1261  uint_t x86_disable_spectrev2 = 0;
1228 1262  static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1229 1263      X86_SPECTREV2_RETPOLINE;
1230 1264  
1231 1265  uint_t pentiumpro_bug4046376;
1232 1266  
1233 1267  uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1234 1268  
1235 1269  static char *x86_feature_names[NUM_X86_FEATURES] = {
1236 1270          "lgpg",
1237 1271          "tsc",
1238 1272          "msr",
1239 1273          "mtrr",
1240 1274          "pge",
1241 1275          "de",
1242 1276          "cmov",
1243 1277          "mmx",
1244 1278          "mca",
1245 1279          "pae",
1246 1280          "cv8",
1247 1281          "pat",
1248 1282          "sep",
1249 1283          "sse",
1250 1284          "sse2",
1251 1285          "htt",
1252 1286          "asysc",
1253 1287          "nx",
1254 1288          "sse3",
1255 1289          "cx16",
1256 1290          "cmp",
1257 1291          "tscp",
1258 1292          "mwait",
1259 1293          "sse4a",
1260 1294          "cpuid",
1261 1295          "ssse3",
1262 1296          "sse4_1",
1263 1297          "sse4_2",
1264 1298          "1gpg",
1265 1299          "clfsh",
1266 1300          "64",
1267 1301          "aes",
1268 1302          "pclmulqdq",
1269 1303          "xsave",
1270 1304          "avx",
1271 1305          "vmx",
1272 1306          "svm",
1273 1307          "topoext",
1274 1308          "f16c",
1275 1309          "rdrand",
1276 1310          "x2apic",
1277 1311          "avx2",
1278 1312          "bmi1",
1279 1313          "bmi2",
1280 1314          "fma",
1281 1315          "smep",
1282 1316          "smap",
1283 1317          "adx",
1284 1318          "rdseed",
1285 1319          "mpx",
1286 1320          "avx512f",
1287 1321          "avx512dq",
1288 1322          "avx512pf",
1289 1323          "avx512er",
1290 1324          "avx512cd",
1291 1325          "avx512bw",
1292 1326          "avx512vl",
1293 1327          "avx512fma",
1294 1328          "avx512vbmi",
1295 1329          "avx512_vpopcntdq",
1296 1330          "avx512_4vnniw",
1297 1331          "avx512_4fmaps",
1298 1332          "xsaveopt",
1299 1333          "xsavec",
1300 1334          "xsaves",
1301 1335          "sha",
1302 1336          "umip",
1303 1337          "pku",
1304 1338          "ospke",
1305 1339          "pcid",
1306 1340          "invpcid",
1307 1341          "ibrs",
1308 1342          "ibpb",
1309 1343          "stibp",
1310 1344          "ssbd",
1311 1345          "ssbd_virt",
1312 1346          "rdcl_no",
1313 1347          "ibrs_all",
1314 1348          "rsba",
1315 1349          "ssb_no",
1316 1350          "stibp_all",
1317 1351          "flush_cmd",
1318 1352          "l1d_vmentry_no",
1319 1353          "fsgsbase",
1320 1354          "clflushopt",
1321 1355          "clwb",
1322 1356          "monitorx",
1323 1357          "clzero",
1324 1358          "xop",
1325 1359          "fma4",
1326 1360          "tbm",
1327 1361          "avx512_vnni",
1328 1362          "amd_pcec",
1329 1363          "mb_clear",
1330 1364          "mds_no",
1331 1365          "core_thermal",
1332 1366          "pkg_thermal"
1333 1367  };
1334 1368  
1335 1369  boolean_t
1336 1370  is_x86_feature(void *featureset, uint_t feature)
1337 1371  {
1338 1372          ASSERT(feature < NUM_X86_FEATURES);
1339 1373          return (BT_TEST((ulong_t *)featureset, feature));
1340 1374  }
1341 1375  
1342 1376  void
1343 1377  add_x86_feature(void *featureset, uint_t feature)
1344 1378  {
1345 1379          ASSERT(feature < NUM_X86_FEATURES);
1346 1380          BT_SET((ulong_t *)featureset, feature);
1347 1381  }
1348 1382  
1349 1383  void
1350 1384  remove_x86_feature(void *featureset, uint_t feature)
1351 1385  {
1352 1386          ASSERT(feature < NUM_X86_FEATURES);
1353 1387          BT_CLEAR((ulong_t *)featureset, feature);
1354 1388  }
1355 1389  
1356 1390  boolean_t
1357 1391  compare_x86_featureset(void *setA, void *setB)
1358 1392  {
1359 1393          /*
1360 1394           * We assume that the unused bits of the bitmap are always zero.
1361 1395           */
1362 1396          if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1363 1397                  return (B_TRUE);
1364 1398          } else {
1365 1399                  return (B_FALSE);
1366 1400          }
1367 1401  }
1368 1402  
1369 1403  void
1370 1404  print_x86_featureset(void *featureset)
1371 1405  {
1372 1406          uint_t i;
1373 1407  
1374 1408          for (i = 0; i < NUM_X86_FEATURES; i++) {
1375 1409                  if (is_x86_feature(featureset, i)) {
1376 1410                          cmn_err(CE_CONT, "?x86_feature: %s\n",
1377 1411                              x86_feature_names[i]);
1378 1412                  }
1379 1413          }
1380 1414  }
1381 1415  
1382 1416  /* Note: This is the maximum size for the CPU, not the size of the structure. */
1383 1417  static size_t xsave_state_size = 0;
1384 1418  uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1385 1419  boolean_t xsave_force_disable = B_FALSE;
1386 1420  extern int disable_smap;
1387 1421  
1388 1422  /*
1389 1423   * This is set to platform type we are running on.
1390 1424   */
1391 1425  static int platform_type = -1;
1392 1426  
1393 1427  #if !defined(__xpv)
1394 1428  /*
1395 1429   * Variable to patch if hypervisor platform detection needs to be
1396 1430   * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1397 1431   */
1398 1432  int enable_platform_detection = 1;
1399 1433  #endif
1400 1434  
1401 1435  /*
1402 1436   * monitor/mwait info.
1403 1437   *
1404 1438   * size_actual and buf_actual are the real address and size allocated to get
1405 1439   * proper mwait_buf alignement.  buf_actual and size_actual should be passed
1406 1440   * to kmem_free().  Currently kmem_alloc() and mwait happen to both use
1407 1441   * processor cache-line alignment, but this is not guarantied in the furture.
1408 1442   */
1409 1443  struct mwait_info {
1410 1444          size_t          mon_min;        /* min size to avoid missed wakeups */
1411 1445          size_t          mon_max;        /* size to avoid false wakeups */
1412 1446          size_t          size_actual;    /* size actually allocated */
1413 1447          void            *buf_actual;    /* memory actually allocated */
1414 1448          uint32_t        support;        /* processor support of monitor/mwait */
1415 1449  };
1416 1450  
1417 1451  /*
1418 1452   * xsave/xrestor info.
1419 1453   *
1420 1454   * This structure contains HW feature bits and the size of the xsave save area.
1421 1455   * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1422 1456   * (xsave_state) to describe the xsave layout. However, at runtime the
1423 1457   * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1424 1458   * xsave_state structure simply represents the legacy layout of the beginning
1425 1459   * of the xsave area.
1426 1460   */
1427 1461  struct xsave_info {
1428 1462          uint32_t        xsav_hw_features_low;   /* Supported HW features */
1429 1463          uint32_t        xsav_hw_features_high;  /* Supported HW features */
1430 1464          size_t          xsav_max_size;  /* max size save area for HW features */
1431 1465          size_t          ymm_size;       /* AVX: size of ymm save area */
1432 1466          size_t          ymm_offset;     /* AVX: offset for ymm save area */
1433 1467          size_t          bndregs_size;   /* MPX: size of bndregs save area */
1434 1468          size_t          bndregs_offset; /* MPX: offset for bndregs save area */
1435 1469          size_t          bndcsr_size;    /* MPX: size of bndcsr save area */
1436 1470          size_t          bndcsr_offset;  /* MPX: offset for bndcsr save area */
1437 1471          size_t          opmask_size;    /* AVX512: size of opmask save */
1438 1472          size_t          opmask_offset;  /* AVX512: offset for opmask save */
1439 1473          size_t          zmmlo_size;     /* AVX512: size of zmm 256 save */
1440 1474          size_t          zmmlo_offset;   /* AVX512: offset for zmm 256 save */
1441 1475          size_t          zmmhi_size;     /* AVX512: size of zmm hi reg save */
1442 1476          size_t          zmmhi_offset;   /* AVX512: offset for zmm hi reg save */
1443 1477  };
1444 1478  
1445 1479  
1446 1480  /*
1447 1481   * These constants determine how many of the elements of the
1448 1482   * cpuid we cache in the cpuid_info data structure; the
1449 1483   * remaining elements are accessible via the cpuid instruction.
1450 1484   */
1451 1485  
1452 1486  #define NMAX_CPI_STD    8               /* eax = 0 .. 7 */
1453 1487  #define NMAX_CPI_EXTD   0x1f            /* eax = 0x80000000 .. 0x8000001e */
1454 1488  
1455 1489  /*
1456 1490   * See the big theory statement for a more detailed explanation of what some of
1457 1491   * these members mean.
1458 1492   */
1459 1493  struct cpuid_info {
1460 1494          uint_t cpi_pass;                /* last pass completed */
1461 1495          /*
1462 1496           * standard function information
1463 1497           */
1464 1498          uint_t cpi_maxeax;              /* fn 0: %eax */
1465 1499          char cpi_vendorstr[13];         /* fn 0: %ebx:%ecx:%edx */
1466 1500          uint_t cpi_vendor;              /* enum of cpi_vendorstr */
1467 1501  
1468 1502          uint_t cpi_family;              /* fn 1: extended family */
1469 1503          uint_t cpi_model;               /* fn 1: extended model */
1470 1504          uint_t cpi_step;                /* fn 1: stepping */
1471 1505          chipid_t cpi_chipid;            /* fn 1: %ebx:  Intel: chip # */
1472 1506                                          /*              AMD: package/socket # */
1473 1507          uint_t cpi_brandid;             /* fn 1: %ebx: brand ID */
1474 1508          int cpi_clogid;                 /* fn 1: %ebx: thread # */
1475 1509          uint_t cpi_ncpu_per_chip;       /* fn 1: %ebx: logical cpu count */
1476 1510          uint8_t cpi_cacheinfo[16];      /* fn 2: intel-style cache desc */
1477 1511          uint_t cpi_ncache;              /* fn 2: number of elements */
1478 1512          uint_t cpi_ncpu_shr_last_cache; /* fn 4: %eax: ncpus sharing cache */
1479 1513          id_t cpi_last_lvl_cacheid;      /* fn 4: %eax: derived cache id */
1480 1514          uint_t cpi_cache_leaf_size;     /* Number of cache elements */
1481 1515                                          /* Intel fn: 4, AMD fn: 8000001d */
1482 1516          struct cpuid_regs **cpi_cache_leaves;   /* Acual leaves from above */
1483 1517          struct cpuid_regs cpi_std[NMAX_CPI_STD];        /* 0 .. 7 */
1484 1518          /*
1485 1519           * extended function information
1486 1520           */
1487 1521          uint_t cpi_xmaxeax;             /* fn 0x80000000: %eax */
1488 1522          char cpi_brandstr[49];          /* fn 0x8000000[234] */
1489 1523          uint8_t cpi_pabits;             /* fn 0x80000006: %eax */
1490 1524          uint8_t cpi_vabits;             /* fn 0x80000006: %eax */
1491 1525          uint8_t cpi_fp_amd_save;        /* AMD: FP error pointer save rqd. */
1492 1526          struct  cpuid_regs cpi_extd[NMAX_CPI_EXTD];     /* 0x800000XX */
1493 1527  
1494 1528          id_t cpi_coreid;                /* same coreid => strands share core */
1495 1529          int cpi_pkgcoreid;              /* core number within single package */
1496 1530          uint_t cpi_ncore_per_chip;      /* AMD: fn 0x80000008: %ecx[7-0] */
1497 1531                                          /* Intel: fn 4: %eax[31-26] */
1498 1532  
1499 1533          /*
1500 1534           * These values represent the number of bits that are required to store
1501 1535           * information about the number of cores and threads.
1502 1536           */
1503 1537          uint_t cpi_ncore_bits;
1504 1538          uint_t cpi_nthread_bits;
1505 1539          /*
1506 1540           * supported feature information
1507 1541           */
1508 1542          uint32_t cpi_support[6];
1509 1543  #define STD_EDX_FEATURES        0
1510 1544  #define AMD_EDX_FEATURES        1
1511 1545  #define TM_EDX_FEATURES         2
1512 1546  #define STD_ECX_FEATURES        3
1513 1547  #define AMD_ECX_FEATURES        4
1514 1548  #define STD_EBX_FEATURES        5
1515 1549          /*
1516 1550           * Synthesized information, where known.
1517 1551           */
1518 1552          uint32_t cpi_chiprev;           /* See X86_CHIPREV_* in x86_archext.h */
1519 1553          const char *cpi_chiprevstr;     /* May be NULL if chiprev unknown */
1520 1554          uint32_t cpi_socket;            /* Chip package/socket type */
1521 1555  
1522 1556          struct mwait_info cpi_mwait;    /* fn 5: monitor/mwait info */
1523 1557          uint32_t cpi_apicid;
1524 1558          uint_t cpi_procnodeid;          /* AMD: nodeID on HT, Intel: chipid */
1525 1559          uint_t cpi_procnodes_per_pkg;   /* AMD: # of nodes in the package */
1526 1560                                          /* Intel: 1 */
1527 1561          uint_t cpi_compunitid;          /* AMD: ComputeUnit ID, Intel: coreid */
1528 1562          uint_t cpi_cores_per_compunit;  /* AMD: # of cores in the ComputeUnit */
1529 1563  
1530 1564          struct xsave_info cpi_xsave;    /* fn D: xsave/xrestor info */
1531 1565  };
1532 1566  
1533 1567  
1534 1568  static struct cpuid_info cpuid_info0;
1535 1569  
1536 1570  /*
1537 1571   * These bit fields are defined by the Intel Application Note AP-485
1538 1572   * "Intel Processor Identification and the CPUID Instruction"
1539 1573   */
1540 1574  #define CPI_FAMILY_XTD(cpi)     BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1541 1575  #define CPI_MODEL_XTD(cpi)      BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1542 1576  #define CPI_TYPE(cpi)           BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1543 1577  #define CPI_FAMILY(cpi)         BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1544 1578  #define CPI_STEP(cpi)           BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1545 1579  #define CPI_MODEL(cpi)          BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1546 1580  
1547 1581  #define CPI_FEATURES_EDX(cpi)           ((cpi)->cpi_std[1].cp_edx)
1548 1582  #define CPI_FEATURES_ECX(cpi)           ((cpi)->cpi_std[1].cp_ecx)
1549 1583  #define CPI_FEATURES_XTD_EDX(cpi)       ((cpi)->cpi_extd[1].cp_edx)
1550 1584  #define CPI_FEATURES_XTD_ECX(cpi)       ((cpi)->cpi_extd[1].cp_ecx)
1551 1585  #define CPI_FEATURES_7_0_EBX(cpi)       ((cpi)->cpi_std[7].cp_ebx)
1552 1586  #define CPI_FEATURES_7_0_ECX(cpi)       ((cpi)->cpi_std[7].cp_ecx)
1553 1587  #define CPI_FEATURES_7_0_EDX(cpi)       ((cpi)->cpi_std[7].cp_edx)
1554 1588  
1555 1589  #define CPI_BRANDID(cpi)        BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1556 1590  #define CPI_CHUNKS(cpi)         BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1557 1591  #define CPI_CPU_COUNT(cpi)      BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1558 1592  #define CPI_APIC_ID(cpi)        BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1559 1593  
1560 1594  #define CPI_MAXEAX_MAX          0x100           /* sanity control */
1561 1595  #define CPI_XMAXEAX_MAX         0x80000100
1562 1596  #define CPI_FN4_ECX_MAX         0x20            /* sanity: max fn 4 levels */
1563 1597  #define CPI_FNB_ECX_MAX         0x20            /* sanity: max fn B levels */
1564 1598  
1565 1599  /*
1566 1600   * Function 4 (Deterministic Cache Parameters) macros
1567 1601   * Defined by Intel Application Note AP-485
1568 1602   */
1569 1603  #define CPI_NUM_CORES(regs)             BITX((regs)->cp_eax, 31, 26)
1570 1604  #define CPI_NTHR_SHR_CACHE(regs)        BITX((regs)->cp_eax, 25, 14)
1571 1605  #define CPI_FULL_ASSOC_CACHE(regs)      BITX((regs)->cp_eax, 9, 9)
1572 1606  #define CPI_SELF_INIT_CACHE(regs)       BITX((regs)->cp_eax, 8, 8)
1573 1607  #define CPI_CACHE_LVL(regs)             BITX((regs)->cp_eax, 7, 5)
1574 1608  #define CPI_CACHE_TYPE(regs)            BITX((regs)->cp_eax, 4, 0)
1575 1609  #define CPI_CPU_LEVEL_TYPE(regs)        BITX((regs)->cp_ecx, 15, 8)
1576 1610  
1577 1611  #define CPI_CACHE_WAYS(regs)            BITX((regs)->cp_ebx, 31, 22)
1578 1612  #define CPI_CACHE_PARTS(regs)           BITX((regs)->cp_ebx, 21, 12)
1579 1613  #define CPI_CACHE_COH_LN_SZ(regs)       BITX((regs)->cp_ebx, 11, 0)
1580 1614  
1581 1615  #define CPI_CACHE_SETS(regs)            BITX((regs)->cp_ecx, 31, 0)
1582 1616  
1583 1617  #define CPI_PREFCH_STRIDE(regs)         BITX((regs)->cp_edx, 9, 0)
1584 1618  
1585 1619  
1586 1620  /*
1587 1621   * A couple of shorthand macros to identify "later" P6-family chips
1588 1622   * like the Pentium M and Core.  First, the "older" P6-based stuff
1589 1623   * (loosely defined as "pre-Pentium-4"):
1590 1624   * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1591 1625   */
1592 1626  #define IS_LEGACY_P6(cpi) (                     \
1593 1627          cpi->cpi_family == 6 &&                 \
1594 1628                  (cpi->cpi_model == 1 ||         \
1595 1629                  cpi->cpi_model == 3 ||          \
1596 1630                  cpi->cpi_model == 5 ||          \
1597 1631                  cpi->cpi_model == 6 ||          \
1598 1632                  cpi->cpi_model == 7 ||          \
1599 1633                  cpi->cpi_model == 8 ||          \
1600 1634                  cpi->cpi_model == 0xA ||        \
1601 1635                  cpi->cpi_model == 0xB)          \
1602 1636  )
1603 1637  
1604 1638  /* A "new F6" is everything with family 6 that's not the above */
1605 1639  #define IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
1606 1640  
1607 1641  /* Extended family/model support */
1608 1642  #define IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
1609 1643          cpi->cpi_family >= 0xf)
1610 1644  
1611 1645  /*
1612 1646   * Info for monitor/mwait idle loop.
1613 1647   *
1614 1648   * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
1615 1649   * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
1616 1650   * 2006.
1617 1651   * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
1618 1652   * Documentation Updates" #33633, Rev 2.05, December 2006.
1619 1653   */
1620 1654  #define MWAIT_SUPPORT           (0x00000001)    /* mwait supported */
1621 1655  #define MWAIT_EXTENSIONS        (0x00000002)    /* extenstion supported */
1622 1656  #define MWAIT_ECX_INT_ENABLE    (0x00000004)    /* ecx 1 extension supported */
1623 1657  #define MWAIT_SUPPORTED(cpi)    ((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
1624 1658  #define MWAIT_INT_ENABLE(cpi)   ((cpi)->cpi_std[5].cp_ecx & 0x2)
1625 1659  #define MWAIT_EXTENSION(cpi)    ((cpi)->cpi_std[5].cp_ecx & 0x1)
1626 1660  #define MWAIT_SIZE_MIN(cpi)     BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
1627 1661  #define MWAIT_SIZE_MAX(cpi)     BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
1628 1662  /*
1629 1663   * Number of sub-cstates for a given c-state.
1630 1664   */
1631 1665  #define MWAIT_NUM_SUBC_STATES(cpi, c_state)                     \
1632 1666          BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
1633 1667  
1634 1668  /*
1635 1669   * XSAVE leaf 0xD enumeration
1636 1670   */
1637 1671  #define CPUID_LEAFD_2_YMM_OFFSET        576
1638 1672  #define CPUID_LEAFD_2_YMM_SIZE          256
1639 1673  
1640 1674  /*
1641 1675   * Common extended leaf names to cut down on typos.
1642 1676   */
1643 1677  #define CPUID_LEAF_EXT_0                0x80000000
1644 1678  #define CPUID_LEAF_EXT_8                0x80000008
1645 1679  #define CPUID_LEAF_EXT_1d               0x8000001d
1646 1680  #define CPUID_LEAF_EXT_1e               0x8000001e
1647 1681  
1648 1682  /*
1649 1683   * Functions we consune from cpuid_subr.c;  don't publish these in a header
1650 1684   * file to try and keep people using the expected cpuid_* interfaces.
1651 1685   */
1652 1686  extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
1653 1687  extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
1654 1688  extern uint32_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
1655 1689  extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
1656 1690  extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
1657 1691  
1658 1692  /*
1659 1693   * Apply up various platform-dependent restrictions where the
1660 1694   * underlying platform restrictions mean the CPU can be marked
1661 1695   * as less capable than its cpuid instruction would imply.
1662 1696   */
1663 1697  #if defined(__xpv)
1664 1698  static void
1665 1699  platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
1666 1700  {
1667 1701          switch (eax) {
1668 1702          case 1: {
1669 1703                  uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
1670 1704                      0 : CPUID_INTC_EDX_MCA;
1671 1705                  cp->cp_edx &=
1672 1706                      ~(mcamask |
1673 1707                      CPUID_INTC_EDX_PSE |
1674 1708                      CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1675 1709                      CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
1676 1710                      CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
1677 1711                      CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1678 1712                      CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
1679 1713                  break;
1680 1714          }
1681 1715  
1682 1716          case 0x80000001:
1683 1717                  cp->cp_edx &=
1684 1718                      ~(CPUID_AMD_EDX_PSE |
1685 1719                      CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1686 1720                      CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
1687 1721                      CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
1688 1722                      CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1689 1723                      CPUID_AMD_EDX_TSCP);
1690 1724                  cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
1691 1725                  break;
1692 1726          default:
1693 1727                  break;
1694 1728          }
1695 1729  
1696 1730          switch (vendor) {
1697 1731          case X86_VENDOR_Intel:
1698 1732                  switch (eax) {
1699 1733                  case 4:
1700 1734                          /*
1701 1735                           * Zero out the (ncores-per-chip - 1) field
1702 1736                           */
1703 1737                          cp->cp_eax &= 0x03fffffff;
1704 1738                          break;
1705 1739                  default:
1706 1740                          break;
1707 1741                  }
1708 1742                  break;
1709 1743          case X86_VENDOR_AMD:
1710 1744                  switch (eax) {
1711 1745  
1712 1746                  case 0x80000001:
1713 1747                          cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
1714 1748                          break;
1715 1749  
1716 1750                  case CPUID_LEAF_EXT_8:
1717 1751                          /*
1718 1752                           * Zero out the (ncores-per-chip - 1) field
1719 1753                           */
1720 1754                          cp->cp_ecx &= 0xffffff00;
1721 1755                          break;
1722 1756                  default:
1723 1757                          break;
1724 1758                  }
1725 1759                  break;
1726 1760          default:
1727 1761                  break;
1728 1762          }
1729 1763  }
1730 1764  #else
1731 1765  #define platform_cpuid_mangle(vendor, eax, cp)  /* nothing */
1732 1766  #endif
1733 1767  
1734 1768  /*
1735 1769   *  Some undocumented ways of patching the results of the cpuid
1736 1770   *  instruction to permit running Solaris 10 on future cpus that
1737 1771   *  we don't currently support.  Could be set to non-zero values
1738 1772   *  via settings in eeprom.
1739 1773   */
1740 1774  
1741 1775  uint32_t cpuid_feature_ecx_include;
1742 1776  uint32_t cpuid_feature_ecx_exclude;
1743 1777  uint32_t cpuid_feature_edx_include;
1744 1778  uint32_t cpuid_feature_edx_exclude;
1745 1779  
1746 1780  /*
1747 1781   * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
1748 1782   */
1749 1783  void
1750 1784  cpuid_alloc_space(cpu_t *cpu)
1751 1785  {
1752 1786          /*
1753 1787           * By convention, cpu0 is the boot cpu, which is set up
1754 1788           * before memory allocation is available.  All other cpus get
1755 1789           * their cpuid_info struct allocated here.
1756 1790           */
1757 1791          ASSERT(cpu->cpu_id != 0);
1758 1792          ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
1759 1793          cpu->cpu_m.mcpu_cpi =
1760 1794              kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
1761 1795  }
1762 1796  
1763 1797  void
1764 1798  cpuid_free_space(cpu_t *cpu)
1765 1799  {
1766 1800          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1767 1801          int i;
1768 1802  
1769 1803          ASSERT(cpi != NULL);
1770 1804          ASSERT(cpi != &cpuid_info0);
1771 1805  
1772 1806          /*
1773 1807           * Free up any cache leaf related dynamic storage. The first entry was
1774 1808           * cached from the standard cpuid storage, so we should not free it.
1775 1809           */
1776 1810          for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
1777 1811                  kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
1778 1812          if (cpi->cpi_cache_leaf_size > 0)
1779 1813                  kmem_free(cpi->cpi_cache_leaves,
1780 1814                      cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
1781 1815  
1782 1816          kmem_free(cpi, sizeof (*cpi));
1783 1817          cpu->cpu_m.mcpu_cpi = NULL;
1784 1818  }
1785 1819  
1786 1820  #if !defined(__xpv)
1787 1821  /*
1788 1822   * Determine the type of the underlying platform. This is used to customize
1789 1823   * initialization of various subsystems (e.g. TSC). determine_platform() must
1790 1824   * only ever be called once to prevent two processors from seeing different
1791 1825   * values of platform_type. Must be called before cpuid_pass1(), the earliest
1792 1826   * consumer to execute (uses _cpuid_chiprev --> synth_amd_info --> get_hwenv).
1793 1827   */
1794 1828  void
1795 1829  determine_platform(void)
1796 1830  {
1797 1831          struct cpuid_regs cp;
1798 1832          uint32_t base;
1799 1833          uint32_t regs[4];
1800 1834          char *hvstr = (char *)regs;
1801 1835  
1802 1836          ASSERT(platform_type == -1);
1803 1837  
1804 1838          platform_type = HW_NATIVE;
1805 1839  
1806 1840          if (!enable_platform_detection)
1807 1841                  return;
1808 1842  
1809 1843          /*
1810 1844           * If Hypervisor CPUID bit is set, try to determine hypervisor
1811 1845           * vendor signature, and set platform type accordingly.
1812 1846           *
1813 1847           * References:
1814 1848           * http://lkml.org/lkml/2008/10/1/246
1815 1849           * http://kb.vmware.com/kb/1009458
1816 1850           */
1817 1851          cp.cp_eax = 0x1;
1818 1852          (void) __cpuid_insn(&cp);
1819 1853          if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
1820 1854                  cp.cp_eax = 0x40000000;
1821 1855                  (void) __cpuid_insn(&cp);
1822 1856                  regs[0] = cp.cp_ebx;
1823 1857                  regs[1] = cp.cp_ecx;
1824 1858                  regs[2] = cp.cp_edx;
1825 1859                  regs[3] = 0;
1826 1860                  if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
1827 1861                          platform_type = HW_XEN_HVM;
1828 1862                          return;
1829 1863                  }
1830 1864                  if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
1831 1865                          platform_type = HW_VMWARE;
1832 1866                          return;
1833 1867                  }
1834 1868                  if (strcmp(hvstr, HVSIG_KVM) == 0) {
1835 1869                          platform_type = HW_KVM;
1836 1870                          return;
1837 1871                  }
1838 1872                  if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
1839 1873                          platform_type = HW_BHYVE;
1840 1874                          return;
1841 1875                  }
1842 1876                  if (strcmp(hvstr, HVSIG_MICROSOFT) == 0)
1843 1877                          platform_type = HW_MICROSOFT;
1844 1878          } else {
1845 1879                  /*
1846 1880                   * Check older VMware hardware versions. VMware hypervisor is
1847 1881                   * detected by performing an IN operation to VMware hypervisor
1848 1882                   * port and checking that value returned in %ebx is VMware
1849 1883                   * hypervisor magic value.
1850 1884                   *
1851 1885                   * References: http://kb.vmware.com/kb/1009458
1852 1886                   */
1853 1887                  vmware_port(VMWARE_HVCMD_GETVERSION, regs);
1854 1888                  if (regs[1] == VMWARE_HVMAGIC) {
1855 1889                          platform_type = HW_VMWARE;
1856 1890                          return;
1857 1891                  }
1858 1892          }
1859 1893  
1860 1894          /*
1861 1895           * Check Xen hypervisor. In a fully virtualized domain,
1862 1896           * Xen's pseudo-cpuid function returns a string representing the
1863 1897           * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
1864 1898           * supported cpuid function. We need at least a (base + 2) leaf value
1865 1899           * to do what we want to do. Try different base values, since the
1866 1900           * hypervisor might use a different one depending on whether Hyper-V
1867 1901           * emulation is switched on by default or not.
1868 1902           */
1869 1903          for (base = 0x40000000; base < 0x40010000; base += 0x100) {
1870 1904                  cp.cp_eax = base;
1871 1905                  (void) __cpuid_insn(&cp);
1872 1906                  regs[0] = cp.cp_ebx;
1873 1907                  regs[1] = cp.cp_ecx;
1874 1908                  regs[2] = cp.cp_edx;
1875 1909                  regs[3] = 0;
1876 1910                  if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
1877 1911                      cp.cp_eax >= (base + 2)) {
1878 1912                          platform_type &= ~HW_NATIVE;
1879 1913                          platform_type |= HW_XEN_HVM;
1880 1914                          return;
1881 1915                  }
1882 1916          }
1883 1917  }
1884 1918  
1885 1919  int
1886 1920  get_hwenv(void)
1887 1921  {
1888 1922          ASSERT(platform_type != -1);
1889 1923          return (platform_type);
1890 1924  }
1891 1925  
1892 1926  int
1893 1927  is_controldom(void)
1894 1928  {
1895 1929          return (0);
1896 1930  }
1897 1931  
1898 1932  #else
1899 1933  
1900 1934  int
1901 1935  get_hwenv(void)
1902 1936  {
1903 1937          return (HW_XEN_PV);
1904 1938  }
1905 1939  
1906 1940  int
1907 1941  is_controldom(void)
1908 1942  {
1909 1943          return (DOMAIN_IS_INITDOMAIN(xen_info));
1910 1944  }
1911 1945  
1912 1946  #endif  /* __xpv */
1913 1947  
1914 1948  /*
1915 1949   * Make sure that we have gathered all of the CPUID leaves that we might need to
1916 1950   * determine topology. We assume that the standard leaf 1 has already been done
1917 1951   * and that xmaxeax has already been calculated.
1918 1952   */
1919 1953  static void
1920 1954  cpuid_gather_amd_topology_leaves(cpu_t *cpu)
1921 1955  {
1922 1956          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1923 1957  
1924 1958          if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
1925 1959                  struct cpuid_regs *cp;
1926 1960  
1927 1961                  cp = &cpi->cpi_extd[8];
1928 1962                  cp->cp_eax = CPUID_LEAF_EXT_8;
1929 1963                  (void) __cpuid_insn(cp);
1930 1964                  platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
1931 1965          }
1932 1966  
1933 1967          if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
1934 1968              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
1935 1969                  struct cpuid_regs *cp;
1936 1970  
1937 1971                  cp = &cpi->cpi_extd[0x1e];
1938 1972                  cp->cp_eax = CPUID_LEAF_EXT_1e;
1939 1973                  (void) __cpuid_insn(cp);
1940 1974          }
1941 1975  }
1942 1976  
1943 1977  /*
1944 1978   * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
1945 1979   * it to everything else. If not, and we're on an AMD system where 8000001e is
1946 1980   * valid, then we use that. Othewrise, we fall back to the default value for the
1947 1981   * APIC ID in leaf 1.
1948 1982   */
1949 1983  static uint32_t
1950 1984  cpuid_gather_apicid(struct cpuid_info *cpi)
1951 1985  {
1952 1986          /*
1953 1987           * Leaf B changes based on the arguments to it. Beacuse we don't cache
1954 1988           * it, we need to gather it again.
1955 1989           */
1956 1990          if (cpi->cpi_maxeax >= 0xB) {
1957 1991                  struct cpuid_regs regs;
1958 1992                  struct cpuid_regs *cp;
1959 1993  
1960 1994                  cp = &regs;
1961 1995                  cp->cp_eax = 0xB;
1962 1996                  cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
1963 1997                  (void) __cpuid_insn(cp);
1964 1998  
1965 1999                  if (cp->cp_ebx != 0) {
1966 2000                          return (cp->cp_edx);
1967 2001                  }
1968 2002          }
1969 2003  
1970 2004          if (cpi->cpi_vendor == X86_VENDOR_AMD &&
1971 2005              is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
1972 2006              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
1973 2007                  return (cpi->cpi_extd[0x1e].cp_eax);
1974 2008          }
1975 2009  
1976 2010          return (CPI_APIC_ID(cpi));
1977 2011  }
1978 2012  
1979 2013  /*
1980 2014   * For AMD processors, attempt to calculate the number of chips and cores that
1981 2015   * exist. The way that we do this varies based on the generation, because the
1982 2016   * generations themselves have changed dramatically.
1983 2017   *
1984 2018   * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
1985 2019   * However, with the advent of family 17h (Zen) it actually tells us the number
1986 2020   * of threads, so we need to look at leaf 0x8000001e if available to determine
1987 2021   * its value. Otherwise, for all prior families, the number of enabled cores is
1988 2022   * the same as threads.
1989 2023   *
1990 2024   * If we do not have leaf 0x80000008, then we assume that this processor does
1991 2025   * not have anything. AMD's older CPUID specification says there's no reason to
1992 2026   * fall back to leaf 1.
1993 2027   *
1994 2028   * In some virtualization cases we will not have leaf 8000001e or it will be
1995 2029   * zero. When that happens we assume the number of threads is one.
1996 2030   */
1997 2031  static void
1998 2032  cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
1999 2033  {
2000 2034          uint_t nthreads, nthread_per_core;
2001 2035  
2002 2036          nthreads = nthread_per_core = 1;
2003 2037  
2004 2038          if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2005 2039                  nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2006 2040          } else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2007 2041                  nthreads = CPI_CPU_COUNT(cpi);
2008 2042          }
2009 2043  
2010 2044          /*
2011 2045           * For us to have threads, and know about it, we have to be at least at
2012 2046           * family 17h and have the cpuid bit that says we have extended
2013 2047           * topology.
2014 2048           */
2015 2049          if (cpi->cpi_family >= 0x17 &&
2016 2050              is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2017 2051              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2018 2052                  nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2019 2053          }
2020 2054  
2021 2055          *ncpus = nthreads;
2022 2056          *ncores = nthreads / nthread_per_core;
2023 2057  }
2024 2058  
2025 2059  /*
2026 2060   * Seed the initial values for the cores and threads for an Intel based
2027 2061   * processor. These values will be overwritten if we detect that the processor
2028 2062   * supports CPUID leaf 0xb.
2029 2063   */
2030 2064  static void
2031 2065  cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2032 2066  {
2033 2067          /*
2034 2068           * Only seed the number of physical cores from the first level leaf 4
2035 2069           * information. The number of threads there indicate how many share the
2036 2070           * L1 cache, which may or may not have anything to do with the number of
2037 2071           * logical CPUs per core.
2038 2072           */
2039 2073          if (cpi->cpi_maxeax >= 4) {
2040 2074                  *ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2041 2075          } else {
2042 2076                  *ncores = 1;
2043 2077          }
2044 2078  
2045 2079          if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2046 2080                  *ncpus = CPI_CPU_COUNT(cpi);
2047 2081          } else {
2048 2082                  *ncpus = *ncores;
2049 2083          }
2050 2084  }
2051 2085  
2052 2086  static boolean_t
2053 2087  cpuid_leafB_getids(cpu_t *cpu)
2054 2088  {
2055 2089          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2056 2090          struct cpuid_regs regs;
2057 2091          struct cpuid_regs *cp;
2058 2092  
2059 2093          if (cpi->cpi_maxeax < 0xB)
2060 2094                  return (B_FALSE);
2061 2095  
2062 2096          cp = &regs;
2063 2097          cp->cp_eax = 0xB;
2064 2098          cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2065 2099  
2066 2100          (void) __cpuid_insn(cp);
2067 2101  
2068 2102          /*
2069 2103           * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2070 2104           * indicates that the extended topology enumeration leaf is
2071 2105           * available.
2072 2106           */
2073 2107          if (cp->cp_ebx != 0) {
2074 2108                  uint32_t x2apic_id = 0;
2075 2109                  uint_t coreid_shift = 0;
2076 2110                  uint_t ncpu_per_core = 1;
2077 2111                  uint_t chipid_shift = 0;
2078 2112                  uint_t ncpu_per_chip = 1;
2079 2113                  uint_t i;
2080 2114                  uint_t level;
2081 2115  
2082 2116                  for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2083 2117                          cp->cp_eax = 0xB;
2084 2118                          cp->cp_ecx = i;
2085 2119  
2086 2120                          (void) __cpuid_insn(cp);
2087 2121                          level = CPI_CPU_LEVEL_TYPE(cp);
2088 2122  
2089 2123                          if (level == 1) {
2090 2124                                  x2apic_id = cp->cp_edx;
2091 2125                                  coreid_shift = BITX(cp->cp_eax, 4, 0);
2092 2126                                  ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2093 2127                          } else if (level == 2) {
2094 2128                                  x2apic_id = cp->cp_edx;
2095 2129                                  chipid_shift = BITX(cp->cp_eax, 4, 0);
2096 2130                                  ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2097 2131                          }
2098 2132                  }
2099 2133  
2100 2134                  /*
2101 2135                   * cpi_apicid is taken care of in cpuid_gather_apicid.
2102 2136                   */
2103 2137                  cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2104 2138                  cpi->cpi_ncore_per_chip = ncpu_per_chip /
2105 2139                      ncpu_per_core;
2106 2140                  cpi->cpi_chipid = x2apic_id >> chipid_shift;
2107 2141                  cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2108 2142                  cpi->cpi_coreid = x2apic_id >> coreid_shift;
2109 2143                  cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2110 2144                  cpi->cpi_procnodeid = cpi->cpi_chipid;
2111 2145                  cpi->cpi_compunitid = cpi->cpi_coreid;
2112 2146  
2113 2147                  if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2114 2148                          cpi->cpi_nthread_bits = coreid_shift;
2115 2149                          cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2116 2150                  }
2117 2151  
2118 2152                  return (B_TRUE);
2119 2153          } else {
2120 2154                  return (B_FALSE);
2121 2155          }
2122 2156  }
2123 2157  
2124 2158  static void
2125 2159  cpuid_intel_getids(cpu_t *cpu, void *feature)
2126 2160  {
2127 2161          uint_t i;
2128 2162          uint_t chipid_shift = 0;
2129 2163          uint_t coreid_shift = 0;
2130 2164          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2131 2165  
2132 2166          /*
2133 2167           * There are no compute units or processor nodes currently on Intel.
2134 2168           * Always set these to one.
2135 2169           */
2136 2170          cpi->cpi_procnodes_per_pkg = 1;
2137 2171          cpi->cpi_cores_per_compunit = 1;
2138 2172  
2139 2173          /*
2140 2174           * If cpuid Leaf B is present, use that to try and get this information.
2141 2175           * It will be the most accurate for Intel CPUs.
2142 2176           */
2143 2177          if (cpuid_leafB_getids(cpu))
2144 2178                  return;
2145 2179  
2146 2180          /*
2147 2181           * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2148 2182           * and ncore_per_chip. These represent the largest power of two values
2149 2183           * that we need to cover all of the IDs in the system. Therefore, we use
2150 2184           * those values to seed the number of bits needed to cover information
2151 2185           * in the case when leaf B is not available. These values will probably
2152 2186           * be larger than required, but that's OK.
2153 2187           */
2154 2188          cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2155 2189          cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2156 2190  
2157 2191          for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2158 2192                  chipid_shift++;
2159 2193  
2160 2194          cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2161 2195          cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2162 2196  
2163 2197          if (is_x86_feature(feature, X86FSET_CMP)) {
2164 2198                  /*
2165 2199                   * Multi-core (and possibly multi-threaded)
2166 2200                   * processors.
2167 2201                   */
2168 2202                  uint_t ncpu_per_core;
2169 2203                  if (cpi->cpi_ncore_per_chip == 1)
2170 2204                          ncpu_per_core = cpi->cpi_ncpu_per_chip;
2171 2205                  else if (cpi->cpi_ncore_per_chip > 1)
2172 2206                          ncpu_per_core = cpi->cpi_ncpu_per_chip /
2173 2207                              cpi->cpi_ncore_per_chip;
2174 2208                  /*
2175 2209                   * 8bit APIC IDs on dual core Pentiums
2176 2210                   * look like this:
2177 2211                   *
2178 2212                   * +-----------------------+------+------+
2179 2213                   * | Physical Package ID   |  MC  |  HT  |
2180 2214                   * +-----------------------+------+------+
2181 2215                   * <------- chipid -------->
2182 2216                   * <------- coreid --------------->
2183 2217                   *                         <--- clogid -->
2184 2218                   *                         <------>
2185 2219                   *                         pkgcoreid
2186 2220                   *
2187 2221                   * Where the number of bits necessary to
2188 2222                   * represent MC and HT fields together equals
2189 2223                   * to the minimum number of bits necessary to
2190 2224                   * store the value of cpi->cpi_ncpu_per_chip.
2191 2225                   * Of those bits, the MC part uses the number
2192 2226                   * of bits necessary to store the value of
2193 2227                   * cpi->cpi_ncore_per_chip.
2194 2228                   */
2195 2229                  for (i = 1; i < ncpu_per_core; i <<= 1)
2196 2230                          coreid_shift++;
2197 2231                  cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2198 2232                  cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2199 2233          } else if (is_x86_feature(feature, X86FSET_HTT)) {
2200 2234                  /*
2201 2235                   * Single-core multi-threaded processors.
2202 2236                   */
2203 2237                  cpi->cpi_coreid = cpi->cpi_chipid;
2204 2238                  cpi->cpi_pkgcoreid = 0;
2205 2239          } else {
2206 2240                  /*
2207 2241                   * Single-core single-thread processors.
2208 2242                   */
2209 2243                  cpi->cpi_coreid = cpu->cpu_id;
2210 2244                  cpi->cpi_pkgcoreid = 0;
2211 2245          }
2212 2246          cpi->cpi_procnodeid = cpi->cpi_chipid;
2213 2247          cpi->cpi_compunitid = cpi->cpi_coreid;
2214 2248  }
2215 2249  
2216 2250  /*
2217 2251   * Historically, AMD has had CMP chips with only a single thread per core.
2218 2252   * However, starting in family 17h (Zen), this has changed and they now have
2219 2253   * multiple threads. Our internal core id needs to be a unique value.
2220 2254   *
2221 2255   * To determine the core id of an AMD system, if we're from a family before 17h,
2222 2256   * then we just use the cpu id, as that gives us a good value that will be
2223 2257   * unique for each core. If instead, we're on family 17h or later, then we need
2224 2258   * to do something more complicated. CPUID leaf 0x8000001e can tell us
2225 2259   * how many threads are in the system. Based on that, we'll shift the APIC ID.
2226 2260   * We can't use the normal core id in that leaf as it's only unique within the
2227 2261   * socket, which is perfect for cpi_pkgcoreid, but not us.
2228 2262   */
2229 2263  static id_t
2230 2264  cpuid_amd_get_coreid(cpu_t *cpu)
2231 2265  {
2232 2266          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2233 2267  
2234 2268          if (cpi->cpi_family >= 0x17 &&
2235 2269              is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2236 2270              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2237 2271                  uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2238 2272                  if (nthreads > 1) {
2239 2273                          VERIFY3U(nthreads, ==, 2);
2240 2274                          return (cpi->cpi_apicid >> 1);
2241 2275                  }
2242 2276          }
2243 2277  
2244 2278          return (cpu->cpu_id);
2245 2279  }
2246 2280  
2247 2281  /*
2248 2282   * IDs on AMD is a more challenging task. This is notable because of the
2249 2283   * following two facts:
2250 2284   *
2251 2285   *  1. Before family 0x17 (Zen), there was no support for SMT and there was
2252 2286   *     also no way to get an actual unique core id from the system. As such, we
2253 2287   *     synthesize this case by using cpu->cpu_id.  This scheme does not,
2254 2288   *     however, guarantee that sibling cores of a chip will have sequential
2255 2289   *     coreids starting at a multiple of the number of cores per chip - that is
2256 2290   *     usually the case, but if the ACPI MADT table is presented in a different
2257 2291   *     order then we need to perform a few more gymnastics for the pkgcoreid.
2258 2292   *
2259 2293   *  2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2260 2294   *     called compute units. These compute units share the L1I cache, L2 cache,
2261 2295   *     and the FPU. To deal with this, a new topology leaf was added in
2262 2296   *     0x8000001e. However, parts of this leaf have different meanings
2263 2297   *     once we get to family 0x17.
2264 2298   */
2265 2299  
2266 2300  static void
2267 2301  cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2268 2302  {
2269 2303          int i, first_half, coreidsz;
2270 2304          uint32_t nb_caps_reg;
2271 2305          uint_t node2_1;
2272 2306          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2273 2307          struct cpuid_regs *cp;
2274 2308  
2275 2309          /*
2276 2310           * Calculate the core id (this comes from hardware in family 0x17 if it
2277 2311           * hasn't been stripped by virtualization). We always set the compute
2278 2312           * unit id to the same value. Also, initialize the default number of
2279 2313           * cores per compute unit and nodes per package. This will be
2280 2314           * overwritten when we know information about a particular family.
2281 2315           */
2282 2316          cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2283 2317          cpi->cpi_compunitid = cpi->cpi_coreid;
2284 2318          cpi->cpi_cores_per_compunit = 1;
2285 2319          cpi->cpi_procnodes_per_pkg = 1;
2286 2320  
2287 2321          /*
2288 2322           * To construct the logical ID, we need to determine how many APIC IDs
2289 2323           * are dedicated to the cores and threads. This is provided for us in
2290 2324           * 0x80000008. However, if it's not present (say due to virtualization),
2291 2325           * then we assume it's one. This should be present on all 64-bit AMD
2292 2326           * processors.  It was added in family 0xf (Hammer).
2293 2327           */
2294 2328          if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2295 2329                  coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2296 2330  
2297 2331                  /*
2298 2332                   * In AMD parlance chip is really a node while illumos
2299 2333                   * uses chip as equivalent to socket/package.
2300 2334                   */
2301 2335                  if (coreidsz == 0) {
2302 2336                          /* Use legacy method */
2303 2337                          for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2304 2338                                  coreidsz++;
2305 2339                          if (coreidsz == 0)
2306 2340                                  coreidsz = 1;
2307 2341                  }
2308 2342          } else {
2309 2343                  /* Assume single-core part */
2310 2344                  coreidsz = 1;
2311 2345          }
2312 2346          cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2313 2347  
2314 2348          /*
2315 2349           * The package core ID varies depending on the family. While it may be
2316 2350           * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2317 2351           * this value is the core id in the given node. For non-virtualized
2318 2352           * family 17h, we need to take the logical core id and shift off the
2319 2353           * threads like we do when getting the core id.  Otherwise, we can use
2320 2354           * the clogid as is. When family 17h is virtualized, the clogid should
2321 2355           * be sufficient as if we don't have valid data in the leaf, then we
2322 2356           * won't think we have SMT, in which case the cpi_clogid should be
2323 2357           * sufficient.
2324 2358           */
2325 2359          if (cpi->cpi_family >= 0x17 &&
2326 2360              is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2327 2361              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2328 2362              cpi->cpi_extd[0x1e].cp_ebx != 0) {
2329 2363                  uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2330 2364                  if (nthreads > 1) {
2331 2365                          VERIFY3U(nthreads, ==, 2);
2332 2366                          cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2333 2367                  } else {
2334 2368                          cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2335 2369                  }
2336 2370          } else {
2337 2371                  cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2338 2372          }
2339 2373  
2340 2374          /*
2341 2375           * Obtain the node ID and compute unit IDs. If we're on family 0x15
2342 2376           * (bulldozer) or newer, then we can derive all of this from leaf
2343 2377           * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2344 2378           */
2345 2379          if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2346 2380              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2347 2381                  cp = &cpi->cpi_extd[0x1e];
2348 2382  
2349 2383                  cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2350 2384                  cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2351 2385  
2352 2386                  /*
2353 2387                   * For Bulldozer-era CPUs, recalculate the compute unit
2354 2388                   * information.
2355 2389                   */
2356 2390                  if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2357 2391                          cpi->cpi_cores_per_compunit =
2358 2392                              BITX(cp->cp_ebx, 15, 8) + 1;
2359 2393                          cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2360 2394                              (cpi->cpi_ncore_per_chip /
2361 2395                              cpi->cpi_cores_per_compunit) *
2362 2396                              (cpi->cpi_procnodeid /
2363 2397                              cpi->cpi_procnodes_per_pkg);
2364 2398                  }
2365 2399          } else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2366 2400                  cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2367 2401          } else if (cpi->cpi_family == 0x10) {
2368 2402                  /*
2369 2403                   * See if we are a multi-node processor.
2370 2404                   * All processors in the system have the same number of nodes
2371 2405                   */
2372 2406                  nb_caps_reg =  pci_getl_func(0, 24, 3, 0xe8);
2373 2407                  if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2374 2408                          /* Single-node */
2375 2409                          cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2376 2410                              coreidsz);
2377 2411                  } else {
2378 2412  
2379 2413                          /*
2380 2414                           * Multi-node revision D (2 nodes per package
2381 2415                           * are supported)
2382 2416                           */
2383 2417                          cpi->cpi_procnodes_per_pkg = 2;
2384 2418  
2385 2419                          first_half = (cpi->cpi_pkgcoreid <=
2386 2420                              (cpi->cpi_ncore_per_chip/2 - 1));
2387 2421  
2388 2422                          if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2389 2423                                  /* We are BSP */
2390 2424                                  cpi->cpi_procnodeid = (first_half ? 0 : 1);
2391 2425                          } else {
2392 2426  
2393 2427                                  /* We are AP */
2394 2428                                  /* NodeId[2:1] bits to use for reading F3xe8 */
2395 2429                                  node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2396 2430  
2397 2431                                  nb_caps_reg =
2398 2432                                      pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2399 2433  
2400 2434                                  /*
2401 2435                                   * Check IntNodeNum bit (31:30, but bit 31 is
2402 2436                                   * always 0 on dual-node processors)
2403 2437                                   */
2404 2438                                  if (BITX(nb_caps_reg, 30, 30) == 0)
2405 2439                                          cpi->cpi_procnodeid = node2_1 +
2406 2440                                              !first_half;
2407 2441                                  else
2408 2442                                          cpi->cpi_procnodeid = node2_1 +
2409 2443                                              first_half;
2410 2444                          }
2411 2445                  }
2412 2446          } else {
2413 2447                  cpi->cpi_procnodeid = 0;
2414 2448          }
2415 2449  
2416 2450          cpi->cpi_chipid =
2417 2451              cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2418 2452  
2419 2453          cpi->cpi_ncore_bits = coreidsz;
2420 2454          cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2421 2455              cpi->cpi_ncore_per_chip);
2422 2456  }
2423 2457  
2424 2458  static void
2425 2459  spec_uarch_flush_noop(void)
2426 2460  {
2427 2461  }
2428 2462  
2429 2463  /*
2430 2464   * When microcode is present that mitigates MDS, this wrmsr will also flush the
2431 2465   * MDS-related micro-architectural state that would normally happen by calling
2432 2466   * x86_md_clear().
2433 2467   */
2434 2468  static void
2435 2469  spec_uarch_flush_msr(void)
2436 2470  {
2437 2471          wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2438 2472  }
2439 2473  
2440 2474  /*
2441 2475   * This function points to a function that will flush certain
2442 2476   * micro-architectural state on the processor. This flush is used to mitigate
2443 2477   * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This
2444 2478   * function can point to one of three functions:
2445 2479   *
2446 2480   * - A noop which is done because we either are vulnerable, but do not have
2447 2481   *   microcode available to help deal with a fix, or because we aren't
2448 2482   *   vulnerable.
2449 2483   *
2450 2484   * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2451 2485   *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2452 2486   *   however, it only flushes the MDS related micro-architectural state on the
2453 2487   *   current hyperthread, it does not do anything for the twin.
2454 2488   *
2455 2489   * - x86_md_clear which will flush the MDS related state. This is done when we
2456 2490   *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2457 2491   *   (RDCL_NO is set).
2458 2492   */
2459 2493  void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2460 2494  
2461 2495  static void
2462 2496  cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2463 2497  {
2464 2498          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2465 2499  
2466 2500          /*
2467 2501           * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2468 2502           * has been fixed in hardware, it doesn't cover everything related to
2469 2503           * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2470 2504           * need to mitigate this.
2471 2505           */
2472 2506          if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2473 2507              is_x86_feature(featureset, X86FSET_MDS_NO)) {
2474 2508                  return;
2475 2509          }
2476 2510  
2477 2511          if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2478 2512                  const uint8_t nop = NOP_INSTR;
2479 2513                  uint8_t *md = (uint8_t *)x86_md_clear;
2480 2514  
2481 2515                  *md = nop;
2482 2516          }
2483 2517  
2484 2518          membar_producer();
2485 2519  }
2486 2520  
2487 2521  static void
2488 2522  cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2489 2523  {
2490 2524          boolean_t need_l1d, need_mds;
2491 2525          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2492 2526  
2493 2527          /*
2494 2528           * If we're not on Intel or we've mitigated both RDCL and MDS in
2495 2529           * hardware, then there's nothing left for us to do for enabling the
2496 2530           * flush. We can also go ahead and say that SMT exclusion is
2497 2531           * unnecessary.
2498 2532           */
2499 2533          if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2500 2534              (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2501 2535              is_x86_feature(featureset, X86FSET_MDS_NO))) {
2502 2536                  extern int smt_exclusion;
2503 2537                  smt_exclusion = 0;
2504 2538                  spec_uarch_flush = spec_uarch_flush_noop;
2505 2539                  membar_producer();
2506 2540                  return;
2507 2541          }
2508 2542  
2509 2543          /*
2510 2544           * The locations where we need to perform an L1D flush are required both
2511 2545           * for mitigating L1TF and MDS. When verw support is present in
2512 2546           * microcode, then the L1D flush will take care of doing that as well.
2513 2547           * However, if we have a system where RDCL_NO is present, but we don't
2514 2548           * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2515 2549           * L1D flush.
2516 2550           */
2517 2551          if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2518 2552              is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2519 2553              !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2520 2554                  need_l1d = B_TRUE;
2521 2555          } else {
2522 2556                  need_l1d = B_FALSE;
2523 2557          }
2524 2558  
2525 2559          if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2526 2560              is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2527 2561                  need_mds = B_TRUE;
2528 2562          } else {
2529 2563                  need_mds = B_FALSE;
2530 2564          }
2531 2565  
2532 2566          if (need_l1d) {
2533 2567                  spec_uarch_flush = spec_uarch_flush_msr;
2534 2568          } else if (need_mds) {
2535 2569                  spec_uarch_flush = x86_md_clear;
2536 2570          } else {
2537 2571                  /*
2538 2572                   * We have no hardware mitigations available to us.
2539 2573                   */
2540 2574                  spec_uarch_flush = spec_uarch_flush_noop;
2541 2575          }
2542 2576          membar_producer();
2543 2577  }
2544 2578  
2545 2579  /*
2546 2580   * We default to enabling RSB mitigations.
2547 2581   */
2548 2582  static void
2549 2583  cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)
2550 2584  {
2551 2585          const uint8_t ret = RET_INSTR;
2552 2586          uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
2553 2587  
2554 2588          switch (mit) {
2555 2589          case X86_SPECTREV2_ENHANCED_IBRS:
2556 2590          case X86_SPECTREV2_DISABLED:
2557 2591                  *stuff = ret;
2558 2592                  break;
2559 2593          default:
2560 2594                  break;
2561 2595          }
2562 2596  }
2563 2597  
2564 2598  static void
2565 2599  cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
2566 2600  {
2567 2601          const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
2568 2602              "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
2569 2603              "_r14", "_r15" };
2570 2604          const uint_t nthunks = ARRAY_SIZE(thunks);
2571 2605          const char *type;
2572 2606          uint_t i;
2573 2607  
2574 2608          if (mit == x86_spectrev2_mitigation)
2575 2609                  return;
2576 2610  
2577 2611          switch (mit) {
2578 2612          case X86_SPECTREV2_RETPOLINE:
2579 2613                  type = "gen";
2580 2614                  break;
2581 2615          case X86_SPECTREV2_RETPOLINE_AMD:
2582 2616                  type = "amd";
2583 2617                  break;
2584 2618          case X86_SPECTREV2_ENHANCED_IBRS:
2585 2619          case X86_SPECTREV2_DISABLED:
2586 2620                  type = "jmp";
2587 2621                  break;
2588 2622          default:
2589 2623                  panic("asked to updated retpoline state with unknown state!");
2590 2624          }
2591 2625  
2592 2626          for (i = 0; i < nthunks; i++) {
2593 2627                  uintptr_t source, dest;
2594 2628                  int ssize, dsize;
2595 2629                  char sourcebuf[64], destbuf[64];
2596 2630                  size_t len;
2597 2631  
2598 2632                  (void) snprintf(destbuf, sizeof (destbuf),
2599 2633                      "__x86_indirect_thunk%s", thunks[i]);
2600 2634                  (void) snprintf(sourcebuf, sizeof (sourcebuf),
2601 2635                      "__x86_indirect_thunk_%s%s", type, thunks[i]);
2602 2636  
2603 2637                  source = kobj_getelfsym(sourcebuf, NULL, &ssize);
2604 2638                  dest = kobj_getelfsym(destbuf, NULL, &dsize);
2605 2639                  VERIFY3U(source, !=, 0);
2606 2640                  VERIFY3U(dest, !=, 0);
2607 2641                  VERIFY3S(dsize, >=, ssize);
2608 2642                  bcopy((void *)source, (void *)dest, ssize);
2609 2643          }
2610 2644  }
2611 2645  
2612 2646  static void
2613 2647  cpuid_enable_enhanced_ibrs(void)
2614 2648  {
2615 2649          uint64_t val;
2616 2650  
2617 2651          val = rdmsr(MSR_IA32_SPEC_CTRL);
2618 2652          val |= IA32_SPEC_CTRL_IBRS;
2619 2653          wrmsr(MSR_IA32_SPEC_CTRL, val);
2620 2654  }
2621 2655  
2622 2656  #ifndef __xpv
2623 2657  /*
2624 2658   * Determine whether or not we can use the AMD optimized retpoline
2625 2659   * functionality. We use this when we know we're on an AMD system and we can
2626 2660   * successfully verify that lfence is dispatch serializing.
2627 2661   */
2628 2662  static boolean_t
2629 2663  cpuid_use_amd_retpoline(struct cpuid_info *cpi)
2630 2664  {
2631 2665          uint64_t val;
2632 2666          on_trap_data_t otd;
2633 2667  
2634 2668          if (cpi->cpi_vendor != X86_VENDOR_AMD)
2635 2669                  return (B_FALSE);
2636 2670  
2637 2671          /*
2638 2672           * We need to determine whether or not lfence is serializing. It always
2639 2673           * is on families 0xf and 0x11. On others, it's controlled by
2640 2674           * MSR_AMD_DECODE_CONFIG (MSRC001_1029). If some hypervisor gives us a
2641 2675           * crazy old family, don't try and do anything.
2642 2676           */
2643 2677          if (cpi->cpi_family < 0xf)
2644 2678                  return (B_FALSE);
2645 2679          if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11)
2646 2680                  return (B_TRUE);
2647 2681  
2648 2682          /*
2649 2683           * While it may be tempting to use get_hwenv(), there are no promises
2650 2684           * that a hypervisor will actually declare themselves to be so in a
2651 2685           * friendly way. As such, try to read and set the MSR. If we can then
2652 2686           * read back the value we set (it wasn't just set to zero), then we go
2653 2687           * for it.
2654 2688           */
2655 2689          if (!on_trap(&otd, OT_DATA_ACCESS)) {
2656 2690                  val = rdmsr(MSR_AMD_DECODE_CONFIG);
2657 2691                  val |= AMD_DECODE_CONFIG_LFENCE_DISPATCH;
2658 2692                  wrmsr(MSR_AMD_DECODE_CONFIG, val);
2659 2693                  val = rdmsr(MSR_AMD_DECODE_CONFIG);
2660 2694          } else {
2661 2695                  val = 0;
2662 2696          }
2663 2697          no_trap();
2664 2698  
2665 2699          if ((val & AMD_DECODE_CONFIG_LFENCE_DISPATCH) != 0)
2666 2700                  return (B_TRUE);
2667 2701          return (B_FALSE);
2668 2702  }
2669 2703  #endif  /* !__xpv */
2670 2704  
2671 2705  static void
2672 2706  cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
2673 2707  {
2674 2708          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2675 2709          x86_spectrev2_mitigation_t v2mit;
2676 2710  
2677 2711          if (cpi->cpi_vendor == X86_VENDOR_AMD &&
2678 2712              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2679 2713                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
2680 2714                          add_x86_feature(featureset, X86FSET_IBPB);
2681 2715                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
2682 2716                          add_x86_feature(featureset, X86FSET_IBRS);
2683 2717                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
2684 2718                          add_x86_feature(featureset, X86FSET_STIBP);
2685 2719                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
2686 2720                          add_x86_feature(featureset, X86FSET_STIBP_ALL);
2687 2721                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
2688 2722                          add_x86_feature(featureset, X86FSET_SSBD);
2689 2723                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
2690 2724                          add_x86_feature(featureset, X86FSET_SSBD_VIRT);
2691 2725                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
2692 2726                          add_x86_feature(featureset, X86FSET_SSB_NO);
2693 2727                  /*
2694 2728                   * Don't enable enhanced IBRS unless we're told that we should
2695 2729                   * prefer it and it has the same semantics as Intel. This is
2696 2730                   * split into two bits rather than a single one.
2697 2731                   */
2698 2732                  if ((cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
2699 2733                      (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)) {
2700 2734                          add_x86_feature(featureset, X86FSET_IBRS_ALL);
2701 2735                  }
2702 2736  
2703 2737          } else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
2704 2738              cpi->cpi_maxeax >= 7) {
2705 2739                  struct cpuid_regs *ecp;
2706 2740                  ecp = &cpi->cpi_std[7];
2707 2741  
2708 2742                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
2709 2743                          add_x86_feature(featureset, X86FSET_MD_CLEAR);
2710 2744                  }
2711 2745  
2712 2746                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
2713 2747                          add_x86_feature(featureset, X86FSET_IBRS);
2714 2748                          add_x86_feature(featureset, X86FSET_IBPB);
2715 2749                  }
2716 2750  
2717 2751                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
2718 2752                          add_x86_feature(featureset, X86FSET_STIBP);
2719 2753                  }
2720 2754  
2721 2755                  /*
2722 2756                   * Don't read the arch caps MSR on xpv where we lack the
2723 2757                   * on_trap().
2724 2758                   */
2725 2759  #ifndef __xpv
2726 2760                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
2727 2761                          on_trap_data_t otd;
2728 2762  
2729 2763                          /*
2730 2764                           * Be paranoid and assume we'll get a #GP.
2731 2765                           */
2732 2766                          if (!on_trap(&otd, OT_DATA_ACCESS)) {
2733 2767                                  uint64_t reg;
2734 2768  
2735 2769                                  reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
2736 2770                                  if (reg & IA32_ARCH_CAP_RDCL_NO) {
2737 2771                                          add_x86_feature(featureset,
2738 2772                                              X86FSET_RDCL_NO);
2739 2773                                  }
2740 2774                                  if (reg & IA32_ARCH_CAP_IBRS_ALL) {
2741 2775                                          add_x86_feature(featureset,
2742 2776                                              X86FSET_IBRS_ALL);
2743 2777                                  }
2744 2778                                  if (reg & IA32_ARCH_CAP_RSBA) {
2745 2779                                          add_x86_feature(featureset,
2746 2780                                              X86FSET_RSBA);
2747 2781                                  }
2748 2782                                  if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
2749 2783                                          add_x86_feature(featureset,
2750 2784                                              X86FSET_L1D_VM_NO);
2751 2785                                  }
2752 2786                                  if (reg & IA32_ARCH_CAP_SSB_NO) {
2753 2787                                          add_x86_feature(featureset,
2754 2788                                              X86FSET_SSB_NO);
2755 2789                                  }
2756 2790                                  if (reg & IA32_ARCH_CAP_MDS_NO) {
2757 2791                                          add_x86_feature(featureset,
2758 2792                                              X86FSET_MDS_NO);
2759 2793                                  }
2760 2794                          }
2761 2795                          no_trap();
2762 2796                  }
2763 2797  #endif  /* !__xpv */
2764 2798  
2765 2799                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
2766 2800                          add_x86_feature(featureset, X86FSET_SSBD);
2767 2801  
2768 2802                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
2769 2803                          add_x86_feature(featureset, X86FSET_FLUSH_CMD);
2770 2804          }
2771 2805  
2772 2806          if (cpu->cpu_id != 0) {
2773 2807                  if (x86_spectrev2_mitigation == X86_SPECTREV2_ENHANCED_IBRS) {
2774 2808                          cpuid_enable_enhanced_ibrs();
2775 2809                  }
2776 2810                  return;
2777 2811          }
2778 2812  
2779 2813          /*
2780 2814           * Go through and initialize various security mechanisms that we should
2781 2815           * only do on a single CPU. This includes Spectre V2, L1TF, and MDS.
2782 2816           */
2783 2817  
2784 2818          /*
2785 2819           * By default we've come in with retpolines enabled. Check whether we
2786 2820           * should disable them or enable enhanced IBRS. RSB stuffing is enabled
2787 2821           * by default, but disabled if we are using enhanced IBRS.
2788 2822           */
2789 2823          if (x86_disable_spectrev2 != 0) {
2790 2824                  v2mit = X86_SPECTREV2_DISABLED;
2791 2825          } else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
2792 2826                  cpuid_enable_enhanced_ibrs();
2793 2827                  v2mit = X86_SPECTREV2_ENHANCED_IBRS;
2794 2828  #ifndef __xpv
2795 2829          } else if (cpuid_use_amd_retpoline(cpi)) {
2796 2830                  v2mit = X86_SPECTREV2_RETPOLINE_AMD;
2797 2831  #endif  /* !__xpv */
2798 2832          } else {
2799 2833                  v2mit = X86_SPECTREV2_RETPOLINE;
2800 2834          }
2801 2835  
2802 2836          cpuid_patch_retpolines(v2mit);
2803 2837          cpuid_patch_rsb(v2mit);
2804 2838          x86_spectrev2_mitigation = v2mit;
2805 2839          membar_producer();
2806 2840  
2807 2841          /*
2808 2842           * We need to determine what changes are required for mitigating L1TF
2809 2843           * and MDS. If the CPU suffers from either of them, then SMT exclusion
2810 2844           * is required.
2811 2845           *
2812 2846           * If any of these are present, then we need to flush u-arch state at
2813 2847           * various points. For MDS, we need to do so whenever we change to a
2814 2848           * lesser privilege level or we are halting the CPU. For L1TF we need to
2815 2849           * flush the L1D cache at VM entry. When we have microcode that handles
2816 2850           * MDS, the L1D flush also clears the other u-arch state that the
2817 2851           * md_clear does.
2818 2852           */
2819 2853  
2820 2854          /*
2821 2855           * Update whether or not we need to be taking explicit action against
2822 2856           * MDS.
2823 2857           */
2824 2858          cpuid_update_md_clear(cpu, featureset);
2825 2859  
2826 2860          /*
2827 2861           * Determine whether SMT exclusion is required and whether or not we
2828 2862           * need to perform an l1d flush.
2829 2863           */
2830 2864          cpuid_update_l1d_flush(cpu, featureset);
2831 2865  }
2832 2866  
2833 2867  /*
2834 2868   * Setup XFeature_Enabled_Mask register. Required by xsave feature.
2835 2869   */
2836 2870  void
2837 2871  setup_xfem(void)
2838 2872  {
2839 2873          uint64_t flags = XFEATURE_LEGACY_FP;
2840 2874  
2841 2875          ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
2842 2876  
2843 2877          if (is_x86_feature(x86_featureset, X86FSET_SSE))
2844 2878                  flags |= XFEATURE_SSE;
2845 2879  
2846 2880          if (is_x86_feature(x86_featureset, X86FSET_AVX))
2847 2881                  flags |= XFEATURE_AVX;
2848 2882  
2849 2883          if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
2850 2884                  flags |= XFEATURE_AVX512;
2851 2885  
2852 2886          set_xcr(XFEATURE_ENABLED_MASK, flags);
2853 2887  
2854 2888          xsave_bv_all = flags;
2855 2889  }
2856 2890  
2857 2891  static void
2858 2892  cpuid_pass1_topology(cpu_t *cpu, uchar_t *featureset)
2859 2893  {
2860 2894          struct cpuid_info *cpi;
2861 2895  
2862 2896          cpi = cpu->cpu_m.mcpu_cpi;
2863 2897  
2864 2898          if (cpi->cpi_vendor == X86_VENDOR_AMD) {
2865 2899                  cpuid_gather_amd_topology_leaves(cpu);
2866 2900          }
2867 2901  
2868 2902          cpi->cpi_apicid = cpuid_gather_apicid(cpi);
2869 2903  
2870 2904          /*
2871 2905           * Before we can calculate the IDs that we should assign to this
2872 2906           * processor, we need to understand how many cores and threads it has.
2873 2907           */
2874 2908          switch (cpi->cpi_vendor) {
2875 2909          case X86_VENDOR_Intel:
2876 2910                  cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
2877 2911                      &cpi->cpi_ncore_per_chip);
2878 2912                  break;
2879 2913          case X86_VENDOR_AMD:
2880 2914                  cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
2881 2915                      &cpi->cpi_ncore_per_chip);
2882 2916                  break;
2883 2917          default:
2884 2918                  /*
2885 2919                   * If we have some other x86 compatible chip, it's not clear how
2886 2920                   * they would behave. The most common case is virtualization
2887 2921                   * today, though there are also 64-bit VIA chips. Assume that
2888 2922                   * all we can get is the basic Leaf 1 HTT information.
2889 2923                   */
2890 2924                  if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2891 2925                          cpi->cpi_ncore_per_chip = 1;
2892 2926                          cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
2893 2927                  }
2894 2928                  break;
2895 2929          }
2896 2930  
2897 2931          /*
2898 2932           * Based on the calculated number of threads and cores, potentially
2899 2933           * assign the HTT and CMT features.
2900 2934           */
2901 2935          if (cpi->cpi_ncore_per_chip > 1) {
2902 2936                  add_x86_feature(featureset, X86FSET_CMP);
2903 2937          }
2904 2938  
2905 2939          if (cpi->cpi_ncpu_per_chip > 1 &&
2906 2940              cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
2907 2941                  add_x86_feature(featureset, X86FSET_HTT);
2908 2942          }
2909 2943  
2910 2944          /*
2911 2945           * Now that has been set up, we need to go through and calculate all of
2912 2946           * the rest of the parameters that exist. If we think the CPU doesn't
2913 2947           * have either SMT (HTT) or CMP, then we basically go through and fake
2914 2948           * up information in some way. The most likely case for this is
2915 2949           * virtualization where we have a lot of partial topology information.
2916 2950           */
2917 2951          if (!is_x86_feature(featureset, X86FSET_HTT) &&
2918 2952              !is_x86_feature(featureset, X86FSET_CMP)) {
2919 2953                  /*
2920 2954                   * This is a single core, single-threaded processor.
2921 2955                   */
2922 2956                  cpi->cpi_procnodes_per_pkg = 1;
2923 2957                  cpi->cpi_cores_per_compunit = 1;
2924 2958                  cpi->cpi_compunitid = 0;
2925 2959                  cpi->cpi_chipid = -1;
2926 2960                  cpi->cpi_clogid = 0;
2927 2961                  cpi->cpi_coreid = cpu->cpu_id;
2928 2962                  cpi->cpi_pkgcoreid = 0;
2929 2963                  if (cpi->cpi_vendor == X86_VENDOR_AMD) {
2930 2964                          cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
2931 2965                  } else {
2932 2966                          cpi->cpi_procnodeid = cpi->cpi_chipid;
2933 2967                  }
2934 2968          } else {
2935 2969                  switch (cpi->cpi_vendor) {
2936 2970                  case X86_VENDOR_Intel:
2937 2971                          cpuid_intel_getids(cpu, featureset);
2938 2972                          break;
2939 2973                  case X86_VENDOR_AMD:
2940 2974                          cpuid_amd_getids(cpu, featureset);
2941 2975                          break;
2942 2976                  default:
2943 2977                          /*
2944 2978                           * In this case, it's hard to say what we should do.
2945 2979                           * We're going to model them to the OS as single core
2946 2980                           * threads. We don't have a good identifier for them, so
2947 2981                           * we're just going to use the cpu id all on a single
2948 2982                           * chip.
2949 2983                           *
2950 2984                           * This case has historically been different from the
2951 2985                           * case above where we don't have HTT or CMP. While they
2952 2986                           * could be combined, we've opted to keep it separate to
2953 2987                           * minimize the risk of topology changes in weird cases.
2954 2988                           */
2955 2989                          cpi->cpi_procnodes_per_pkg = 1;
2956 2990                          cpi->cpi_cores_per_compunit = 1;
2957 2991                          cpi->cpi_chipid = 0;
2958 2992                          cpi->cpi_coreid = cpu->cpu_id;
2959 2993                          cpi->cpi_clogid = cpu->cpu_id;
2960 2994                          cpi->cpi_pkgcoreid = cpu->cpu_id;
2961 2995                          cpi->cpi_procnodeid = cpi->cpi_chipid;
2962 2996                          cpi->cpi_compunitid = cpi->cpi_coreid;
2963 2997                          break;
2964 2998                  }
2965 2999          }
2966 3000  }
2967 3001  
2968 3002  /*
2969 3003   * Gather relevant CPU features from leaf 6 which covers thermal information. We
2970 3004   * always gather leaf 6 if it's supported; however, we only look for features on
2971 3005   * Intel systems as AMD does not currently define any of the features we look
2972 3006   * for below.
2973 3007   */
2974 3008  static void
2975 3009  cpuid_pass1_thermal(cpu_t *cpu, uchar_t *featureset)
2976 3010  {
2977 3011          struct cpuid_regs *cp;
2978 3012          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2979 3013  
2980 3014          if (cpi->cpi_maxeax < 6) {
2981 3015                  return;
2982 3016          }
2983 3017  
2984 3018          cp = &cpi->cpi_std[6];
2985 3019          cp->cp_eax = 6;
2986 3020          cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
2987 3021          (void) __cpuid_insn(cp);
2988 3022          platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
2989 3023  
2990 3024          if (cpi->cpi_vendor != X86_VENDOR_Intel) {
2991 3025                  return;
2992 3026          }
2993 3027  
2994 3028          if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
2995 3029                  add_x86_feature(featureset, X86FSET_CORE_THERMAL);
2996 3030          }
2997 3031  
2998 3032          if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
2999 3033                  add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3000 3034          }
3001 3035  }
3002 3036  
3003 3037  void
3004 3038  cpuid_pass1(cpu_t *cpu, uchar_t *featureset)
3005 3039  {
3006 3040          uint32_t mask_ecx, mask_edx;
3007 3041          struct cpuid_info *cpi;
3008 3042          struct cpuid_regs *cp;
3009 3043          int xcpuid;
3010 3044  #if !defined(__xpv)
3011 3045          extern int idle_cpu_prefer_mwait;
3012 3046  #endif
3013 3047  
3014 3048          /*
3015 3049           * Space statically allocated for BSP, ensure pointer is set
3016 3050           */
3017 3051          if (cpu->cpu_id == 0) {
3018 3052                  if (cpu->cpu_m.mcpu_cpi == NULL)
3019 3053                          cpu->cpu_m.mcpu_cpi = &cpuid_info0;
3020 3054          }
3021 3055  
3022 3056          add_x86_feature(featureset, X86FSET_CPUID);
3023 3057  
3024 3058          cpi = cpu->cpu_m.mcpu_cpi;
3025 3059          ASSERT(cpi != NULL);
3026 3060          cp = &cpi->cpi_std[0];
3027 3061          cp->cp_eax = 0;
3028 3062          cpi->cpi_maxeax = __cpuid_insn(cp);
3029 3063          {
3030 3064                  uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
3031 3065                  *iptr++ = cp->cp_ebx;
3032 3066                  *iptr++ = cp->cp_edx;
3033 3067                  *iptr++ = cp->cp_ecx;
3034 3068                  *(char *)&cpi->cpi_vendorstr[12] = '\0';
3035 3069          }
3036 3070  
3037 3071          cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
3038 3072          x86_vendor = cpi->cpi_vendor; /* for compatibility */
3039 3073  
3040 3074          /*
3041 3075           * Limit the range in case of weird hardware
3042 3076           */
3043 3077          if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
3044 3078                  cpi->cpi_maxeax = CPI_MAXEAX_MAX;
3045 3079          if (cpi->cpi_maxeax < 1)
3046 3080                  goto pass1_done;
3047 3081  
3048 3082          cp = &cpi->cpi_std[1];
3049 3083          cp->cp_eax = 1;
3050 3084          (void) __cpuid_insn(cp);
3051 3085  
3052 3086          /*
3053 3087           * Extract identifying constants for easy access.
3054 3088           */
3055 3089          cpi->cpi_model = CPI_MODEL(cpi);
3056 3090          cpi->cpi_family = CPI_FAMILY(cpi);
3057 3091  
3058 3092          if (cpi->cpi_family == 0xf)
3059 3093                  cpi->cpi_family += CPI_FAMILY_XTD(cpi);
3060 3094  
3061 3095          /*
3062 3096           * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
3063 3097           * Intel, and presumably everyone else, uses model == 0xf, as
3064 3098           * one would expect (max value means possible overflow).  Sigh.
3065 3099           */
3066 3100  
3067 3101          switch (cpi->cpi_vendor) {
3068 3102          case X86_VENDOR_Intel:
3069 3103                  if (IS_EXTENDED_MODEL_INTEL(cpi))
3070 3104                          cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3071 3105                  break;
3072 3106          case X86_VENDOR_AMD:
3073 3107                  if (CPI_FAMILY(cpi) == 0xf)
3074 3108                          cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3075 3109                  break;
3076 3110          default:
3077 3111                  if (cpi->cpi_model == 0xf)
3078 3112                          cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3079 3113                  break;
3080 3114          }
3081 3115  
3082 3116          cpi->cpi_step = CPI_STEP(cpi);
3083 3117          cpi->cpi_brandid = CPI_BRANDID(cpi);
3084 3118  
3085 3119          /*
3086 3120           * *default* assumptions:
3087 3121           * - believe %edx feature word
3088 3122           * - ignore %ecx feature word
3089 3123           * - 32-bit virtual and physical addressing
3090 3124           */
3091 3125          mask_edx = 0xffffffff;
3092 3126          mask_ecx = 0;
3093 3127  
3094 3128          cpi->cpi_pabits = cpi->cpi_vabits = 32;
3095 3129  
3096 3130          switch (cpi->cpi_vendor) {
3097 3131          case X86_VENDOR_Intel:
3098 3132                  if (cpi->cpi_family == 5)
3099 3133                          x86_type = X86_TYPE_P5;
3100 3134                  else if (IS_LEGACY_P6(cpi)) {
3101 3135                          x86_type = X86_TYPE_P6;
3102 3136                          pentiumpro_bug4046376 = 1;
3103 3137                          /*
3104 3138                           * Clear the SEP bit when it was set erroneously
3105 3139                           */
3106 3140                          if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
3107 3141                                  cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
3108 3142                  } else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
3109 3143                          x86_type = X86_TYPE_P4;
3110 3144                          /*
3111 3145                           * We don't currently depend on any of the %ecx
3112 3146                           * features until Prescott, so we'll only check
3113 3147                           * this from P4 onwards.  We might want to revisit
3114 3148                           * that idea later.
3115 3149                           */
3116 3150                          mask_ecx = 0xffffffff;
3117 3151                  } else if (cpi->cpi_family > 0xf)
3118 3152                          mask_ecx = 0xffffffff;
3119 3153                  /*
3120 3154                   * We don't support MONITOR/MWAIT if leaf 5 is not available
3121 3155                   * to obtain the monitor linesize.
3122 3156                   */
3123 3157                  if (cpi->cpi_maxeax < 5)
3124 3158                          mask_ecx &= ~CPUID_INTC_ECX_MON;
3125 3159                  break;
3126 3160          case X86_VENDOR_IntelClone:
3127 3161          default:
3128 3162                  break;
3129 3163          case X86_VENDOR_AMD:
3130 3164  #if defined(OPTERON_ERRATUM_108)
3131 3165                  if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
3132 3166                          cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
3133 3167                          cpi->cpi_model = 0xc;
3134 3168                  } else
3135 3169  #endif
3136 3170                  if (cpi->cpi_family == 5) {
3137 3171                          /*
3138 3172                           * AMD K5 and K6
3139 3173                           *
3140 3174                           * These CPUs have an incomplete implementation
3141 3175                           * of MCA/MCE which we mask away.
3142 3176                           */
3143 3177                          mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
3144 3178  
3145 3179                          /*
3146 3180                           * Model 0 uses the wrong (APIC) bit
3147 3181                           * to indicate PGE.  Fix it here.
3148 3182                           */
3149 3183                          if (cpi->cpi_model == 0) {
3150 3184                                  if (cp->cp_edx & 0x200) {
3151 3185                                          cp->cp_edx &= ~0x200;
3152 3186                                          cp->cp_edx |= CPUID_INTC_EDX_PGE;
3153 3187                                  }
3154 3188                          }
3155 3189  
3156 3190                          /*
3157 3191                           * Early models had problems w/ MMX; disable.
3158 3192                           */
3159 3193                          if (cpi->cpi_model < 6)
3160 3194                                  mask_edx &= ~CPUID_INTC_EDX_MMX;
3161 3195                  }
3162 3196  
3163 3197                  /*
3164 3198                   * For newer families, SSE3 and CX16, at least, are valid;
3165 3199                   * enable all
3166 3200                   */
3167 3201                  if (cpi->cpi_family >= 0xf)
3168 3202                          mask_ecx = 0xffffffff;
3169 3203                  /*
3170 3204                   * We don't support MONITOR/MWAIT if leaf 5 is not available
3171 3205                   * to obtain the monitor linesize.
3172 3206                   */
3173 3207                  if (cpi->cpi_maxeax < 5)
3174 3208                          mask_ecx &= ~CPUID_INTC_ECX_MON;
3175 3209  
3176 3210  #if !defined(__xpv)
3177 3211                  /*
3178 3212                   * AMD has not historically used MWAIT in the CPU's idle loop.
3179 3213                   * Pre-family-10h Opterons do not have the MWAIT instruction. We
3180 3214                   * know for certain that in at least family 17h, per AMD, mwait
3181 3215                   * is preferred. Families in-between are less certain.
3182 3216                   */
3183 3217                  if (cpi->cpi_family < 0x17) {
3184 3218                          idle_cpu_prefer_mwait = 0;
3185 3219                  }
3186 3220  #endif
3187 3221  
3188 3222                  break;
3189 3223          case X86_VENDOR_TM:
3190 3224                  /*
3191 3225                   * workaround the NT workaround in CMS 4.1
3192 3226                   */
3193 3227                  if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
3194 3228                      (cpi->cpi_step == 2 || cpi->cpi_step == 3))
3195 3229                          cp->cp_edx |= CPUID_INTC_EDX_CX8;
3196 3230                  break;
3197 3231          case X86_VENDOR_Centaur:
3198 3232                  /*
3199 3233                   * workaround the NT workarounds again
3200 3234                   */
3201 3235                  if (cpi->cpi_family == 6)
3202 3236                          cp->cp_edx |= CPUID_INTC_EDX_CX8;
3203 3237                  break;
3204 3238          case X86_VENDOR_Cyrix:
3205 3239                  /*
3206 3240                   * We rely heavily on the probing in locore
3207 3241                   * to actually figure out what parts, if any,
3208 3242                   * of the Cyrix cpuid instruction to believe.
3209 3243                   */
3210 3244                  switch (x86_type) {
3211 3245                  case X86_TYPE_CYRIX_486:
3212 3246                          mask_edx = 0;
3213 3247                          break;
3214 3248                  case X86_TYPE_CYRIX_6x86:
3215 3249                          mask_edx = 0;
3216 3250                          break;
3217 3251                  case X86_TYPE_CYRIX_6x86L:
3218 3252                          mask_edx =
3219 3253                              CPUID_INTC_EDX_DE |
3220 3254                              CPUID_INTC_EDX_CX8;
3221 3255                          break;
3222 3256                  case X86_TYPE_CYRIX_6x86MX:
3223 3257                          mask_edx =
3224 3258                              CPUID_INTC_EDX_DE |
3225 3259                              CPUID_INTC_EDX_MSR |
3226 3260                              CPUID_INTC_EDX_CX8 |
3227 3261                              CPUID_INTC_EDX_PGE |
3228 3262                              CPUID_INTC_EDX_CMOV |
3229 3263                              CPUID_INTC_EDX_MMX;
3230 3264                          break;
3231 3265                  case X86_TYPE_CYRIX_GXm:
3232 3266                          mask_edx =
3233 3267                              CPUID_INTC_EDX_MSR |
3234 3268                              CPUID_INTC_EDX_CX8 |
3235 3269                              CPUID_INTC_EDX_CMOV |
3236 3270                              CPUID_INTC_EDX_MMX;
3237 3271                          break;
3238 3272                  case X86_TYPE_CYRIX_MediaGX:
3239 3273                          break;
3240 3274                  case X86_TYPE_CYRIX_MII:
3241 3275                  case X86_TYPE_VIA_CYRIX_III:
3242 3276                          mask_edx =
3243 3277                              CPUID_INTC_EDX_DE |
3244 3278                              CPUID_INTC_EDX_TSC |
3245 3279                              CPUID_INTC_EDX_MSR |
3246 3280                              CPUID_INTC_EDX_CX8 |
3247 3281                              CPUID_INTC_EDX_PGE |
3248 3282                              CPUID_INTC_EDX_CMOV |
3249 3283                              CPUID_INTC_EDX_MMX;
3250 3284                          break;
3251 3285                  default:
3252 3286                          break;
3253 3287                  }
3254 3288                  break;
3255 3289          }
3256 3290  
3257 3291  #if defined(__xpv)
3258 3292          /*
3259 3293           * Do not support MONITOR/MWAIT under a hypervisor
3260 3294           */
3261 3295          mask_ecx &= ~CPUID_INTC_ECX_MON;
3262 3296          /*
3263 3297           * Do not support XSAVE under a hypervisor for now
3264 3298           */
3265 3299          xsave_force_disable = B_TRUE;
3266 3300  
3267 3301  #endif  /* __xpv */
3268 3302  
3269 3303          if (xsave_force_disable) {
3270 3304                  mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
3271 3305                  mask_ecx &= ~CPUID_INTC_ECX_AVX;
3272 3306                  mask_ecx &= ~CPUID_INTC_ECX_F16C;
3273 3307                  mask_ecx &= ~CPUID_INTC_ECX_FMA;
3274 3308          }
3275 3309  
3276 3310          /*
3277 3311           * Now we've figured out the masks that determine
3278 3312           * which bits we choose to believe, apply the masks
3279 3313           * to the feature words, then map the kernel's view
3280 3314           * of these feature words into its feature word.
3281 3315           */
3282 3316          cp->cp_edx &= mask_edx;
3283 3317          cp->cp_ecx &= mask_ecx;
3284 3318  
3285 3319          /*
3286 3320           * apply any platform restrictions (we don't call this
3287 3321           * immediately after __cpuid_insn here, because we need the
3288 3322           * workarounds applied above first)
3289 3323           */
3290 3324          platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
3291 3325  
3292 3326          /*
3293 3327           * In addition to ecx and edx, Intel and AMD are storing a bunch of
3294 3328           * instruction set extensions in leaf 7's ebx, ecx, and edx.
3295 3329           */
3296 3330          if (cpi->cpi_maxeax >= 7) {
3297 3331                  struct cpuid_regs *ecp;
3298 3332                  ecp = &cpi->cpi_std[7];
3299 3333                  ecp->cp_eax = 7;
3300 3334                  ecp->cp_ecx = 0;
3301 3335                  (void) __cpuid_insn(ecp);
3302 3336  
3303 3337                  /*
3304 3338                   * If XSAVE has been disabled, just ignore all of the
3305 3339                   * extended-save-area dependent flags here.
3306 3340                   */
3307 3341                  if (xsave_force_disable) {
3308 3342                          ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
3309 3343                          ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
3310 3344                          ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
3311 3345                          ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
3312 3346                          ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
3313 3347                          ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
3314 3348                          ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
3315 3349                  }
3316 3350  
3317 3351                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
3318 3352                          add_x86_feature(featureset, X86FSET_SMEP);
3319 3353  
3320 3354                  /*
3321 3355                   * We check disable_smap here in addition to in startup_smap()
3322 3356                   * to ensure CPUs that aren't the boot CPU don't accidentally
3323 3357                   * include it in the feature set and thus generate a mismatched
3324 3358                   * x86 feature set across CPUs.
3325 3359                   */
3326 3360                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
3327 3361                      disable_smap == 0)
3328 3362                          add_x86_feature(featureset, X86FSET_SMAP);
3329 3363  
3330 3364                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
3331 3365                          add_x86_feature(featureset, X86FSET_RDSEED);
3332 3366  
3333 3367                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
3334 3368                          add_x86_feature(featureset, X86FSET_ADX);
3335 3369  
3336 3370                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
3337 3371                          add_x86_feature(featureset, X86FSET_FSGSBASE);
3338 3372  
3339 3373                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
3340 3374                          add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
3341 3375  
3342 3376                  if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3343 3377                          if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
3344 3378                                  add_x86_feature(featureset, X86FSET_INVPCID);
3345 3379  
3346 3380                          if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
3347 3381                                  add_x86_feature(featureset, X86FSET_MPX);
3348 3382  
3349 3383                          if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
3350 3384                                  add_x86_feature(featureset, X86FSET_CLWB);
3351 3385                  }
3352 3386          }
3353 3387  
3354 3388          /*
3355 3389           * fold in overrides from the "eeprom" mechanism
3356 3390           */
3357 3391          cp->cp_edx |= cpuid_feature_edx_include;
3358 3392          cp->cp_edx &= ~cpuid_feature_edx_exclude;
3359 3393  
3360 3394          cp->cp_ecx |= cpuid_feature_ecx_include;
3361 3395          cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
3362 3396  
3363 3397          if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
3364 3398                  add_x86_feature(featureset, X86FSET_LARGEPAGE);
3365 3399          }
3366 3400          if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
3367 3401                  add_x86_feature(featureset, X86FSET_TSC);
3368 3402          }
3369 3403          if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
3370 3404                  add_x86_feature(featureset, X86FSET_MSR);
3371 3405          }
3372 3406          if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
3373 3407                  add_x86_feature(featureset, X86FSET_MTRR);
3374 3408          }
3375 3409          if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
3376 3410                  add_x86_feature(featureset, X86FSET_PGE);
3377 3411          }
3378 3412          if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
3379 3413                  add_x86_feature(featureset, X86FSET_CMOV);
3380 3414          }
3381 3415          if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
3382 3416                  add_x86_feature(featureset, X86FSET_MMX);
3383 3417          }
3384 3418          if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
3385 3419              (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
3386 3420                  add_x86_feature(featureset, X86FSET_MCA);
3387 3421          }
3388 3422          if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
3389 3423                  add_x86_feature(featureset, X86FSET_PAE);
3390 3424          }
3391 3425          if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
3392 3426                  add_x86_feature(featureset, X86FSET_CX8);
3393 3427          }
3394 3428          if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
3395 3429                  add_x86_feature(featureset, X86FSET_CX16);
3396 3430          }
3397 3431          if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
3398 3432                  add_x86_feature(featureset, X86FSET_PAT);
3399 3433          }
3400 3434          if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
3401 3435                  add_x86_feature(featureset, X86FSET_SEP);
3402 3436          }
3403 3437          if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
3404 3438                  /*
3405 3439                   * In our implementation, fxsave/fxrstor
3406 3440                   * are prerequisites before we'll even
3407 3441                   * try and do SSE things.
3408 3442                   */
3409 3443                  if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
3410 3444                          add_x86_feature(featureset, X86FSET_SSE);
3411 3445                  }
3412 3446                  if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
3413 3447                          add_x86_feature(featureset, X86FSET_SSE2);
3414 3448                  }
3415 3449                  if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
3416 3450                          add_x86_feature(featureset, X86FSET_SSE3);
3417 3451                  }
3418 3452                  if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
3419 3453                          add_x86_feature(featureset, X86FSET_SSSE3);
3420 3454                  }
3421 3455                  if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
3422 3456                          add_x86_feature(featureset, X86FSET_SSE4_1);
3423 3457                  }
3424 3458                  if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
3425 3459                          add_x86_feature(featureset, X86FSET_SSE4_2);
3426 3460                  }
3427 3461                  if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
3428 3462                          add_x86_feature(featureset, X86FSET_AES);
3429 3463                  }
3430 3464                  if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
3431 3465                          add_x86_feature(featureset, X86FSET_PCLMULQDQ);
3432 3466                  }
3433 3467  
3434 3468                  if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
3435 3469                          add_x86_feature(featureset, X86FSET_SHA);
3436 3470  
3437 3471                  if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
3438 3472                          add_x86_feature(featureset, X86FSET_UMIP);
3439 3473                  if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_PKU)
3440 3474                          add_x86_feature(featureset, X86FSET_PKU);
3441 3475                  if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
3442 3476                          add_x86_feature(featureset, X86FSET_OSPKE);
3443 3477  
3444 3478                  if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
3445 3479                          add_x86_feature(featureset, X86FSET_XSAVE);
3446 3480  
3447 3481                          /* We only test AVX & AVX512 when there is XSAVE */
3448 3482  
3449 3483                          if (cp->cp_ecx & CPUID_INTC_ECX_AVX) {
3450 3484                                  add_x86_feature(featureset,
3451 3485                                      X86FSET_AVX);
3452 3486  
3453 3487                                  /*
3454 3488                                   * Intel says we can't check these without also
3455 3489                                   * checking AVX.
3456 3490                                   */
3457 3491                                  if (cp->cp_ecx & CPUID_INTC_ECX_F16C)
3458 3492                                          add_x86_feature(featureset,
3459 3493                                              X86FSET_F16C);
3460 3494  
3461 3495                                  if (cp->cp_ecx & CPUID_INTC_ECX_FMA)
3462 3496                                          add_x86_feature(featureset,
3463 3497                                              X86FSET_FMA);
3464 3498  
3465 3499                                  if (cpi->cpi_std[7].cp_ebx &
3466 3500                                      CPUID_INTC_EBX_7_0_BMI1)
3467 3501                                          add_x86_feature(featureset,
3468 3502                                              X86FSET_BMI1);
3469 3503  
3470 3504                                  if (cpi->cpi_std[7].cp_ebx &
3471 3505                                      CPUID_INTC_EBX_7_0_BMI2)
3472 3506                                          add_x86_feature(featureset,
3473 3507                                              X86FSET_BMI2);
3474 3508  
3475 3509                                  if (cpi->cpi_std[7].cp_ebx &
3476 3510                                      CPUID_INTC_EBX_7_0_AVX2)
3477 3511                                          add_x86_feature(featureset,
3478 3512                                              X86FSET_AVX2);
3479 3513                          }
3480 3514  
3481 3515                          if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3482 3516                              (cpi->cpi_std[7].cp_ebx &
3483 3517                              CPUID_INTC_EBX_7_0_AVX512F) != 0) {
3484 3518                                  add_x86_feature(featureset, X86FSET_AVX512F);
3485 3519  
3486 3520                                  if (cpi->cpi_std[7].cp_ebx &
3487 3521                                      CPUID_INTC_EBX_7_0_AVX512DQ)
3488 3522                                          add_x86_feature(featureset,
3489 3523                                              X86FSET_AVX512DQ);
3490 3524                                  if (cpi->cpi_std[7].cp_ebx &
3491 3525                                      CPUID_INTC_EBX_7_0_AVX512IFMA)
3492 3526                                          add_x86_feature(featureset,
3493 3527                                              X86FSET_AVX512FMA);
3494 3528                                  if (cpi->cpi_std[7].cp_ebx &
3495 3529                                      CPUID_INTC_EBX_7_0_AVX512PF)
3496 3530                                          add_x86_feature(featureset,
3497 3531                                              X86FSET_AVX512PF);
3498 3532                                  if (cpi->cpi_std[7].cp_ebx &
3499 3533                                      CPUID_INTC_EBX_7_0_AVX512ER)
3500 3534                                          add_x86_feature(featureset,
3501 3535                                              X86FSET_AVX512ER);
3502 3536                                  if (cpi->cpi_std[7].cp_ebx &
3503 3537                                      CPUID_INTC_EBX_7_0_AVX512CD)
3504 3538                                          add_x86_feature(featureset,
3505 3539                                              X86FSET_AVX512CD);
3506 3540                                  if (cpi->cpi_std[7].cp_ebx &
3507 3541                                      CPUID_INTC_EBX_7_0_AVX512BW)
3508 3542                                          add_x86_feature(featureset,
3509 3543                                              X86FSET_AVX512BW);
3510 3544                                  if (cpi->cpi_std[7].cp_ebx &
3511 3545                                      CPUID_INTC_EBX_7_0_AVX512VL)
3512 3546                                          add_x86_feature(featureset,
3513 3547                                              X86FSET_AVX512VL);
3514 3548  
3515 3549                                  if (cpi->cpi_std[7].cp_ecx &
3516 3550                                      CPUID_INTC_ECX_7_0_AVX512VBMI)
3517 3551                                          add_x86_feature(featureset,
3518 3552                                              X86FSET_AVX512VBMI);
3519 3553                                  if (cpi->cpi_std[7].cp_ecx &
3520 3554                                      CPUID_INTC_ECX_7_0_AVX512VNNI)
3521 3555                                          add_x86_feature(featureset,
3522 3556                                              X86FSET_AVX512VNNI);
3523 3557                                  if (cpi->cpi_std[7].cp_ecx &
3524 3558                                      CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3525 3559                                          add_x86_feature(featureset,
3526 3560                                              X86FSET_AVX512VPOPCDQ);
3527 3561  
3528 3562                                  if (cpi->cpi_std[7].cp_edx &
3529 3563                                      CPUID_INTC_EDX_7_0_AVX5124NNIW)
3530 3564                                          add_x86_feature(featureset,
3531 3565                                              X86FSET_AVX512NNIW);
3532 3566                                  if (cpi->cpi_std[7].cp_edx &
3533 3567                                      CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3534 3568                                          add_x86_feature(featureset,
3535 3569                                              X86FSET_AVX512FMAPS);
3536 3570                          }
3537 3571                  }
3538 3572          }
3539 3573  
3540 3574          if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3541 3575                  if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
3542 3576                          add_x86_feature(featureset, X86FSET_PCID);
3543 3577                  }
3544 3578          }
3545 3579  
3546 3580          if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
3547 3581                  add_x86_feature(featureset, X86FSET_X2APIC);
3548 3582          }
3549 3583          if (cp->cp_edx & CPUID_INTC_EDX_DE) {
3550 3584                  add_x86_feature(featureset, X86FSET_DE);
3551 3585          }
3552 3586  #if !defined(__xpv)
3553 3587          if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
3554 3588  
3555 3589                  /*
3556 3590                   * We require the CLFLUSH instruction for erratum workaround
3557 3591                   * to use MONITOR/MWAIT.
3558 3592                   */
3559 3593                  if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3560 3594                          cpi->cpi_mwait.support |= MWAIT_SUPPORT;
3561 3595                          add_x86_feature(featureset, X86FSET_MWAIT);
3562 3596                  } else {
3563 3597                          extern int idle_cpu_assert_cflush_monitor;
3564 3598  
3565 3599                          /*
3566 3600                           * All processors we are aware of which have
3567 3601                           * MONITOR/MWAIT also have CLFLUSH.
3568 3602                           */
3569 3603                          if (idle_cpu_assert_cflush_monitor) {
3570 3604                                  ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
3571 3605                                      (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
3572 3606                          }
3573 3607                  }
3574 3608          }
3575 3609  #endif  /* __xpv */
3576 3610  
3577 3611          if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
3578 3612                  add_x86_feature(featureset, X86FSET_VMX);
3579 3613          }
3580 3614  
3581 3615          if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
3582 3616                  add_x86_feature(featureset, X86FSET_RDRAND);
3583 3617  
3584 3618          /*
3585 3619           * Only need it first time, rest of the cpus would follow suit.
3586 3620           * we only capture this for the bootcpu.
3587 3621           */
3588 3622          if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3589 3623                  add_x86_feature(featureset, X86FSET_CLFSH);
3590 3624                  x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
3591 3625          }
3592 3626          if (is_x86_feature(featureset, X86FSET_PAE))
3593 3627                  cpi->cpi_pabits = 36;
3594 3628  
3595 3629          if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
3596 3630                  struct cpuid_regs r, *ecp;
3597 3631  
3598 3632                  ecp = &r;
3599 3633                  ecp->cp_eax = 0xD;
3600 3634                  ecp->cp_ecx = 1;
3601 3635                  ecp->cp_edx = ecp->cp_ebx = 0;
3602 3636                  (void) __cpuid_insn(ecp);
3603 3637  
3604 3638                  if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
3605 3639                          add_x86_feature(featureset, X86FSET_XSAVEOPT);
3606 3640                  if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
3607 3641                          add_x86_feature(featureset, X86FSET_XSAVEC);
3608 3642                  if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
3609 3643                          add_x86_feature(featureset, X86FSET_XSAVES);
3610 3644          }
3611 3645  
3612 3646          /*
3613 3647           * Work on the "extended" feature information, doing
3614 3648           * some basic initialization for cpuid_pass2()
3615 3649           */
3616 3650          xcpuid = 0;
3617 3651          switch (cpi->cpi_vendor) {
3618 3652          case X86_VENDOR_Intel:
3619 3653                  /*
3620 3654                   * On KVM we know we will have proper support for extended
3621 3655                   * cpuid.
3622 3656                   */
3623 3657                  if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
3624 3658                      (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
3625 3659                      (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
3626 3660                          xcpuid++;
3627 3661                  break;
3628 3662          case X86_VENDOR_AMD:
3629 3663                  if (cpi->cpi_family > 5 ||
3630 3664                      (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
3631 3665                          xcpuid++;
3632 3666                  break;
3633 3667          case X86_VENDOR_Cyrix:
3634 3668                  /*
3635 3669                   * Only these Cyrix CPUs are -known- to support
3636 3670                   * extended cpuid operations.
3637 3671                   */
3638 3672                  if (x86_type == X86_TYPE_VIA_CYRIX_III ||
3639 3673                      x86_type == X86_TYPE_CYRIX_GXm)
3640 3674                          xcpuid++;
3641 3675                  break;
3642 3676          case X86_VENDOR_Centaur:
3643 3677          case X86_VENDOR_TM:
3644 3678          default:
3645 3679                  xcpuid++;
3646 3680                  break;
3647 3681          }
3648 3682  
3649 3683          if (xcpuid) {
3650 3684                  cp = &cpi->cpi_extd[0];
3651 3685                  cp->cp_eax = CPUID_LEAF_EXT_0;
3652 3686                  cpi->cpi_xmaxeax = __cpuid_insn(cp);
3653 3687          }
3654 3688  
3655 3689          if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
3656 3690  
3657 3691                  if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
3658 3692                          cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
3659 3693  
3660 3694                  switch (cpi->cpi_vendor) {
3661 3695                  case X86_VENDOR_Intel:
3662 3696                  case X86_VENDOR_AMD:
3663 3697                          if (cpi->cpi_xmaxeax < 0x80000001)
3664 3698                                  break;
3665 3699                          cp = &cpi->cpi_extd[1];
3666 3700                          cp->cp_eax = 0x80000001;
3667 3701                          (void) __cpuid_insn(cp);
3668 3702  
3669 3703                          if (cpi->cpi_vendor == X86_VENDOR_AMD &&
3670 3704                              cpi->cpi_family == 5 &&
3671 3705                              cpi->cpi_model == 6 &&
3672 3706                              cpi->cpi_step == 6) {
3673 3707                                  /*
3674 3708                                   * K6 model 6 uses bit 10 to indicate SYSC
3675 3709                                   * Later models use bit 11. Fix it here.
3676 3710                                   */
3677 3711                                  if (cp->cp_edx & 0x400) {
3678 3712                                          cp->cp_edx &= ~0x400;
3679 3713                                          cp->cp_edx |= CPUID_AMD_EDX_SYSC;
3680 3714                                  }
3681 3715                          }
3682 3716  
3683 3717                          platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
3684 3718  
3685 3719                          /*
3686 3720                           * Compute the additions to the kernel's feature word.
3687 3721                           */
3688 3722                          if (cp->cp_edx & CPUID_AMD_EDX_NX) {
3689 3723                                  add_x86_feature(featureset, X86FSET_NX);
3690 3724                          }
3691 3725  
3692 3726                          /*
3693 3727                           * Regardless whether or not we boot 64-bit,
3694 3728                           * we should have a way to identify whether
3695 3729                           * the CPU is capable of running 64-bit.
3696 3730                           */
3697 3731                          if (cp->cp_edx & CPUID_AMD_EDX_LM) {
3698 3732                                  add_x86_feature(featureset, X86FSET_64);
3699 3733                          }
3700 3734  
3701 3735                          /* 1 GB large page - enable only for 64 bit kernel */
3702 3736                          if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
3703 3737                                  add_x86_feature(featureset, X86FSET_1GPG);
3704 3738                          }
3705 3739  
3706 3740                          if ((cpi->cpi_vendor == X86_VENDOR_AMD) &&
3707 3741                              (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
3708 3742                              (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
3709 3743                                  add_x86_feature(featureset, X86FSET_SSE4A);
3710 3744                          }
3711 3745  
3712 3746                          /*
3713 3747                           * It's really tricky to support syscall/sysret in
3714 3748                           * the i386 kernel; we rely on sysenter/sysexit
3715 3749                           * instead.  In the amd64 kernel, things are -way-
3716 3750                           * better.
3717 3751                           */
3718 3752                          if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
3719 3753                                  add_x86_feature(featureset, X86FSET_ASYSC);
3720 3754                          }
3721 3755  
3722 3756                          /*
3723 3757                           * While we're thinking about system calls, note
3724 3758                           * that AMD processors don't support sysenter
3725 3759                           * in long mode at all, so don't try to program them.
3726 3760                           */
3727 3761                          if (x86_vendor == X86_VENDOR_AMD) {
3728 3762                                  remove_x86_feature(featureset, X86FSET_SEP);
3729 3763                          }
3730 3764  
3731 3765                          if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
3732 3766                                  add_x86_feature(featureset, X86FSET_TSCP);
3733 3767                          }
3734 3768  
3735 3769                          if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
3736 3770                                  add_x86_feature(featureset, X86FSET_SVM);
3737 3771                          }
3738 3772  
3739 3773                          if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
3740 3774                                  add_x86_feature(featureset, X86FSET_TOPOEXT);
3741 3775                          }
3742 3776  
3743 3777                          if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
3744 3778                                  add_x86_feature(featureset, X86FSET_AMD_PCEC);
3745 3779                          }
3746 3780  
3747 3781                          if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
3748 3782                                  add_x86_feature(featureset, X86FSET_XOP);
3749 3783                          }
3750 3784  
3751 3785                          if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
3752 3786                                  add_x86_feature(featureset, X86FSET_FMA4);
3753 3787                          }
3754 3788  
3755 3789                          if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
3756 3790                                  add_x86_feature(featureset, X86FSET_TBM);
3757 3791                          }
3758 3792  
3759 3793                          if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
3760 3794                                  add_x86_feature(featureset, X86FSET_MONITORX);
3761 3795                          }
3762 3796                          break;
3763 3797                  default:
3764 3798                          break;
3765 3799                  }
3766 3800  
3767 3801                  /*
3768 3802                   * Get CPUID data about processor cores and hyperthreads.
3769 3803                   */
3770 3804                  switch (cpi->cpi_vendor) {
3771 3805                  case X86_VENDOR_Intel:
3772 3806                          if (cpi->cpi_maxeax >= 4) {
3773 3807                                  cp = &cpi->cpi_std[4];
3774 3808                                  cp->cp_eax = 4;
3775 3809                                  cp->cp_ecx = 0;
3776 3810                                  (void) __cpuid_insn(cp);
3777 3811                                  platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
3778 3812                          }
3779 3813                          /*FALLTHROUGH*/
3780 3814                  case X86_VENDOR_AMD:
3781 3815                          if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
3782 3816                                  break;
3783 3817                          cp = &cpi->cpi_extd[8];
3784 3818                          cp->cp_eax = CPUID_LEAF_EXT_8;
3785 3819                          (void) __cpuid_insn(cp);
3786 3820                          platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
3787 3821                              cp);
3788 3822  
3789 3823                          /*
3790 3824                           * AMD uses ebx for some extended functions.
3791 3825                           */
3792 3826                          if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3793 3827                                  /*
3794 3828                                   * While we're here, check for the AMD "Error
3795 3829                                   * Pointer Zero/Restore" feature. This can be
3796 3830                                   * used to setup the FP save handlers
3797 3831                                   * appropriately.
3798 3832                                   */
3799 3833                                  if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
3800 3834                                          cpi->cpi_fp_amd_save = 0;
3801 3835                                  } else {
3802 3836                                          cpi->cpi_fp_amd_save = 1;
3803 3837                                  }
3804 3838  
3805 3839                                  if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
3806 3840                                          add_x86_feature(featureset,
3807 3841                                              X86FSET_CLZERO);
3808 3842                                  }
3809 3843                          }
3810 3844  
3811 3845                          /*
3812 3846                           * Virtual and physical address limits from
3813 3847                           * cpuid override previously guessed values.
3814 3848                           */
3815 3849                          cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
3816 3850                          cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
3817 3851                          break;
3818 3852                  default:
3819 3853                          break;
3820 3854                  }
3821 3855  
3822 3856                  /*
3823 3857                   * Get CPUID data about TSC Invariance in Deep C-State.
3824 3858                   */
3825 3859                  switch (cpi->cpi_vendor) {
3826 3860                  case X86_VENDOR_Intel:
3827 3861                  case X86_VENDOR_AMD:
3828 3862                          if (cpi->cpi_maxeax >= 7) {
3829 3863                                  cp = &cpi->cpi_extd[7];
3830 3864                                  cp->cp_eax = 0x80000007;
3831 3865                                  cp->cp_ecx = 0;
3832 3866                                  (void) __cpuid_insn(cp);
3833 3867                          }
3834 3868                          break;
3835 3869                  default:
3836 3870                          break;
3837 3871                  }
3838 3872          }
3839 3873  
3840 3874          cpuid_pass1_topology(cpu, featureset);
3841 3875          cpuid_pass1_thermal(cpu, featureset);
3842 3876  
3843 3877          /*
3844 3878           * Synthesize chip "revision" and socket type
3845 3879           */
3846 3880          cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
3847 3881              cpi->cpi_model, cpi->cpi_step);
3848 3882          cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
3849 3883              cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
3850 3884          cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
3851 3885              cpi->cpi_model, cpi->cpi_step);
3852 3886  
3853 3887          if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3854 3888                  if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
3855 3889                      cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
3856 3890                          /* Special handling for AMD FP not necessary. */
3857 3891                          cpi->cpi_fp_amd_save = 0;
3858 3892                  } else {
3859 3893                          cpi->cpi_fp_amd_save = 1;
3860 3894                  }
3861 3895          }
3862 3896  
3863 3897          /*
3864 3898           * Check the processor leaves that are used for security features.
3865 3899           */
3866 3900          cpuid_scan_security(cpu, featureset);
3867 3901  
3868 3902  pass1_done:
3869 3903          cpi->cpi_pass = 1;
3870 3904  }
3871 3905  
3872 3906  /*
3873 3907   * Make copies of the cpuid table entries we depend on, in
3874 3908   * part for ease of parsing now, in part so that we have only
3875 3909   * one place to correct any of it, in part for ease of
3876 3910   * later export to userland, and in part so we can look at
3877 3911   * this stuff in a crash dump.
3878 3912   */
3879 3913  
3880 3914  /*ARGSUSED*/
3881 3915  void
3882 3916  cpuid_pass2(cpu_t *cpu)
3883 3917  {
3884 3918          uint_t n, nmax;
3885 3919          int i;
3886 3920          struct cpuid_regs *cp;
3887 3921          uint8_t *dp;
3888 3922          uint32_t *iptr;
3889 3923          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3890 3924  
3891 3925          ASSERT(cpi->cpi_pass == 1);
3892 3926  
3893 3927          if (cpi->cpi_maxeax < 1)
3894 3928                  goto pass2_done;
3895 3929  
3896 3930          if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
3897 3931                  nmax = NMAX_CPI_STD;
3898 3932          /*
3899 3933           * (We already handled n == 0 and n == 1 in pass 1)
3900 3934           */
3901 3935          for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
3902 3936                  /*
3903 3937                   * leaves 6 and 7 were handled in pass 1
3904 3938                   */
3905 3939                  if (n == 6 || n == 7)
3906 3940                          continue;
3907 3941  
3908 3942                  cp->cp_eax = n;
3909 3943  
3910 3944                  /*
3911 3945                   * CPUID function 4 expects %ecx to be initialized
3912 3946                   * with an index which indicates which cache to return
3913 3947                   * information about. The OS is expected to call function 4
3914 3948                   * with %ecx set to 0, 1, 2, ... until it returns with
3915 3949                   * EAX[4:0] set to 0, which indicates there are no more
3916 3950                   * caches.
3917 3951                   *
3918 3952                   * Here, populate cpi_std[4] with the information returned by
3919 3953                   * function 4 when %ecx == 0, and do the rest in cpuid_pass3()
3920 3954                   * when dynamic memory allocation becomes available.
3921 3955                   *
3922 3956                   * Note: we need to explicitly initialize %ecx here, since
3923 3957                   * function 4 may have been previously invoked.
3924 3958                   */
3925 3959                  if (n == 4)
3926 3960                          cp->cp_ecx = 0;
3927 3961  
3928 3962                  (void) __cpuid_insn(cp);
3929 3963                  platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
3930 3964                  switch (n) {
3931 3965                  case 2:
3932 3966                          /*
3933 3967                           * "the lower 8 bits of the %eax register
3934 3968                           * contain a value that identifies the number
3935 3969                           * of times the cpuid [instruction] has to be
3936 3970                           * executed to obtain a complete image of the
3937 3971                           * processor's caching systems."
3938 3972                           *
3939 3973                           * How *do* they make this stuff up?
3940 3974                           */
3941 3975                          cpi->cpi_ncache = sizeof (*cp) *
3942 3976                              BITX(cp->cp_eax, 7, 0);
3943 3977                          if (cpi->cpi_ncache == 0)
3944 3978                                  break;
3945 3979                          cpi->cpi_ncache--;      /* skip count byte */
3946 3980  
3947 3981                          /*
3948 3982                           * Well, for now, rather than attempt to implement
3949 3983                           * this slightly dubious algorithm, we just look
3950 3984                           * at the first 15 ..
3951 3985                           */
3952 3986                          if (cpi->cpi_ncache > (sizeof (*cp) - 1))
3953 3987                                  cpi->cpi_ncache = sizeof (*cp) - 1;
3954 3988  
3955 3989                          dp = cpi->cpi_cacheinfo;
3956 3990                          if (BITX(cp->cp_eax, 31, 31) == 0) {
3957 3991                                  uint8_t *p = (void *)&cp->cp_eax;
3958 3992                                  for (i = 1; i < 4; i++)
3959 3993                                          if (p[i] != 0)
3960 3994                                                  *dp++ = p[i];
3961 3995                          }
3962 3996                          if (BITX(cp->cp_ebx, 31, 31) == 0) {
3963 3997                                  uint8_t *p = (void *)&cp->cp_ebx;
3964 3998                                  for (i = 0; i < 4; i++)
3965 3999                                          if (p[i] != 0)
3966 4000                                                  *dp++ = p[i];
3967 4001                          }
3968 4002                          if (BITX(cp->cp_ecx, 31, 31) == 0) {
3969 4003                                  uint8_t *p = (void *)&cp->cp_ecx;
3970 4004                                  for (i = 0; i < 4; i++)
3971 4005                                          if (p[i] != 0)
3972 4006                                                  *dp++ = p[i];
3973 4007                          }
3974 4008                          if (BITX(cp->cp_edx, 31, 31) == 0) {
3975 4009                                  uint8_t *p = (void *)&cp->cp_edx;
3976 4010                                  for (i = 0; i < 4; i++)
3977 4011                                          if (p[i] != 0)
3978 4012                                                  *dp++ = p[i];
3979 4013                          }
3980 4014                          break;
3981 4015  
3982 4016                  case 3: /* Processor serial number, if PSN supported */
3983 4017                          break;
3984 4018  
3985 4019                  case 4: /* Deterministic cache parameters */
3986 4020                          break;
3987 4021  
3988 4022                  case 5: /* Monitor/Mwait parameters */
3989 4023                  {
3990 4024                          size_t mwait_size;
3991 4025  
3992 4026                          /*
3993 4027                           * check cpi_mwait.support which was set in cpuid_pass1
3994 4028                           */
3995 4029                          if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
3996 4030                                  break;
3997 4031  
3998 4032                          /*
3999 4033                           * Protect ourself from insane mwait line size.
4000 4034                           * Workaround for incomplete hardware emulator(s).
4001 4035                           */
4002 4036                          mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
4003 4037                          if (mwait_size < sizeof (uint32_t) ||
4004 4038                              !ISP2(mwait_size)) {
4005 4039  #if DEBUG
4006 4040                                  cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
4007 4041                                      "size %ld", cpu->cpu_id, (long)mwait_size);
4008 4042  #endif
4009 4043                                  break;
4010 4044                          }
4011 4045  
4012 4046                          cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
4013 4047                          cpi->cpi_mwait.mon_max = mwait_size;
4014 4048                          if (MWAIT_EXTENSION(cpi)) {
4015 4049                                  cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
4016 4050                                  if (MWAIT_INT_ENABLE(cpi))
4017 4051                                          cpi->cpi_mwait.support |=
4018 4052                                              MWAIT_ECX_INT_ENABLE;
4019 4053                          }
4020 4054                          break;
4021 4055                  }
4022 4056                  default:
4023 4057                          break;
4024 4058                  }
4025 4059          }
4026 4060  
4027 4061          /*
4028 4062           * XSAVE enumeration
4029 4063           */
4030 4064          if (cpi->cpi_maxeax >= 0xD) {
4031 4065                  struct cpuid_regs regs;
4032 4066                  boolean_t cpuid_d_valid = B_TRUE;
4033 4067  
4034 4068                  cp = &regs;
4035 4069                  cp->cp_eax = 0xD;
4036 4070                  cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
4037 4071  
4038 4072                  (void) __cpuid_insn(cp);
4039 4073  
4040 4074                  /*
4041 4075                   * Sanity checks for debug
4042 4076                   */
4043 4077                  if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
4044 4078                      (cp->cp_eax & XFEATURE_SSE) == 0) {
4045 4079                          cpuid_d_valid = B_FALSE;
4046 4080                  }
4047 4081  
4048 4082                  cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
4049 4083                  cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
4050 4084                  cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
4051 4085  
4052 4086                  /*
4053 4087                   * If the hw supports AVX, get the size and offset in the save
4054 4088                   * area for the ymm state.
4055 4089                   */
4056 4090                  if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
4057 4091                          cp->cp_eax = 0xD;
4058 4092                          cp->cp_ecx = 2;
4059 4093                          cp->cp_edx = cp->cp_ebx = 0;
4060 4094  
4061 4095                          (void) __cpuid_insn(cp);
4062 4096  
4063 4097                          if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
4064 4098                              cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
4065 4099                                  cpuid_d_valid = B_FALSE;
4066 4100                          }
4067 4101  
4068 4102                          cpi->cpi_xsave.ymm_size = cp->cp_eax;
4069 4103                          cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
4070 4104                  }
4071 4105  
4072 4106                  /*
4073 4107                   * If the hw supports MPX, get the size and offset in the
4074 4108                   * save area for BNDREGS and BNDCSR.
4075 4109                   */
4076 4110                  if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
4077 4111                          cp->cp_eax = 0xD;
4078 4112                          cp->cp_ecx = 3;
4079 4113                          cp->cp_edx = cp->cp_ebx = 0;
4080 4114  
4081 4115                          (void) __cpuid_insn(cp);
4082 4116  
4083 4117                          cpi->cpi_xsave.bndregs_size = cp->cp_eax;
4084 4118                          cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
4085 4119  
4086 4120                          cp->cp_eax = 0xD;
4087 4121                          cp->cp_ecx = 4;
4088 4122                          cp->cp_edx = cp->cp_ebx = 0;
4089 4123  
4090 4124                          (void) __cpuid_insn(cp);
4091 4125  
4092 4126                          cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
4093 4127                          cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
4094 4128                  }
4095 4129  
4096 4130                  /*
4097 4131                   * If the hw supports AVX512, get the size and offset in the
4098 4132                   * save area for the opmask registers and zmm state.
4099 4133                   */
4100 4134                  if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
4101 4135                          cp->cp_eax = 0xD;
4102 4136                          cp->cp_ecx = 5;
4103 4137                          cp->cp_edx = cp->cp_ebx = 0;
4104 4138  
4105 4139                          (void) __cpuid_insn(cp);
4106 4140  
4107 4141                          cpi->cpi_xsave.opmask_size = cp->cp_eax;
4108 4142                          cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
4109 4143  
4110 4144                          cp->cp_eax = 0xD;
4111 4145                          cp->cp_ecx = 6;
4112 4146                          cp->cp_edx = cp->cp_ebx = 0;
4113 4147  
4114 4148                          (void) __cpuid_insn(cp);
4115 4149  
4116 4150                          cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
4117 4151                          cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
4118 4152  
4119 4153                          cp->cp_eax = 0xD;
4120 4154                          cp->cp_ecx = 7;
4121 4155                          cp->cp_edx = cp->cp_ebx = 0;
4122 4156  
4123 4157                          (void) __cpuid_insn(cp);
4124 4158  
4125 4159                          cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
4126 4160                          cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
4127 4161                  }
4128 4162  
4129 4163                  if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
4130 4164                          xsave_state_size = 0;
4131 4165                  } else if (cpuid_d_valid) {
4132 4166                          xsave_state_size = cpi->cpi_xsave.xsav_max_size;
4133 4167                  } else {
4134 4168                          /* Broken CPUID 0xD, probably in HVM */
4135 4169                          cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
4136 4170                              "value: hw_low = %d, hw_high = %d, xsave_size = %d"
4137 4171                              ", ymm_size = %d, ymm_offset = %d\n",
4138 4172                              cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
4139 4173                              cpi->cpi_xsave.xsav_hw_features_high,
4140 4174                              (int)cpi->cpi_xsave.xsav_max_size,
4141 4175                              (int)cpi->cpi_xsave.ymm_size,
4142 4176                              (int)cpi->cpi_xsave.ymm_offset);
4143 4177  
4144 4178                          if (xsave_state_size != 0) {
4145 4179                                  /*
4146 4180                                   * This must be a non-boot CPU. We cannot
4147 4181                                   * continue, because boot cpu has already
4148 4182                                   * enabled XSAVE.
4149 4183                                   */
4150 4184                                  ASSERT(cpu->cpu_id != 0);
4151 4185                                  cmn_err(CE_PANIC, "cpu%d: we have already "
4152 4186                                      "enabled XSAVE on boot cpu, cannot "
4153 4187                                      "continue.", cpu->cpu_id);
4154 4188                          } else {
4155 4189                                  /*
4156 4190                                   * If we reached here on the boot CPU, it's also
4157 4191                                   * almost certain that we'll reach here on the
4158 4192                                   * non-boot CPUs. When we're here on a boot CPU
4159 4193                                   * we should disable the feature, on a non-boot
4160 4194                                   * CPU we need to confirm that we have.
4161 4195                                   */
4162 4196                                  if (cpu->cpu_id == 0) {
4163 4197                                          remove_x86_feature(x86_featureset,
4164 4198                                              X86FSET_XSAVE);
4165 4199                                          remove_x86_feature(x86_featureset,
4166 4200                                              X86FSET_AVX);
4167 4201                                          remove_x86_feature(x86_featureset,
4168 4202                                              X86FSET_F16C);
4169 4203                                          remove_x86_feature(x86_featureset,
4170 4204                                              X86FSET_BMI1);
4171 4205                                          remove_x86_feature(x86_featureset,
4172 4206                                              X86FSET_BMI2);
4173 4207                                          remove_x86_feature(x86_featureset,
4174 4208                                              X86FSET_FMA);
4175 4209                                          remove_x86_feature(x86_featureset,
4176 4210                                              X86FSET_AVX2);
4177 4211                                          remove_x86_feature(x86_featureset,
4178 4212                                              X86FSET_MPX);
4179 4213                                          remove_x86_feature(x86_featureset,
4180 4214                                              X86FSET_AVX512F);
4181 4215                                          remove_x86_feature(x86_featureset,
4182 4216                                              X86FSET_AVX512DQ);
4183 4217                                          remove_x86_feature(x86_featureset,
4184 4218                                              X86FSET_AVX512PF);
4185 4219                                          remove_x86_feature(x86_featureset,
4186 4220                                              X86FSET_AVX512ER);
4187 4221                                          remove_x86_feature(x86_featureset,
4188 4222                                              X86FSET_AVX512CD);
4189 4223                                          remove_x86_feature(x86_featureset,
4190 4224                                              X86FSET_AVX512BW);
4191 4225                                          remove_x86_feature(x86_featureset,
4192 4226                                              X86FSET_AVX512VL);
4193 4227                                          remove_x86_feature(x86_featureset,
4194 4228                                              X86FSET_AVX512FMA);
4195 4229                                          remove_x86_feature(x86_featureset,
4196 4230                                              X86FSET_AVX512VBMI);
4197 4231                                          remove_x86_feature(x86_featureset,
4198 4232                                              X86FSET_AVX512VNNI);
4199 4233                                          remove_x86_feature(x86_featureset,
4200 4234                                              X86FSET_AVX512VPOPCDQ);
4201 4235                                          remove_x86_feature(x86_featureset,
4202 4236                                              X86FSET_AVX512NNIW);
4203 4237                                          remove_x86_feature(x86_featureset,
4204 4238                                              X86FSET_AVX512FMAPS);
4205 4239  
4206 4240                                          CPI_FEATURES_ECX(cpi) &=
4207 4241                                              ~CPUID_INTC_ECX_XSAVE;
4208 4242                                          CPI_FEATURES_ECX(cpi) &=
4209 4243                                              ~CPUID_INTC_ECX_AVX;
4210 4244                                          CPI_FEATURES_ECX(cpi) &=
4211 4245                                              ~CPUID_INTC_ECX_F16C;
4212 4246                                          CPI_FEATURES_ECX(cpi) &=
4213 4247                                              ~CPUID_INTC_ECX_FMA;
4214 4248                                          CPI_FEATURES_7_0_EBX(cpi) &=
4215 4249                                              ~CPUID_INTC_EBX_7_0_BMI1;
4216 4250                                          CPI_FEATURES_7_0_EBX(cpi) &=
4217 4251                                              ~CPUID_INTC_EBX_7_0_BMI2;
4218 4252                                          CPI_FEATURES_7_0_EBX(cpi) &=
4219 4253                                              ~CPUID_INTC_EBX_7_0_AVX2;
4220 4254                                          CPI_FEATURES_7_0_EBX(cpi) &=
4221 4255                                              ~CPUID_INTC_EBX_7_0_MPX;
4222 4256                                          CPI_FEATURES_7_0_EBX(cpi) &=
4223 4257                                              ~CPUID_INTC_EBX_7_0_ALL_AVX512;
4224 4258  
4225 4259                                          CPI_FEATURES_7_0_ECX(cpi) &=
4226 4260                                              ~CPUID_INTC_ECX_7_0_ALL_AVX512;
4227 4261  
4228 4262                                          CPI_FEATURES_7_0_EDX(cpi) &=
4229 4263                                              ~CPUID_INTC_EDX_7_0_ALL_AVX512;
4230 4264  
4231 4265                                          xsave_force_disable = B_TRUE;
4232 4266                                  } else {
4233 4267                                          VERIFY(is_x86_feature(x86_featureset,
4234 4268                                              X86FSET_XSAVE) == B_FALSE);
4235 4269                                  }
4236 4270                          }
4237 4271                  }
4238 4272          }
4239 4273  
4240 4274  
4241 4275          if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
4242 4276                  goto pass2_done;
4243 4277  
4244 4278          if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
4245 4279                  nmax = NMAX_CPI_EXTD;
4246 4280          /*
4247 4281           * Copy the extended properties, fixing them as we go.
4248 4282           * (We already handled n == 0 and n == 1 in pass 1)
4249 4283           */
4250 4284          iptr = (void *)cpi->cpi_brandstr;
4251 4285          for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
4252 4286                  cp->cp_eax = CPUID_LEAF_EXT_0 + n;
4253 4287                  (void) __cpuid_insn(cp);
4254 4288                  platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
4255 4289                      cp);
4256 4290                  switch (n) {
4257 4291                  case 2:
4258 4292                  case 3:
4259 4293                  case 4:
4260 4294                          /*
4261 4295                           * Extract the brand string
4262 4296                           */
4263 4297                          *iptr++ = cp->cp_eax;
4264 4298                          *iptr++ = cp->cp_ebx;
4265 4299                          *iptr++ = cp->cp_ecx;
4266 4300                          *iptr++ = cp->cp_edx;
4267 4301                          break;
4268 4302                  case 5:
4269 4303                          switch (cpi->cpi_vendor) {
4270 4304                          case X86_VENDOR_AMD:
4271 4305                                  /*
4272 4306                                   * The Athlon and Duron were the first
4273 4307                                   * parts to report the sizes of the
4274 4308                                   * TLB for large pages. Before then,
4275 4309                                   * we don't trust the data.
4276 4310                                   */
4277 4311                                  if (cpi->cpi_family < 6 ||
4278 4312                                      (cpi->cpi_family == 6 &&
4279 4313                                      cpi->cpi_model < 1))
4280 4314                                          cp->cp_eax = 0;
4281 4315                                  break;
4282 4316                          default:
4283 4317                                  break;
4284 4318                          }
4285 4319                          break;
4286 4320                  case 6:
4287 4321                          switch (cpi->cpi_vendor) {
4288 4322                          case X86_VENDOR_AMD:
4289 4323                                  /*
4290 4324                                   * The Athlon and Duron were the first
4291 4325                                   * AMD parts with L2 TLB's.
4292 4326                                   * Before then, don't trust the data.
4293 4327                                   */
4294 4328                                  if (cpi->cpi_family < 6 ||
4295 4329                                      cpi->cpi_family == 6 &&
4296 4330                                      cpi->cpi_model < 1)
4297 4331                                          cp->cp_eax = cp->cp_ebx = 0;
4298 4332                                  /*
4299 4333                                   * AMD Duron rev A0 reports L2
4300 4334                                   * cache size incorrectly as 1K
4301 4335                                   * when it is really 64K
4302 4336                                   */
4303 4337                                  if (cpi->cpi_family == 6 &&
4304 4338                                      cpi->cpi_model == 3 &&
4305 4339                                      cpi->cpi_step == 0) {
4306 4340                                          cp->cp_ecx &= 0xffff;
4307 4341                                          cp->cp_ecx |= 0x400000;
4308 4342                                  }
4309 4343                                  break;
4310 4344                          case X86_VENDOR_Cyrix:  /* VIA C3 */
4311 4345                                  /*
4312 4346                                   * VIA C3 processors are a bit messed
4313 4347                                   * up w.r.t. encoding cache sizes in %ecx
4314 4348                                   */
4315 4349                                  if (cpi->cpi_family != 6)
4316 4350                                          break;
4317 4351                                  /*
4318 4352                                   * model 7 and 8 were incorrectly encoded
4319 4353                                   *
4320 4354                                   * xxx is model 8 really broken?
4321 4355                                   */
4322 4356                                  if (cpi->cpi_model == 7 ||
4323 4357                                      cpi->cpi_model == 8)
4324 4358                                          cp->cp_ecx =
4325 4359                                              BITX(cp->cp_ecx, 31, 24) << 16 |
4326 4360                                              BITX(cp->cp_ecx, 23, 16) << 12 |
4327 4361                                              BITX(cp->cp_ecx, 15, 8) << 8 |
4328 4362                                              BITX(cp->cp_ecx, 7, 0);
4329 4363                                  /*
4330 4364                                   * model 9 stepping 1 has wrong associativity
4331 4365                                   */
4332 4366                                  if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
4333 4367                                          cp->cp_ecx |= 8 << 12;
4334 4368                                  break;
4335 4369                          case X86_VENDOR_Intel:
4336 4370                                  /*
4337 4371                                   * Extended L2 Cache features function.
4338 4372                                   * First appeared on Prescott.
4339 4373                                   */
4340 4374                          default:
4341 4375                                  break;
4342 4376                          }
4343 4377                          break;
4344 4378                  default:
4345 4379                          break;
4346 4380                  }
4347 4381          }
4348 4382  
4349 4383  pass2_done:
4350 4384          cpi->cpi_pass = 2;
4351 4385  }
4352 4386  
4353 4387  static const char *
4354 4388  intel_cpubrand(const struct cpuid_info *cpi)
4355 4389  {
4356 4390          int i;
4357 4391  
4358 4392          if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4359 4393              cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
4360 4394                  return ("i486");
4361 4395  
4362 4396          switch (cpi->cpi_family) {
4363 4397          case 5:
4364 4398                  return ("Intel Pentium(r)");
4365 4399          case 6:
4366 4400                  switch (cpi->cpi_model) {
4367 4401                          uint_t celeron, xeon;
4368 4402                          const struct cpuid_regs *cp;
4369 4403                  case 0:
4370 4404                  case 1:
4371 4405                  case 2:
4372 4406                          return ("Intel Pentium(r) Pro");
4373 4407                  case 3:
4374 4408                  case 4:
4375 4409                          return ("Intel Pentium(r) II");
4376 4410                  case 6:
4377 4411                          return ("Intel Celeron(r)");
4378 4412                  case 5:
4379 4413                  case 7:
4380 4414                          celeron = xeon = 0;
4381 4415                          cp = &cpi->cpi_std[2];  /* cache info */
4382 4416  
4383 4417                          for (i = 1; i < 4; i++) {
4384 4418                                  uint_t tmp;
4385 4419  
4386 4420                                  tmp = (cp->cp_eax >> (8 * i)) & 0xff;
4387 4421                                  if (tmp == 0x40)
4388 4422                                          celeron++;
4389 4423                                  if (tmp >= 0x44 && tmp <= 0x45)
4390 4424                                          xeon++;
4391 4425                          }
4392 4426  
4393 4427                          for (i = 0; i < 2; i++) {
4394 4428                                  uint_t tmp;
4395 4429  
4396 4430                                  tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
4397 4431                                  if (tmp == 0x40)
4398 4432                                          celeron++;
4399 4433                                  else if (tmp >= 0x44 && tmp <= 0x45)
4400 4434                                          xeon++;
4401 4435                          }
4402 4436  
4403 4437                          for (i = 0; i < 4; i++) {
4404 4438                                  uint_t tmp;
4405 4439  
4406 4440                                  tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
4407 4441                                  if (tmp == 0x40)
4408 4442                                          celeron++;
4409 4443                                  else if (tmp >= 0x44 && tmp <= 0x45)
4410 4444                                          xeon++;
4411 4445                          }
4412 4446  
4413 4447                          for (i = 0; i < 4; i++) {
4414 4448                                  uint_t tmp;
4415 4449  
4416 4450                                  tmp = (cp->cp_edx >> (8 * i)) & 0xff;
4417 4451                                  if (tmp == 0x40)
4418 4452                                          celeron++;
4419 4453                                  else if (tmp >= 0x44 && tmp <= 0x45)
4420 4454                                          xeon++;
4421 4455                          }
4422 4456  
4423 4457                          if (celeron)
4424 4458                                  return ("Intel Celeron(r)");
4425 4459                          if (xeon)
4426 4460                                  return (cpi->cpi_model == 5 ?
4427 4461                                      "Intel Pentium(r) II Xeon(tm)" :
4428 4462                                      "Intel Pentium(r) III Xeon(tm)");
4429 4463                          return (cpi->cpi_model == 5 ?
4430 4464                              "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
4431 4465                              "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
4432 4466                  default:
4433 4467                          break;
4434 4468                  }
4435 4469          default:
4436 4470                  break;
4437 4471          }
4438 4472  
4439 4473          /* BrandID is present if the field is nonzero */
4440 4474          if (cpi->cpi_brandid != 0) {
4441 4475                  static const struct {
4442 4476                          uint_t bt_bid;
4443 4477                          const char *bt_str;
4444 4478                  } brand_tbl[] = {
4445 4479                          { 0x1,  "Intel(r) Celeron(r)" },
4446 4480                          { 0x2,  "Intel(r) Pentium(r) III" },
4447 4481                          { 0x3,  "Intel(r) Pentium(r) III Xeon(tm)" },
4448 4482                          { 0x4,  "Intel(r) Pentium(r) III" },
4449 4483                          { 0x6,  "Mobile Intel(r) Pentium(r) III" },
4450 4484                          { 0x7,  "Mobile Intel(r) Celeron(r)" },
4451 4485                          { 0x8,  "Intel(r) Pentium(r) 4" },
4452 4486                          { 0x9,  "Intel(r) Pentium(r) 4" },
4453 4487                          { 0xa,  "Intel(r) Celeron(r)" },
4454 4488                          { 0xb,  "Intel(r) Xeon(tm)" },
4455 4489                          { 0xc,  "Intel(r) Xeon(tm) MP" },
4456 4490                          { 0xe,  "Mobile Intel(r) Pentium(r) 4" },
4457 4491                          { 0xf,  "Mobile Intel(r) Celeron(r)" },
4458 4492                          { 0x11, "Mobile Genuine Intel(r)" },
4459 4493                          { 0x12, "Intel(r) Celeron(r) M" },
4460 4494                          { 0x13, "Mobile Intel(r) Celeron(r)" },
4461 4495                          { 0x14, "Intel(r) Celeron(r)" },
4462 4496                          { 0x15, "Mobile Genuine Intel(r)" },
4463 4497                          { 0x16, "Intel(r) Pentium(r) M" },
4464 4498                          { 0x17, "Mobile Intel(r) Celeron(r)" }
4465 4499                  };
4466 4500                  uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
4467 4501                  uint_t sgn;
4468 4502  
4469 4503                  sgn = (cpi->cpi_family << 8) |
4470 4504                      (cpi->cpi_model << 4) | cpi->cpi_step;
4471 4505  
4472 4506                  for (i = 0; i < btblmax; i++)
4473 4507                          if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
4474 4508                                  break;
4475 4509                  if (i < btblmax) {
4476 4510                          if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
4477 4511                                  return ("Intel(r) Celeron(r)");
4478 4512                          if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
4479 4513                                  return ("Intel(r) Xeon(tm) MP");
4480 4514                          if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
4481 4515                                  return ("Intel(r) Xeon(tm)");
4482 4516                          return (brand_tbl[i].bt_str);
4483 4517                  }
4484 4518          }
4485 4519  
4486 4520          return (NULL);
4487 4521  }
4488 4522  
4489 4523  static const char *
4490 4524  amd_cpubrand(const struct cpuid_info *cpi)
4491 4525  {
4492 4526          if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4493 4527              cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
4494 4528                  return ("i486 compatible");
4495 4529  
4496 4530          switch (cpi->cpi_family) {
4497 4531          case 5:
4498 4532                  switch (cpi->cpi_model) {
4499 4533                  case 0:
4500 4534                  case 1:
4501 4535                  case 2:
4502 4536                  case 3:
4503 4537                  case 4:
4504 4538                  case 5:
4505 4539                          return ("AMD-K5(r)");
4506 4540                  case 6:
4507 4541                  case 7:
4508 4542                          return ("AMD-K6(r)");
4509 4543                  case 8:
4510 4544                          return ("AMD-K6(r)-2");
4511 4545                  case 9:
4512 4546                          return ("AMD-K6(r)-III");
4513 4547                  default:
4514 4548                          return ("AMD (family 5)");
4515 4549                  }
4516 4550          case 6:
4517 4551                  switch (cpi->cpi_model) {
4518 4552                  case 1:
4519 4553                          return ("AMD-K7(tm)");
4520 4554                  case 0:
4521 4555                  case 2:
4522 4556                  case 4:
4523 4557                          return ("AMD Athlon(tm)");
4524 4558                  case 3:
4525 4559                  case 7:
4526 4560                          return ("AMD Duron(tm)");
4527 4561                  case 6:
4528 4562                  case 8:
4529 4563                  case 10:
4530 4564                          /*
4531 4565                           * Use the L2 cache size to distinguish
4532 4566                           */
4533 4567                          return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
4534 4568                              "AMD Athlon(tm)" : "AMD Duron(tm)");
4535 4569                  default:
4536 4570                          return ("AMD (family 6)");
4537 4571                  }
4538 4572          default:
4539 4573                  break;
4540 4574          }
4541 4575  
4542 4576          if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
4543 4577              cpi->cpi_brandid != 0) {
4544 4578                  switch (BITX(cpi->cpi_brandid, 7, 5)) {
4545 4579                  case 3:
4546 4580                          return ("AMD Opteron(tm) UP 1xx");
4547 4581                  case 4:
4548 4582                          return ("AMD Opteron(tm) DP 2xx");
4549 4583                  case 5:
4550 4584                          return ("AMD Opteron(tm) MP 8xx");
4551 4585                  default:
4552 4586                          return ("AMD Opteron(tm)");
4553 4587                  }
4554 4588          }
4555 4589  
4556 4590          return (NULL);
4557 4591  }
4558 4592  
4559 4593  static const char *
4560 4594  cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
4561 4595  {
4562 4596          if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4563 4597              cpi->cpi_maxeax < 1 || cpi->cpi_family < 5 ||
4564 4598              type == X86_TYPE_CYRIX_486)
4565 4599                  return ("i486 compatible");
4566 4600  
4567 4601          switch (type) {
4568 4602          case X86_TYPE_CYRIX_6x86:
4569 4603                  return ("Cyrix 6x86");
4570 4604          case X86_TYPE_CYRIX_6x86L:
4571 4605                  return ("Cyrix 6x86L");
4572 4606          case X86_TYPE_CYRIX_6x86MX:
4573 4607                  return ("Cyrix 6x86MX");
4574 4608          case X86_TYPE_CYRIX_GXm:
4575 4609                  return ("Cyrix GXm");
4576 4610          case X86_TYPE_CYRIX_MediaGX:
4577 4611                  return ("Cyrix MediaGX");
4578 4612          case X86_TYPE_CYRIX_MII:
4579 4613                  return ("Cyrix M2");
4580 4614          case X86_TYPE_VIA_CYRIX_III:
4581 4615                  return ("VIA Cyrix M3");
4582 4616          default:
4583 4617                  /*
4584 4618                   * Have another wild guess ..
4585 4619                   */
4586 4620                  if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
4587 4621                          return ("Cyrix 5x86");
4588 4622                  else if (cpi->cpi_family == 5) {
4589 4623                          switch (cpi->cpi_model) {
4590 4624                          case 2:
4591 4625                                  return ("Cyrix 6x86");  /* Cyrix M1 */
4592 4626                          case 4:
4593 4627                                  return ("Cyrix MediaGX");
4594 4628                          default:
4595 4629                                  break;
4596 4630                          }
4597 4631                  } else if (cpi->cpi_family == 6) {
4598 4632                          switch (cpi->cpi_model) {
4599 4633                          case 0:
4600 4634                                  return ("Cyrix 6x86MX"); /* Cyrix M2? */
4601 4635                          case 5:
4602 4636                          case 6:
4603 4637                          case 7:
4604 4638                          case 8:
4605 4639                          case 9:
4606 4640                                  return ("VIA C3");
4607 4641                          default:
4608 4642                                  break;
4609 4643                          }
4610 4644                  }
4611 4645                  break;
4612 4646          }
4613 4647          return (NULL);
4614 4648  }
4615 4649  
4616 4650  /*
4617 4651   * This only gets called in the case that the CPU extended
4618 4652   * feature brand string (0x80000002, 0x80000003, 0x80000004)
4619 4653   * aren't available, or contain null bytes for some reason.
4620 4654   */
4621 4655  static void
4622 4656  fabricate_brandstr(struct cpuid_info *cpi)
4623 4657  {
4624 4658          const char *brand = NULL;
4625 4659  
4626 4660          switch (cpi->cpi_vendor) {
4627 4661          case X86_VENDOR_Intel:
4628 4662                  brand = intel_cpubrand(cpi);
4629 4663                  break;
4630 4664          case X86_VENDOR_AMD:
4631 4665                  brand = amd_cpubrand(cpi);
4632 4666                  break;
4633 4667          case X86_VENDOR_Cyrix:
4634 4668                  brand = cyrix_cpubrand(cpi, x86_type);
4635 4669                  break;
4636 4670          case X86_VENDOR_NexGen:
4637 4671                  if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4638 4672                          brand = "NexGen Nx586";
4639 4673                  break;
4640 4674          case X86_VENDOR_Centaur:
4641 4675                  if (cpi->cpi_family == 5)
4642 4676                          switch (cpi->cpi_model) {
4643 4677                          case 4:
4644 4678                                  brand = "Centaur C6";
4645 4679                                  break;
4646 4680                          case 8:
4647 4681                                  brand = "Centaur C2";
4648 4682                                  break;
4649 4683                          case 9:
4650 4684                                  brand = "Centaur C3";
4651 4685                                  break;
4652 4686                          default:
4653 4687                                  break;
4654 4688                          }
4655 4689                  break;
4656 4690          case X86_VENDOR_Rise:
4657 4691                  if (cpi->cpi_family == 5 &&
4658 4692                      (cpi->cpi_model == 0 || cpi->cpi_model == 2))
4659 4693                          brand = "Rise mP6";
4660 4694                  break;
4661 4695          case X86_VENDOR_SiS:
4662 4696                  if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4663 4697                          brand = "SiS 55x";
4664 4698                  break;
4665 4699          case X86_VENDOR_TM:
4666 4700                  if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
4667 4701                          brand = "Transmeta Crusoe TM3x00 or TM5x00";
4668 4702                  break;
4669 4703          case X86_VENDOR_NSC:
4670 4704          case X86_VENDOR_UMC:
4671 4705          default:
4672 4706                  break;
4673 4707          }
4674 4708          if (brand) {
4675 4709                  (void) strcpy((char *)cpi->cpi_brandstr, brand);
4676 4710                  return;
4677 4711          }
4678 4712  
4679 4713          /*
4680 4714           * If all else fails ...
4681 4715           */
4682 4716          (void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
4683 4717              "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
4684 4718              cpi->cpi_model, cpi->cpi_step);
4685 4719  }
4686 4720  
4687 4721  /*
4688 4722   * This routine is called just after kernel memory allocation
4689 4723   * becomes available on cpu0, and as part of mp_startup() on
4690 4724   * the other cpus.
4691 4725   *
4692 4726   * Fixup the brand string, and collect any information from cpuid
4693 4727   * that requires dynamically allocated storage to represent.
4694 4728   */
4695 4729  /*ARGSUSED*/
4696 4730  void
4697 4731  cpuid_pass3(cpu_t *cpu)
4698 4732  {
4699 4733          int     i, max, shft, level, size;
4700 4734          struct cpuid_regs regs;
4701 4735          struct cpuid_regs *cp;
4702 4736          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4703 4737  
4704 4738          ASSERT(cpi->cpi_pass == 2);
4705 4739  
4706 4740          /*
4707 4741           * Deterministic cache parameters
4708 4742           *
4709 4743           * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
4710 4744           * values that are present are currently defined to be the same. This
4711 4745           * means we can use the same logic to parse it as long as we use the
4712 4746           * appropriate leaf to get the data. If you're updating this, make sure
4713 4747           * you're careful about which vendor supports which aspect.
4714 4748           *
4715 4749           * Take this opportunity to detect the number of threads sharing the
4716 4750           * last level cache, and construct a corresponding cache id. The
4717 4751           * respective cpuid_info members are initialized to the default case of
4718 4752           * "no last level cache sharing".
4719 4753           */
4720 4754          cpi->cpi_ncpu_shr_last_cache = 1;
4721 4755          cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
4722 4756  
4723 4757          if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
4724 4758              (cpi->cpi_vendor == X86_VENDOR_AMD &&
4725 4759              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
4726 4760              is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
4727 4761                  uint32_t leaf;
4728 4762  
4729 4763                  if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4730 4764                          leaf = 4;
4731 4765                  } else {
4732 4766                          leaf = CPUID_LEAF_EXT_1d;
4733 4767                  }
4734 4768  
4735 4769                  /*
4736 4770                   * Find the # of elements (size) returned by the leaf and along
4737 4771                   * the way detect last level cache sharing details.
4738 4772                   */
4739 4773                  bzero(&regs, sizeof (regs));
4740 4774                  cp = &regs;
4741 4775                  for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
4742 4776                          cp->cp_eax = leaf;
4743 4777                          cp->cp_ecx = i;
4744 4778  
4745 4779                          (void) __cpuid_insn(cp);
4746 4780  
4747 4781                          if (CPI_CACHE_TYPE(cp) == 0)
4748 4782                                  break;
4749 4783                          level = CPI_CACHE_LVL(cp);
4750 4784                          if (level > max) {
4751 4785                                  max = level;
4752 4786                                  cpi->cpi_ncpu_shr_last_cache =
4753 4787                                      CPI_NTHR_SHR_CACHE(cp) + 1;
4754 4788                          }
4755 4789                  }
4756 4790                  cpi->cpi_cache_leaf_size = size = i;
4757 4791  
4758 4792                  /*
4759 4793                   * Allocate the cpi_cache_leaves array. The first element
4760 4794                   * references the regs for the corresponding leaf with %ecx set
4761 4795                   * to 0. This was gathered in cpuid_pass2().
4762 4796                   */
4763 4797                  if (size > 0) {
4764 4798                          cpi->cpi_cache_leaves =
4765 4799                              kmem_alloc(size * sizeof (cp), KM_SLEEP);
4766 4800                          if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4767 4801                                  cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
4768 4802                          } else {
4769 4803                                  cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
4770 4804                          }
4771 4805  
4772 4806                          /*
4773 4807                           * Allocate storage to hold the additional regs
4774 4808                           * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
4775 4809                           *
4776 4810                           * The regs for the leaf, %ecx == 0 has already
4777 4811                           * been allocated as indicated above.
4778 4812                           */
4779 4813                          for (i = 1; i < size; i++) {
4780 4814                                  cp = cpi->cpi_cache_leaves[i] =
4781 4815                                      kmem_zalloc(sizeof (regs), KM_SLEEP);
4782 4816                                  cp->cp_eax = leaf;
4783 4817                                  cp->cp_ecx = i;
4784 4818  
4785 4819                                  (void) __cpuid_insn(cp);
4786 4820                          }
4787 4821                  }
4788 4822                  /*
4789 4823                   * Determine the number of bits needed to represent
4790 4824                   * the number of CPUs sharing the last level cache.
4791 4825                   *
4792 4826                   * Shift off that number of bits from the APIC id to
4793 4827                   * derive the cache id.
4794 4828                   */
4795 4829                  shft = 0;
4796 4830                  for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
4797 4831                          shft++;
4798 4832                  cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
4799 4833          }
4800 4834  
4801 4835          /*
4802 4836           * Now fixup the brand string
4803 4837           */
4804 4838          if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
4805 4839                  fabricate_brandstr(cpi);
4806 4840          } else {
4807 4841  
4808 4842                  /*
4809 4843                   * If we successfully extracted a brand string from the cpuid
4810 4844                   * instruction, clean it up by removing leading spaces and
4811 4845                   * similar junk.
4812 4846                   */
4813 4847                  if (cpi->cpi_brandstr[0]) {
4814 4848                          size_t maxlen = sizeof (cpi->cpi_brandstr);
4815 4849                          char *src, *dst;
4816 4850  
4817 4851                          dst = src = (char *)cpi->cpi_brandstr;
4818 4852                          src[maxlen - 1] = '\0';
4819 4853                          /*
4820 4854                           * strip leading spaces
4821 4855                           */
4822 4856                          while (*src == ' ')
4823 4857                                  src++;
4824 4858                          /*
4825 4859                           * Remove any 'Genuine' or "Authentic" prefixes
4826 4860                           */
4827 4861                          if (strncmp(src, "Genuine ", 8) == 0)
4828 4862                                  src += 8;
4829 4863                          if (strncmp(src, "Authentic ", 10) == 0)
4830 4864                                  src += 10;
4831 4865  
4832 4866                          /*
4833 4867                           * Now do an in-place copy.
4834 4868                           * Map (R) to (r) and (TM) to (tm).
4835 4869                           * The era of teletypes is long gone, and there's
4836 4870                           * -really- no need to shout.
4837 4871                           */
4838 4872                          while (*src != '\0') {
4839 4873                                  if (src[0] == '(') {
4840 4874                                          if (strncmp(src + 1, "R)", 2) == 0) {
4841 4875                                                  (void) strncpy(dst, "(r)", 3);
4842 4876                                                  src += 3;
4843 4877                                                  dst += 3;
4844 4878                                                  continue;
4845 4879                                          }
4846 4880                                          if (strncmp(src + 1, "TM)", 3) == 0) {
4847 4881                                                  (void) strncpy(dst, "(tm)", 4);
4848 4882                                                  src += 4;
4849 4883                                                  dst += 4;
4850 4884                                                  continue;
4851 4885                                          }
4852 4886                                  }
4853 4887                                  *dst++ = *src++;
4854 4888                          }
4855 4889                          *dst = '\0';
4856 4890  
4857 4891                          /*
4858 4892                           * Finally, remove any trailing spaces
4859 4893                           */
4860 4894                          while (--dst > cpi->cpi_brandstr)
4861 4895                                  if (*dst == ' ')
4862 4896                                          *dst = '\0';
4863 4897                                  else
4864 4898                                          break;
4865 4899                  } else
4866 4900                          fabricate_brandstr(cpi);
4867 4901          }
4868 4902          cpi->cpi_pass = 3;
4869 4903  }
4870 4904  
4871 4905  /*
4872 4906   * This routine is called out of bind_hwcap() much later in the life
4873 4907   * of the kernel (post_startup()).  The job of this routine is to resolve
4874 4908   * the hardware feature support and kernel support for those features into
4875 4909   * what we're actually going to tell applications via the aux vector.
4876 4910   */
4877 4911  void
4878 4912  cpuid_pass4(cpu_t *cpu, uint_t *hwcap_out)
4879 4913  {
4880 4914          struct cpuid_info *cpi;
4881 4915          uint_t hwcap_flags = 0, hwcap_flags_2 = 0;
4882 4916  
4883 4917          if (cpu == NULL)
4884 4918                  cpu = CPU;
4885 4919          cpi = cpu->cpu_m.mcpu_cpi;
4886 4920  
4887 4921          ASSERT(cpi->cpi_pass == 3);
4888 4922  
4889 4923          if (cpi->cpi_maxeax >= 1) {
4890 4924                  uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
4891 4925                  uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
4892 4926                  uint32_t *ebx = &cpi->cpi_support[STD_EBX_FEATURES];
4893 4927  
4894 4928                  *edx = CPI_FEATURES_EDX(cpi);
4895 4929                  *ecx = CPI_FEATURES_ECX(cpi);
4896 4930                  *ebx = CPI_FEATURES_7_0_EBX(cpi);
4897 4931  
4898 4932                  /*
4899 4933                   * [these require explicit kernel support]
4900 4934                   */
4901 4935                  if (!is_x86_feature(x86_featureset, X86FSET_SEP))
4902 4936                          *edx &= ~CPUID_INTC_EDX_SEP;
4903 4937  
4904 4938                  if (!is_x86_feature(x86_featureset, X86FSET_SSE))
4905 4939                          *edx &= ~(CPUID_INTC_EDX_FXSR|CPUID_INTC_EDX_SSE);
4906 4940                  if (!is_x86_feature(x86_featureset, X86FSET_SSE2))
4907 4941                          *edx &= ~CPUID_INTC_EDX_SSE2;
4908 4942  
4909 4943                  if (!is_x86_feature(x86_featureset, X86FSET_HTT))
4910 4944                          *edx &= ~CPUID_INTC_EDX_HTT;
4911 4945  
4912 4946                  if (!is_x86_feature(x86_featureset, X86FSET_SSE3))
4913 4947                          *ecx &= ~CPUID_INTC_ECX_SSE3;
4914 4948  
4915 4949                  if (!is_x86_feature(x86_featureset, X86FSET_SSSE3))
4916 4950                          *ecx &= ~CPUID_INTC_ECX_SSSE3;
4917 4951                  if (!is_x86_feature(x86_featureset, X86FSET_SSE4_1))
4918 4952                          *ecx &= ~CPUID_INTC_ECX_SSE4_1;
4919 4953                  if (!is_x86_feature(x86_featureset, X86FSET_SSE4_2))
4920 4954                          *ecx &= ~CPUID_INTC_ECX_SSE4_2;
4921 4955                  if (!is_x86_feature(x86_featureset, X86FSET_AES))
4922 4956                          *ecx &= ~CPUID_INTC_ECX_AES;
4923 4957                  if (!is_x86_feature(x86_featureset, X86FSET_PCLMULQDQ))
4924 4958                          *ecx &= ~CPUID_INTC_ECX_PCLMULQDQ;
4925 4959                  if (!is_x86_feature(x86_featureset, X86FSET_XSAVE))
4926 4960                          *ecx &= ~(CPUID_INTC_ECX_XSAVE |
4927 4961                              CPUID_INTC_ECX_OSXSAVE);
4928 4962                  if (!is_x86_feature(x86_featureset, X86FSET_AVX))
4929 4963                          *ecx &= ~CPUID_INTC_ECX_AVX;
4930 4964                  if (!is_x86_feature(x86_featureset, X86FSET_F16C))
4931 4965                          *ecx &= ~CPUID_INTC_ECX_F16C;
4932 4966                  if (!is_x86_feature(x86_featureset, X86FSET_FMA))
4933 4967                          *ecx &= ~CPUID_INTC_ECX_FMA;
4934 4968                  if (!is_x86_feature(x86_featureset, X86FSET_BMI1))
4935 4969                          *ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
4936 4970                  if (!is_x86_feature(x86_featureset, X86FSET_BMI2))
4937 4971                          *ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
4938 4972                  if (!is_x86_feature(x86_featureset, X86FSET_AVX2))
4939 4973                          *ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
4940 4974                  if (!is_x86_feature(x86_featureset, X86FSET_RDSEED))
4941 4975                          *ebx &= ~CPUID_INTC_EBX_7_0_RDSEED;
4942 4976                  if (!is_x86_feature(x86_featureset, X86FSET_ADX))
4943 4977                          *ebx &= ~CPUID_INTC_EBX_7_0_ADX;
4944 4978  
4945 4979                  /*
4946 4980                   * [no explicit support required beyond x87 fp context]
4947 4981                   */
4948 4982                  if (!fpu_exists)
4949 4983                          *edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
4950 4984  
4951 4985                  /*
4952 4986                   * Now map the supported feature vector to things that we
4953 4987                   * think userland will care about.
4954 4988                   */
4955 4989                  if (*edx & CPUID_INTC_EDX_SEP)
4956 4990                          hwcap_flags |= AV_386_SEP;
4957 4991                  if (*edx & CPUID_INTC_EDX_SSE)
4958 4992                          hwcap_flags |= AV_386_FXSR | AV_386_SSE;
4959 4993                  if (*edx & CPUID_INTC_EDX_SSE2)
4960 4994                          hwcap_flags |= AV_386_SSE2;
4961 4995                  if (*ecx & CPUID_INTC_ECX_SSE3)
4962 4996                          hwcap_flags |= AV_386_SSE3;
4963 4997                  if (*ecx & CPUID_INTC_ECX_SSSE3)
4964 4998                          hwcap_flags |= AV_386_SSSE3;
4965 4999                  if (*ecx & CPUID_INTC_ECX_SSE4_1)
4966 5000                          hwcap_flags |= AV_386_SSE4_1;
4967 5001                  if (*ecx & CPUID_INTC_ECX_SSE4_2)
4968 5002                          hwcap_flags |= AV_386_SSE4_2;
4969 5003                  if (*ecx & CPUID_INTC_ECX_MOVBE)
4970 5004                          hwcap_flags |= AV_386_MOVBE;
4971 5005                  if (*ecx & CPUID_INTC_ECX_AES)
4972 5006                          hwcap_flags |= AV_386_AES;
4973 5007                  if (*ecx & CPUID_INTC_ECX_PCLMULQDQ)
4974 5008                          hwcap_flags |= AV_386_PCLMULQDQ;
4975 5009                  if ((*ecx & CPUID_INTC_ECX_XSAVE) &&
4976 5010                      (*ecx & CPUID_INTC_ECX_OSXSAVE)) {
4977 5011                          hwcap_flags |= AV_386_XSAVE;
4978 5012  
4979 5013                          if (*ecx & CPUID_INTC_ECX_AVX) {
4980 5014                                  uint32_t *ecx_7 = &CPI_FEATURES_7_0_ECX(cpi);
4981 5015                                  uint32_t *edx_7 = &CPI_FEATURES_7_0_EDX(cpi);
4982 5016  
4983 5017                                  hwcap_flags |= AV_386_AVX;
4984 5018                                  if (*ecx & CPUID_INTC_ECX_F16C)
4985 5019                                          hwcap_flags_2 |= AV_386_2_F16C;
4986 5020                                  if (*ecx & CPUID_INTC_ECX_FMA)
4987 5021                                          hwcap_flags_2 |= AV_386_2_FMA;
4988 5022  
4989 5023                                  if (*ebx & CPUID_INTC_EBX_7_0_BMI1)
4990 5024                                          hwcap_flags_2 |= AV_386_2_BMI1;
4991 5025                                  if (*ebx & CPUID_INTC_EBX_7_0_BMI2)
4992 5026                                          hwcap_flags_2 |= AV_386_2_BMI2;
4993 5027                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX2)
4994 5028                                          hwcap_flags_2 |= AV_386_2_AVX2;
4995 5029                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512F)
4996 5030                                          hwcap_flags_2 |= AV_386_2_AVX512F;
4997 5031                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
4998 5032                                          hwcap_flags_2 |= AV_386_2_AVX512DQ;
4999 5033                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
5000 5034                                          hwcap_flags_2 |= AV_386_2_AVX512IFMA;
5001 5035                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512PF)
5002 5036                                          hwcap_flags_2 |= AV_386_2_AVX512PF;
5003 5037                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512ER)
5004 5038                                          hwcap_flags_2 |= AV_386_2_AVX512ER;
5005 5039                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512CD)
5006 5040                                          hwcap_flags_2 |= AV_386_2_AVX512CD;
5007 5041                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512BW)
5008 5042                                          hwcap_flags_2 |= AV_386_2_AVX512BW;
5009 5043                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512VL)
5010 5044                                          hwcap_flags_2 |= AV_386_2_AVX512VL;
5011 5045  
5012 5046                                  if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VBMI)
5013 5047                                          hwcap_flags_2 |= AV_386_2_AVX512VBMI;
5014 5048                                  if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VNNI)
5015 5049                                          hwcap_flags_2 |= AV_386_2_AVX512_VNNI;
5016 5050                                  if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
5017 5051                                          hwcap_flags_2 |= AV_386_2_AVX512VPOPCDQ;
5018 5052  
5019 5053                                  if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124NNIW)
5020 5054                                          hwcap_flags_2 |= AV_386_2_AVX512_4NNIW;
5021 5055                                  if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
5022 5056                                          hwcap_flags_2 |= AV_386_2_AVX512_4FMAPS;
5023 5057                          }
5024 5058                  }
5025 5059                  if (*ecx & CPUID_INTC_ECX_VMX)
5026 5060                          hwcap_flags |= AV_386_VMX;
5027 5061                  if (*ecx & CPUID_INTC_ECX_POPCNT)
5028 5062                          hwcap_flags |= AV_386_POPCNT;
5029 5063                  if (*edx & CPUID_INTC_EDX_FPU)
5030 5064                          hwcap_flags |= AV_386_FPU;
5031 5065                  if (*edx & CPUID_INTC_EDX_MMX)
5032 5066                          hwcap_flags |= AV_386_MMX;
5033 5067  
5034 5068                  if (*edx & CPUID_INTC_EDX_TSC)
5035 5069                          hwcap_flags |= AV_386_TSC;
5036 5070                  if (*edx & CPUID_INTC_EDX_CX8)
5037 5071                          hwcap_flags |= AV_386_CX8;
5038 5072                  if (*edx & CPUID_INTC_EDX_CMOV)
5039 5073                          hwcap_flags |= AV_386_CMOV;
5040 5074                  if (*ecx & CPUID_INTC_ECX_CX16)
5041 5075                          hwcap_flags |= AV_386_CX16;
5042 5076  
5043 5077                  if (*ecx & CPUID_INTC_ECX_RDRAND)
5044 5078                          hwcap_flags_2 |= AV_386_2_RDRAND;
5045 5079                  if (*ebx & CPUID_INTC_EBX_7_0_ADX)
5046 5080                          hwcap_flags_2 |= AV_386_2_ADX;
5047 5081                  if (*ebx & CPUID_INTC_EBX_7_0_RDSEED)
5048 5082                          hwcap_flags_2 |= AV_386_2_RDSEED;
5049 5083                  if (*ebx & CPUID_INTC_EBX_7_0_SHA)
5050 5084                          hwcap_flags_2 |= AV_386_2_SHA;
5051 5085                  if (*ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
5052 5086                          hwcap_flags_2 |= AV_386_2_FSGSBASE;
5053 5087                  if (*ebx & CPUID_INTC_EBX_7_0_CLWB)
5054 5088                          hwcap_flags_2 |= AV_386_2_CLWB;
5055 5089                  if (*ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
5056 5090                          hwcap_flags_2 |= AV_386_2_CLFLUSHOPT;
5057 5091  
5058 5092          }
5059 5093          /*
5060 5094           * Check a few miscilaneous features.
5061 5095           */
5062 5096          if (is_x86_feature(x86_featureset, X86FSET_CLZERO))
5063 5097                  hwcap_flags_2 |= AV_386_2_CLZERO;
5064 5098  
5065 5099          if (cpi->cpi_xmaxeax < 0x80000001)
5066 5100                  goto pass4_done;
5067 5101  
5068 5102          switch (cpi->cpi_vendor) {
5069 5103                  struct cpuid_regs cp;
5070 5104                  uint32_t *edx, *ecx;
5071 5105  
5072 5106          case X86_VENDOR_Intel:
5073 5107                  /*
5074 5108                   * Seems like Intel duplicated what we necessary
5075 5109                   * here to make the initial crop of 64-bit OS's work.
5076 5110                   * Hopefully, those are the only "extended" bits
5077 5111                   * they'll add.
5078 5112                   */
5079 5113                  /*FALLTHROUGH*/
5080 5114  
5081 5115          case X86_VENDOR_AMD:
5082 5116                  edx = &cpi->cpi_support[AMD_EDX_FEATURES];
5083 5117                  ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
5084 5118  
5085 5119                  *edx = CPI_FEATURES_XTD_EDX(cpi);
5086 5120                  *ecx = CPI_FEATURES_XTD_ECX(cpi);
5087 5121  
5088 5122                  /*
5089 5123                   * [these features require explicit kernel support]
5090 5124                   */
5091 5125                  switch (cpi->cpi_vendor) {
5092 5126                  case X86_VENDOR_Intel:
5093 5127                          if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
5094 5128                                  *edx &= ~CPUID_AMD_EDX_TSCP;
5095 5129                          break;
5096 5130  
5097 5131                  case X86_VENDOR_AMD:
5098 5132                          if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
5099 5133                                  *edx &= ~CPUID_AMD_EDX_TSCP;
5100 5134                          if (!is_x86_feature(x86_featureset, X86FSET_SSE4A))
5101 5135                                  *ecx &= ~CPUID_AMD_ECX_SSE4A;
5102 5136                          break;
5103 5137  
5104 5138                  default:
5105 5139                          break;
5106 5140                  }
5107 5141  
5108 5142                  /*
5109 5143                   * [no explicit support required beyond
5110 5144                   * x87 fp context and exception handlers]
5111 5145                   */
5112 5146                  if (!fpu_exists)
5113 5147                          *edx &= ~(CPUID_AMD_EDX_MMXamd |
5114 5148                              CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
5115 5149  
5116 5150                  if (!is_x86_feature(x86_featureset, X86FSET_NX))
5117 5151                          *edx &= ~CPUID_AMD_EDX_NX;
5118 5152  #if !defined(__amd64)
5119 5153                  *edx &= ~CPUID_AMD_EDX_LM;
5120 5154  #endif
5121 5155                  /*
5122 5156                   * Now map the supported feature vector to
5123 5157                   * things that we think userland will care about.
5124 5158                   */
5125 5159  #if defined(__amd64)
5126 5160                  if (*edx & CPUID_AMD_EDX_SYSC)
5127 5161                          hwcap_flags |= AV_386_AMD_SYSC;
5128 5162  #endif
5129 5163                  if (*edx & CPUID_AMD_EDX_MMXamd)
5130 5164                          hwcap_flags |= AV_386_AMD_MMX;
5131 5165                  if (*edx & CPUID_AMD_EDX_3DNow)
5132 5166                          hwcap_flags |= AV_386_AMD_3DNow;
5133 5167                  if (*edx & CPUID_AMD_EDX_3DNowx)
5134 5168                          hwcap_flags |= AV_386_AMD_3DNowx;
5135 5169                  if (*ecx & CPUID_AMD_ECX_SVM)
5136 5170                          hwcap_flags |= AV_386_AMD_SVM;
5137 5171  
5138 5172                  switch (cpi->cpi_vendor) {
5139 5173                  case X86_VENDOR_AMD:
5140 5174                          if (*edx & CPUID_AMD_EDX_TSCP)
5141 5175                                  hwcap_flags |= AV_386_TSCP;
5142 5176                          if (*ecx & CPUID_AMD_ECX_AHF64)
5143 5177                                  hwcap_flags |= AV_386_AHF;
5144 5178                          if (*ecx & CPUID_AMD_ECX_SSE4A)
5145 5179                                  hwcap_flags |= AV_386_AMD_SSE4A;
5146 5180                          if (*ecx & CPUID_AMD_ECX_LZCNT)
5147 5181                                  hwcap_flags |= AV_386_AMD_LZCNT;
5148 5182                          if (*ecx & CPUID_AMD_ECX_MONITORX)
5149 5183                                  hwcap_flags_2 |= AV_386_2_MONITORX;
5150 5184                          break;
5151 5185  
5152 5186                  case X86_VENDOR_Intel:
5153 5187                          if (*edx & CPUID_AMD_EDX_TSCP)
5154 5188                                  hwcap_flags |= AV_386_TSCP;
5155 5189                          if (*ecx & CPUID_AMD_ECX_LZCNT)
5156 5190                                  hwcap_flags |= AV_386_AMD_LZCNT;
5157 5191                          /*
5158 5192                           * Aarrgh.
5159 5193                           * Intel uses a different bit in the same word.
5160 5194                           */
5161 5195                          if (*ecx & CPUID_INTC_ECX_AHF64)
5162 5196                                  hwcap_flags |= AV_386_AHF;
5163 5197                          break;
5164 5198  
5165 5199                  default:
5166 5200                          break;
5167 5201                  }
5168 5202                  break;
5169 5203  
5170 5204          case X86_VENDOR_TM:
5171 5205                  cp.cp_eax = 0x80860001;
5172 5206                  (void) __cpuid_insn(&cp);
5173 5207                  cpi->cpi_support[TM_EDX_FEATURES] = cp.cp_edx;
5174 5208                  break;
5175 5209  
5176 5210          default:
5177 5211                  break;
5178 5212          }
5179 5213  
5180 5214  pass4_done:
5181 5215          cpi->cpi_pass = 4;
5182 5216          if (hwcap_out != NULL) {
5183 5217                  hwcap_out[0] = hwcap_flags;
5184 5218                  hwcap_out[1] = hwcap_flags_2;
5185 5219          }
5186 5220  }
5187 5221  
5188 5222  
5189 5223  /*
5190 5224   * Simulate the cpuid instruction using the data we previously
5191 5225   * captured about this CPU.  We try our best to return the truth
5192 5226   * about the hardware, independently of kernel support.
5193 5227   */
5194 5228  uint32_t
5195 5229  cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
5196 5230  {
5197 5231          struct cpuid_info *cpi;
5198 5232          struct cpuid_regs *xcp;
5199 5233  
5200 5234          if (cpu == NULL)
5201 5235                  cpu = CPU;
5202 5236          cpi = cpu->cpu_m.mcpu_cpi;
5203 5237  
5204 5238          ASSERT(cpuid_checkpass(cpu, 3));
5205 5239  
5206 5240          /*
5207 5241           * CPUID data is cached in two separate places: cpi_std for standard
5208 5242           * CPUID leaves , and cpi_extd for extended CPUID leaves.
5209 5243           */
5210 5244          if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
5211 5245                  xcp = &cpi->cpi_std[cp->cp_eax];
5212 5246          } else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
5213 5247              cp->cp_eax <= cpi->cpi_xmaxeax &&
5214 5248              cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
5215 5249                  xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
5216 5250          } else {
5217 5251                  /*
5218 5252                   * The caller is asking for data from an input parameter which
5219 5253                   * the kernel has not cached.  In this case we go fetch from
5220 5254                   * the hardware and return the data directly to the user.
5221 5255                   */
5222 5256                  return (__cpuid_insn(cp));
5223 5257          }
5224 5258  
5225 5259          cp->cp_eax = xcp->cp_eax;
5226 5260          cp->cp_ebx = xcp->cp_ebx;
5227 5261          cp->cp_ecx = xcp->cp_ecx;
5228 5262          cp->cp_edx = xcp->cp_edx;
5229 5263          return (cp->cp_eax);
5230 5264  }
5231 5265  
5232 5266  int
5233 5267  cpuid_checkpass(cpu_t *cpu, int pass)
5234 5268  {
5235 5269          return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
5236 5270              cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
5237 5271  }
5238 5272  
5239 5273  int
5240 5274  cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
5241 5275  {
5242 5276          ASSERT(cpuid_checkpass(cpu, 3));
5243 5277  
5244 5278          return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
5245 5279  }
5246 5280  
5247 5281  int
5248 5282  cpuid_is_cmt(cpu_t *cpu)
5249 5283  {
5250 5284          if (cpu == NULL)
5251 5285                  cpu = CPU;
5252 5286  
5253 5287          ASSERT(cpuid_checkpass(cpu, 1));
5254 5288  
5255 5289          return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
5256 5290  }
5257 5291  
5258 5292  /*
5259 5293   * AMD and Intel both implement the 64-bit variant of the syscall
5260 5294   * instruction (syscallq), so if there's -any- support for syscall,
5261 5295   * cpuid currently says "yes, we support this".
5262 5296   *
5263 5297   * However, Intel decided to -not- implement the 32-bit variant of the
5264 5298   * syscall instruction, so we provide a predicate to allow our caller
5265 5299   * to test that subtlety here.
5266 5300   *
5267 5301   * XXPV Currently, 32-bit syscall instructions don't work via the hypervisor,
5268 5302   *      even in the case where the hardware would in fact support it.
5269 5303   */
5270 5304  /*ARGSUSED*/
5271 5305  int
5272 5306  cpuid_syscall32_insn(cpu_t *cpu)
5273 5307  {
5274 5308          ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), 1));
5275 5309  
5276 5310  #if !defined(__xpv)
5277 5311          if (cpu == NULL)
5278 5312                  cpu = CPU;
5279 5313  
5280 5314          /*CSTYLED*/
5281 5315          {
5282 5316                  struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5283 5317  
5284 5318                  if (cpi->cpi_vendor == X86_VENDOR_AMD &&
5285 5319                      cpi->cpi_xmaxeax >= 0x80000001 &&
5286 5320                      (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
5287 5321                          return (1);
5288 5322          }
5289 5323  #endif
5290 5324          return (0);
5291 5325  }
5292 5326  
5293 5327  int
5294 5328  cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
5295 5329  {
5296 5330          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5297 5331  
5298 5332          static const char fmt[] =
5299 5333              "x86 (%s %X family %d model %d step %d clock %d MHz)";
5300 5334          static const char fmt_ht[] =
5301 5335              "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
5302 5336  
5303 5337          ASSERT(cpuid_checkpass(cpu, 1));
5304 5338  
5305 5339          if (cpuid_is_cmt(cpu))
5306 5340                  return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
5307 5341                      cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5308 5342                      cpi->cpi_family, cpi->cpi_model,
5309 5343                      cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5310 5344          return (snprintf(s, n, fmt,
5311 5345              cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5312 5346              cpi->cpi_family, cpi->cpi_model,
5313 5347              cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5314 5348  }
5315 5349  
5316 5350  const char *
5317 5351  cpuid_getvendorstr(cpu_t *cpu)
5318 5352  {
5319 5353          ASSERT(cpuid_checkpass(cpu, 1));
5320 5354          return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
5321 5355  }
5322 5356  
5323 5357  uint_t
5324 5358  cpuid_getvendor(cpu_t *cpu)
5325 5359  {
5326 5360          ASSERT(cpuid_checkpass(cpu, 1));
5327 5361          return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
5328 5362  }
5329 5363  
5330 5364  uint_t
5331 5365  cpuid_getfamily(cpu_t *cpu)
5332 5366  {
5333 5367          ASSERT(cpuid_checkpass(cpu, 1));
5334 5368          return (cpu->cpu_m.mcpu_cpi->cpi_family);
5335 5369  }
5336 5370  
5337 5371  uint_t
5338 5372  cpuid_getmodel(cpu_t *cpu)
5339 5373  {
5340 5374          ASSERT(cpuid_checkpass(cpu, 1));
5341 5375          return (cpu->cpu_m.mcpu_cpi->cpi_model);
5342 5376  }
5343 5377  
5344 5378  uint_t
5345 5379  cpuid_get_ncpu_per_chip(cpu_t *cpu)
5346 5380  {
5347 5381          ASSERT(cpuid_checkpass(cpu, 1));
5348 5382          return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
5349 5383  }
5350 5384  
5351 5385  uint_t
5352 5386  cpuid_get_ncore_per_chip(cpu_t *cpu)
5353 5387  {
5354 5388          ASSERT(cpuid_checkpass(cpu, 1));
5355 5389          return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
5356 5390  }
5357 5391  
5358 5392  uint_t
5359 5393  cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
5360 5394  {
5361 5395          ASSERT(cpuid_checkpass(cpu, 2));
5362 5396          return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
5363 5397  }
5364 5398  
5365 5399  id_t
5366 5400  cpuid_get_last_lvl_cacheid(cpu_t *cpu)
5367 5401  {
5368 5402          ASSERT(cpuid_checkpass(cpu, 2));
5369 5403          return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5370 5404  }
5371 5405  
5372 5406  uint_t
5373 5407  cpuid_getstep(cpu_t *cpu)
5374 5408  {
5375 5409          ASSERT(cpuid_checkpass(cpu, 1));
5376 5410          return (cpu->cpu_m.mcpu_cpi->cpi_step);
5377 5411  }
5378 5412  
5379 5413  uint_t
5380 5414  cpuid_getsig(struct cpu *cpu)
5381 5415  {
5382 5416          ASSERT(cpuid_checkpass(cpu, 1));
5383 5417          return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
5384 5418  }
5385 5419  
5386 5420  uint32_t
5387 5421  cpuid_getchiprev(struct cpu *cpu)
5388 5422  {
5389 5423          ASSERT(cpuid_checkpass(cpu, 1));
5390 5424          return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
5391 5425  }
5392 5426  
5393 5427  const char *
5394 5428  cpuid_getchiprevstr(struct cpu *cpu)
5395 5429  {
5396 5430          ASSERT(cpuid_checkpass(cpu, 1));
5397 5431          return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
5398 5432  }
5399 5433  
5400 5434  uint32_t
5401 5435  cpuid_getsockettype(struct cpu *cpu)
5402 5436  {
5403 5437          ASSERT(cpuid_checkpass(cpu, 1));
5404 5438          return (cpu->cpu_m.mcpu_cpi->cpi_socket);
5405 5439  }
5406 5440  
5407 5441  const char *
5408 5442  cpuid_getsocketstr(cpu_t *cpu)
5409 5443  {
5410 5444          static const char *socketstr = NULL;
5411 5445          struct cpuid_info *cpi;
5412 5446  
5413 5447          ASSERT(cpuid_checkpass(cpu, 1));
5414 5448          cpi = cpu->cpu_m.mcpu_cpi;
5415 5449  
5416 5450          /* Assume that socket types are the same across the system */
5417 5451          if (socketstr == NULL)
5418 5452                  socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
5419 5453                      cpi->cpi_model, cpi->cpi_step);
5420 5454  
5421 5455  
5422 5456          return (socketstr);
5423 5457  }
5424 5458  
5425 5459  int
5426 5460  cpuid_get_chipid(cpu_t *cpu)
5427 5461  {
5428 5462          ASSERT(cpuid_checkpass(cpu, 1));
5429 5463  
5430 5464          if (cpuid_is_cmt(cpu))
5431 5465                  return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
5432 5466          return (cpu->cpu_id);
5433 5467  }
5434 5468  
5435 5469  id_t
5436 5470  cpuid_get_coreid(cpu_t *cpu)
5437 5471  {
5438 5472          ASSERT(cpuid_checkpass(cpu, 1));
5439 5473          return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
5440 5474  }
5441 5475  
5442 5476  int
5443 5477  cpuid_get_pkgcoreid(cpu_t *cpu)
5444 5478  {
5445 5479          ASSERT(cpuid_checkpass(cpu, 1));
5446 5480          return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
5447 5481  }
5448 5482  
5449 5483  int
5450 5484  cpuid_get_clogid(cpu_t *cpu)
5451 5485  {
5452 5486          ASSERT(cpuid_checkpass(cpu, 1));
5453 5487          return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
5454 5488  }
5455 5489  
5456 5490  int
5457 5491  cpuid_get_cacheid(cpu_t *cpu)
5458 5492  {
5459 5493          ASSERT(cpuid_checkpass(cpu, 1));
5460 5494          return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5461 5495  }
5462 5496  
5463 5497  uint_t
5464 5498  cpuid_get_procnodeid(cpu_t *cpu)
5465 5499  {
5466 5500          ASSERT(cpuid_checkpass(cpu, 1));
5467 5501          return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
5468 5502  }
5469 5503  
5470 5504  uint_t
5471 5505  cpuid_get_procnodes_per_pkg(cpu_t *cpu)
5472 5506  {
5473 5507          ASSERT(cpuid_checkpass(cpu, 1));
5474 5508          return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
5475 5509  }
5476 5510  
5477 5511  uint_t
5478 5512  cpuid_get_compunitid(cpu_t *cpu)
5479 5513  {
5480 5514          ASSERT(cpuid_checkpass(cpu, 1));
5481 5515          return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
5482 5516  }
5483 5517  
5484 5518  uint_t
5485 5519  cpuid_get_cores_per_compunit(cpu_t *cpu)
5486 5520  {
5487 5521          ASSERT(cpuid_checkpass(cpu, 1));
5488 5522          return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
5489 5523  }
5490 5524  
5491 5525  /*ARGSUSED*/
5492 5526  int
5493 5527  cpuid_have_cr8access(cpu_t *cpu)
5494 5528  {
5495 5529  #if defined(__amd64)
5496 5530          return (1);
5497 5531  #else
5498 5532          struct cpuid_info *cpi;
5499 5533  
5500 5534          ASSERT(cpu != NULL);
5501 5535          cpi = cpu->cpu_m.mcpu_cpi;
5502 5536          if (cpi->cpi_vendor == X86_VENDOR_AMD && cpi->cpi_maxeax >= 1 &&
5503 5537              (CPI_FEATURES_XTD_ECX(cpi) & CPUID_AMD_ECX_CR8D) != 0)
5504 5538                  return (1);
5505 5539          return (0);
5506 5540  #endif
5507 5541  }
5508 5542  
5509 5543  uint32_t
5510 5544  cpuid_get_apicid(cpu_t *cpu)
5511 5545  {
5512 5546          ASSERT(cpuid_checkpass(cpu, 1));
5513 5547          if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
5514 5548                  return (UINT32_MAX);
5515 5549          } else {
5516 5550                  return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
5517 5551          }
5518 5552  }
5519 5553  
5520 5554  void
5521 5555  cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
5522 5556  {
5523 5557          struct cpuid_info *cpi;
5524 5558  
5525 5559          if (cpu == NULL)
5526 5560                  cpu = CPU;
5527 5561          cpi = cpu->cpu_m.mcpu_cpi;
5528 5562  
5529 5563          ASSERT(cpuid_checkpass(cpu, 1));
5530 5564  
5531 5565          if (pabits)
5532 5566                  *pabits = cpi->cpi_pabits;
5533 5567          if (vabits)
5534 5568                  *vabits = cpi->cpi_vabits;
5535 5569  }
5536 5570  
5537 5571  size_t
5538 5572  cpuid_get_xsave_size()
5539 5573  {
5540 5574          return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
5541 5575              sizeof (struct xsave_state)));
5542 5576  }
5543 5577  
5544 5578  /*
5545 5579   * Return true if the CPUs on this system require 'pointer clearing' for the
5546 5580   * floating point error pointer exception handling. In the past, this has been
5547 5581   * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
5548 5582   * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
5549 5583   * feature bit and is reflected in the cpi_fp_amd_save member.
5550 5584   */
5551 5585  boolean_t
5552 5586  cpuid_need_fp_excp_handling()
5553 5587  {
5554 5588          return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
5555 5589              cpuid_info0.cpi_fp_amd_save != 0);
5556 5590  }
5557 5591  
5558 5592  /*
5559 5593   * Returns the number of data TLB entries for a corresponding
5560 5594   * pagesize.  If it can't be computed, or isn't known, the
5561 5595   * routine returns zero.  If you ask about an architecturally
5562 5596   * impossible pagesize, the routine will panic (so that the
5563 5597   * hat implementor knows that things are inconsistent.)
5564 5598   */
5565 5599  uint_t
5566 5600  cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
5567 5601  {
5568 5602          struct cpuid_info *cpi;
5569 5603          uint_t dtlb_nent = 0;
5570 5604  
5571 5605          if (cpu == NULL)
5572 5606                  cpu = CPU;
5573 5607          cpi = cpu->cpu_m.mcpu_cpi;
5574 5608  
5575 5609          ASSERT(cpuid_checkpass(cpu, 1));
5576 5610  
5577 5611          /*
5578 5612           * Check the L2 TLB info
5579 5613           */
5580 5614          if (cpi->cpi_xmaxeax >= 0x80000006) {
5581 5615                  struct cpuid_regs *cp = &cpi->cpi_extd[6];
5582 5616  
5583 5617                  switch (pagesize) {
5584 5618  
5585 5619                  case 4 * 1024:
5586 5620                          /*
5587 5621                           * All zero in the top 16 bits of the register
5588 5622                           * indicates a unified TLB. Size is in low 16 bits.
5589 5623                           */
5590 5624                          if ((cp->cp_ebx & 0xffff0000) == 0)
5591 5625                                  dtlb_nent = cp->cp_ebx & 0x0000ffff;
5592 5626                          else
5593 5627                                  dtlb_nent = BITX(cp->cp_ebx, 27, 16);
5594 5628                          break;
5595 5629  
5596 5630                  case 2 * 1024 * 1024:
5597 5631                          if ((cp->cp_eax & 0xffff0000) == 0)
5598 5632                                  dtlb_nent = cp->cp_eax & 0x0000ffff;
5599 5633                          else
5600 5634                                  dtlb_nent = BITX(cp->cp_eax, 27, 16);
5601 5635                          break;
5602 5636  
5603 5637                  default:
5604 5638                          panic("unknown L2 pagesize");
5605 5639                          /*NOTREACHED*/
5606 5640                  }
5607 5641          }
5608 5642  
5609 5643          if (dtlb_nent != 0)
5610 5644                  return (dtlb_nent);
5611 5645  
5612 5646          /*
5613 5647           * No L2 TLB support for this size, try L1.
5614 5648           */
5615 5649          if (cpi->cpi_xmaxeax >= 0x80000005) {
5616 5650                  struct cpuid_regs *cp = &cpi->cpi_extd[5];
5617 5651  
5618 5652                  switch (pagesize) {
5619 5653                  case 4 * 1024:
5620 5654                          dtlb_nent = BITX(cp->cp_ebx, 23, 16);
5621 5655                          break;
5622 5656                  case 2 * 1024 * 1024:
5623 5657                          dtlb_nent = BITX(cp->cp_eax, 23, 16);
5624 5658                          break;
5625 5659                  default:
5626 5660                          panic("unknown L1 d-TLB pagesize");
5627 5661                          /*NOTREACHED*/
5628 5662                  }
5629 5663          }
5630 5664  
5631 5665          return (dtlb_nent);
5632 5666  }
5633 5667  
5634 5668  /*
5635 5669   * Return 0 if the erratum is not present or not applicable, positive
5636 5670   * if it is, and negative if the status of the erratum is unknown.
5637 5671   *
5638 5672   * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
5639 5673   * Processors" #25759, Rev 3.57, August 2005
5640 5674   */
5641 5675  int
5642 5676  cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
5643 5677  {
5644 5678          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5645 5679          uint_t eax;
5646 5680  
5647 5681          /*
5648 5682           * Bail out if this CPU isn't an AMD CPU, or if it's
5649 5683           * a legacy (32-bit) AMD CPU.
5650 5684           */
5651 5685          if (cpi->cpi_vendor != X86_VENDOR_AMD ||
5652 5686              cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
5653 5687              cpi->cpi_family == 6) {
5654 5688                  return (0);
5655 5689          }
5656 5690  
5657 5691          eax = cpi->cpi_std[1].cp_eax;
5658 5692  
5659 5693  #define SH_B0(eax)      (eax == 0xf40 || eax == 0xf50)
5660 5694  #define SH_B3(eax)      (eax == 0xf51)
5661 5695  #define B(eax)          (SH_B0(eax) || SH_B3(eax))
5662 5696  
5663 5697  #define SH_C0(eax)      (eax == 0xf48 || eax == 0xf58)
5664 5698  
5665 5699  #define SH_CG(eax)      (eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
5666 5700  #define DH_CG(eax)      (eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
5667 5701  #define CH_CG(eax)      (eax == 0xf82 || eax == 0xfb2)
5668 5702  #define CG(eax)         (SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
5669 5703  
5670 5704  #define SH_D0(eax)      (eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
5671 5705  #define DH_D0(eax)      (eax == 0x10fc0 || eax == 0x10ff0)
5672 5706  #define CH_D0(eax)      (eax == 0x10f80 || eax == 0x10fb0)
5673 5707  #define D0(eax)         (SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
5674 5708  
5675 5709  #define SH_E0(eax)      (eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
5676 5710  #define JH_E1(eax)      (eax == 0x20f10)        /* JH8_E0 had 0x20f30 */
5677 5711  #define DH_E3(eax)      (eax == 0x20fc0 || eax == 0x20ff0)
5678 5712  #define SH_E4(eax)      (eax == 0x20f51 || eax == 0x20f71)
5679 5713  #define BH_E4(eax)      (eax == 0x20fb1)
5680 5714  #define SH_E5(eax)      (eax == 0x20f42)
5681 5715  #define DH_E6(eax)      (eax == 0x20ff2 || eax == 0x20fc2)
5682 5716  #define JH_E6(eax)      (eax == 0x20f12 || eax == 0x20f32)
5683 5717  #define EX(eax)         (SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
5684 5718                              SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
5685 5719                              DH_E6(eax) || JH_E6(eax))
5686 5720  
5687 5721  #define DR_AX(eax)      (eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
5688 5722  #define DR_B0(eax)      (eax == 0x100f20)
5689 5723  #define DR_B1(eax)      (eax == 0x100f21)
5690 5724  #define DR_BA(eax)      (eax == 0x100f2a)
5691 5725  #define DR_B2(eax)      (eax == 0x100f22)
5692 5726  #define DR_B3(eax)      (eax == 0x100f23)
5693 5727  #define RB_C0(eax)      (eax == 0x100f40)
5694 5728  
5695 5729          switch (erratum) {
5696 5730          case 1:
5697 5731                  return (cpi->cpi_family < 0x10);
5698 5732          case 51:        /* what does the asterisk mean? */
5699 5733                  return (B(eax) || SH_C0(eax) || CG(eax));
5700 5734          case 52:
5701 5735                  return (B(eax));
5702 5736          case 57:
5703 5737                  return (cpi->cpi_family <= 0x11);
5704 5738          case 58:
5705 5739                  return (B(eax));
5706 5740          case 60:
5707 5741                  return (cpi->cpi_family <= 0x11);
5708 5742          case 61:
5709 5743          case 62:
5710 5744          case 63:
5711 5745          case 64:
5712 5746          case 65:
5713 5747          case 66:
5714 5748          case 68:
5715 5749          case 69:
5716 5750          case 70:
5717 5751          case 71:
5718 5752                  return (B(eax));
5719 5753          case 72:
5720 5754                  return (SH_B0(eax));
5721 5755          case 74:
5722 5756                  return (B(eax));
5723 5757          case 75:
5724 5758                  return (cpi->cpi_family < 0x10);
5725 5759          case 76:
5726 5760                  return (B(eax));
5727 5761          case 77:
5728 5762                  return (cpi->cpi_family <= 0x11);
5729 5763          case 78:
5730 5764                  return (B(eax) || SH_C0(eax));
5731 5765          case 79:
5732 5766                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5733 5767          case 80:
5734 5768          case 81:
5735 5769          case 82:
5736 5770                  return (B(eax));
5737 5771          case 83:
5738 5772                  return (B(eax) || SH_C0(eax) || CG(eax));
5739 5773          case 85:
5740 5774                  return (cpi->cpi_family < 0x10);
5741 5775          case 86:
5742 5776                  return (SH_C0(eax) || CG(eax));
5743 5777          case 88:
5744 5778  #if !defined(__amd64)
5745 5779                  return (0);
5746 5780  #else
5747 5781                  return (B(eax) || SH_C0(eax));
5748 5782  #endif
5749 5783          case 89:
5750 5784                  return (cpi->cpi_family < 0x10);
5751 5785          case 90:
5752 5786                  return (B(eax) || SH_C0(eax) || CG(eax));
5753 5787          case 91:
5754 5788          case 92:
5755 5789                  return (B(eax) || SH_C0(eax));
5756 5790          case 93:
5757 5791                  return (SH_C0(eax));
5758 5792          case 94:
5759 5793                  return (B(eax) || SH_C0(eax) || CG(eax));
5760 5794          case 95:
5761 5795  #if !defined(__amd64)
5762 5796                  return (0);
5763 5797  #else
5764 5798                  return (B(eax) || SH_C0(eax));
5765 5799  #endif
5766 5800          case 96:
5767 5801                  return (B(eax) || SH_C0(eax) || CG(eax));
5768 5802          case 97:
5769 5803          case 98:
5770 5804                  return (SH_C0(eax) || CG(eax));
5771 5805          case 99:
5772 5806                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5773 5807          case 100:
5774 5808                  return (B(eax) || SH_C0(eax));
5775 5809          case 101:
5776 5810          case 103:
5777 5811                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5778 5812          case 104:
5779 5813                  return (SH_C0(eax) || CG(eax) || D0(eax));
5780 5814          case 105:
5781 5815          case 106:
5782 5816          case 107:
5783 5817                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5784 5818          case 108:
5785 5819                  return (DH_CG(eax));
5786 5820          case 109:
5787 5821                  return (SH_C0(eax) || CG(eax) || D0(eax));
5788 5822          case 110:
5789 5823                  return (D0(eax) || EX(eax));
5790 5824          case 111:
5791 5825                  return (CG(eax));
5792 5826          case 112:
5793 5827                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5794 5828          case 113:
5795 5829                  return (eax == 0x20fc0);
5796 5830          case 114:
5797 5831                  return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
5798 5832          case 115:
5799 5833                  return (SH_E0(eax) || JH_E1(eax));
5800 5834          case 116:
5801 5835                  return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
5802 5836          case 117:
5803 5837                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5804 5838          case 118:
5805 5839                  return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
5806 5840                      JH_E6(eax));
5807 5841          case 121:
5808 5842                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5809 5843          case 122:
5810 5844                  return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
5811 5845          case 123:
5812 5846                  return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
5813 5847          case 131:
5814 5848                  return (cpi->cpi_family < 0x10);
5815 5849          case 6336786:
5816 5850  
5817 5851                  /*
5818 5852                   * Test for AdvPowerMgmtInfo.TscPStateInvariant
5819 5853                   * if this is a K8 family or newer processor. We're testing for
5820 5854                   * this 'erratum' to determine whether or not we have a constant
5821 5855                   * TSC.
5822 5856                   *
5823 5857                   * Our current fix for this is to disable the C1-Clock ramping.
5824 5858                   * However, this doesn't work on newer processor families nor
5825 5859                   * does it work when virtualized as those devices don't exist.
5826 5860                   */
5827 5861                  if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
5828 5862                          return (0);
5829 5863                  }
5830 5864  
5831 5865                  if (CPI_FAMILY(cpi) == 0xf) {
5832 5866                          struct cpuid_regs regs;
5833 5867                          regs.cp_eax = 0x80000007;
5834 5868                          (void) __cpuid_insn(&regs);
5835 5869                          return (!(regs.cp_edx & 0x100));
5836 5870                  }
5837 5871                  return (0);
5838 5872          case 6323525:
5839 5873                  /*
5840 5874                   * This erratum (K8 #147) is not present on family 10 and newer.
5841 5875                   */
5842 5876                  if (cpi->cpi_family >= 0x10) {
5843 5877                          return (0);
5844 5878                  }
5845 5879                  return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
5846 5880                      (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
5847 5881  
5848 5882          case 6671130:
5849 5883                  /*
5850 5884                   * check for processors (pre-Shanghai) that do not provide
5851 5885                   * optimal management of 1gb ptes in its tlb.
5852 5886                   */
5853 5887                  return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
5854 5888  
5855 5889          case 298:
5856 5890                  return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
5857 5891                      DR_B2(eax) || RB_C0(eax));
5858 5892  
5859 5893          case 721:
5860 5894  #if defined(__amd64)
5861 5895                  return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
5862 5896  #else
5863 5897                  return (0);
5864 5898  #endif
5865 5899  
5866 5900          default:
5867 5901                  return (-1);
5868 5902  
5869 5903          }
5870 5904  }
5871 5905  
5872 5906  /*
5873 5907   * Determine if specified erratum is present via OSVW (OS Visible Workaround).
5874 5908   * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
5875 5909   */
5876 5910  int
5877 5911  osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
5878 5912  {
5879 5913          struct cpuid_info       *cpi;
5880 5914          uint_t                  osvwid;
5881 5915          static int              osvwfeature = -1;
5882 5916          uint64_t                osvwlength;
5883 5917  
5884 5918  
5885 5919          cpi = cpu->cpu_m.mcpu_cpi;
5886 5920  
5887 5921          /* confirm OSVW supported */
5888 5922          if (osvwfeature == -1) {
5889 5923                  osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
5890 5924          } else {
5891 5925                  /* assert that osvw feature setting is consistent on all cpus */
5892 5926                  ASSERT(osvwfeature ==
5893 5927                      (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
5894 5928          }
5895 5929          if (!osvwfeature)
5896 5930                  return (-1);
5897 5931  
5898 5932          osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
5899 5933  
5900 5934          switch (erratum) {
5901 5935          case 298:       /* osvwid is 0 */
5902 5936                  osvwid = 0;
5903 5937                  if (osvwlength <= (uint64_t)osvwid) {
5904 5938                          /* osvwid 0 is unknown */
5905 5939                          return (-1);
5906 5940                  }
5907 5941  
5908 5942                  /*
5909 5943                   * Check the OSVW STATUS MSR to determine the state
5910 5944                   * of the erratum where:
5911 5945                   *   0 - fixed by HW
5912 5946                   *   1 - BIOS has applied the workaround when BIOS
5913 5947                   *   workaround is available. (Or for other errata,
5914 5948                   *   OS workaround is required.)
5915 5949                   * For a value of 1, caller will confirm that the
5916 5950                   * erratum 298 workaround has indeed been applied by BIOS.
5917 5951                   *
5918 5952                   * A 1 may be set in cpus that have a HW fix
5919 5953                   * in a mixed cpu system. Regarding erratum 298:
5920 5954                   *   In a multiprocessor platform, the workaround above
5921 5955                   *   should be applied to all processors regardless of
5922 5956                   *   silicon revision when an affected processor is
5923 5957                   *   present.
5924 5958                   */
5925 5959  
5926 5960                  return (rdmsr(MSR_AMD_OSVW_STATUS +
5927 5961                      (osvwid / OSVW_ID_CNT_PER_MSR)) &
5928 5962                      (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
5929 5963  
5930 5964          default:
5931 5965                  return (-1);
5932 5966          }
5933 5967  }
5934 5968  
5935 5969  static const char assoc_str[] = "associativity";
5936 5970  static const char line_str[] = "line-size";
5937 5971  static const char size_str[] = "size";
5938 5972  
5939 5973  static void
5940 5974  add_cache_prop(dev_info_t *devi, const char *label, const char *type,
5941 5975      uint32_t val)
5942 5976  {
5943 5977          char buf[128];
5944 5978  
5945 5979          /*
5946 5980           * ndi_prop_update_int() is used because it is desirable for
5947 5981           * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
5948 5982           */
5949 5983          if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
5950 5984                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
5951 5985  }
5952 5986  
5953 5987  /*
5954 5988   * Intel-style cache/tlb description
5955 5989   *
5956 5990   * Standard cpuid level 2 gives a randomly ordered
5957 5991   * selection of tags that index into a table that describes
5958 5992   * cache and tlb properties.
5959 5993   */
5960 5994  
5961 5995  static const char l1_icache_str[] = "l1-icache";
5962 5996  static const char l1_dcache_str[] = "l1-dcache";
5963 5997  static const char l2_cache_str[] = "l2-cache";
5964 5998  static const char l3_cache_str[] = "l3-cache";
5965 5999  static const char itlb4k_str[] = "itlb-4K";
5966 6000  static const char dtlb4k_str[] = "dtlb-4K";
5967 6001  static const char itlb2M_str[] = "itlb-2M";
5968 6002  static const char itlb4M_str[] = "itlb-4M";
5969 6003  static const char dtlb4M_str[] = "dtlb-4M";
5970 6004  static const char dtlb24_str[] = "dtlb0-2M-4M";
5971 6005  static const char itlb424_str[] = "itlb-4K-2M-4M";
5972 6006  static const char itlb24_str[] = "itlb-2M-4M";
5973 6007  static const char dtlb44_str[] = "dtlb-4K-4M";
5974 6008  static const char sl1_dcache_str[] = "sectored-l1-dcache";
5975 6009  static const char sl2_cache_str[] = "sectored-l2-cache";
5976 6010  static const char itrace_str[] = "itrace-cache";
5977 6011  static const char sl3_cache_str[] = "sectored-l3-cache";
5978 6012  static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
5979 6013  
5980 6014  static const struct cachetab {
5981 6015          uint8_t         ct_code;
5982 6016          uint8_t         ct_assoc;
5983 6017          uint16_t        ct_line_size;
5984 6018          size_t          ct_size;
5985 6019          const char      *ct_label;
5986 6020  } intel_ctab[] = {
5987 6021          /*
5988 6022           * maintain descending order!
5989 6023           *
5990 6024           * Codes ignored - Reason
5991 6025           * ----------------------
5992 6026           * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
5993 6027           * f0H/f1H - Currently we do not interpret prefetch size by design
5994 6028           */
5995 6029          { 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
5996 6030          { 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
5997 6031          { 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
5998 6032          { 0xde, 12, 64, 6*1024*1024, l3_cache_str},
5999 6033          { 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
6000 6034          { 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
6001 6035          { 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
6002 6036          { 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
6003 6037          { 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
6004 6038          { 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
6005 6039          { 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
6006 6040          { 0xd0, 4, 64, 512*1024, l3_cache_str},
6007 6041          { 0xca, 4, 0, 512, sh_l2_tlb4k_str},
6008 6042          { 0xc0, 4, 0, 8, dtlb44_str },
6009 6043          { 0xba, 4, 0, 64, dtlb4k_str },
6010 6044          { 0xb4, 4, 0, 256, dtlb4k_str },
6011 6045          { 0xb3, 4, 0, 128, dtlb4k_str },
6012 6046          { 0xb2, 4, 0, 64, itlb4k_str },
6013 6047          { 0xb0, 4, 0, 128, itlb4k_str },
6014 6048          { 0x87, 8, 64, 1024*1024, l2_cache_str},
6015 6049          { 0x86, 4, 64, 512*1024, l2_cache_str},
6016 6050          { 0x85, 8, 32, 2*1024*1024, l2_cache_str},
6017 6051          { 0x84, 8, 32, 1024*1024, l2_cache_str},
6018 6052          { 0x83, 8, 32, 512*1024, l2_cache_str},
6019 6053          { 0x82, 8, 32, 256*1024, l2_cache_str},
6020 6054          { 0x80, 8, 64, 512*1024, l2_cache_str},
6021 6055          { 0x7f, 2, 64, 512*1024, l2_cache_str},
6022 6056          { 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
6023 6057          { 0x7c, 8, 64, 1024*1024, sl2_cache_str},
6024 6058          { 0x7b, 8, 64, 512*1024, sl2_cache_str},
6025 6059          { 0x7a, 8, 64, 256*1024, sl2_cache_str},
6026 6060          { 0x79, 8, 64, 128*1024, sl2_cache_str},
6027 6061          { 0x78, 8, 64, 1024*1024, l2_cache_str},
6028 6062          { 0x73, 8, 0, 64*1024, itrace_str},
6029 6063          { 0x72, 8, 0, 32*1024, itrace_str},
6030 6064          { 0x71, 8, 0, 16*1024, itrace_str},
6031 6065          { 0x70, 8, 0, 12*1024, itrace_str},
6032 6066          { 0x68, 4, 64, 32*1024, sl1_dcache_str},
6033 6067          { 0x67, 4, 64, 16*1024, sl1_dcache_str},
6034 6068          { 0x66, 4, 64, 8*1024, sl1_dcache_str},
6035 6069          { 0x60, 8, 64, 16*1024, sl1_dcache_str},
6036 6070          { 0x5d, 0, 0, 256, dtlb44_str},
6037 6071          { 0x5c, 0, 0, 128, dtlb44_str},
6038 6072          { 0x5b, 0, 0, 64, dtlb44_str},
6039 6073          { 0x5a, 4, 0, 32, dtlb24_str},
6040 6074          { 0x59, 0, 0, 16, dtlb4k_str},
6041 6075          { 0x57, 4, 0, 16, dtlb4k_str},
6042 6076          { 0x56, 4, 0, 16, dtlb4M_str},
6043 6077          { 0x55, 0, 0, 7, itlb24_str},
6044 6078          { 0x52, 0, 0, 256, itlb424_str},
6045 6079          { 0x51, 0, 0, 128, itlb424_str},
6046 6080          { 0x50, 0, 0, 64, itlb424_str},
6047 6081          { 0x4f, 0, 0, 32, itlb4k_str},
6048 6082          { 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
6049 6083          { 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
6050 6084          { 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
6051 6085          { 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
6052 6086          { 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
6053 6087          { 0x49, 16, 64, 4*1024*1024, l3_cache_str},
6054 6088          { 0x48, 12, 64, 3*1024*1024, l2_cache_str},
6055 6089          { 0x47, 8, 64, 8*1024*1024, l3_cache_str},
6056 6090          { 0x46, 4, 64, 4*1024*1024, l3_cache_str},
6057 6091          { 0x45, 4, 32, 2*1024*1024, l2_cache_str},
6058 6092          { 0x44, 4, 32, 1024*1024, l2_cache_str},
6059 6093          { 0x43, 4, 32, 512*1024, l2_cache_str},
6060 6094          { 0x42, 4, 32, 256*1024, l2_cache_str},
6061 6095          { 0x41, 4, 32, 128*1024, l2_cache_str},
6062 6096          { 0x3e, 4, 64, 512*1024, sl2_cache_str},
6063 6097          { 0x3d, 6, 64, 384*1024, sl2_cache_str},
6064 6098          { 0x3c, 4, 64, 256*1024, sl2_cache_str},
6065 6099          { 0x3b, 2, 64, 128*1024, sl2_cache_str},
6066 6100          { 0x3a, 6, 64, 192*1024, sl2_cache_str},
6067 6101          { 0x39, 4, 64, 128*1024, sl2_cache_str},
6068 6102          { 0x30, 8, 64, 32*1024, l1_icache_str},
6069 6103          { 0x2c, 8, 64, 32*1024, l1_dcache_str},
6070 6104          { 0x29, 8, 64, 4096*1024, sl3_cache_str},
6071 6105          { 0x25, 8, 64, 2048*1024, sl3_cache_str},
6072 6106          { 0x23, 8, 64, 1024*1024, sl3_cache_str},
6073 6107          { 0x22, 4, 64, 512*1024, sl3_cache_str},
6074 6108          { 0x0e, 6, 64, 24*1024, l1_dcache_str},
6075 6109          { 0x0d, 4, 32, 16*1024, l1_dcache_str},
6076 6110          { 0x0c, 4, 32, 16*1024, l1_dcache_str},
6077 6111          { 0x0b, 4, 0, 4, itlb4M_str},
6078 6112          { 0x0a, 2, 32, 8*1024, l1_dcache_str},
6079 6113          { 0x08, 4, 32, 16*1024, l1_icache_str},
6080 6114          { 0x06, 4, 32, 8*1024, l1_icache_str},
6081 6115          { 0x05, 4, 0, 32, dtlb4M_str},
6082 6116          { 0x04, 4, 0, 8, dtlb4M_str},
6083 6117          { 0x03, 4, 0, 64, dtlb4k_str},
6084 6118          { 0x02, 4, 0, 2, itlb4M_str},
6085 6119          { 0x01, 4, 0, 32, itlb4k_str},
6086 6120          { 0 }
6087 6121  };
6088 6122  
6089 6123  static const struct cachetab cyrix_ctab[] = {
6090 6124          { 0x70, 4, 0, 32, "tlb-4K" },
6091 6125          { 0x80, 4, 16, 16*1024, "l1-cache" },
6092 6126          { 0 }
6093 6127  };
6094 6128  
6095 6129  /*
6096 6130   * Search a cache table for a matching entry
6097 6131   */
6098 6132  static const struct cachetab *
6099 6133  find_cacheent(const struct cachetab *ct, uint_t code)
6100 6134  {
6101 6135          if (code != 0) {
6102 6136                  for (; ct->ct_code != 0; ct++)
6103 6137                          if (ct->ct_code <= code)
6104 6138                                  break;
6105 6139                  if (ct->ct_code == code)
6106 6140                          return (ct);
6107 6141          }
6108 6142          return (NULL);
6109 6143  }
6110 6144  
6111 6145  /*
6112 6146   * Populate cachetab entry with L2 or L3 cache-information using
6113 6147   * cpuid function 4. This function is called from intel_walk_cacheinfo()
6114 6148   * when descriptor 0x49 is encountered. It returns 0 if no such cache
6115 6149   * information is found.
6116 6150   */
6117 6151  static int
6118 6152  intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
6119 6153  {
6120 6154          uint32_t level, i;
6121 6155          int ret = 0;
6122 6156  
6123 6157          for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
6124 6158                  level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
6125 6159  
6126 6160                  if (level == 2 || level == 3) {
6127 6161                          ct->ct_assoc =
6128 6162                              CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
6129 6163                          ct->ct_line_size =
6130 6164                              CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
6131 6165                          ct->ct_size = ct->ct_assoc *
6132 6166                              (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
6133 6167                              ct->ct_line_size *
6134 6168                              (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
6135 6169  
6136 6170                          if (level == 2) {
6137 6171                                  ct->ct_label = l2_cache_str;
6138 6172                          } else if (level == 3) {
6139 6173                                  ct->ct_label = l3_cache_str;
6140 6174                          }
6141 6175                          ret = 1;
6142 6176                  }
6143 6177          }
6144 6178  
6145 6179          return (ret);
6146 6180  }
6147 6181  
6148 6182  /*
6149 6183   * Walk the cacheinfo descriptor, applying 'func' to every valid element
6150 6184   * The walk is terminated if the walker returns non-zero.
6151 6185   */
6152 6186  static void
6153 6187  intel_walk_cacheinfo(struct cpuid_info *cpi,
6154 6188      void *arg, int (*func)(void *, const struct cachetab *))
6155 6189  {
6156 6190          const struct cachetab *ct;
6157 6191          struct cachetab des_49_ct, des_b1_ct;
6158 6192          uint8_t *dp;
6159 6193          int i;
6160 6194  
6161 6195          if ((dp = cpi->cpi_cacheinfo) == NULL)
6162 6196                  return;
6163 6197          for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6164 6198                  /*
6165 6199                   * For overloaded descriptor 0x49 we use cpuid function 4
6166 6200                   * if supported by the current processor, to create
6167 6201                   * cache information.
6168 6202                   * For overloaded descriptor 0xb1 we use X86_PAE flag
6169 6203                   * to disambiguate the cache information.
6170 6204                   */
6171 6205                  if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
6172 6206                      intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
6173 6207                                  ct = &des_49_ct;
6174 6208                  } else if (*dp == 0xb1) {
6175 6209                          des_b1_ct.ct_code = 0xb1;
6176 6210                          des_b1_ct.ct_assoc = 4;
6177 6211                          des_b1_ct.ct_line_size = 0;
6178 6212                          if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
6179 6213                                  des_b1_ct.ct_size = 8;
6180 6214                                  des_b1_ct.ct_label = itlb2M_str;
6181 6215                          } else {
6182 6216                                  des_b1_ct.ct_size = 4;
6183 6217                                  des_b1_ct.ct_label = itlb4M_str;
6184 6218                          }
6185 6219                          ct = &des_b1_ct;
6186 6220                  } else {
6187 6221                          if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
6188 6222                                  continue;
6189 6223                          }
6190 6224                  }
6191 6225  
6192 6226                  if (func(arg, ct) != 0) {
6193 6227                          break;
6194 6228                  }
6195 6229          }
6196 6230  }
6197 6231  
6198 6232  /*
6199 6233   * (Like the Intel one, except for Cyrix CPUs)
6200 6234   */
6201 6235  static void
6202 6236  cyrix_walk_cacheinfo(struct cpuid_info *cpi,
6203 6237      void *arg, int (*func)(void *, const struct cachetab *))
6204 6238  {
6205 6239          const struct cachetab *ct;
6206 6240          uint8_t *dp;
6207 6241          int i;
6208 6242  
6209 6243          if ((dp = cpi->cpi_cacheinfo) == NULL)
6210 6244                  return;
6211 6245          for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6212 6246                  /*
6213 6247                   * Search Cyrix-specific descriptor table first ..
6214 6248                   */
6215 6249                  if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
6216 6250                          if (func(arg, ct) != 0)
6217 6251                                  break;
6218 6252                          continue;
6219 6253                  }
6220 6254                  /*
6221 6255                   * .. else fall back to the Intel one
6222 6256                   */
6223 6257                  if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
6224 6258                          if (func(arg, ct) != 0)
6225 6259                                  break;
6226 6260                          continue;
6227 6261                  }
6228 6262          }
6229 6263  }
6230 6264  
6231 6265  /*
6232 6266   * A cacheinfo walker that adds associativity, line-size, and size properties
6233 6267   * to the devinfo node it is passed as an argument.
6234 6268   */
6235 6269  static int
6236 6270  add_cacheent_props(void *arg, const struct cachetab *ct)
6237 6271  {
6238 6272          dev_info_t *devi = arg;
6239 6273  
6240 6274          add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
6241 6275          if (ct->ct_line_size != 0)
6242 6276                  add_cache_prop(devi, ct->ct_label, line_str,
6243 6277                      ct->ct_line_size);
6244 6278          add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
6245 6279          return (0);
6246 6280  }
6247 6281  
6248 6282  
6249 6283  static const char fully_assoc[] = "fully-associative?";
6250 6284  
6251 6285  /*
6252 6286   * AMD style cache/tlb description
6253 6287   *
6254 6288   * Extended functions 5 and 6 directly describe properties of
6255 6289   * tlbs and various cache levels.
6256 6290   */
6257 6291  static void
6258 6292  add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6259 6293  {
6260 6294          switch (assoc) {
6261 6295          case 0: /* reserved; ignore */
6262 6296                  break;
6263 6297          default:
6264 6298                  add_cache_prop(devi, label, assoc_str, assoc);
6265 6299                  break;
6266 6300          case 0xff:
6267 6301                  add_cache_prop(devi, label, fully_assoc, 1);
6268 6302                  break;
6269 6303          }
6270 6304  }
6271 6305  
6272 6306  static void
6273 6307  add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6274 6308  {
6275 6309          if (size == 0)
6276 6310                  return;
6277 6311          add_cache_prop(devi, label, size_str, size);
6278 6312          add_amd_assoc(devi, label, assoc);
6279 6313  }
6280 6314  
6281 6315  static void
6282 6316  add_amd_cache(dev_info_t *devi, const char *label,
6283 6317      uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6284 6318  {
6285 6319          if (size == 0 || line_size == 0)
6286 6320                  return;
6287 6321          add_amd_assoc(devi, label, assoc);
6288 6322          /*
6289 6323           * Most AMD parts have a sectored cache. Multiple cache lines are
6290 6324           * associated with each tag. A sector consists of all cache lines
6291 6325           * associated with a tag. For example, the AMD K6-III has a sector
6292 6326           * size of 2 cache lines per tag.
6293 6327           */
6294 6328          if (lines_per_tag != 0)
6295 6329                  add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6296 6330          add_cache_prop(devi, label, line_str, line_size);
6297 6331          add_cache_prop(devi, label, size_str, size * 1024);
6298 6332  }
6299 6333  
6300 6334  static void
6301 6335  add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6302 6336  {
6303 6337          switch (assoc) {
6304 6338          case 0: /* off */
6305 6339                  break;
6306 6340          case 1:
6307 6341          case 2:
6308 6342          case 4:
6309 6343                  add_cache_prop(devi, label, assoc_str, assoc);
6310 6344                  break;
6311 6345          case 6:
6312 6346                  add_cache_prop(devi, label, assoc_str, 8);
6313 6347                  break;
6314 6348          case 8:
6315 6349                  add_cache_prop(devi, label, assoc_str, 16);
6316 6350                  break;
6317 6351          case 0xf:
6318 6352                  add_cache_prop(devi, label, fully_assoc, 1);
6319 6353                  break;
6320 6354          default: /* reserved; ignore */
6321 6355                  break;
6322 6356          }
6323 6357  }
6324 6358  
6325 6359  static void
6326 6360  add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6327 6361  {
6328 6362          if (size == 0 || assoc == 0)
6329 6363                  return;
6330 6364          add_amd_l2_assoc(devi, label, assoc);
6331 6365          add_cache_prop(devi, label, size_str, size);
6332 6366  }
6333 6367  
6334 6368  static void
6335 6369  add_amd_l2_cache(dev_info_t *devi, const char *label,
6336 6370      uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6337 6371  {
6338 6372          if (size == 0 || assoc == 0 || line_size == 0)
6339 6373                  return;
6340 6374          add_amd_l2_assoc(devi, label, assoc);
6341 6375          if (lines_per_tag != 0)
6342 6376                  add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6343 6377          add_cache_prop(devi, label, line_str, line_size);
6344 6378          add_cache_prop(devi, label, size_str, size * 1024);
6345 6379  }
6346 6380  
6347 6381  static void
6348 6382  amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
6349 6383  {
6350 6384          struct cpuid_regs *cp;
6351 6385  
6352 6386          if (cpi->cpi_xmaxeax < 0x80000005)
6353 6387                  return;
6354 6388          cp = &cpi->cpi_extd[5];
6355 6389  
6356 6390          /*
6357 6391           * 4M/2M L1 TLB configuration
6358 6392           *
6359 6393           * We report the size for 2M pages because AMD uses two
6360 6394           * TLB entries for one 4M page.
6361 6395           */
6362 6396          add_amd_tlb(devi, "dtlb-2M",
6363 6397              BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
6364 6398          add_amd_tlb(devi, "itlb-2M",
6365 6399              BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
6366 6400  
6367 6401          /*
6368 6402           * 4K L1 TLB configuration
6369 6403           */
6370 6404  
6371 6405          switch (cpi->cpi_vendor) {
6372 6406                  uint_t nentries;
6373 6407          case X86_VENDOR_TM:
6374 6408                  if (cpi->cpi_family >= 5) {
6375 6409                          /*
6376 6410                           * Crusoe processors have 256 TLB entries, but
6377 6411                           * cpuid data format constrains them to only
6378 6412                           * reporting 255 of them.
6379 6413                           */
6380 6414                          if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
6381 6415                                  nentries = 256;
6382 6416                          /*
6383 6417                           * Crusoe processors also have a unified TLB
6384 6418                           */
6385 6419                          add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
6386 6420                              nentries);
6387 6421                          break;
6388 6422                  }
6389 6423                  /*FALLTHROUGH*/
6390 6424          default:
6391 6425                  add_amd_tlb(devi, itlb4k_str,
6392 6426                      BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
6393 6427                  add_amd_tlb(devi, dtlb4k_str,
6394 6428                      BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
6395 6429                  break;
6396 6430          }
6397 6431  
6398 6432          /*
6399 6433           * data L1 cache configuration
6400 6434           */
6401 6435  
6402 6436          add_amd_cache(devi, l1_dcache_str,
6403 6437              BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
6404 6438              BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
6405 6439  
6406 6440          /*
6407 6441           * code L1 cache configuration
6408 6442           */
6409 6443  
6410 6444          add_amd_cache(devi, l1_icache_str,
6411 6445              BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
6412 6446              BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
6413 6447  
6414 6448          if (cpi->cpi_xmaxeax < 0x80000006)
6415 6449                  return;
6416 6450          cp = &cpi->cpi_extd[6];
6417 6451  
6418 6452          /* Check for a unified L2 TLB for large pages */
6419 6453  
6420 6454          if (BITX(cp->cp_eax, 31, 16) == 0)
6421 6455                  add_amd_l2_tlb(devi, "l2-tlb-2M",
6422 6456                      BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6423 6457          else {
6424 6458                  add_amd_l2_tlb(devi, "l2-dtlb-2M",
6425 6459                      BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6426 6460                  add_amd_l2_tlb(devi, "l2-itlb-2M",
6427 6461                      BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6428 6462          }
6429 6463  
6430 6464          /* Check for a unified L2 TLB for 4K pages */
6431 6465  
6432 6466          if (BITX(cp->cp_ebx, 31, 16) == 0) {
6433 6467                  add_amd_l2_tlb(devi, "l2-tlb-4K",
6434 6468                      BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6435 6469          } else {
6436 6470                  add_amd_l2_tlb(devi, "l2-dtlb-4K",
6437 6471                      BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6438 6472                  add_amd_l2_tlb(devi, "l2-itlb-4K",
6439 6473                      BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6440 6474          }
6441 6475  
6442 6476          add_amd_l2_cache(devi, l2_cache_str,
6443 6477              BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
6444 6478              BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
6445 6479  }
6446 6480  
6447 6481  /*
6448 6482   * There are two basic ways that the x86 world describes it cache
6449 6483   * and tlb architecture - Intel's way and AMD's way.
6450 6484   *
6451 6485   * Return which flavor of cache architecture we should use
6452 6486   */
6453 6487  static int
6454 6488  x86_which_cacheinfo(struct cpuid_info *cpi)
6455 6489  {
6456 6490          switch (cpi->cpi_vendor) {
6457 6491          case X86_VENDOR_Intel:
6458 6492                  if (cpi->cpi_maxeax >= 2)
6459 6493                          return (X86_VENDOR_Intel);
6460 6494                  break;
6461 6495          case X86_VENDOR_AMD:
6462 6496                  /*
6463 6497                   * The K5 model 1 was the first part from AMD that reported
6464 6498                   * cache sizes via extended cpuid functions.
6465 6499                   */
6466 6500                  if (cpi->cpi_family > 5 ||
6467 6501                      (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
6468 6502                          return (X86_VENDOR_AMD);
6469 6503                  break;
6470 6504          case X86_VENDOR_TM:
6471 6505                  if (cpi->cpi_family >= 5)
6472 6506                          return (X86_VENDOR_AMD);
6473 6507                  /*FALLTHROUGH*/
6474 6508          default:
6475 6509                  /*
6476 6510                   * If they have extended CPU data for 0x80000005
6477 6511                   * then we assume they have AMD-format cache
6478 6512                   * information.
6479 6513                   *
6480 6514                   * If not, and the vendor happens to be Cyrix,
6481 6515                   * then try our-Cyrix specific handler.
6482 6516                   *
6483 6517                   * If we're not Cyrix, then assume we're using Intel's
6484 6518                   * table-driven format instead.
6485 6519                   */
6486 6520                  if (cpi->cpi_xmaxeax >= 0x80000005)
6487 6521                          return (X86_VENDOR_AMD);
6488 6522                  else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
6489 6523                          return (X86_VENDOR_Cyrix);
6490 6524                  else if (cpi->cpi_maxeax >= 2)
6491 6525                          return (X86_VENDOR_Intel);
6492 6526                  break;
6493 6527          }
6494 6528          return (-1);
6495 6529  }
6496 6530  
6497 6531  void
6498 6532  cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
6499 6533      struct cpuid_info *cpi)
6500 6534  {
6501 6535          dev_info_t *cpu_devi;
6502 6536          int create;
6503 6537  
6504 6538          cpu_devi = (dev_info_t *)dip;
6505 6539  
6506 6540          /* device_type */
6507 6541          (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6508 6542              "device_type", "cpu");
6509 6543  
6510 6544          /* reg */
6511 6545          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6512 6546              "reg", cpu_id);
6513 6547  
6514 6548          /* cpu-mhz, and clock-frequency */
6515 6549          if (cpu_freq > 0) {
6516 6550                  long long mul;
6517 6551  
6518 6552                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6519 6553                      "cpu-mhz", cpu_freq);
6520 6554                  if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
6521 6555                          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6522 6556                              "clock-frequency", (int)mul);
6523 6557          }
6524 6558  
6525 6559          if (!is_x86_feature(x86_featureset, X86FSET_CPUID)) {
6526 6560                  return;
6527 6561          }
6528 6562  
6529 6563          /* vendor-id */
6530 6564          (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6531 6565              "vendor-id", cpi->cpi_vendorstr);
6532 6566  
6533 6567          if (cpi->cpi_maxeax == 0) {
6534 6568                  return;
6535 6569          }
6536 6570  
6537 6571          /*
6538 6572           * family, model, and step
6539 6573           */
6540 6574          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6541 6575              "family", CPI_FAMILY(cpi));
6542 6576          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6543 6577              "cpu-model", CPI_MODEL(cpi));
6544 6578          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6545 6579              "stepping-id", CPI_STEP(cpi));
6546 6580  
6547 6581          /* type */
6548 6582          switch (cpi->cpi_vendor) {
6549 6583          case X86_VENDOR_Intel:
6550 6584                  create = 1;
6551 6585                  break;
6552 6586          default:
6553 6587                  create = 0;
6554 6588                  break;
6555 6589          }
6556 6590          if (create)
6557 6591                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6558 6592                      "type", CPI_TYPE(cpi));
6559 6593  
6560 6594          /* ext-family */
6561 6595          switch (cpi->cpi_vendor) {
6562 6596          case X86_VENDOR_Intel:
6563 6597          case X86_VENDOR_AMD:
6564 6598                  create = cpi->cpi_family >= 0xf;
6565 6599                  break;
6566 6600          default:
6567 6601                  create = 0;
6568 6602                  break;
6569 6603          }
6570 6604          if (create)
6571 6605                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6572 6606                      "ext-family", CPI_FAMILY_XTD(cpi));
6573 6607  
6574 6608          /* ext-model */
6575 6609          switch (cpi->cpi_vendor) {
6576 6610          case X86_VENDOR_Intel:
6577 6611                  create = IS_EXTENDED_MODEL_INTEL(cpi);
6578 6612                  break;
6579 6613          case X86_VENDOR_AMD:
6580 6614                  create = CPI_FAMILY(cpi) == 0xf;
6581 6615                  break;
6582 6616          default:
6583 6617                  create = 0;
6584 6618                  break;
6585 6619          }
6586 6620          if (create)
6587 6621                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6588 6622                      "ext-model", CPI_MODEL_XTD(cpi));
6589 6623  
6590 6624          /* generation */
6591 6625          switch (cpi->cpi_vendor) {
6592 6626          case X86_VENDOR_AMD:
6593 6627                  /*
6594 6628                   * AMD K5 model 1 was the first part to support this
6595 6629                   */
6596 6630                  create = cpi->cpi_xmaxeax >= 0x80000001;
6597 6631                  break;
6598 6632          default:
6599 6633                  create = 0;
6600 6634                  break;
6601 6635          }
6602 6636          if (create)
6603 6637                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6604 6638                      "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
6605 6639  
6606 6640          /* brand-id */
6607 6641          switch (cpi->cpi_vendor) {
6608 6642          case X86_VENDOR_Intel:
6609 6643                  /*
6610 6644                   * brand id first appeared on Pentium III Xeon model 8,
6611 6645                   * and Celeron model 8 processors and Opteron
6612 6646                   */
6613 6647                  create = cpi->cpi_family > 6 ||
6614 6648                      (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
6615 6649                  break;
6616 6650          case X86_VENDOR_AMD:
6617 6651                  create = cpi->cpi_family >= 0xf;
6618 6652                  break;
6619 6653          default:
6620 6654                  create = 0;
6621 6655                  break;
6622 6656          }
6623 6657          if (create && cpi->cpi_brandid != 0) {
6624 6658                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6625 6659                      "brand-id", cpi->cpi_brandid);
6626 6660          }
6627 6661  
6628 6662          /* chunks, and apic-id */
6629 6663          switch (cpi->cpi_vendor) {
6630 6664                  /*
6631 6665                   * first available on Pentium IV and Opteron (K8)
6632 6666                   */
6633 6667          case X86_VENDOR_Intel:
6634 6668                  create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6635 6669                  break;
6636 6670          case X86_VENDOR_AMD:
6637 6671                  create = cpi->cpi_family >= 0xf;
6638 6672                  break;
6639 6673          default:
6640 6674                  create = 0;
6641 6675                  break;
6642 6676          }
6643 6677          if (create) {
6644 6678                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6645 6679                      "chunks", CPI_CHUNKS(cpi));
6646 6680                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6647 6681                      "apic-id", cpi->cpi_apicid);
6648 6682                  if (cpi->cpi_chipid >= 0) {
6649 6683                          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6650 6684                              "chip#", cpi->cpi_chipid);
6651 6685                          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6652 6686                              "clog#", cpi->cpi_clogid);
6653 6687                  }
6654 6688          }
6655 6689  
6656 6690          /* cpuid-features */
6657 6691          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6658 6692              "cpuid-features", CPI_FEATURES_EDX(cpi));
6659 6693  
6660 6694  
6661 6695          /* cpuid-features-ecx */
6662 6696          switch (cpi->cpi_vendor) {
6663 6697          case X86_VENDOR_Intel:
6664 6698                  create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6665 6699                  break;
6666 6700          case X86_VENDOR_AMD:
6667 6701                  create = cpi->cpi_family >= 0xf;
6668 6702                  break;
6669 6703          default:
6670 6704                  create = 0;
6671 6705                  break;
6672 6706          }
6673 6707          if (create)
6674 6708                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6675 6709                      "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
6676 6710  
6677 6711          /* ext-cpuid-features */
6678 6712          switch (cpi->cpi_vendor) {
6679 6713          case X86_VENDOR_Intel:
6680 6714          case X86_VENDOR_AMD:
6681 6715          case X86_VENDOR_Cyrix:
6682 6716          case X86_VENDOR_TM:
6683 6717          case X86_VENDOR_Centaur:
6684 6718                  create = cpi->cpi_xmaxeax >= 0x80000001;
6685 6719                  break;
6686 6720          default:
6687 6721                  create = 0;
6688 6722                  break;
6689 6723          }
6690 6724          if (create) {
6691 6725                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6692 6726                      "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
6693 6727                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6694 6728                      "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
6695 6729          }
6696 6730  
6697 6731          /*
6698 6732           * Brand String first appeared in Intel Pentium IV, AMD K5
6699 6733           * model 1, and Cyrix GXm.  On earlier models we try and
6700 6734           * simulate something similar .. so this string should always
6701 6735           * same -something- about the processor, however lame.
6702 6736           */
6703 6737          (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6704 6738              "brand-string", cpi->cpi_brandstr);
6705 6739  
6706 6740          /*
6707 6741           * Finally, cache and tlb information
6708 6742           */
6709 6743          switch (x86_which_cacheinfo(cpi)) {
6710 6744          case X86_VENDOR_Intel:
6711 6745                  intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
6712 6746                  break;
6713 6747          case X86_VENDOR_Cyrix:
6714 6748                  cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
6715 6749                  break;
6716 6750          case X86_VENDOR_AMD:
6717 6751                  amd_cache_info(cpi, cpu_devi);
6718 6752                  break;
6719 6753          default:
6720 6754                  break;
6721 6755          }
6722 6756  }
6723 6757  
6724 6758  struct l2info {
6725 6759          int *l2i_csz;
6726 6760          int *l2i_lsz;
6727 6761          int *l2i_assoc;
6728 6762          int l2i_ret;
6729 6763  };
6730 6764  
6731 6765  /*
6732 6766   * A cacheinfo walker that fetches the size, line-size and associativity
6733 6767   * of the L2 cache
6734 6768   */
6735 6769  static int
6736 6770  intel_l2cinfo(void *arg, const struct cachetab *ct)
6737 6771  {
6738 6772          struct l2info *l2i = arg;
6739 6773          int *ip;
6740 6774  
6741 6775          if (ct->ct_label != l2_cache_str &&
6742 6776              ct->ct_label != sl2_cache_str)
6743 6777                  return (0);     /* not an L2 -- keep walking */
6744 6778  
6745 6779          if ((ip = l2i->l2i_csz) != NULL)
6746 6780                  *ip = ct->ct_size;
6747 6781          if ((ip = l2i->l2i_lsz) != NULL)
6748 6782                  *ip = ct->ct_line_size;
6749 6783          if ((ip = l2i->l2i_assoc) != NULL)
6750 6784                  *ip = ct->ct_assoc;
6751 6785          l2i->l2i_ret = ct->ct_size;
6752 6786          return (1);             /* was an L2 -- terminate walk */
6753 6787  }
6754 6788  
6755 6789  /*
6756 6790   * AMD L2/L3 Cache and TLB Associativity Field Definition:
6757 6791   *
6758 6792   *      Unlike the associativity for the L1 cache and tlb where the 8 bit
6759 6793   *      value is the associativity, the associativity for the L2 cache and
6760 6794   *      tlb is encoded in the following table. The 4 bit L2 value serves as
6761 6795   *      an index into the amd_afd[] array to determine the associativity.
6762 6796   *      -1 is undefined. 0 is fully associative.
6763 6797   */
6764 6798  
6765 6799  static int amd_afd[] =
6766 6800          {-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
6767 6801  
6768 6802  static void
6769 6803  amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
6770 6804  {
6771 6805          struct cpuid_regs *cp;
6772 6806          uint_t size, assoc;
6773 6807          int i;
6774 6808          int *ip;
6775 6809  
6776 6810          if (cpi->cpi_xmaxeax < 0x80000006)
6777 6811                  return;
6778 6812          cp = &cpi->cpi_extd[6];
6779 6813  
6780 6814          if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
6781 6815              (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
6782 6816                  uint_t cachesz = size * 1024;
6783 6817                  assoc = amd_afd[i];
6784 6818  
6785 6819                  ASSERT(assoc != -1);
6786 6820  
6787 6821                  if ((ip = l2i->l2i_csz) != NULL)
6788 6822                          *ip = cachesz;
6789 6823                  if ((ip = l2i->l2i_lsz) != NULL)
6790 6824                          *ip = BITX(cp->cp_ecx, 7, 0);
6791 6825                  if ((ip = l2i->l2i_assoc) != NULL)
6792 6826                          *ip = assoc;
6793 6827                  l2i->l2i_ret = cachesz;
6794 6828          }
6795 6829  }
6796 6830  
6797 6831  int
6798 6832  getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
6799 6833  {
6800 6834          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6801 6835          struct l2info __l2info, *l2i = &__l2info;
6802 6836  
6803 6837          l2i->l2i_csz = csz;
6804 6838          l2i->l2i_lsz = lsz;
6805 6839          l2i->l2i_assoc = assoc;
6806 6840          l2i->l2i_ret = -1;
6807 6841  
6808 6842          switch (x86_which_cacheinfo(cpi)) {
6809 6843          case X86_VENDOR_Intel:
6810 6844                  intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
6811 6845                  break;
6812 6846          case X86_VENDOR_Cyrix:
6813 6847                  cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
6814 6848                  break;
6815 6849          case X86_VENDOR_AMD:
6816 6850                  amd_l2cacheinfo(cpi, l2i);
6817 6851                  break;
6818 6852          default:
6819 6853                  break;
6820 6854          }
6821 6855          return (l2i->l2i_ret);
6822 6856  }
6823 6857  
6824 6858  #if !defined(__xpv)
6825 6859  
6826 6860  uint32_t *
6827 6861  cpuid_mwait_alloc(cpu_t *cpu)
6828 6862  {
6829 6863          uint32_t        *ret;
6830 6864          size_t          mwait_size;
6831 6865  
6832 6866          ASSERT(cpuid_checkpass(CPU, 2));
6833 6867  
6834 6868          mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
6835 6869          if (mwait_size == 0)
6836 6870                  return (NULL);
6837 6871  
6838 6872          /*
6839 6873           * kmem_alloc() returns cache line size aligned data for mwait_size
6840 6874           * allocations.  mwait_size is currently cache line sized.  Neither
6841 6875           * of these implementation details are guarantied to be true in the
6842 6876           * future.
6843 6877           *
6844 6878           * First try allocating mwait_size as kmem_alloc() currently returns
6845 6879           * correctly aligned memory.  If kmem_alloc() does not return
6846 6880           * mwait_size aligned memory, then use mwait_size ROUNDUP.
6847 6881           *
6848 6882           * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
6849 6883           * decide to free this memory.
6850 6884           */
6851 6885          ret = kmem_zalloc(mwait_size, KM_SLEEP);
6852 6886          if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
6853 6887                  cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
6854 6888                  cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
6855 6889                  *ret = MWAIT_RUNNING;
6856 6890                  return (ret);
6857 6891          } else {
6858 6892                  kmem_free(ret, mwait_size);
6859 6893                  ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
6860 6894                  cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
6861 6895                  cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
6862 6896                  ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
6863 6897                  *ret = MWAIT_RUNNING;
6864 6898                  return (ret);
6865 6899          }
6866 6900  }
6867 6901  
6868 6902  void
6869 6903  cpuid_mwait_free(cpu_t *cpu)
6870 6904  {
6871 6905          if (cpu->cpu_m.mcpu_cpi == NULL) {
6872 6906                  return;
6873 6907          }
6874 6908  
6875 6909          if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
6876 6910              cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
6877 6911                  kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
6878 6912                      cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
6879 6913          }
6880 6914  
6881 6915          cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
6882 6916          cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
6883 6917  }
6884 6918  
6885 6919  void
6886 6920  patch_tsc_read(int flag)
6887 6921  {
6888 6922          size_t cnt;
6889 6923  
6890 6924          switch (flag) {
6891 6925          case TSC_NONE:
6892 6926                  cnt = &_no_rdtsc_end - &_no_rdtsc_start;
6893 6927                  (void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
6894 6928                  break;
6895 6929          case TSC_RDTSC_MFENCE:
6896 6930                  cnt = &_tsc_mfence_end - &_tsc_mfence_start;
6897 6931                  (void) memcpy((void *)tsc_read,
6898 6932                      (void *)&_tsc_mfence_start, cnt);
6899 6933                  break;
6900 6934          case TSC_RDTSC_LFENCE:
6901 6935                  cnt = &_tsc_lfence_end - &_tsc_lfence_start;
6902 6936                  (void) memcpy((void *)tsc_read,
6903 6937                      (void *)&_tsc_lfence_start, cnt);
6904 6938                  break;
6905 6939          case TSC_TSCP:
6906 6940                  cnt = &_tscp_end - &_tscp_start;
6907 6941                  (void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
6908 6942                  break;
6909 6943          default:
6910 6944                  /* Bail for unexpected TSC types. (TSC_NONE covers 0) */
6911 6945                  cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
6912 6946                  break;
6913 6947          }
6914 6948          tsc_type = flag;
6915 6949  }
6916 6950  
6917 6951  int
6918 6952  cpuid_deep_cstates_supported(void)
6919 6953  {
6920 6954          struct cpuid_info *cpi;
6921 6955          struct cpuid_regs regs;
6922 6956  
6923 6957          ASSERT(cpuid_checkpass(CPU, 1));
6924 6958  
6925 6959          cpi = CPU->cpu_m.mcpu_cpi;
6926 6960  
6927 6961          if (!is_x86_feature(x86_featureset, X86FSET_CPUID))
6928 6962                  return (0);
6929 6963  
6930 6964          switch (cpi->cpi_vendor) {
6931 6965          case X86_VENDOR_Intel:
6932 6966                  if (cpi->cpi_xmaxeax < 0x80000007)
6933 6967                          return (0);
6934 6968  
6935 6969                  /*
6936 6970                   * TSC run at a constant rate in all ACPI C-states?
6937 6971                   */
6938 6972                  regs.cp_eax = 0x80000007;
6939 6973                  (void) __cpuid_insn(&regs);
6940 6974                  return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
6941 6975  
6942 6976          default:
6943 6977                  return (0);
6944 6978          }
6945 6979  }
6946 6980  
6947 6981  #endif  /* !__xpv */
6948 6982  
6949 6983  void
6950 6984  post_startup_cpu_fixups(void)
6951 6985  {
6952 6986  #ifndef __xpv
6953 6987          /*
6954 6988           * Some AMD processors support C1E state. Entering this state will
6955 6989           * cause the local APIC timer to stop, which we can't deal with at
6956 6990           * this time.
6957 6991           */
6958 6992          if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
6959 6993                  on_trap_data_t otd;
6960 6994                  uint64_t reg;
6961 6995  
6962 6996                  if (!on_trap(&otd, OT_DATA_ACCESS)) {
6963 6997                          reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
6964 6998                          /* Disable C1E state if it is enabled by BIOS */
6965 6999                          if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
6966 7000                              AMD_ACTONCMPHALT_MASK) {
6967 7001                                  reg &= ~(AMD_ACTONCMPHALT_MASK <<
6968 7002                                      AMD_ACTONCMPHALT_SHIFT);
6969 7003                                  wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
6970 7004                          }
6971 7005                  }
6972 7006                  no_trap();
6973 7007          }
6974 7008  #endif  /* !__xpv */
6975 7009  }
6976 7010  
6977 7011  void
6978 7012  enable_pcid(void)
6979 7013  {
6980 7014          if (x86_use_pcid == -1)
6981 7015                  x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
6982 7016  
6983 7017          if (x86_use_invpcid == -1) {
6984 7018                  x86_use_invpcid = is_x86_feature(x86_featureset,
6985 7019                      X86FSET_INVPCID);
6986 7020          }
6987 7021  
6988 7022          if (!x86_use_pcid)
6989 7023                  return;
6990 7024  
6991 7025          /*
6992 7026           * Intel say that on setting PCIDE, it immediately starts using the PCID
6993 7027           * bits; better make sure there's nothing there.
6994 7028           */
6995 7029          ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
6996 7030  
6997 7031          setcr4(getcr4() | CR4_PCIDE);
6998 7032  }
6999 7033  
7000 7034  /*
7001 7035   * Setup necessary registers to enable XSAVE feature on this processor.
7002 7036   * This function needs to be called early enough, so that no xsave/xrstor
7003 7037   * ops will execute on the processor before the MSRs are properly set up.
7004 7038   *
7005 7039   * Current implementation has the following assumption:
7006 7040   * - cpuid_pass1() is done, so that X86 features are known.
7007 7041   * - fpu_probe() is done, so that fp_save_mech is chosen.
7008 7042   */
7009 7043  void
7010 7044  xsave_setup_msr(cpu_t *cpu)
7011 7045  {
7012 7046          ASSERT(fp_save_mech == FP_XSAVE);
7013 7047          ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
7014 7048  
7015 7049          /* Enable OSXSAVE in CR4. */
7016 7050          setcr4(getcr4() | CR4_OSXSAVE);
7017 7051          /*
7018 7052           * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report
7019 7053           * correct value.
7020 7054           */
7021 7055          cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE;
7022 7056          setup_xfem();
7023 7057  }
7024 7058  
7025 7059  /*
7026 7060   * Starting with the Westmere processor the local
7027 7061   * APIC timer will continue running in all C-states,
7028 7062   * including the deepest C-states.
7029 7063   */
7030 7064  int
7031 7065  cpuid_arat_supported(void)
7032 7066  {
7033 7067          struct cpuid_info *cpi;
7034 7068          struct cpuid_regs regs;
7035 7069  
7036 7070          ASSERT(cpuid_checkpass(CPU, 1));
7037 7071          ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7038 7072  
7039 7073          cpi = CPU->cpu_m.mcpu_cpi;
7040 7074  
7041 7075          switch (cpi->cpi_vendor) {
7042 7076          case X86_VENDOR_Intel:
7043 7077                  /*
7044 7078                   * Always-running Local APIC Timer is
7045 7079                   * indicated by CPUID.6.EAX[2].
7046 7080                   */
7047 7081                  if (cpi->cpi_maxeax >= 6) {
7048 7082                          regs.cp_eax = 6;
7049 7083                          (void) cpuid_insn(NULL, &regs);
7050 7084                          return (regs.cp_eax & CPUID_INTC_EAX_ARAT);
7051 7085                  } else {
7052 7086                          return (0);
7053 7087                  }
7054 7088          default:
7055 7089                  return (0);
7056 7090          }
7057 7091  }
7058 7092  
7059 7093  /*
7060 7094   * Check support for Intel ENERGY_PERF_BIAS feature
7061 7095   */
7062 7096  int
7063 7097  cpuid_iepb_supported(struct cpu *cp)
7064 7098  {
7065 7099          struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi;
7066 7100          struct cpuid_regs regs;
7067 7101  
7068 7102          ASSERT(cpuid_checkpass(cp, 1));
7069 7103  
7070 7104          if (!(is_x86_feature(x86_featureset, X86FSET_CPUID)) ||
7071 7105              !(is_x86_feature(x86_featureset, X86FSET_MSR))) {
7072 7106                  return (0);
7073 7107          }
7074 7108  
7075 7109          /*
7076 7110           * Intel ENERGY_PERF_BIAS MSR is indicated by
7077 7111           * capability bit CPUID.6.ECX.3
7078 7112           */
7079 7113          if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6))
7080 7114                  return (0);
7081 7115  
7082 7116          regs.cp_eax = 0x6;
7083 7117          (void) cpuid_insn(NULL, &regs);
7084 7118          return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS);
7085 7119  }
7086 7120  
7087 7121  /*
7088 7122   * Check support for TSC deadline timer
7089 7123   *
7090 7124   * TSC deadline timer provides a superior software programming
7091 7125   * model over local APIC timer that eliminates "time drifts".
7092 7126   * Instead of specifying a relative time, software specifies an
7093 7127   * absolute time as the target at which the processor should
7094 7128   * generate a timer event.
7095 7129   */
7096 7130  int
7097 7131  cpuid_deadline_tsc_supported(void)
7098 7132  {
7099 7133          struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi;
7100 7134          struct cpuid_regs regs;
7101 7135  
7102 7136          ASSERT(cpuid_checkpass(CPU, 1));
7103 7137          ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7104 7138  
7105 7139          switch (cpi->cpi_vendor) {
7106 7140          case X86_VENDOR_Intel:
7107 7141                  if (cpi->cpi_maxeax >= 1) {
7108 7142                          regs.cp_eax = 1;
7109 7143                          (void) cpuid_insn(NULL, &regs);
7110 7144                          return (regs.cp_ecx & CPUID_DEADLINE_TSC);
7111 7145                  } else {
7112 7146                          return (0);
7113 7147                  }
7114 7148          default:
7115 7149                  return (0);
7116 7150          }
7117 7151  }
7118 7152  
7119 7153  #if defined(__amd64) && !defined(__xpv)
7120 7154  /*
7121 7155   * Patch in versions of bcopy for high performance Intel Nhm processors
7122 7156   * and later...
7123 7157   */
7124 7158  void
7125 7159  patch_memops(uint_t vendor)
7126 7160  {
7127 7161          size_t cnt, i;
7128 7162          caddr_t to, from;
7129 7163  
7130 7164          if ((vendor == X86_VENDOR_Intel) &&
7131 7165              is_x86_feature(x86_featureset, X86FSET_SSE4_2)) {
7132 7166                  cnt = &bcopy_patch_end - &bcopy_patch_start;
7133 7167                  to = &bcopy_ck_size;
7134 7168                  from = &bcopy_patch_start;
7135 7169                  for (i = 0; i < cnt; i++) {
7136 7170                          *to++ = *from++;
7137 7171                  }
7138 7172          }
7139 7173  }
7140 7174  #endif  /* __amd64 && !__xpv */
7141 7175  
7142 7176  /*
7143 7177   * We're being asked to tell the system how many bits are required to represent
7144 7178   * the various thread and strand IDs. While it's tempting to derive this based
7145 7179   * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite
7146 7180   * correct. Instead, this needs to be based on the number of bits that the APIC
7147 7181   * allows for these different configurations. We only update these to a larger
7148 7182   * value if we find one.
7149 7183   */
7150 7184  void
7151 7185  cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
7152 7186  {
7153 7187          struct cpuid_info *cpi;
7154 7188  
7155 7189          VERIFY(cpuid_checkpass(CPU, 1));
7156 7190          cpi = cpu->cpu_m.mcpu_cpi;
7157 7191  
7158 7192          if (cpi->cpi_ncore_bits > *core_nbits) {
7159 7193                  *core_nbits = cpi->cpi_ncore_bits;
7160 7194          }
7161 7195  
7162 7196          if (cpi->cpi_nthread_bits > *strand_nbits) {
7163 7197                  *strand_nbits = cpi->cpi_nthread_bits;
7164 7198          }
7165 7199  }
7166 7200  
7167 7201  void
7168 7202  cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
7169 7203  {
7170 7204          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7171 7205          struct cpuid_regs cp;
7172 7206  
7173 7207          /*
7174 7208           * Reread the CPUID portions that we need for various security
7175 7209           * information.
7176 7210           */
7177 7211          if (cpi->cpi_vendor == X86_VENDOR_Intel) {
7178 7212                  /*
7179 7213                   * Check if we now have leaf 7 available to us.
7180 7214                   */
7181 7215                  if (cpi->cpi_maxeax < 7) {
7182 7216                          bzero(&cp, sizeof (cp));
7183 7217                          cp.cp_eax = 0;
7184 7218                          cpi->cpi_maxeax = __cpuid_insn(&cp);
7185 7219                          if (cpi->cpi_maxeax < 7)
7186 7220                                  return;
7187 7221                  }
7188 7222  
7189 7223                  bzero(&cp, sizeof (cp));
7190 7224                  cp.cp_eax = 7;
7191 7225                  cp.cp_ecx = 0;
7192 7226                  (void) __cpuid_insn(&cp);
7193 7227                  cpi->cpi_std[7] = cp;
7194 7228          } else if (cpi->cpi_vendor == X86_VENDOR_AMD) {
7195 7229                  /* No xcpuid support */
7196 7230                  if (cpi->cpi_family < 5 ||
7197 7231                      (cpi->cpi_family == 5 && cpi->cpi_model < 1))
7198 7232                          return;
7199 7233  
7200 7234                  if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7201 7235                          bzero(&cp, sizeof (cp));
7202 7236                          cp.cp_eax = CPUID_LEAF_EXT_0;
7203 7237                          cpi->cpi_xmaxeax = __cpuid_insn(&cp);
7204 7238                          if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7205 7239                                  return;
7206 7240                          }
7207 7241                  }
7208 7242  
7209 7243                  bzero(&cp, sizeof (cp));
7210 7244                  cp.cp_eax = CPUID_LEAF_EXT_8;
7211 7245                  (void) __cpuid_insn(&cp);
7212 7246                  platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
7213 7247                  cpi->cpi_extd[8] = cp;
7214 7248          } else {
7215 7249                  /*
7216 7250                   * Nothing to do here. Return an empty set which has already
7217 7251                   * been zeroed for us.
7218 7252                   */
7219 7253                  return;
7220 7254          }
7221 7255          cpuid_scan_security(cpu, fset);
7222 7256  }
7223 7257  
7224 7258  /* ARGSUSED */
7225 7259  static int
7226 7260  cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
7227 7261  {
7228 7262          uchar_t *fset;
7229 7263          boolean_t first_pass = (boolean_t)arg1;
7230 7264  
7231 7265          fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
7232 7266          if (first_pass && CPU->cpu_id != 0)
7233 7267                  return (0);
7234 7268          if (!first_pass && CPU->cpu_id == 0)
7235 7269                  return (0);
7236 7270          cpuid_pass_ucode(CPU, fset);
7237 7271  
7238 7272          return (0);
7239 7273  }
7240 7274  
7241 7275  /*
7242 7276   * After a microcode update where the version has changed, then we need to
7243 7277   * rescan CPUID. To do this we check every CPU to make sure that they have the
7244 7278   * same microcode. Then we perform a cross call to all such CPUs. It's the
7245 7279   * caller's job to make sure that no one else can end up doing an update while
7246 7280   * this is going on.
7247 7281   *
7248 7282   * We assume that the system is microcode capable if we're called.
7249 7283   */
7250 7284  void
7251 7285  cpuid_post_ucodeadm(void)
7252 7286  {
7253 7287          uint32_t rev;
7254 7288          int i;
7255 7289          struct cpu *cpu;
7256 7290          cpuset_t cpuset;
7257 7291          void *argdata;
7258 7292          uchar_t *f0;
7259 7293  
7260 7294          argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
7261 7295  
7262 7296          mutex_enter(&cpu_lock);
7263 7297          cpu = cpu_get(0);
7264 7298          rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
7265 7299          CPUSET_ONLY(cpuset, 0);
7266 7300          for (i = 1; i < max_ncpus; i++) {
7267 7301                  if ((cpu = cpu_get(i)) == NULL)
7268 7302                          continue;
7269 7303  
7270 7304                  if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
7271 7305                          panic("post microcode update CPU %d has differing "
7272 7306                              "microcode revision (%u) from CPU 0 (%u)",
7273 7307                              i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
7274 7308                  }
7275 7309                  CPUSET_ADD(cpuset, i);
7276 7310          }
7277 7311  
7278 7312          /*
7279 7313           * We do the cross calls in two passes. The first pass is only for the
7280 7314           * boot CPU. The second pass is for all of the other CPUs. This allows
7281 7315           * the boot CPU to go through and change behavior related to patching or
7282 7316           * whether or not Enhanced IBRS needs to be enabled and then allow all
7283 7317           * other CPUs to follow suit.
7284 7318           */
7285 7319          kpreempt_disable();
7286 7320          xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset),
7287 7321              cpuid_post_ucodeadm_xc);
7288 7322          xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset),
7289 7323              cpuid_post_ucodeadm_xc);
7290 7324          kpreempt_enable();
7291 7325  
7292 7326          /*
7293 7327           * OK, now look at each CPU and see if their feature sets are equal.
7294 7328           */
7295 7329          f0 = argdata;
7296 7330          for (i = 1; i < max_ncpus; i++) {
7297 7331                  uchar_t *fset;
7298 7332                  if (!CPU_IN_SET(cpuset, i))
7299 7333                          continue;
7300 7334  
7301 7335                  fset = (uchar_t *)((uintptr_t)argdata +
7302 7336                      sizeof (x86_featureset) * i);
7303 7337  
7304 7338                  if (!compare_x86_featureset(f0, fset)) {
7305 7339                          panic("Post microcode update CPU %d has "
7306 7340                              "differing security feature (%p) set from CPU 0 "
7307 7341                              "(%p), not appending to feature set", i,
7308 7342                              (void *)fset, (void *)f0);
7309 7343                  }
7310 7344          }
7311 7345  
7312 7346          mutex_exit(&cpu_lock);
7313 7347  
7314 7348          for (i = 0; i < NUM_X86_FEATURES; i++) {
7315 7349                  cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n",
7316 7350                      x86_feature_names[i]);
7317 7351                  if (is_x86_feature(f0, i)) {
7318 7352                          add_x86_feature(x86_featureset, i);
7319 7353                  }
7320 7354          }
7321 7355          kmem_free(argdata, sizeof (x86_featureset) * NCPU);
7322 7356  }

↓ open down ↓

6145 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX