2 Wdiff usr/src/uts/i86pc/os/cpuid.c

Print this page

11967 need TAA mitigation
Portions contributed by: Robert Mustacchi <rm@fingolfin.org>
Reviewed by: Dan McDonald <danmcd@joyent.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/i86pc/os/cpuid.c
          +++ new/usr/src/uts/i86pc/os/cpuid.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  24   24   * Copyright 2013 Nexenta Systems, Inc. All rights reserved.

↓ open down ↓

24 lines elided

↑ open up ↑

  25   25   * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
  26   26   */
  27   27  /*
  28   28   * Copyright (c) 2010, Intel Corporation.
  29   29   * All rights reserved.
  30   30   */
  31   31  /*
  32   32   * Portions Copyright 2009 Advanced Micro Devices, Inc.
  33   33   */
  34   34  /*
  35      - * Copyright 2019 Joyent, Inc.
       35 + * Copyright 2020 Joyent, Inc.
  36   36   */
  37   37  
  38   38  /*
  39   39   * CPU Identification logic
  40   40   *
  41   41   * The purpose of this file and its companion, cpuid_subr.c, is to help deal
  42   42   * with the identification of CPUs, their features, and their topologies. More
  43   43   * specifically, this file helps drive the following:
  44   44   *
  45   45   * 1. Enumeration of features of the processor which are used by the kernel to

  46   46   *    determine what features to enable or disable. These may be instruction set
  47   47   *    enhancements or features that we use.
  48   48   *
  49   49   * 2. Enumeration of instruction set architecture (ISA) additions that userland
  50   50   *    will be told about through the auxiliary vector.
  51   51   *
  52   52   * 3. Understanding the physical topology of the CPU such as the number of
  53   53   *    caches, how many cores it has, whether or not it supports symmetric
  54   54   *    multi-processing (SMT), etc.
  55   55   *
  56   56   * ------------------------
  57   57   * CPUID History and Basics
  58   58   * ------------------------
  59   59   *
  60   60   * The cpuid instruction was added by Intel roughly around the time that the
  61   61   * original Pentium was introduced. The purpose of cpuid was to tell in a
  62   62   * programmatic fashion information about the CPU that previously was guessed
  63   63   * at. For example, an important part of cpuid is that we can know what
  64   64   * extensions to the ISA exist. If you use an invalid opcode you would get a
  65   65   * #UD, so this method allows a program (whether a user program or the kernel)
  66   66   * to determine what exists without crashing or getting a SIGILL. Of course,
  67   67   * this was also during the era of the clones and the AMD Am5x86. The vendor
  68   68   * name shows up first in cpuid for a reason.
  69   69   *
  70   70   * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
  71   71   * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
  72   72   * its own meaning. The different leaves are broken down into different regions:
  73   73   *
  74   74   *      [ 0, 7fffffff ]                 This region is called the 'basic'
  75   75   *                                      region. This region is generally defined
  76   76   *                                      by Intel, though some of the original
  77   77   *                                      portions have different meanings based
  78   78   *                                      on the manufacturer. These days, Intel
  79   79   *                                      adds most new features to this region.
  80   80   *                                      AMD adds non-Intel compatible
  81   81   *                                      information in the third, extended
  82   82   *                                      region. Intel uses this for everything
  83   83   *                                      including ISA extensions, CPU
  84   84   *                                      features, cache information, topology,
  85   85   *                                      and more.
  86   86   *
  87   87   *                                      There is a hole carved out of this
  88   88   *                                      region which is reserved for
  89   89   *                                      hypervisors.
  90   90   *
  91   91   *      [ 40000000, 4fffffff ]          This region, which is found in the
  92   92   *                                      middle of the previous region, is
  93   93   *                                      explicitly promised to never be used by
  94   94   *                                      CPUs. Instead, it is used by hypervisors
  95   95   *                                      to communicate information about
  96   96   *                                      themselves to the operating system. The
  97   97   *                                      values and details are unique for each
  98   98   *                                      hypervisor.
  99   99   *
 100  100   *      [ 80000000, ffffffff ]          This region is called the 'extended'
 101  101   *                                      region. Some of the low leaves mirror
 102  102   *                                      parts of the basic leaves. This region
 103  103   *                                      has generally been used by AMD for
 104  104   *                                      various extensions. For example, AMD-
 105  105   *                                      specific information about caches,
 106  106   *                                      features, and topology are found in this
 107  107   *                                      region.
 108  108   *
 109  109   * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
 110  110   * and %edx, and then issue the cpuid instruction. At the first leaf in each of
 111  111   * the ranges, one of the primary things returned is the maximum valid leaf in
 112  112   * that range. This allows for discovery of what range of CPUID is valid.
 113  113   *
 114  114   * The CPUs have potentially surprising behavior when using an invalid leaf or
 115  115   * unimplemented leaf. If the requested leaf is within the valid basic or
 116  116   * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
 117  117   * set to zero. However, if you specify a leaf that is outside of a valid range,
 118  118   * then instead it will be filled with the last valid _basic_ leaf. For example,
 119  119   * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
 120  120   * an invalid extended leaf will return the information for leaf 3.
 121  121   *
 122  122   * Some leaves are broken down into sub-leaves. This means that the value
 123  123   * depends on both the leaf asked for in %eax and a secondary register. For
 124  124   * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
 125  125   * additional information. Or when getting topology information in leaf 0xb, the
 126  126   * initial value in %ecx changes which level of the topology that you are
 127  127   * getting information about.
 128  128   *
 129  129   * cpuid values are always kept to 32 bits regardless of whether or not the
 130  130   * program is in 64-bit mode. When executing in 64-bit mode, the upper
 131  131   * 32 bits of the register are always set to zero so that way the values are the
 132  132   * same regardless of execution mode.
 133  133   *
 134  134   * ----------------------
 135  135   * Identifying Processors
 136  136   * ----------------------
 137  137   *
 138  138   * We can identify a processor in two steps. The first step looks at cpuid leaf
 139  139   * 0. Leaf 0 contains the processor's vendor information. This is done by
 140  140   * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
 141  141   * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
 142  142   *
 143  143   * From there, a processor is identified by a combination of three different
 144  144   * values:
 145  145   *
 146  146   *  1. Family
 147  147   *  2. Model
 148  148   *  3. Stepping
 149  149   *
 150  150   * Each vendor uses the family and model to uniquely identify a processor. The
 151  151   * way that family and model are changed depends on the vendor. For example,
 152  152   * Intel has been using family 0x6 for almost all of their processor since the
 153  153   * Pentium Pro/Pentium II era, often called the P6. The model is used to
 154  154   * identify the exact processor. Different models are often used for the client
 155  155   * (consumer) and server parts. Even though each processor often has major
 156  156   * architectural differences, they still are considered the same family by
 157  157   * Intel.
 158  158   *
 159  159   * On the other hand, each major AMD architecture generally has its own family.
 160  160   * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
 161  161   * the model number is used to help identify specific processors.
 162  162   *
 163  163   * The stepping is used to refer to a revision of a specific microprocessor. The
 164  164   * term comes from equipment used to produce masks that are used to create
 165  165   * integrated circuits.
 166  166   *
 167  167   * The information is present in leaf 1, %eax. In technical documentation you
 168  168   * will see the terms extended model and extended family. The original family,
 169  169   * model, and stepping fields were each 4 bits wide. If the values in either
 170  170   * are 0xf, then one is to consult the extended model and extended family, which
 171  171   * take previously reserved bits and allow for a larger number of models and add
 172  172   * 0xf to them.
 173  173   *
 174  174   * When we process this information, we store the full family, model, and
 175  175   * stepping in the struct cpuid_info members cpi_family, cpi_model, and
 176  176   * cpi_step, respectively. Whenever you are performing comparisons with the
 177  177   * family, model, and stepping, you should use these members and not the raw
 178  178   * values from cpuid. If you must use the raw values from cpuid directly, you
 179  179   * must make sure that you add the extended model and family to the base model
 180  180   * and family.
 181  181   *
 182  182   * In general, we do not use information about the family, model, and stepping
 183  183   * to determine whether or not a feature is present; that is generally driven by
 184  184   * specific leaves. However, when something we care about on the processor is
 185  185   * not considered 'architectural' meaning that it is specific to a set of
 186  186   * processors and not promised in the architecture model to be consistent from
 187  187   * generation to generation, then we will fall back on this information. The
 188  188   * most common cases where this comes up is when we have to workaround errata in
 189  189   * the processor, are dealing with processor-specific features such as CPU
 190  190   * performance counters, or we want to provide additional information for things
 191  191   * such as fault management.
 192  192   *
 193  193   * While processors also do have a brand string, which is the name that people
 194  194   * are familiar with when buying the processor, they are not meant for
 195  195   * programmatic consumption. That is what the family, model, and stepping are
 196  196   * for.
 197  197   *
 198  198   * ------------
 199  199   * CPUID Passes
 200  200   * ------------
 201  201   *
 202  202   * As part of performing feature detection, we break this into several different
 203  203   * passes. The passes are as follows:
 204  204   *
 205  205   *      Pass 0          This is a primordial pass done in locore.s to deal with
 206  206   *                      Cyrix CPUs that don't support cpuid. The reality is that
 207  207   *                      we likely don't run on them any more, but there is still
 208  208   *                      logic for handling them.
 209  209   *
 210  210   *      Pass 1          This is the primary pass and is responsible for doing a
 211  211   *                      large number of different things:
 212  212   *
 213  213   *                      1. Determine which vendor manufactured the CPU and
 214  214   *                      determining the family, model, and stepping information.
 215  215   *
 216  216   *                      2. Gathering a large number of feature flags to
 217  217   *                      determine which features the CPU support and which
 218  218   *                      indicate things that we need to do other work in the OS
 219  219   *                      to enable. Features detected this way are added to the
 220  220   *                      x86_featureset which can be queried to
 221  221   *                      determine what we should do. This includes processing
 222  222   *                      all of the basic and extended CPU features that we care
 223  223   *                      about.
 224  224   *
 225  225   *                      3. Determining the CPU's topology. This includes
 226  226   *                      information about how many cores and threads are present
 227  227   *                      in the package. It also is responsible for figuring out
 228  228   *                      which logical CPUs are potentially part of the same core
 229  229   *                      and what other resources they might share. For more
 230  230   *                      information see the 'Topology' section.
 231  231   *
 232  232   *                      4. Determining the set of CPU security-specific features
 233  233   *                      that we need to worry about and determine the
 234  234   *                      appropriate set of workarounds.
 235  235   *
 236  236   *                      Pass 1 on the boot CPU occurs before KMDB is started.
 237  237   *
 238  238   *      Pass 2          The second pass is done after startup(). Here, we check
 239  239   *                      other miscellaneous features. Most of this is gathering
 240  240   *                      additional basic and extended features that we'll use in
 241  241   *                      later passes or for debugging support.
 242  242   *
 243  243   *      Pass 3          The third pass occurs after the kernel memory allocator
 244  244   *                      has been fully initialized. This gathers information
 245  245   *                      where we might need dynamic memory available for our
 246  246   *                      uses. This includes several varying width leaves that
 247  247   *                      have cache information and the processor's brand string.
 248  248   *
 249  249   *      Pass 4          The fourth and final normal pass is performed after the
 250  250   *                      kernel has brought most everything online. This is
 251  251   *                      invoked from post_startup(). In this pass, we go through
 252  252   *                      the set of features that we have enabled and turn that
 253  253   *                      into the hardware auxiliary vector features that
 254  254   *                      userland receives. This is used by userland, primarily
 255  255   *                      by the run-time link-editor (RTLD), though userland
 256  256   *                      software could also refer to it directly.
 257  257   *
 258  258   *      Microcode       After a microcode update, we do a selective rescan of
 259  259   *                      the cpuid leaves to determine what features have
 260  260   *                      changed. Microcode updates can provide more details
 261  261   *                      about security related features to deal with issues like
 262  262   *                      Spectre and L1TF. On occasion, vendors have violated
 263  263   *                      their contract and removed bits. However, we don't try
 264  264   *                      to detect that because that puts us in a situation that
 265  265   *                      we really can't deal with. As such, the only thing we
 266  266   *                      rescan are security related features today. See
 267  267   *                      cpuid_pass_ucode().
 268  268   *
 269  269   * All of the passes (except pass 0) are run on all CPUs. However, for the most
 270  270   * part we only care about what the boot CPU says about this information and use
 271  271   * the other CPUs as a rough guide to sanity check that we have the same feature
 272  272   * set.
 273  273   *
 274  274   * We do not support running multiple logical CPUs with disjoint, let alone
 275  275   * different, feature sets.
 276  276   *
 277  277   * ------------------
 278  278   * Processor Topology
 279  279   * ------------------
 280  280   *
 281  281   * One of the important things that we need to do is to understand the topology
 282  282   * of the underlying processor. When we say topology in this case, we're trying
 283  283   * to understand the relationship between the logical CPUs that the operating
 284  284   * system sees and the underlying physical layout. Different logical CPUs may
 285  285   * share different resources which can have important consequences for the
 286  286   * performance of the system. For example, they may share caches, execution
 287  287   * units, and more.
 288  288   *
 289  289   * The topology of the processor changes from generation to generation and
 290  290   * vendor to vendor.  Along with that, different vendors use different
 291  291   * terminology, and the operating system itself uses occasionally overlapping
 292  292   * terminology. It's important to understand what this topology looks like so
 293  293   * one can understand the different things that we try to calculate and
 294  294   * determine.
 295  295   *
 296  296   * To get started, let's talk about a little bit of terminology that we've used
 297  297   * so far, is used throughout this file, and is fairly generic across multiple
 298  298   * vendors:
 299  299   *
 300  300   * CPU
 301  301   *      A central processing unit (CPU) refers to a logical and/or virtual
 302  302   *      entity that the operating system can execute instructions on. The
 303  303   *      underlying resources for this CPU may be shared between multiple
 304  304   *      entities; however, to the operating system it is a discrete unit.
 305  305   *
 306  306   * PROCESSOR and PACKAGE
 307  307   *
 308  308   *      Generally, when we use the term 'processor' on its own, we are referring
 309  309   *      to the physical entity that one buys and plugs into a board. However,
 310  310   *      because processor has been overloaded and one might see it used to mean
 311  311   *      multiple different levels, we will instead use the term 'package' for
 312  312   *      the rest of this file. The term package comes from the electrical
 313  313   *      engineering side and refers to the physical entity that encloses the
 314  314   *      electronics inside. Strictly speaking the package can contain more than
 315  315   *      just the CPU, for example, on many processors it may also have what's
 316  316   *      called an 'integrated graphical processing unit (GPU)'. Because the
 317  317   *      package can encapsulate multiple units, it is the largest physical unit
 318  318   *      that we refer to.
 319  319   *
 320  320   * SOCKET
 321  321   *
 322  322   *      A socket refers to unit on a system board (generally the motherboard)
 323  323   *      that can receive a package. A single package, or processor, is plugged
 324  324   *      into a single socket. A system may have multiple sockets. Often times,
 325  325   *      the term socket is used interchangeably with package and refers to the
 326  326   *      electrical component that has plugged in, and not the receptacle itself.
 327  327   *
 328  328   * CORE
 329  329   *
 330  330   *      A core refers to the physical instantiation of a CPU, generally, with a
 331  331   *      full set of hardware resources available to it. A package may contain
 332  332   *      multiple cores inside of it or it may just have a single one. A
 333  333   *      processor with more than one core is often referred to as 'multi-core'.
 334  334   *      In illumos, we will use the feature X86FSET_CMP to refer to a system
 335  335   *      that has 'multi-core' processors.
 336  336   *
 337  337   *      A core may expose a single logical CPU to the operating system, or it
 338  338   *      may expose multiple CPUs, which we call threads, defined below.
 339  339   *
 340  340   *      Some resources may still be shared by cores in the same package. For
 341  341   *      example, many processors will share the level 3 cache between cores.
 342  342   *      Some AMD generations share hardware resources between cores. For more
 343  343   *      information on that see the section 'AMD Topology'.
 344  344   *
 345  345   * THREAD and STRAND
 346  346   *
 347  347   *      In this file, generally a thread refers to a hardware resources and not
 348  348   *      the operating system's logical abstraction. A thread is always exposed
 349  349   *      as an independent logical CPU to the operating system. A thread belongs
 350  350   *      to a specific core. A core may have more than one thread. When that is
 351  351   *      the case, the threads that are part of the same core are often referred
 352  352   *      to as 'siblings'.
 353  353   *
 354  354   *      When multiple threads exist, this is generally referred to as
 355  355   *      simultaneous multi-threading (SMT). When Intel introduced this in their
 356  356   *      processors they called it hyper-threading (HT). When multiple threads
 357  357   *      are active in a core, they split the resources of the core. For example,
 358  358   *      two threads may share the same set of hardware execution units.
 359  359   *
 360  360   *      The operating system often uses the term 'strand' to refer to a thread.
 361  361   *      This helps disambiguate it from the software concept.
 362  362   *
 363  363   * CHIP
 364  364   *
 365  365   *      Unfortunately, the term 'chip' is dramatically overloaded. At its most
 366  366   *      base meaning, it is used to refer to a single integrated circuit, which
 367  367   *      may or may not be the only thing in the package. In illumos, when you
 368  368   *      see the term 'chip' it is almost always referring to the same thing as
 369  369   *      the 'package'. However, many vendors may use chip to refer to one of
 370  370   *      many integrated circuits that have been placed in the package. As an
 371  371   *      example, see the subsequent definition.
 372  372   *
 373  373   *      To try and keep things consistent, we will only use chip when referring
 374  374   *      to the entire integrated circuit package, with the exception of the
 375  375   *      definition of multi-chip module (because it is in the name) and use the
 376  376   *      term 'die' when we want the more general, potential sub-component
 377  377   *      definition.
 378  378   *
 379  379   * DIE
 380  380   *
 381  381   *      A die refers to an integrated circuit. Inside of the package there may
 382  382   *      be a single die or multiple dies. This is sometimes called a 'chip' in
 383  383   *      vendor's parlance, but in this file, we use the term die to refer to a
 384  384   *      subcomponent.
 385  385   *
 386  386   * MULTI-CHIP MODULE
 387  387   *
 388  388   *      A multi-chip module (MCM) refers to putting multiple distinct chips that
 389  389   *      are connected together in the same package. When a multi-chip design is
 390  390   *      used, generally each chip is manufactured independently and then joined
 391  391   *      together in the package. For example, on AMD's Zen microarchitecture
 392  392   *      (family 0x17), the package contains several dies (the second meaning of
 393  393   *      chip from above) that are connected together.
 394  394   *
 395  395   * CACHE
 396  396   *
 397  397   *      A cache is a part of the processor that maintains copies of recently
 398  398   *      accessed memory. Caches are split into levels and then into types.
 399  399   *      Commonly there are one to three levels, called level one, two, and
 400  400   *      three. The lower the level, the smaller it is, the closer it is to the
 401  401   *      execution units of the CPU, and the faster it is to access. The layout
 402  402   *      and design of the cache come in many different flavors, consult other
 403  403   *      resources for a discussion of those.
 404  404   *
 405  405   *      Caches are generally split into two types, the instruction and data
 406  406   *      cache. The caches contain what their names suggest, the instruction
 407  407   *      cache has executable program text, while the data cache has all other
 408  408   *      memory that the processor accesses. As of this writing, data is kept
 409  409   *      coherent between all of the caches on x86, so if one modifies program
 410  410   *      text before it is executed, that will be in the data cache, and the
 411  411   *      instruction cache will be synchronized with that change when the
 412  412   *      processor actually executes those instructions. This coherency also
 413  413   *      covers the fact that data could show up in multiple caches.
 414  414   *
 415  415   *      Generally, the lowest level caches are specific to a core. However, the
 416  416   *      last layer cache is shared between some number of cores. The number of
 417  417   *      CPUs sharing this last level cache is important. This has implications
 418  418   *      for the choices that the scheduler makes, as accessing memory that might
 419  419   *      be in a remote cache after thread migration can be quite expensive.
 420  420   *
 421  421   *      Sometimes, the word cache is abbreviated with a '$', because in US
 422  422   *      English the word cache is pronounced the same as cash. So L1D$ refers to
 423  423   *      the L1 data cache, and L2$ would be the L2 cache. This will not be used
 424  424   *      in the rest of this theory statement for clarity.
 425  425   *
 426  426   * MEMORY CONTROLLER
 427  427   *
 428  428   *      The memory controller is a component that provides access to DRAM. Each
 429  429   *      memory controller can access a set number of DRAM channels. Each channel
 430  430   *      can have a number of DIMMs (sticks of memory) associated with it. A
 431  431   *      given package may have more than one memory controller. The association
 432  432   *      of the memory controller to a group of cores is important as it is
 433  433   *      cheaper to access memory on the controller that you are associated with.
 434  434   *
 435  435   * NUMA
 436  436   *
 437  437   *      NUMA or non-uniform memory access, describes a way that systems are
 438  438   *      built. On x86, any processor core can address all of the memory in the
 439  439   *      system. However, When using multiple sockets or possibly within a
 440  440   *      multi-chip module, some of that memory is physically closer and some of
 441  441   *      it is further. Memory that is further away is more expensive to access.
 442  442   *      Consider the following image of multiple sockets with memory:
 443  443   *
 444  444   *      +--------+                                                +--------+
 445  445   *      | DIMM A |         +----------+      +----------+         | DIMM D |
 446  446   *      +--------+-+       |          |      |          |       +-+------+-+
 447  447   *        | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
 448  448   *        +--------+-+     |          |      |          |     +-+------+-+
 449  449   *          | DIMM C |     +----------+      +----------+     | DIMM F |
 450  450   *          +--------+                                        +--------+
 451  451   *
 452  452   *      In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
 453  453   *      closer to DIMMs D-F. This means that it is cheaper for socket 0 to
 454  454   *      access DIMMs A-C and more expensive to access D-F as it has to go
 455  455   *      through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
 456  456   *      D-F are cheaper than A-C. While the socket form is the most common, when
 457  457   *      using multi-chip modules, this can also sometimes occur. For another
 458  458   *      example of this that's more involved, see the AMD topology section.
 459  459   *
 460  460   *
 461  461   * Intel Topology
 462  462   * --------------
 463  463   *
 464  464   * Most Intel processors since Nehalem, (as of this writing the current gen
 465  465   * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
 466  466   * the package is a single monolithic die. MCMs currently aren't used. Most
 467  467   * parts have three levels of caches, with the L3 cache being shared between
 468  468   * all of the cores on the package. The L1/L2 cache is generally specific to
 469  469   * an individual core. The following image shows at a simplified level what
 470  470   * this looks like. The memory controller is commonly part of something called
 471  471   * the 'Uncore', that used to be separate physical chips that were not a part of
 472  472   * the package, but are now part of the same chip.
 473  473   *
 474  474   *  +-----------------------------------------------------------------------+
 475  475   *  | Package                                                               |
 476  476   *  |  +-------------------+  +-------------------+  +-------------------+  |
 477  477   *  |  | Core              |  | Core              |  | Core              |  |
 478  478   *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
 479  479   *  |  |  | Thread | | L | |  |  | Thread | | L | |  |  | Thread | | L | |  |
 480  480   *  |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |
 481  481   *  |  |  +--------+ |   | |  |  +--------+ |   | |  |  +--------+ |   | |  |
 482  482   *  |  |  | Thread | |   | |  |  | Thread | |   | |  |  | Thread | |   | |  |
 483  483   *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
 484  484   *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
 485  485   *  |  |  | L2 Cache     | |  |  | L2 Cache     | |  |  | L2 Cache     | |  |
 486  486   *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
 487  487   *  |  +-------------------+  +-------------------+  +-------------------+  |
 488  488   *  | +-------------------------------------------------------------------+ |
 489  489   *  | |                         Shared L3 Cache                           | |
 490  490   *  | +-------------------------------------------------------------------+ |
 491  491   *  | +-------------------------------------------------------------------+ |
 492  492   *  | |                        Memory Controller                          | |
 493  493   *  | +-------------------------------------------------------------------+ |
 494  494   *  +-----------------------------------------------------------------------+
 495  495   *
 496  496   * A side effect of this current architecture is that what we care about from a
 497  497   * scheduling and topology perspective, is simplified. In general we care about
 498  498   * understanding which logical CPUs are part of the same core and socket.
 499  499   *
 500  500   * To determine the relationship between threads and cores, Intel initially used
 501  501   * the identifier in the advanced programmable interrupt controller (APIC). They
 502  502   * also added cpuid leaf 4 to give additional information about the number of
 503  503   * threads and CPUs in the processor. With the addition of x2apic (which
 504  504   * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
 505  505   * additional cpuid topology leaf 0xB was added.
 506  506   *
 507  507   * AMD Topology
 508  508   * ------------
 509  509   *
 510  510   * When discussing AMD topology, we want to break this into three distinct
 511  511   * generations of topology. There's the basic topology that has been used in
 512  512   * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
 513  513   * with family 0x15 (Bulldozer), and there's the topology that was introduced
 514  514   * with family 0x17 (Zen). AMD also has some additional terminology that's worth
 515  515   * talking about.
 516  516   *
 517  517   * Until the introduction of family 0x17 (Zen), AMD did not implement something
 518  518   * that they considered SMT. Whether or not the AMD processors have SMT
 519  519   * influences many things including scheduling and reliability, availability,
 520  520   * and serviceability (RAS) features.
 521  521   *
 522  522   * NODE
 523  523   *
 524  524   *      AMD uses the term node to refer to a die that contains a number of cores
 525  525   *      and I/O resources. Depending on the processor family and model, more
 526  526   *      than one node can be present in the package. When there is more than one
 527  527   *      node this indicates a multi-chip module. Usually each node has its own
 528  528   *      access to memory and I/O devices. This is important and generally
 529  529   *      different from the corresponding Intel Nehalem-Skylake+ processors. As a
 530  530   *      result, we track this relationship in the operating system.
 531  531   *
 532  532   *      In processors with an L3 cache, the L3 cache is generally shared across
 533  533   *      the entire node, though the way this is carved up varies from generation
 534  534   *      to generation.
 535  535   *
 536  536   * BULLDOZER
 537  537   *
 538  538   *      Starting with the Bulldozer family (0x15) and continuing until the
 539  539   *      introduction of the Zen microarchitecture, AMD introduced the idea of a
 540  540   *      compute unit. In a compute unit, two traditional cores share a number of
 541  541   *      hardware resources. Critically, they share the FPU, L1 instruction
 542  542   *      cache, and the L2 cache. Several compute units were then combined inside
 543  543   *      of a single node.  Because the integer execution units, L1 data cache,
 544  544   *      and some other resources were not shared between the cores, AMD never
 545  545   *      considered this to be SMT.
 546  546   *
 547  547   * ZEN
 548  548   *
 549  549   *      The Zen family (0x17) uses a multi-chip module (MCM) design, the module
 550  550   *      is called Zeppelin. These modules are similar to the idea of nodes used
 551  551   *      previously. Each of these nodes has two DRAM channels which all of the
 552  552   *      cores in the node can access uniformly. These nodes are linked together
 553  553   *      in the package, creating a NUMA environment.
 554  554   *
 555  555   *      The Zeppelin die itself contains two different 'core complexes'. Each
 556  556   *      core complex consists of four cores which each have two threads, for a
 557  557   *      total of 8 logical CPUs per complex. Unlike other generations,
 558  558   *      where all the logical CPUs in a given node share the L3 cache, here each
 559  559   *      core complex has its own shared L3 cache.
 560  560   *
 561  561   *      A further thing that we need to consider is that in some configurations,
 562  562   *      particularly with the Threadripper line of processors, not every die
 563  563   *      actually has its memory controllers wired up to actual memory channels.
 564  564   *      This means that some cores have memory attached to them and others
 565  565   *      don't.
 566  566   *
 567  567   *      To put Zen in perspective, consider the following images:
 568  568   *
 569  569   *      +--------------------------------------------------------+
 570  570   *      | Core Complex                                           |
 571  571   *      | +-------------------+    +-------------------+  +---+  |
 572  572   *      | | Core       +----+ |    | Core       +----+ |  |   |  |
 573  573   *      | | +--------+ | L2 | |    | +--------+ | L2 | |  |   |  |
 574  574   *      | | | Thread | +----+ |    | | Thread | +----+ |  |   |  |
 575  575   *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  | L |  |
 576  576   *      | |   | Thread | |L1| |    |   | Thread | |L1| |  | 3 |  |
 577  577   *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
 578  578   *      | +-------------------+    +-------------------+  | C |  |
 579  579   *      | +-------------------+    +-------------------+  | a |  |
 580  580   *      | | Core       +----+ |    | Core       +----+ |  | c |  |
 581  581   *      | | +--------+ | L2 | |    | +--------+ | L2 | |  | h |  |
 582  582   *      | | | Thread | +----+ |    | | Thread | +----+ |  | e |  |
 583  583   *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |   |  |
 584  584   *      | |   | Thread | |L1| |    |   | Thread | |L1| |  |   |  |
 585  585   *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
 586  586   *      | +-------------------+    +-------------------+  +---+  |
 587  587   *      |                                                        |
 588  588   *      +--------------------------------------------------------+
 589  589   *
 590  590   *  This first image represents a single Zen core complex that consists of four
 591  591   *  cores.
 592  592   *
 593  593   *
 594  594   *      +--------------------------------------------------------+
 595  595   *      | Zeppelin Die                                           |
 596  596   *      |  +--------------------------------------------------+  |
 597  597   *      |  |         I/O Units (PCIe, SATA, USB, etc.)        |  |
 598  598   *      |  +--------------------------------------------------+  |
 599  599   *      |                           HH                           |
 600  600   *      |          +-----------+    HH    +-----------+          |
 601  601   *      |          |           |    HH    |           |          |
 602  602   *      |          |    Core   |==========|    Core   |          |
 603  603   *      |          |  Complex  |==========|  Complex  |          |
 604  604   *      |          |           |    HH    |           |          |
 605  605   *      |          +-----------+    HH    +-----------+          |
 606  606   *      |                           HH                           |
 607  607   *      |  +--------------------------------------------------+  |
 608  608   *      |  |                Memory Controller                 |  |
 609  609   *      |  +--------------------------------------------------+  |
 610  610   *      |                                                        |
 611  611   *      +--------------------------------------------------------+
 612  612   *
 613  613   *  This image represents a single Zeppelin Die. Note how both cores are
 614  614   *  connected to the same memory controller and I/O units. While each core
 615  615   *  complex has its own L3 cache as seen in the first image, they both have
 616  616   *  uniform access to memory.
 617  617   *
 618  618   *
 619  619   *                      PP                     PP
 620  620   *                      PP                     PP
 621  621   *           +----------PP---------------------PP---------+
 622  622   *           |          PP                     PP         |
 623  623   *           |    +-----------+          +-----------+    |
 624  624   *           |    |           |          |           |    |
 625  625   *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
 626  626   *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
 627  627   *           |    |           |          |           |    |
 628  628   *           |    +-----------+ooo    ...+-----------+    |
 629  629   *           |          HH      ooo  ...       HH         |
 630  630   *           |          HH        oo..         HH         |
 631  631   *           |          HH        ..oo         HH         |
 632  632   *           |          HH      ...  ooo       HH         |
 633  633   *           |    +-----------+...    ooo+-----------+    |
 634  634   *           |    |           |          |           |    |
 635  635   *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
 636  636   *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
 637  637   *           |    |           |          |           |    |
 638  638   *           |    +-----------+          +-----------+    |
 639  639   *           |          PP                     PP         |
 640  640   *           +----------PP---------------------PP---------+
 641  641   *                      PP                     PP
 642  642   *                      PP                     PP
 643  643   *
 644  644   *  This image represents a single Zen package. In this example, it has four
 645  645   *  Zeppelin dies, though some configurations only have a single one. In this
 646  646   *  example, each die is directly connected to the next. Also, each die is
 647  647   *  represented as being connected to memory by the 'M' character and connected
 648  648   *  to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
 649  649   *  die is made up of two core complexes, we have multiple different NUMA
 650  650   *  domains that we care about for these systems.
 651  651   *
 652  652   * CPUID LEAVES
 653  653   *
 654  654   * There are a few different CPUID leaves that we can use to try and understand
 655  655   * the actual state of the world. As part of the introduction of family 0xf, AMD
 656  656   * added CPUID leaf 0x80000008. This leaf tells us the number of logical
 657  657   * processors that are in the system. Because families before Zen didn't have
 658  658   * SMT, this was always the number of cores that were in the system. However, it
 659  659   * should always be thought of as the number of logical threads to be consistent
 660  660   * between generations. In addition we also get the size of the APIC ID that is
 661  661   * used to represent the number of logical processors. This is important for
 662  662   * deriving topology information.
 663  663   *
 664  664   * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
 665  665   * bit between Bulldozer and later families, but it is quite useful in
 666  666   * determining the topology information. Because this information has changed
 667  667   * across family generations, it's worth calling out what these mean
 668  668   * explicitly. The registers have the following meanings:
 669  669   *
 670  670   *      %eax    The APIC ID. The entire register is defined to have a 32-bit
 671  671   *              APIC ID, even though on systems without x2apic support, it will
 672  672   *              be limited to 8 bits.
 673  673   *
 674  674   *      %ebx    On Bulldozer-era systems this contains information about the
 675  675   *              number of cores that are in a compute unit (cores that share
 676  676   *              resources). It also contains a per-package compute unit ID that
 677  677   *              identifies which compute unit the logical CPU is a part of.
 678  678   *
 679  679   *              On Zen-era systems this instead contains the number of threads
 680  680   *              per core and the ID of the core that the logical CPU is a part
 681  681   *              of. Note, this ID is unique only to the package, it is not
 682  682   *              globally unique across the entire system.
 683  683   *
 684  684   *      %ecx    This contains the number of nodes that exist in the package. It
 685  685   *              also contains an ID that identifies which node the logical CPU
 686  686   *              is a part of.
 687  687   *
 688  688   * Finally, we also use cpuid leaf 0x8000001D to determine information about the
 689  689   * cache layout to determine which logical CPUs are sharing which caches.
 690  690   *
 691  691   * illumos Topology
 692  692   * ----------------
 693  693   *
 694  694   * Based on the above we synthesize the information into several different
 695  695   * variables that we store in the 'struct cpuid_info'. We'll go into the details
 696  696   * of what each member is supposed to represent and their uniqueness. In
 697  697   * general, there are two levels of uniqueness that we care about. We care about
 698  698   * an ID that is globally unique. That means that it will be unique across all
 699  699   * entities in the system. For example, the default logical CPU ID is globally
 700  700   * unique. On the other hand, there is some information that we only care about
 701  701   * being unique within the context of a single package / socket. Here are the
 702  702   * variables that we keep track of and their meaning.
 703  703   *
 704  704   * Several of the values that are asking for an identifier, with the exception
 705  705   * of cpi_apicid, are allowed to be synthetic.
 706  706   *
 707  707   *
 708  708   * cpi_apicid
 709  709   *
 710  710   *      This is the value of the CPU's APIC id. This should be the full 32-bit
 711  711   *      ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
 712  712   *      APIC ID. This value is globally unique between all logical CPUs across
 713  713   *      all packages. This is usually required by the APIC.
 714  714   *
 715  715   * cpi_chipid
 716  716   *
 717  717   *      This value indicates the ID of the package that the logical CPU is a
 718  718   *      part of. This value is allowed to be synthetic. It is usually derived by
 719  719   *      taking the CPU's APIC ID and determining how many bits are used to
 720  720   *      represent CPU cores in the package. All logical CPUs that are part of
 721  721   *      the same package must have the same value.
 722  722   *
 723  723   * cpi_coreid
 724  724   *
 725  725   *      This represents the ID of a CPU core. Two logical CPUs should only have
 726  726   *      the same cpi_coreid value if they are part of the same core. These
 727  727   *      values may be synthetic. On systems that support SMT, this value is
 728  728   *      usually derived from the APIC ID, otherwise it is often synthetic and
 729  729   *      just set to the value of the cpu_id in the cpu_t.
 730  730   *
 731  731   * cpi_pkgcoreid
 732  732   *
 733  733   *      This is similar to the cpi_coreid in that logical CPUs that are part of
 734  734   *      the same core should have the same ID. The main difference is that these
 735  735   *      values are only required to be unique to a given socket.
 736  736   *
 737  737   * cpi_clogid
 738  738   *
 739  739   *      This represents the logical ID of a logical CPU. This value should be
 740  740   *      unique within a given socket for each logical CPU. This is allowed to be
 741  741   *      synthetic, though it is usually based off of the CPU's apic ID. The
 742  742   *      broader system expects that logical CPUs that have are part of the same
 743  743   *      core have contiguous numbers. For example, if there were two threads per
 744  744   *      core, then the core IDs divided by two should be the same and the first
 745  745   *      modulus two should be zero and the second one. For example, IDs 4 and 5
 746  746   *      indicate two logical CPUs that are part of the same core. But IDs 5 and
 747  747   *      6 represent two logical CPUs that are part of different cores.
 748  748   *
 749  749   *      While it is common for the cpi_coreid and the cpi_clogid to be derived
 750  750   *      from the same source, strictly speaking, they don't have to be and the
 751  751   *      two values should be considered logically independent. One should not
 752  752   *      try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
 753  753   *      some kind of relationship. While this is tempting, we've seen cases on
 754  754   *      AMD family 0xf where the system's cpu id is not related to its APIC ID.
 755  755   *
 756  756   * cpi_ncpu_per_chip
 757  757   *
 758  758   *      This value indicates the total number of logical CPUs that exist in the
 759  759   *      physical package. Critically, this is not the number of logical CPUs
 760  760   *      that exist for just the single core.
 761  761   *
 762  762   *      This value should be the same for all logical CPUs in the same package.
 763  763   *
 764  764   * cpi_ncore_per_chip
 765  765   *
 766  766   *      This value indicates the total number of physical CPU cores that exist
 767  767   *      in the package. The system compares this value with cpi_ncpu_per_chip to
 768  768   *      determine if simultaneous multi-threading (SMT) is enabled. When
 769  769   *      cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
 770  770   *      the X86FSET_HTT feature is not set. If this value is greater than one,
 771  771   *      than we consider the processor to have the feature X86FSET_CMP, to
 772  772   *      indicate that there is support for more than one core.
 773  773   *
 774  774   *      This value should be the same for all logical CPUs in the same package.
 775  775   *
 776  776   * cpi_procnodes_per_pkg
 777  777   *
 778  778   *      This value indicates the number of 'nodes' that exist in the package.
 779  779   *      When processors are actually a multi-chip module, this represents the
 780  780   *      number of such modules that exist in the package. Currently, on Intel
 781  781   *      based systems this member is always set to 1.
 782  782   *
 783  783   *      This value should be the same for all logical CPUs in the same package.
 784  784   *
 785  785   * cpi_procnodeid
 786  786   *
 787  787   *      This value indicates the ID of the node that the logical CPU is a part
 788  788   *      of. All logical CPUs that are in the same node must have the same value
 789  789   *      here. This value must be unique across all of the packages in the
 790  790   *      system.  On Intel based systems, this is currently set to the value in
 791  791   *      cpi_chipid because there is only one node.
 792  792   *
 793  793   * cpi_cores_per_compunit
 794  794   *
 795  795   *      This value indicates the number of cores that are part of a compute
 796  796   *      unit. See the AMD topology section for this. This member only has real
 797  797   *      meaning currently for AMD Bulldozer family processors. For all other
 798  798   *      processors, this should currently be set to 1.
 799  799   *
 800  800   * cpi_compunitid
 801  801   *
 802  802   *      This indicates the compute unit that the logical CPU belongs to. For
 803  803   *      processors without AMD Bulldozer-style compute units this should be set
 804  804   *      to the value of cpi_coreid.
 805  805   *
 806  806   * cpi_ncpu_shr_last_cache
 807  807   *
 808  808   *      This indicates the number of logical CPUs that are sharing the same last
 809  809   *      level cache. This value should be the same for all CPUs that are sharing
 810  810   *      that cache. The last cache refers to the cache that is closest to memory
 811  811   *      and furthest away from the CPU.
 812  812   *
 813  813   * cpi_last_lvl_cacheid
 814  814   *
 815  815   *      This indicates the ID of the last cache that the logical CPU uses. This
 816  816   *      cache is often shared between multiple logical CPUs and is the cache
 817  817   *      that is closest to memory and furthest away from the CPU. This value
 818  818   *      should be the same for a group of logical CPUs only if they actually
 819  819   *      share the same last level cache. IDs should not overlap between
 820  820   *      packages.
 821  821   *
 822  822   * cpi_ncore_bits
 823  823   *
 824  824   *      This indicates the number of bits that are required to represent all of
 825  825   *      the cores in the system. As cores are derived based on their APIC IDs,
 826  826   *      we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
 827  827   *      this value to be larger than the actual number of IDs that are present
 828  828   *      in the system. This is used to size tables by the CMI framework. It is
 829  829   *      only filled in for Intel and AMD CPUs.
 830  830   *
 831  831   * cpi_nthread_bits
 832  832   *
 833  833   *      This indicates the number of bits required to represent all of the IDs
 834  834   *      that cover the logical CPUs that exist on a given core. It's OK for this
 835  835   *      value to be larger than the actual number of IDs that are present in the
 836  836   *      system.  This is used to size tables by the CMI framework. It is
 837  837   *      only filled in for Intel and AMD CPUs.
 838  838   *
 839  839   * -----------
 840  840   * Hypervisors
 841  841   * -----------
 842  842   *
 843  843   * If trying to manage the differences between vendors wasn't bad enough, it can
 844  844   * get worse thanks to our friend hardware virtualization. Hypervisors are given
 845  845   * the ability to interpose on all cpuid instructions and change them to suit
 846  846   * their purposes. In general, this is necessary as the hypervisor wants to be
 847  847   * able to present a more uniform set of features or not necessarily give the
 848  848   * guest operating system kernel knowledge of all features so it can be
 849  849   * more easily migrated between systems.
 850  850   *
 851  851   * When it comes to trying to determine topology information, this can be a
 852  852   * double edged sword. When a hypervisor doesn't actually implement a cpuid
 853  853   * leaf, it'll often return all zeros. Because of that, you'll often see various
 854  854   * checks scattered about fields being non-zero before we assume we can use
 855  855   * them.
 856  856   *
 857  857   * When it comes to topology information, the hypervisor is often incentivized
 858  858   * to lie to you about topology. This is because it doesn't always actually
 859  859   * guarantee that topology at all. The topology path we take in the system
 860  860   * depends on how the CPU advertises itself. If it advertises itself as an Intel
 861  861   * or AMD CPU, then we basically do our normal path. However, when they don't
 862  862   * use an actual vendor, then that usually turns into multiple one-core CPUs
 863  863   * that we enumerate that are often on different sockets. The actual behavior
 864  864   * depends greatly on what the hypervisor actually exposes to us.
 865  865   *
 866  866   * --------------------
 867  867   * Exposing Information
 868  868   * --------------------
 869  869   *
 870  870   * We expose CPUID information in three different forms in the system.
 871  871   *
 872  872   * The first is through the x86_featureset variable. This is used in conjunction
 873  873   * with the is_x86_feature() function. This is queried by x86-specific functions
 874  874   * to determine which features are or aren't present in the system and to make
 875  875   * decisions based upon them. For example, users of this include everything from
 876  876   * parts of the system dedicated to reliability, availability, and
 877  877   * serviceability (RAS), to making decisions about how to handle security
 878  878   * mitigations, to various x86-specific drivers. General purpose or
 879  879   * architecture independent drivers should never be calling this function.
 880  880   *
 881  881   * The second means is through the auxiliary vector. The auxiliary vector is a
 882  882   * series of tagged data that the kernel passes down to a user program when it
 883  883   * begins executing. This information is used to indicate to programs what
 884  884   * instruction set extensions are present. For example, information about the
 885  885   * CPU supporting the machine check architecture (MCA) wouldn't be passed down
 886  886   * since user programs cannot make use of it. However, things like the AVX
 887  887   * instruction sets are. Programs use this information to make run-time
 888  888   * decisions about what features they should use. As an example, the run-time
 889  889   * link-editor (rtld) can relocate different functions depending on the hardware
 890  890   * support available.
 891  891   *
 892  892   * The final form is through a series of accessor functions that all have the
 893  893   * form cpuid_get*. This is used by a number of different subsystems in the
 894  894   * kernel to determine more detailed information about what we're running on,
 895  895   * topology information, etc. Some of these subsystems include processor groups
 896  896   * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
 897  897   * microcode, and performance monitoring. These functions all ASSERT that the
 898  898   * CPU they're being called on has reached a certain cpuid pass. If the passes
 899  899   * are rearranged, then this needs to be adjusted.
 900  900   *
 901  901   * -----------------------------------------------
 902  902   * Speculative Execution CPU Side Channel Security
 903  903   * -----------------------------------------------
 904  904   *
 905  905   * With the advent of the Spectre and Meltdown attacks which exploit speculative
 906  906   * execution in the CPU to create side channels there have been a number of
 907  907   * different attacks and corresponding issues that the operating system needs to
 908  908   * mitigate against. The following list is some of the common, but not
 909  909   * exhaustive, set of issues that we know about and have done some or need to do
 910  910   * more work in the system to mitigate against:
 911  911   *
 912  912   *   - Spectre v1
 913  913   *   - swapgs (Spectre v1 variant)
 914  914   *   - Spectre v2
 915  915   *   - Meltdown (Spectre v3)
 916  916   *   - Rogue Register Read (Spectre v3a)
 917  917   *   - Speculative Store Bypass (Spectre v4)
 918  918   *   - ret2spec, SpectreRSB
 919  919   *   - L1 Terminal Fault (L1TF)
 920  920   *   - Microarchitectural Data Sampling (MDS)
 921  921   *
 922  922   * Each of these requires different sets of mitigations and has different attack
 923  923   * surfaces. For the most part, this discussion is about protecting the kernel
 924  924   * from non-kernel executing environments such as user processes and hardware
 925  925   * virtual machines. Unfortunately, there are a number of user vs. user
 926  926   * scenarios that exist with these. The rest of this section will describe the
 927  927   * overall approach that the system has taken to address these as well as their
 928  928   * shortcomings. Unfortunately, not all of the above have been handled today.
 929  929   *
 930  930   * SPECTRE v2, ret2spec, SpectreRSB
 931  931   *
 932  932   * The second variant of the spectre attack focuses on performing branch target
 933  933   * injection. This generally impacts indirect call instructions in the system.
 934  934   * There are three different ways to mitigate this issue that are commonly
 935  935   * described today:
 936  936   *
 937  937   *  1. Using Indirect Branch Restricted Speculation (IBRS).
 938  938   *  2. Using Retpolines and RSB Stuffing
 939  939   *  3. Using Enhanced Indirect Branch Restricted Speculation (EIBRS)
 940  940   *
 941  941   * IBRS uses a feature added to microcode to restrict speculation, among other
 942  942   * things. This form of mitigation has not been used as it has been generally
 943  943   * seen as too expensive and requires reactivation upon various transitions in
 944  944   * the system.
 945  945   *
 946  946   * As a less impactful alternative to IBRS, retpolines were developed by
 947  947   * Google. These basically require one to replace indirect calls with a specific
 948  948   * trampoline that will cause speculation to fail and break the attack.
 949  949   * Retpolines require compiler support. We always build with retpolines in the
 950  950   * external thunk mode. This means that a traditional indirect call is replaced
 951  951   * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
 952  952   * of this is that all indirect function calls are performed through a register.
 953  953   *
 954  954   * We have to use a common external location of the thunk and not inline it into
 955  955   * the callsite so that way we can have a single place to patch these functions.
 956  956   * As it turns out, we actually have three different forms of retpolines that
 957  957   * exist in the system:
 958  958   *
 959  959   *  1. A full retpoline
 960  960   *  2. An AMD-specific optimized retpoline
 961  961   *  3. A no-op version
 962  962   *
 963  963   * The first one is used in the general case. The second one is used if we can
 964  964   * determine that we're on an AMD system and we can successfully toggle the
 965  965   * lfence serializing MSR that exists on the platform. Basically with this
 966  966   * present, an lfence is sufficient and we don't need to do anywhere near as
 967  967   * complicated a dance to successfully use retpolines.
 968  968   *
 969  969   * The third form described above is the most curious. It turns out that the way
 970  970   * that retpolines are implemented is that they rely on how speculation is
 971  971   * performed on a 'ret' instruction. Intel has continued to optimize this
 972  972   * process (which is partly why we need to have return stack buffer stuffing,
 973  973   * but more on that in a bit) and in processors starting with Cascade Lake
 974  974   * on the server side, it's dangerous to rely on retpolines. Instead, a new
 975  975   * mechanism has been introduced called Enhanced IBRS (EIBRS).
 976  976   *
 977  977   * Unlike IBRS, EIBRS is designed to be enabled once at boot and left on each
 978  978   * physical core. However, if this is the case, we don't want to use retpolines
 979  979   * any more. Therefore if EIBRS is present, we end up turning each retpoline
 980  980   * function (called a thunk) into a jmp instruction. This means that we're still
 981  981   * paying the cost of an extra jump to the external thunk, but it gives us
 982  982   * flexibility and the ability to have a single kernel image that works across a
 983  983   * wide variety of systems and hardware features.
 984  984   *
 985  985   * Unfortunately, this alone is insufficient. First, Skylake systems have
 986  986   * additional speculation for the Return Stack Buffer (RSB) which is used to
 987  987   * return from call instructions which retpolines take advantage of. However,
 988  988   * this problem is not just limited to Skylake and is actually more pernicious.
 989  989   * The SpectreRSB paper introduces several more problems that can arise with
 990  990   * dealing with this. The RSB can be poisoned just like the indirect branch
 991  991   * predictor. This means that one needs to clear the RSB when transitioning
 992  992   * between two different privilege domains. Some examples include:
 993  993   *
 994  994   *  - Switching between two different user processes
 995  995   *  - Going between user land and the kernel
 996  996   *  - Returning to the kernel from a hardware virtual machine
 997  997   *
 998  998   * Mitigating this involves combining a couple of different things. The first is
 999  999   * SMEP (supervisor mode execution protection) which was introduced in Ivy
1000 1000   * Bridge. When an RSB entry refers to a user address and we're executing in the
1001 1001   * kernel, speculation through it will be stopped when SMEP is enabled. This
1002 1002   * protects against a number of the different cases that we would normally be
1003 1003   * worried about such as when we enter the kernel from user land.
1004 1004   *
1005 1005   * To prevent against additional manipulation of the RSB from other contexts
1006 1006   * such as a non-root VMX context attacking the kernel we first look to enhanced
1007 1007   * IBRS. When EIBRS is present and enabled, then there is nothing else that we
1008 1008   * need to do to protect the kernel at this time.
1009 1009   *
1010 1010   * On CPUs without EIBRS we need to manually overwrite the contents of the
1011 1011   * return stack buffer. We do this through the x86_rsb_stuff() function.
1012 1012   * Currently this is employed on context switch. The x86_rsb_stuff() function is
1013 1013   * disabled when enhanced IBRS is present because Intel claims on such systems
1014 1014   * it will be ineffective. Stuffing the RSB in context switch helps prevent user
1015 1015   * to user attacks via the RSB.
1016 1016   *
1017 1017   * If SMEP is not present, then we would have to stuff the RSB every time we
1018 1018   * transitioned from user mode to the kernel, which isn't very practical right
1019 1019   * now.
1020 1020   *
1021 1021   * To fully protect user to user and vmx to vmx attacks from these classes of
1022 1022   * issues, we would also need to allow them to opt into performing an Indirect
1023 1023   * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1024 1024   *
1025 1025   * By default, the system will enable RSB stuffing and the required variant of
1026 1026   * retpolines and store that information in the x86_spectrev2_mitigation value.
1027 1027   * This will be evaluated after a microcode update as well, though it is
1028 1028   * expected that microcode updates will not take away features. This may mean
1029 1029   * that a late loaded microcode may not end up in the optimal configuration
1030 1030   * (though this should be rare).
1031 1031   *
1032 1032   * Currently we do not build kmdb with retpolines or perform any additional side
1033 1033   * channel security mitigations for it. One complication with kmdb is that it
1034 1034   * requires its own retpoline thunks and it would need to adjust itself based on
1035 1035   * what the kernel does. The threat model of kmdb is more limited and therefore
1036 1036   * it may make more sense to investigate using prediction barriers as the whole
1037 1037   * system is only executing a single instruction at a time while in kmdb.
1038 1038   *
1039 1039   * SPECTRE v1, v4
1040 1040   *
1041 1041   * The v1 and v4 variants of spectre are not currently mitigated in the
1042 1042   * system and require other classes of changes to occur in the code.
1043 1043   *
1044 1044   * SPECTRE v1 (SWAPGS VARIANT)
1045 1045   *
1046 1046   * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but
1047 1047   * can generally affect any branch-dependent code. The swapgs issue is one
1048 1048   * variant of this. If we are coming in from userspace, we can have code like
1049 1049   * this:
1050 1050   *
1051 1051   *      cmpw    $KCS_SEL, REGOFF_CS(%rsp)
1052 1052   *      je      1f
1053 1053   *      movq    $0, REGOFF_SAVFP(%rsp)
1054 1054   *      swapgs
1055 1055   *      1:
1056 1056   *      movq    %gs:CPU_THREAD, %rax
1057 1057   *
1058 1058   * If an attacker can cause a mis-speculation of the branch here, we could skip
1059 1059   * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based
1060 1060   * load. If subsequent code can act as the usual Spectre cache gadget, this
1061 1061   * would potentially allow KPTI bypass. To fix this, we need an lfence prior to
1062 1062   * any use of the %gs override.
1063 1063   *
1064 1064   * The other case is also an issue: if we're coming into a trap from kernel
1065 1065   * space, we could mis-speculate and swapgs the user %gsbase back in prior to
1066 1066   * using it. AMD systems are not vulnerable to this version, as a swapgs is
1067 1067   * serializing with respect to subsequent uses. But as AMD /does/ need the other
1068 1068   * case, and the fix is the same in both cases (an lfence at the branch target
1069 1069   * 1: in this example), we'll just do it unconditionally.
1070 1070   *
1071 1071   * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it
1072 1072   * harder for user-space to actually set a useful %gsbase value: although it's
1073 1073   * not clear, it might still be feasible via lwp_setprivate(), though, so we
1074 1074   * mitigate anyway.
1075 1075   *
1076 1076   * MELTDOWN
1077 1077   *
1078 1078   * Meltdown, or spectre v3, allowed a user process to read any data in their
1079 1079   * address space regardless of whether or not the page tables in question
1080 1080   * allowed the user to have the ability to read them. The solution to meltdown
1081 1081   * is kernel page table isolation. In this world, there are two page tables that
1082 1082   * are used for a process, one in user land and one in the kernel. To implement
1083 1083   * this we use per-CPU page tables and switch between the user and kernel
1084 1084   * variants when entering and exiting the kernel.  For more information about
1085 1085   * this process and how the trampolines work, please see the big theory
1086 1086   * statements and additional comments in:
1087 1087   *
1088 1088   *  - uts/i86pc/ml/kpti_trampolines.s
1089 1089   *  - uts/i86pc/vm/hat_i86.c
1090 1090   *
1091 1091   * While Meltdown only impacted Intel systems and there are also Intel systems
1092 1092   * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1093 1093   * kernel page table isolation enabled. While this may at first seem weird, an
1094 1094   * important thing to remember is that you can't speculatively read an address
1095 1095   * if it's never in your page table at all. Having user processes without kernel
1096 1096   * pages present provides us with an important layer of defense in the kernel
1097 1097   * against any other side channel attacks that exist and have yet to be
1098 1098   * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1099 1099   * default, no matter the x86 system.
1100 1100   *
1101 1101   * L1 TERMINAL FAULT
1102 1102   *
1103 1103   * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1104 1104   * execution uses page table entries. Effectively, it is two different problems.
1105 1105   * The first is that it ignores the not present bit in the page table entries
1106 1106   * when performing speculative execution. This means that something can
1107 1107   * speculatively read the listed physical address if it's present in the L1
1108 1108   * cache under certain conditions (see Intel's documentation for the full set of
1109 1109   * conditions). Secondly, this can be used to bypass hardware virtualization
1110 1110   * extended page tables (EPT) that are part of Intel's hardware virtual machine
1111 1111   * instructions.
1112 1112   *
1113 1113   * For the non-hardware virtualized case, this is relatively easy to deal with.
1114 1114   * We must make sure that all unmapped pages have an address of zero. This means
1115 1115   * that they could read the first 4k of physical memory; however, we never use
1116 1116   * that first page in the operating system and always skip putting it in our
1117 1117   * memory map, even if firmware tells us we can use it in our memory map. While
1118 1118   * other systems try to put extra metadata in the address and reserved bits,
1119 1119   * which led to this being problematic in those cases, we do not.
1120 1120   *
1121 1121   * For hardware virtual machines things are more complicated. Because they can
1122 1122   * construct their own page tables, it isn't hard for them to perform this
1123 1123   * attack against any physical address. The one wrinkle is that this physical
1124 1124   * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1125 1125   * to flush the L1 data cache. We wrap this up in the function
1126 1126   * spec_uarch_flush(). This function is also used in the mitigation of
1127 1127   * microarchitectural data sampling (MDS) discussed later on. Kernel based
1128 1128   * hypervisors such as KVM or bhyve are responsible for performing this before
1129 1129   * entering the guest.
1130 1130   *
1131 1131   * Because this attack takes place in the L1 cache, there's another wrinkle
1132 1132   * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1133 1133   * designs. This means that when a thread enters a hardware virtualized context
1134 1134   * and flushes the L1 data cache, the other thread on the processor may then go
1135 1135   * ahead and put new data in it that can be potentially attacked. While one
1136 1136   * solution is to disable SMT on the system, another option that is available is
1137 1137   * to use a feature for hardware virtualization called 'SMT exclusion'. This
1138 1138   * goes through and makes sure that if a HVM is being scheduled on one thread,
1139 1139   * then the thing on the other thread is from the same hardware virtual machine.
1140 1140   * If an interrupt comes in or the guest exits to the broader system, then the
1141 1141   * other SMT thread will be kicked out.
1142 1142   *
1143 1143   * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1144 1144   * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1145 1145   * perform L1TF related mitigations.
1146 1146   *
1147 1147   * MICROARCHITECTURAL DATA SAMPLING
1148 1148   *
1149 1149   * Microarchitectural data sampling (MDS) is a combination of four discrete
1150 1150   * vulnerabilities that are similar issues affecting various parts of the CPU's
1151 1151   * microarchitectural implementation around load, store, and fill buffers.
1152 1152   * Specifically it is made up of the following subcomponents:
1153 1153   *
1154 1154   *  1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1155 1155   *  2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1156 1156   *  3. Microarchitectural Load Port Data Sampling (MLPDS)
1157 1157   *  4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1158 1158   *
1159 1159   * To begin addressing these, Intel has introduced another feature in microcode
1160 1160   * called MD_CLEAR. This changes the verw instruction to operate in a different
1161 1161   * way. This allows us to execute the verw instruction in a particular way to
1162 1162   * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1163 1163   * updated when this microcode is present to flush this state.
1164 1164   *
1165 1165   * Primarily we need to flush this state whenever we transition from the kernel
1166 1166   * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1167 1167   * little bit different. Here the structures are statically sized when a logical
1168 1168   * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1169 1169   * flush the microarchitectural state before the CPU goes idles by calling hlt,
1170 1170   * mwait, or another ACPI method. To perform these flushes, we call
1171 1171   * x86_md_clear() at all of these transition points.
1172 1172   *
1173 1173   * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1174 1174   * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1175 1175   * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1176 1176   * a no-op.
1177 1177   *
1178 1178   * Unfortunately, with this issue hyperthreading rears its ugly head. In

↓ open down ↓

1133 lines elided

↑ open up ↑

1179 1179   * particular, everything we've discussed above is only valid for a single
1180 1180   * thread executing on a core. In the case where you have hyper-threading
1181 1181   * present, this attack can be performed between threads. The theoretical fix
1182 1182   * for this is to ensure that both threads are always in the same security
1183 1183   * domain. This means that they are executing in the same ring and mutually
1184 1184   * trust each other. Practically speaking, this would mean that a system call
1185 1185   * would have to issue an inter-processor interrupt (IPI) to the other thread.
1186 1186   * Rather than implement this, we recommend that one disables hyper-threading
1187 1187   * through the use of psradm -aS.
1188 1188   *
     1189 + * TSX ASYNCHRONOUS ABORT
     1190 + *
     1191 + * TSX Asynchronous Abort (TAA) is another side-channel vulnerability that
     1192 + * behaves like MDS, but leverages Intel's transactional instructions as another
     1193 + * vector. Effectively, when a transaction hits one of these cases (unmapped
     1194 + * page, various cache snoop activity, etc.) then the same data can be exposed
     1195 + * as in the case of MDS. This means that you can attack your twin.
     1196 + *
     1197 + * Intel has described that there are two different ways that we can mitigate
     1198 + * this problem on affected processors:
     1199 + *
     1200 + *   1) We can use the same techniques used to deal with MDS. Flushing the
     1201 + *      microarchitectural buffers and disabling hyperthreading will mitigate
     1202 + *      this in the same way.
     1203 + *
     1204 + *   2) Using microcode to disable TSX.
     1205 + *
     1206 + * Now, most processors that are subject to MDS (as in they don't have MDS_NO in
     1207 + * the IA32_ARCH_CAPABILITIES MSR) will not receive microcode to disable TSX.
     1208 + * That's OK as we're already doing all such mitigations. On the other hand,
     1209 + * processors with MDS_NO are all supposed to receive microcode updates that
     1210 + * enumerate support for disabling TSX. In general, we'd rather use this method
     1211 + * when available as it doesn't require disabling hyperthreading to be
     1212 + * effective. Currently we basically are relying on microcode for processors
     1213 + * that enumerate MDS_NO.
     1214 + *
     1215 + * The microcode features are enumerated as part of the IA32_ARCH_CAPABILITIES.
     1216 + * When bit 7 (IA32_ARCH_CAP_TSX_CTRL) is present, then we are given two
     1217 + * different powers. The first allows us to cause all transactions to
     1218 + * immediately abort. The second gives us a means of disabling TSX completely,
     1219 + * which includes removing it from cpuid. If we have support for this in
     1220 + * microcode during the first cpuid pass, then we'll disable TSX completely such
     1221 + * that user land never has a chance to observe the bit. However, if we are late
     1222 + * loading the microcode, then we must use the functionality to cause
     1223 + * transactions to automatically abort. This is necessary for user land's sake.
     1224 + * Once a program sees a cpuid bit, it must not be taken away.
     1225 + *
     1226 + * We track whether or not we should do this based on what cpuid pass we're in.
     1227 + * Whenever we hit cpuid_scan_security() on the boot CPU and we're still on pass
     1228 + * 1 of the cpuid logic, then we can completely turn off TSX. Notably this
     1229 + * should happen twice. Once in the normal cpuid_pass1() code and then a second
     1230 + * time after we do the initial microcode update.
     1231 + *
     1232 + * If TAA has been fixed, then it will be enumerated in IA32_ARCH_CAPABILITIES
     1233 + * as TAA_NO. In such a case, we will still disable TSX: it's proven to be an
     1234 + * unfortunate feature in a number of ways, and taking the opportunity to
     1235 + * finally be able to turn it off is likely to be of benefit in the future.
     1236 + *
1189 1237   * SUMMARY
1190 1238   *
1191 1239   * The following table attempts to summarize the mitigations for various issues
1192 1240   * and what's done in various places:
1193 1241   *
1194 1242   *  - Spectre v1: Not currently mitigated
1195 1243   *  - swapgs: lfences after swapgs paths
1196 1244   *  - Spectre v2: Retpolines/RSB Stuffing or EIBRS if HW support
1197 1245   *  - Meltdown: Kernel Page Table Isolation
1198 1246   *  - Spectre v3a: Updated CPU microcode
1199 1247   *  - Spectre v4: Not currently mitigated
1200 1248   *  - SpectreRSB: SMEP and RSB Stuffing
1201 1249   *  - L1TF: spec_uarch_flush, SMT exclusion, requires microcode
1202      - *  - MDS: x86_md_clear, requires microcode, disabling hyper threading
     1250 + *  - MDS: x86_md_clear, requires microcode, disabling SMT
     1251 + *  - TAA: x86_md_clear and disabling SMT OR microcode and disabling TSX
1203 1252   *
1204 1253   * The following table indicates the x86 feature set bits that indicate that a
1205 1254   * given problem has been solved or a notable feature is present:
1206 1255   *
1207 1256   *  - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1208 1257   *  - MDS_NO: All forms of MDS
     1258 + *  - TAA_NO: TAA
1209 1259   */
1210 1260  
1211 1261  #include <sys/types.h>
1212 1262  #include <sys/archsystm.h>
1213 1263  #include <sys/x86_archext.h>
1214 1264  #include <sys/kmem.h>
1215 1265  #include <sys/systm.h>
1216 1266  #include <sys/cmn_err.h>
1217 1267  #include <sys/sunddi.h>
1218 1268  #include <sys/sunndi.h>

1219 1269  #include <sys/cpuvar.h>
1220 1270  #include <sys/processor.h>
1221 1271  #include <sys/sysmacros.h>
1222 1272  #include <sys/pg.h>
1223 1273  #include <sys/fp.h>
1224 1274  #include <sys/controlregs.h>
1225 1275  #include <sys/bitmap.h>
1226 1276  #include <sys/auxv_386.h>
1227 1277  #include <sys/memnode.h>
1228 1278  #include <sys/pci_cfgspace.h>
1229 1279  #include <sys/comm_page.h>
1230 1280  #include <sys/mach_mmu.h>
1231 1281  #include <sys/ucode.h>
1232 1282  #include <sys/tsc.h>
1233 1283  #include <sys/kobj.h>
1234 1284  #include <sys/asm_misc.h>
1235 1285  
1236 1286  #ifdef __xpv
1237 1287  #include <sys/hypervisor.h>
1238 1288  #else
1239 1289  #include <sys/ontrap.h>
1240 1290  #endif
1241 1291  
1242 1292  uint_t x86_vendor = X86_VENDOR_IntelClone;
1243 1293  uint_t x86_type = X86_TYPE_OTHER;
1244 1294  uint_t x86_clflush_size = 0;
1245 1295  
1246 1296  #if defined(__xpv)
1247 1297  int x86_use_pcid = 0;
1248 1298  int x86_use_invpcid = 0;
1249 1299  #else
1250 1300  int x86_use_pcid = -1;
1251 1301  int x86_use_invpcid = -1;
1252 1302  #endif
1253 1303  
1254 1304  typedef enum {

↓ open down ↓

36 lines elided

↑ open up ↑

1255 1305          X86_SPECTREV2_RETPOLINE,
1256 1306          X86_SPECTREV2_RETPOLINE_AMD,
1257 1307          X86_SPECTREV2_ENHANCED_IBRS,
1258 1308          X86_SPECTREV2_DISABLED
1259 1309  } x86_spectrev2_mitigation_t;
1260 1310  
1261 1311  uint_t x86_disable_spectrev2 = 0;
1262 1312  static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1263 1313      X86_SPECTREV2_RETPOLINE;
1264 1314  
     1315 +/*
     1316 + * The mitigation status for TAA:
     1317 + * X86_TAA_NOTHING -- no mitigation available for TAA side-channels
     1318 + * X86_TAA_DISABLED -- mitigation disabled via x86_disable_taa
     1319 + * X86_TAA_MD_CLEAR -- MDS mitigation also suffices for TAA
     1320 + * X86_TAA_TSX_FORCE_ABORT -- transactions are forced to abort
     1321 + * X86_TAA_TSX_DISABLE -- force abort transactions and hide from CPUID
     1322 + * X86_TAA_HW_MITIGATED -- TSX potentially active but H/W not TAA-vulnerable
     1323 + */
     1324 +typedef enum {
     1325 +        X86_TAA_NOTHING,
     1326 +        X86_TAA_DISABLED,
     1327 +        X86_TAA_MD_CLEAR,
     1328 +        X86_TAA_TSX_FORCE_ABORT,
     1329 +        X86_TAA_TSX_DISABLE,
     1330 +        X86_TAA_HW_MITIGATED
     1331 +} x86_taa_mitigation_t;
     1332 +
     1333 +uint_t x86_disable_taa = 0;
     1334 +static x86_taa_mitigation_t x86_taa_mitigation = X86_TAA_NOTHING;
     1335 +
1265 1336  uint_t pentiumpro_bug4046376;
1266 1337  
1267 1338  uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1268 1339  
1269 1340  static char *x86_feature_names[NUM_X86_FEATURES] = {
1270 1341          "lgpg",
1271 1342          "tsc",
1272 1343          "msr",
1273 1344          "mtrr",
1274 1345          "pge",

1275 1346          "de",
1276 1347          "cmov",
1277 1348          "mmx",
1278 1349          "mca",
1279 1350          "pae",
1280 1351          "cv8",
1281 1352          "pat",
1282 1353          "sep",
1283 1354          "sse",
1284 1355          "sse2",
1285 1356          "htt",
1286 1357          "asysc",
1287 1358          "nx",
1288 1359          "sse3",
1289 1360          "cx16",
1290 1361          "cmp",
1291 1362          "tscp",
1292 1363          "mwait",
1293 1364          "sse4a",
1294 1365          "cpuid",
1295 1366          "ssse3",
1296 1367          "sse4_1",
1297 1368          "sse4_2",
1298 1369          "1gpg",
1299 1370          "clfsh",
1300 1371          "64",
1301 1372          "aes",
1302 1373          "pclmulqdq",
1303 1374          "xsave",
1304 1375          "avx",
1305 1376          "vmx",
1306 1377          "svm",
1307 1378          "topoext",
1308 1379          "f16c",
1309 1380          "rdrand",
1310 1381          "x2apic",
1311 1382          "avx2",
1312 1383          "bmi1",
1313 1384          "bmi2",
1314 1385          "fma",
1315 1386          "smep",
1316 1387          "smap",
1317 1388          "adx",
1318 1389          "rdseed",
1319 1390          "mpx",
1320 1391          "avx512f",
1321 1392          "avx512dq",
1322 1393          "avx512pf",
1323 1394          "avx512er",
1324 1395          "avx512cd",
1325 1396          "avx512bw",
1326 1397          "avx512vl",
1327 1398          "avx512fma",
1328 1399          "avx512vbmi",
1329 1400          "avx512_vpopcntdq",
1330 1401          "avx512_4vnniw",
1331 1402          "avx512_4fmaps",
1332 1403          "xsaveopt",
1333 1404          "xsavec",
1334 1405          "xsaves",
1335 1406          "sha",
1336 1407          "umip",
1337 1408          "pku",
1338 1409          "ospke",
1339 1410          "pcid",
1340 1411          "invpcid",
1341 1412          "ibrs",
1342 1413          "ibpb",
1343 1414          "stibp",
1344 1415          "ssbd",
1345 1416          "ssbd_virt",
1346 1417          "rdcl_no",
1347 1418          "ibrs_all",
1348 1419          "rsba",
1349 1420          "ssb_no",
1350 1421          "stibp_all",
1351 1422          "flush_cmd",
1352 1423          "l1d_vmentry_no",
1353 1424          "fsgsbase",
1354 1425          "clflushopt",
1355 1426          "clwb",

↓ open down ↓

81 lines elided

↑ open up ↑

1356 1427          "monitorx",
1357 1428          "clzero",
1358 1429          "xop",
1359 1430          "fma4",
1360 1431          "tbm",
1361 1432          "avx512_vnni",
1362 1433          "amd_pcec",
1363 1434          "mb_clear",
1364 1435          "mds_no",
1365 1436          "core_thermal",
1366      -        "pkg_thermal"
     1437 +        "pkg_thermal",
     1438 +        "tsx_ctrl",
     1439 +        "taa_no"
1367 1440  };
1368 1441  
1369 1442  boolean_t
1370 1443  is_x86_feature(void *featureset, uint_t feature)
1371 1444  {
1372 1445          ASSERT(feature < NUM_X86_FEATURES);
1373 1446          return (BT_TEST((ulong_t *)featureset, feature));
1374 1447  }
1375 1448  
1376 1449  void

1377 1450  add_x86_feature(void *featureset, uint_t feature)
1378 1451  {
1379 1452          ASSERT(feature < NUM_X86_FEATURES);
1380 1453          BT_SET((ulong_t *)featureset, feature);
1381 1454  }
1382 1455  
1383 1456  void
1384 1457  remove_x86_feature(void *featureset, uint_t feature)
1385 1458  {
1386 1459          ASSERT(feature < NUM_X86_FEATURES);
1387 1460          BT_CLEAR((ulong_t *)featureset, feature);
1388 1461  }
1389 1462  
1390 1463  boolean_t
1391 1464  compare_x86_featureset(void *setA, void *setB)
1392 1465  {
1393 1466          /*
1394 1467           * We assume that the unused bits of the bitmap are always zero.
1395 1468           */
1396 1469          if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1397 1470                  return (B_TRUE);
1398 1471          } else {
1399 1472                  return (B_FALSE);
1400 1473          }
1401 1474  }
1402 1475  
1403 1476  void
1404 1477  print_x86_featureset(void *featureset)
1405 1478  {
1406 1479          uint_t i;
1407 1480  
1408 1481          for (i = 0; i < NUM_X86_FEATURES; i++) {
1409 1482                  if (is_x86_feature(featureset, i)) {
1410 1483                          cmn_err(CE_CONT, "?x86_feature: %s\n",
1411 1484                              x86_feature_names[i]);
1412 1485                  }
1413 1486          }
1414 1487  }
1415 1488  
1416 1489  /* Note: This is the maximum size for the CPU, not the size of the structure. */
1417 1490  static size_t xsave_state_size = 0;
1418 1491  uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1419 1492  boolean_t xsave_force_disable = B_FALSE;
1420 1493  extern int disable_smap;
1421 1494  
1422 1495  /*
1423 1496   * This is set to platform type we are running on.
1424 1497   */
1425 1498  static int platform_type = -1;
1426 1499  
1427 1500  #if !defined(__xpv)
1428 1501  /*
1429 1502   * Variable to patch if hypervisor platform detection needs to be
1430 1503   * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1431 1504   */
1432 1505  int enable_platform_detection = 1;
1433 1506  #endif
1434 1507  
1435 1508  /*
1436 1509   * monitor/mwait info.
1437 1510   *
1438 1511   * size_actual and buf_actual are the real address and size allocated to get
1439 1512   * proper mwait_buf alignement.  buf_actual and size_actual should be passed
1440 1513   * to kmem_free().  Currently kmem_alloc() and mwait happen to both use
1441 1514   * processor cache-line alignment, but this is not guarantied in the furture.
1442 1515   */
1443 1516  struct mwait_info {
1444 1517          size_t          mon_min;        /* min size to avoid missed wakeups */
1445 1518          size_t          mon_max;        /* size to avoid false wakeups */
1446 1519          size_t          size_actual;    /* size actually allocated */
1447 1520          void            *buf_actual;    /* memory actually allocated */
1448 1521          uint32_t        support;        /* processor support of monitor/mwait */
1449 1522  };
1450 1523  
1451 1524  /*
1452 1525   * xsave/xrestor info.
1453 1526   *
1454 1527   * This structure contains HW feature bits and the size of the xsave save area.
1455 1528   * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1456 1529   * (xsave_state) to describe the xsave layout. However, at runtime the
1457 1530   * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1458 1531   * xsave_state structure simply represents the legacy layout of the beginning
1459 1532   * of the xsave area.
1460 1533   */
1461 1534  struct xsave_info {
1462 1535          uint32_t        xsav_hw_features_low;   /* Supported HW features */
1463 1536          uint32_t        xsav_hw_features_high;  /* Supported HW features */
1464 1537          size_t          xsav_max_size;  /* max size save area for HW features */
1465 1538          size_t          ymm_size;       /* AVX: size of ymm save area */
1466 1539          size_t          ymm_offset;     /* AVX: offset for ymm save area */
1467 1540          size_t          bndregs_size;   /* MPX: size of bndregs save area */
1468 1541          size_t          bndregs_offset; /* MPX: offset for bndregs save area */
1469 1542          size_t          bndcsr_size;    /* MPX: size of bndcsr save area */
1470 1543          size_t          bndcsr_offset;  /* MPX: offset for bndcsr save area */
1471 1544          size_t          opmask_size;    /* AVX512: size of opmask save */
1472 1545          size_t          opmask_offset;  /* AVX512: offset for opmask save */
1473 1546          size_t          zmmlo_size;     /* AVX512: size of zmm 256 save */
1474 1547          size_t          zmmlo_offset;   /* AVX512: offset for zmm 256 save */
1475 1548          size_t          zmmhi_size;     /* AVX512: size of zmm hi reg save */
1476 1549          size_t          zmmhi_offset;   /* AVX512: offset for zmm hi reg save */
1477 1550  };
1478 1551  
1479 1552  
1480 1553  /*
1481 1554   * These constants determine how many of the elements of the
1482 1555   * cpuid we cache in the cpuid_info data structure; the
1483 1556   * remaining elements are accessible via the cpuid instruction.
1484 1557   */
1485 1558  
1486 1559  #define NMAX_CPI_STD    8               /* eax = 0 .. 7 */
1487 1560  #define NMAX_CPI_EXTD   0x1f            /* eax = 0x80000000 .. 0x8000001e */
1488 1561  
1489 1562  /*
1490 1563   * See the big theory statement for a more detailed explanation of what some of
1491 1564   * these members mean.
1492 1565   */
1493 1566  struct cpuid_info {
1494 1567          uint_t cpi_pass;                /* last pass completed */
1495 1568          /*
1496 1569           * standard function information
1497 1570           */
1498 1571          uint_t cpi_maxeax;              /* fn 0: %eax */
1499 1572          char cpi_vendorstr[13];         /* fn 0: %ebx:%ecx:%edx */
1500 1573          uint_t cpi_vendor;              /* enum of cpi_vendorstr */
1501 1574  
1502 1575          uint_t cpi_family;              /* fn 1: extended family */
1503 1576          uint_t cpi_model;               /* fn 1: extended model */
1504 1577          uint_t cpi_step;                /* fn 1: stepping */
1505 1578          chipid_t cpi_chipid;            /* fn 1: %ebx:  Intel: chip # */
1506 1579                                          /*              AMD: package/socket # */
1507 1580          uint_t cpi_brandid;             /* fn 1: %ebx: brand ID */
1508 1581          int cpi_clogid;                 /* fn 1: %ebx: thread # */
1509 1582          uint_t cpi_ncpu_per_chip;       /* fn 1: %ebx: logical cpu count */
1510 1583          uint8_t cpi_cacheinfo[16];      /* fn 2: intel-style cache desc */
1511 1584          uint_t cpi_ncache;              /* fn 2: number of elements */
1512 1585          uint_t cpi_ncpu_shr_last_cache; /* fn 4: %eax: ncpus sharing cache */
1513 1586          id_t cpi_last_lvl_cacheid;      /* fn 4: %eax: derived cache id */
1514 1587          uint_t cpi_cache_leaf_size;     /* Number of cache elements */
1515 1588                                          /* Intel fn: 4, AMD fn: 8000001d */
1516 1589          struct cpuid_regs **cpi_cache_leaves;   /* Acual leaves from above */
1517 1590          struct cpuid_regs cpi_std[NMAX_CPI_STD];        /* 0 .. 7 */
1518 1591          /*
1519 1592           * extended function information
1520 1593           */
1521 1594          uint_t cpi_xmaxeax;             /* fn 0x80000000: %eax */
1522 1595          char cpi_brandstr[49];          /* fn 0x8000000[234] */
1523 1596          uint8_t cpi_pabits;             /* fn 0x80000006: %eax */
1524 1597          uint8_t cpi_vabits;             /* fn 0x80000006: %eax */
1525 1598          uint8_t cpi_fp_amd_save;        /* AMD: FP error pointer save rqd. */
1526 1599          struct  cpuid_regs cpi_extd[NMAX_CPI_EXTD];     /* 0x800000XX */
1527 1600  
1528 1601          id_t cpi_coreid;                /* same coreid => strands share core */
1529 1602          int cpi_pkgcoreid;              /* core number within single package */
1530 1603          uint_t cpi_ncore_per_chip;      /* AMD: fn 0x80000008: %ecx[7-0] */
1531 1604                                          /* Intel: fn 4: %eax[31-26] */
1532 1605  
1533 1606          /*
1534 1607           * These values represent the number of bits that are required to store
1535 1608           * information about the number of cores and threads.
1536 1609           */
1537 1610          uint_t cpi_ncore_bits;
1538 1611          uint_t cpi_nthread_bits;
1539 1612          /*
1540 1613           * supported feature information
1541 1614           */
1542 1615          uint32_t cpi_support[6];
1543 1616  #define STD_EDX_FEATURES        0
1544 1617  #define AMD_EDX_FEATURES        1
1545 1618  #define TM_EDX_FEATURES         2
1546 1619  #define STD_ECX_FEATURES        3
1547 1620  #define AMD_ECX_FEATURES        4
1548 1621  #define STD_EBX_FEATURES        5
1549 1622          /*
1550 1623           * Synthesized information, where known.
1551 1624           */
1552 1625          uint32_t cpi_chiprev;           /* See X86_CHIPREV_* in x86_archext.h */
1553 1626          const char *cpi_chiprevstr;     /* May be NULL if chiprev unknown */
1554 1627          uint32_t cpi_socket;            /* Chip package/socket type */
1555 1628  
1556 1629          struct mwait_info cpi_mwait;    /* fn 5: monitor/mwait info */
1557 1630          uint32_t cpi_apicid;
1558 1631          uint_t cpi_procnodeid;          /* AMD: nodeID on HT, Intel: chipid */
1559 1632          uint_t cpi_procnodes_per_pkg;   /* AMD: # of nodes in the package */
1560 1633                                          /* Intel: 1 */
1561 1634          uint_t cpi_compunitid;          /* AMD: ComputeUnit ID, Intel: coreid */
1562 1635          uint_t cpi_cores_per_compunit;  /* AMD: # of cores in the ComputeUnit */
1563 1636  
1564 1637          struct xsave_info cpi_xsave;    /* fn D: xsave/xrestor info */
1565 1638  };
1566 1639  
1567 1640  
1568 1641  static struct cpuid_info cpuid_info0;
1569 1642  
1570 1643  /*
1571 1644   * These bit fields are defined by the Intel Application Note AP-485
1572 1645   * "Intel Processor Identification and the CPUID Instruction"
1573 1646   */
1574 1647  #define CPI_FAMILY_XTD(cpi)     BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1575 1648  #define CPI_MODEL_XTD(cpi)      BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1576 1649  #define CPI_TYPE(cpi)           BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1577 1650  #define CPI_FAMILY(cpi)         BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1578 1651  #define CPI_STEP(cpi)           BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1579 1652  #define CPI_MODEL(cpi)          BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1580 1653  
1581 1654  #define CPI_FEATURES_EDX(cpi)           ((cpi)->cpi_std[1].cp_edx)
1582 1655  #define CPI_FEATURES_ECX(cpi)           ((cpi)->cpi_std[1].cp_ecx)
1583 1656  #define CPI_FEATURES_XTD_EDX(cpi)       ((cpi)->cpi_extd[1].cp_edx)
1584 1657  #define CPI_FEATURES_XTD_ECX(cpi)       ((cpi)->cpi_extd[1].cp_ecx)
1585 1658  #define CPI_FEATURES_7_0_EBX(cpi)       ((cpi)->cpi_std[7].cp_ebx)
1586 1659  #define CPI_FEATURES_7_0_ECX(cpi)       ((cpi)->cpi_std[7].cp_ecx)
1587 1660  #define CPI_FEATURES_7_0_EDX(cpi)       ((cpi)->cpi_std[7].cp_edx)
1588 1661  
1589 1662  #define CPI_BRANDID(cpi)        BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1590 1663  #define CPI_CHUNKS(cpi)         BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1591 1664  #define CPI_CPU_COUNT(cpi)      BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1592 1665  #define CPI_APIC_ID(cpi)        BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1593 1666  
1594 1667  #define CPI_MAXEAX_MAX          0x100           /* sanity control */
1595 1668  #define CPI_XMAXEAX_MAX         0x80000100
1596 1669  #define CPI_FN4_ECX_MAX         0x20            /* sanity: max fn 4 levels */
1597 1670  #define CPI_FNB_ECX_MAX         0x20            /* sanity: max fn B levels */
1598 1671  
1599 1672  /*
1600 1673   * Function 4 (Deterministic Cache Parameters) macros
1601 1674   * Defined by Intel Application Note AP-485
1602 1675   */
1603 1676  #define CPI_NUM_CORES(regs)             BITX((regs)->cp_eax, 31, 26)
1604 1677  #define CPI_NTHR_SHR_CACHE(regs)        BITX((regs)->cp_eax, 25, 14)
1605 1678  #define CPI_FULL_ASSOC_CACHE(regs)      BITX((regs)->cp_eax, 9, 9)
1606 1679  #define CPI_SELF_INIT_CACHE(regs)       BITX((regs)->cp_eax, 8, 8)
1607 1680  #define CPI_CACHE_LVL(regs)             BITX((regs)->cp_eax, 7, 5)
1608 1681  #define CPI_CACHE_TYPE(regs)            BITX((regs)->cp_eax, 4, 0)
1609 1682  #define CPI_CPU_LEVEL_TYPE(regs)        BITX((regs)->cp_ecx, 15, 8)
1610 1683  
1611 1684  #define CPI_CACHE_WAYS(regs)            BITX((regs)->cp_ebx, 31, 22)
1612 1685  #define CPI_CACHE_PARTS(regs)           BITX((regs)->cp_ebx, 21, 12)
1613 1686  #define CPI_CACHE_COH_LN_SZ(regs)       BITX((regs)->cp_ebx, 11, 0)
1614 1687  
1615 1688  #define CPI_CACHE_SETS(regs)            BITX((regs)->cp_ecx, 31, 0)
1616 1689  
1617 1690  #define CPI_PREFCH_STRIDE(regs)         BITX((regs)->cp_edx, 9, 0)
1618 1691  
1619 1692  
1620 1693  /*
1621 1694   * A couple of shorthand macros to identify "later" P6-family chips
1622 1695   * like the Pentium M and Core.  First, the "older" P6-based stuff
1623 1696   * (loosely defined as "pre-Pentium-4"):
1624 1697   * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1625 1698   */
1626 1699  #define IS_LEGACY_P6(cpi) (                     \
1627 1700          cpi->cpi_family == 6 &&                 \
1628 1701                  (cpi->cpi_model == 1 ||         \
1629 1702                  cpi->cpi_model == 3 ||          \
1630 1703                  cpi->cpi_model == 5 ||          \
1631 1704                  cpi->cpi_model == 6 ||          \
1632 1705                  cpi->cpi_model == 7 ||          \
1633 1706                  cpi->cpi_model == 8 ||          \
1634 1707                  cpi->cpi_model == 0xA ||        \
1635 1708                  cpi->cpi_model == 0xB)          \
1636 1709  )
1637 1710  
1638 1711  /* A "new F6" is everything with family 6 that's not the above */
1639 1712  #define IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
1640 1713  
1641 1714  /* Extended family/model support */
1642 1715  #define IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
1643 1716          cpi->cpi_family >= 0xf)
1644 1717  
1645 1718  /*
1646 1719   * Info for monitor/mwait idle loop.
1647 1720   *
1648 1721   * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
1649 1722   * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
1650 1723   * 2006.
1651 1724   * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
1652 1725   * Documentation Updates" #33633, Rev 2.05, December 2006.
1653 1726   */
1654 1727  #define MWAIT_SUPPORT           (0x00000001)    /* mwait supported */
1655 1728  #define MWAIT_EXTENSIONS        (0x00000002)    /* extenstion supported */
1656 1729  #define MWAIT_ECX_INT_ENABLE    (0x00000004)    /* ecx 1 extension supported */
1657 1730  #define MWAIT_SUPPORTED(cpi)    ((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
1658 1731  #define MWAIT_INT_ENABLE(cpi)   ((cpi)->cpi_std[5].cp_ecx & 0x2)
1659 1732  #define MWAIT_EXTENSION(cpi)    ((cpi)->cpi_std[5].cp_ecx & 0x1)
1660 1733  #define MWAIT_SIZE_MIN(cpi)     BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
1661 1734  #define MWAIT_SIZE_MAX(cpi)     BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
1662 1735  /*
1663 1736   * Number of sub-cstates for a given c-state.
1664 1737   */
1665 1738  #define MWAIT_NUM_SUBC_STATES(cpi, c_state)                     \
1666 1739          BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
1667 1740  
1668 1741  /*
1669 1742   * XSAVE leaf 0xD enumeration
1670 1743   */
1671 1744  #define CPUID_LEAFD_2_YMM_OFFSET        576
1672 1745  #define CPUID_LEAFD_2_YMM_SIZE          256
1673 1746  
1674 1747  /*
1675 1748   * Common extended leaf names to cut down on typos.
1676 1749   */
1677 1750  #define CPUID_LEAF_EXT_0                0x80000000
1678 1751  #define CPUID_LEAF_EXT_8                0x80000008
1679 1752  #define CPUID_LEAF_EXT_1d               0x8000001d
1680 1753  #define CPUID_LEAF_EXT_1e               0x8000001e
1681 1754  
1682 1755  /*
1683 1756   * Functions we consune from cpuid_subr.c;  don't publish these in a header
1684 1757   * file to try and keep people using the expected cpuid_* interfaces.
1685 1758   */
1686 1759  extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
1687 1760  extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
1688 1761  extern uint32_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
1689 1762  extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
1690 1763  extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
1691 1764  
1692 1765  /*
1693 1766   * Apply up various platform-dependent restrictions where the
1694 1767   * underlying platform restrictions mean the CPU can be marked
1695 1768   * as less capable than its cpuid instruction would imply.
1696 1769   */
1697 1770  #if defined(__xpv)
1698 1771  static void
1699 1772  platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
1700 1773  {
1701 1774          switch (eax) {
1702 1775          case 1: {
1703 1776                  uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
1704 1777                      0 : CPUID_INTC_EDX_MCA;
1705 1778                  cp->cp_edx &=
1706 1779                      ~(mcamask |
1707 1780                      CPUID_INTC_EDX_PSE |
1708 1781                      CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1709 1782                      CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
1710 1783                      CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
1711 1784                      CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1712 1785                      CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
1713 1786                  break;
1714 1787          }
1715 1788  
1716 1789          case 0x80000001:
1717 1790                  cp->cp_edx &=
1718 1791                      ~(CPUID_AMD_EDX_PSE |
1719 1792                      CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1720 1793                      CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
1721 1794                      CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
1722 1795                      CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1723 1796                      CPUID_AMD_EDX_TSCP);
1724 1797                  cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
1725 1798                  break;
1726 1799          default:
1727 1800                  break;
1728 1801          }
1729 1802  
1730 1803          switch (vendor) {
1731 1804          case X86_VENDOR_Intel:
1732 1805                  switch (eax) {
1733 1806                  case 4:
1734 1807                          /*
1735 1808                           * Zero out the (ncores-per-chip - 1) field
1736 1809                           */
1737 1810                          cp->cp_eax &= 0x03fffffff;
1738 1811                          break;
1739 1812                  default:
1740 1813                          break;
1741 1814                  }
1742 1815                  break;
1743 1816          case X86_VENDOR_AMD:
1744 1817                  switch (eax) {
1745 1818  
1746 1819                  case 0x80000001:
1747 1820                          cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
1748 1821                          break;
1749 1822  
1750 1823                  case CPUID_LEAF_EXT_8:
1751 1824                          /*
1752 1825                           * Zero out the (ncores-per-chip - 1) field
1753 1826                           */
1754 1827                          cp->cp_ecx &= 0xffffff00;
1755 1828                          break;
1756 1829                  default:
1757 1830                          break;
1758 1831                  }
1759 1832                  break;
1760 1833          default:
1761 1834                  break;
1762 1835          }
1763 1836  }
1764 1837  #else
1765 1838  #define platform_cpuid_mangle(vendor, eax, cp)  /* nothing */
1766 1839  #endif
1767 1840  
1768 1841  /*
1769 1842   *  Some undocumented ways of patching the results of the cpuid
1770 1843   *  instruction to permit running Solaris 10 on future cpus that
1771 1844   *  we don't currently support.  Could be set to non-zero values
1772 1845   *  via settings in eeprom.
1773 1846   */
1774 1847  
1775 1848  uint32_t cpuid_feature_ecx_include;
1776 1849  uint32_t cpuid_feature_ecx_exclude;
1777 1850  uint32_t cpuid_feature_edx_include;
1778 1851  uint32_t cpuid_feature_edx_exclude;
1779 1852  
1780 1853  /*
1781 1854   * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
1782 1855   */
1783 1856  void
1784 1857  cpuid_alloc_space(cpu_t *cpu)
1785 1858  {
1786 1859          /*
1787 1860           * By convention, cpu0 is the boot cpu, which is set up
1788 1861           * before memory allocation is available.  All other cpus get
1789 1862           * their cpuid_info struct allocated here.
1790 1863           */
1791 1864          ASSERT(cpu->cpu_id != 0);
1792 1865          ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
1793 1866          cpu->cpu_m.mcpu_cpi =
1794 1867              kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
1795 1868  }
1796 1869  
1797 1870  void
1798 1871  cpuid_free_space(cpu_t *cpu)
1799 1872  {
1800 1873          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1801 1874          int i;
1802 1875  
1803 1876          ASSERT(cpi != NULL);
1804 1877          ASSERT(cpi != &cpuid_info0);
1805 1878  
1806 1879          /*
1807 1880           * Free up any cache leaf related dynamic storage. The first entry was
1808 1881           * cached from the standard cpuid storage, so we should not free it.
1809 1882           */
1810 1883          for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
1811 1884                  kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
1812 1885          if (cpi->cpi_cache_leaf_size > 0)
1813 1886                  kmem_free(cpi->cpi_cache_leaves,
1814 1887                      cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
1815 1888  
1816 1889          kmem_free(cpi, sizeof (*cpi));
1817 1890          cpu->cpu_m.mcpu_cpi = NULL;
1818 1891  }
1819 1892  
1820 1893  #if !defined(__xpv)
1821 1894  /*
1822 1895   * Determine the type of the underlying platform. This is used to customize
1823 1896   * initialization of various subsystems (e.g. TSC). determine_platform() must
1824 1897   * only ever be called once to prevent two processors from seeing different
1825 1898   * values of platform_type. Must be called before cpuid_pass1(), the earliest
1826 1899   * consumer to execute (uses _cpuid_chiprev --> synth_amd_info --> get_hwenv).
1827 1900   */
1828 1901  void
1829 1902  determine_platform(void)
1830 1903  {
1831 1904          struct cpuid_regs cp;
1832 1905          uint32_t base;
1833 1906          uint32_t regs[4];
1834 1907          char *hvstr = (char *)regs;
1835 1908  
1836 1909          ASSERT(platform_type == -1);
1837 1910  
1838 1911          platform_type = HW_NATIVE;
1839 1912  
1840 1913          if (!enable_platform_detection)
1841 1914                  return;
1842 1915  
1843 1916          /*
1844 1917           * If Hypervisor CPUID bit is set, try to determine hypervisor
1845 1918           * vendor signature, and set platform type accordingly.
1846 1919           *
1847 1920           * References:
1848 1921           * http://lkml.org/lkml/2008/10/1/246
1849 1922           * http://kb.vmware.com/kb/1009458
1850 1923           */
1851 1924          cp.cp_eax = 0x1;
1852 1925          (void) __cpuid_insn(&cp);
1853 1926          if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
1854 1927                  cp.cp_eax = 0x40000000;
1855 1928                  (void) __cpuid_insn(&cp);
1856 1929                  regs[0] = cp.cp_ebx;
1857 1930                  regs[1] = cp.cp_ecx;
1858 1931                  regs[2] = cp.cp_edx;
1859 1932                  regs[3] = 0;
1860 1933                  if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
1861 1934                          platform_type = HW_XEN_HVM;
1862 1935                          return;
1863 1936                  }
1864 1937                  if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
1865 1938                          platform_type = HW_VMWARE;
1866 1939                          return;
1867 1940                  }
1868 1941                  if (strcmp(hvstr, HVSIG_KVM) == 0) {
1869 1942                          platform_type = HW_KVM;
1870 1943                          return;
1871 1944                  }
1872 1945                  if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
1873 1946                          platform_type = HW_BHYVE;
1874 1947                          return;
1875 1948                  }
1876 1949                  if (strcmp(hvstr, HVSIG_MICROSOFT) == 0)
1877 1950                          platform_type = HW_MICROSOFT;
1878 1951          } else {
1879 1952                  /*
1880 1953                   * Check older VMware hardware versions. VMware hypervisor is
1881 1954                   * detected by performing an IN operation to VMware hypervisor
1882 1955                   * port and checking that value returned in %ebx is VMware
1883 1956                   * hypervisor magic value.
1884 1957                   *
1885 1958                   * References: http://kb.vmware.com/kb/1009458
1886 1959                   */
1887 1960                  vmware_port(VMWARE_HVCMD_GETVERSION, regs);
1888 1961                  if (regs[1] == VMWARE_HVMAGIC) {
1889 1962                          platform_type = HW_VMWARE;
1890 1963                          return;
1891 1964                  }
1892 1965          }
1893 1966  
1894 1967          /*
1895 1968           * Check Xen hypervisor. In a fully virtualized domain,
1896 1969           * Xen's pseudo-cpuid function returns a string representing the
1897 1970           * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
1898 1971           * supported cpuid function. We need at least a (base + 2) leaf value
1899 1972           * to do what we want to do. Try different base values, since the
1900 1973           * hypervisor might use a different one depending on whether Hyper-V
1901 1974           * emulation is switched on by default or not.
1902 1975           */
1903 1976          for (base = 0x40000000; base < 0x40010000; base += 0x100) {
1904 1977                  cp.cp_eax = base;
1905 1978                  (void) __cpuid_insn(&cp);
1906 1979                  regs[0] = cp.cp_ebx;
1907 1980                  regs[1] = cp.cp_ecx;
1908 1981                  regs[2] = cp.cp_edx;
1909 1982                  regs[3] = 0;
1910 1983                  if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
1911 1984                      cp.cp_eax >= (base + 2)) {
1912 1985                          platform_type &= ~HW_NATIVE;
1913 1986                          platform_type |= HW_XEN_HVM;
1914 1987                          return;
1915 1988                  }
1916 1989          }
1917 1990  }
1918 1991  
1919 1992  int
1920 1993  get_hwenv(void)
1921 1994  {
1922 1995          ASSERT(platform_type != -1);
1923 1996          return (platform_type);
1924 1997  }
1925 1998  
1926 1999  int
1927 2000  is_controldom(void)
1928 2001  {
1929 2002          return (0);
1930 2003  }
1931 2004  
1932 2005  #else
1933 2006  
1934 2007  int
1935 2008  get_hwenv(void)
1936 2009  {
1937 2010          return (HW_XEN_PV);
1938 2011  }
1939 2012  
1940 2013  int
1941 2014  is_controldom(void)
1942 2015  {
1943 2016          return (DOMAIN_IS_INITDOMAIN(xen_info));
1944 2017  }
1945 2018  
1946 2019  #endif  /* __xpv */
1947 2020  
1948 2021  /*
1949 2022   * Make sure that we have gathered all of the CPUID leaves that we might need to
1950 2023   * determine topology. We assume that the standard leaf 1 has already been done
1951 2024   * and that xmaxeax has already been calculated.
1952 2025   */
1953 2026  static void
1954 2027  cpuid_gather_amd_topology_leaves(cpu_t *cpu)
1955 2028  {
1956 2029          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1957 2030  
1958 2031          if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
1959 2032                  struct cpuid_regs *cp;
1960 2033  
1961 2034                  cp = &cpi->cpi_extd[8];
1962 2035                  cp->cp_eax = CPUID_LEAF_EXT_8;
1963 2036                  (void) __cpuid_insn(cp);
1964 2037                  platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
1965 2038          }
1966 2039  
1967 2040          if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
1968 2041              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
1969 2042                  struct cpuid_regs *cp;
1970 2043  
1971 2044                  cp = &cpi->cpi_extd[0x1e];
1972 2045                  cp->cp_eax = CPUID_LEAF_EXT_1e;
1973 2046                  (void) __cpuid_insn(cp);
1974 2047          }
1975 2048  }
1976 2049  
1977 2050  /*
1978 2051   * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
1979 2052   * it to everything else. If not, and we're on an AMD system where 8000001e is
1980 2053   * valid, then we use that. Othewrise, we fall back to the default value for the
1981 2054   * APIC ID in leaf 1.
1982 2055   */
1983 2056  static uint32_t
1984 2057  cpuid_gather_apicid(struct cpuid_info *cpi)
1985 2058  {
1986 2059          /*
1987 2060           * Leaf B changes based on the arguments to it. Beacuse we don't cache
1988 2061           * it, we need to gather it again.
1989 2062           */
1990 2063          if (cpi->cpi_maxeax >= 0xB) {
1991 2064                  struct cpuid_regs regs;
1992 2065                  struct cpuid_regs *cp;
1993 2066  
1994 2067                  cp = &regs;
1995 2068                  cp->cp_eax = 0xB;
1996 2069                  cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
1997 2070                  (void) __cpuid_insn(cp);
1998 2071  
1999 2072                  if (cp->cp_ebx != 0) {
2000 2073                          return (cp->cp_edx);
2001 2074                  }
2002 2075          }
2003 2076  
2004 2077          if (cpi->cpi_vendor == X86_VENDOR_AMD &&
2005 2078              is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2006 2079              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2007 2080                  return (cpi->cpi_extd[0x1e].cp_eax);
2008 2081          }
2009 2082  
2010 2083          return (CPI_APIC_ID(cpi));
2011 2084  }
2012 2085  
2013 2086  /*
2014 2087   * For AMD processors, attempt to calculate the number of chips and cores that
2015 2088   * exist. The way that we do this varies based on the generation, because the
2016 2089   * generations themselves have changed dramatically.
2017 2090   *
2018 2091   * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
2019 2092   * However, with the advent of family 17h (Zen) it actually tells us the number
2020 2093   * of threads, so we need to look at leaf 0x8000001e if available to determine
2021 2094   * its value. Otherwise, for all prior families, the number of enabled cores is
2022 2095   * the same as threads.
2023 2096   *
2024 2097   * If we do not have leaf 0x80000008, then we assume that this processor does
2025 2098   * not have anything. AMD's older CPUID specification says there's no reason to
2026 2099   * fall back to leaf 1.
2027 2100   *
2028 2101   * In some virtualization cases we will not have leaf 8000001e or it will be
2029 2102   * zero. When that happens we assume the number of threads is one.
2030 2103   */
2031 2104  static void
2032 2105  cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2033 2106  {
2034 2107          uint_t nthreads, nthread_per_core;
2035 2108  
2036 2109          nthreads = nthread_per_core = 1;
2037 2110  
2038 2111          if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2039 2112                  nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2040 2113          } else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2041 2114                  nthreads = CPI_CPU_COUNT(cpi);
2042 2115          }
2043 2116  
2044 2117          /*
2045 2118           * For us to have threads, and know about it, we have to be at least at
2046 2119           * family 17h and have the cpuid bit that says we have extended
2047 2120           * topology.
2048 2121           */
2049 2122          if (cpi->cpi_family >= 0x17 &&
2050 2123              is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2051 2124              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2052 2125                  nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2053 2126          }
2054 2127  
2055 2128          *ncpus = nthreads;
2056 2129          *ncores = nthreads / nthread_per_core;
2057 2130  }
2058 2131  
2059 2132  /*
2060 2133   * Seed the initial values for the cores and threads for an Intel based
2061 2134   * processor. These values will be overwritten if we detect that the processor
2062 2135   * supports CPUID leaf 0xb.
2063 2136   */
2064 2137  static void
2065 2138  cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2066 2139  {
2067 2140          /*
2068 2141           * Only seed the number of physical cores from the first level leaf 4
2069 2142           * information. The number of threads there indicate how many share the
2070 2143           * L1 cache, which may or may not have anything to do with the number of
2071 2144           * logical CPUs per core.
2072 2145           */
2073 2146          if (cpi->cpi_maxeax >= 4) {
2074 2147                  *ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2075 2148          } else {
2076 2149                  *ncores = 1;
2077 2150          }
2078 2151  
2079 2152          if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2080 2153                  *ncpus = CPI_CPU_COUNT(cpi);
2081 2154          } else {
2082 2155                  *ncpus = *ncores;
2083 2156          }
2084 2157  }
2085 2158  
2086 2159  static boolean_t
2087 2160  cpuid_leafB_getids(cpu_t *cpu)
2088 2161  {
2089 2162          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2090 2163          struct cpuid_regs regs;
2091 2164          struct cpuid_regs *cp;
2092 2165  
2093 2166          if (cpi->cpi_maxeax < 0xB)
2094 2167                  return (B_FALSE);
2095 2168  
2096 2169          cp = &regs;
2097 2170          cp->cp_eax = 0xB;
2098 2171          cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2099 2172  
2100 2173          (void) __cpuid_insn(cp);
2101 2174  
2102 2175          /*
2103 2176           * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2104 2177           * indicates that the extended topology enumeration leaf is
2105 2178           * available.
2106 2179           */
2107 2180          if (cp->cp_ebx != 0) {
2108 2181                  uint32_t x2apic_id = 0;
2109 2182                  uint_t coreid_shift = 0;
2110 2183                  uint_t ncpu_per_core = 1;
2111 2184                  uint_t chipid_shift = 0;
2112 2185                  uint_t ncpu_per_chip = 1;
2113 2186                  uint_t i;
2114 2187                  uint_t level;
2115 2188  
2116 2189                  for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2117 2190                          cp->cp_eax = 0xB;
2118 2191                          cp->cp_ecx = i;
2119 2192  
2120 2193                          (void) __cpuid_insn(cp);
2121 2194                          level = CPI_CPU_LEVEL_TYPE(cp);
2122 2195  
2123 2196                          if (level == 1) {
2124 2197                                  x2apic_id = cp->cp_edx;
2125 2198                                  coreid_shift = BITX(cp->cp_eax, 4, 0);
2126 2199                                  ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2127 2200                          } else if (level == 2) {
2128 2201                                  x2apic_id = cp->cp_edx;
2129 2202                                  chipid_shift = BITX(cp->cp_eax, 4, 0);
2130 2203                                  ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2131 2204                          }
2132 2205                  }
2133 2206  
2134 2207                  /*
2135 2208                   * cpi_apicid is taken care of in cpuid_gather_apicid.
2136 2209                   */
2137 2210                  cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2138 2211                  cpi->cpi_ncore_per_chip = ncpu_per_chip /
2139 2212                      ncpu_per_core;
2140 2213                  cpi->cpi_chipid = x2apic_id >> chipid_shift;
2141 2214                  cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2142 2215                  cpi->cpi_coreid = x2apic_id >> coreid_shift;
2143 2216                  cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2144 2217                  cpi->cpi_procnodeid = cpi->cpi_chipid;
2145 2218                  cpi->cpi_compunitid = cpi->cpi_coreid;
2146 2219  
2147 2220                  if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2148 2221                          cpi->cpi_nthread_bits = coreid_shift;
2149 2222                          cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2150 2223                  }
2151 2224  
2152 2225                  return (B_TRUE);
2153 2226          } else {
2154 2227                  return (B_FALSE);
2155 2228          }
2156 2229  }
2157 2230  
2158 2231  static void
2159 2232  cpuid_intel_getids(cpu_t *cpu, void *feature)
2160 2233  {
2161 2234          uint_t i;
2162 2235          uint_t chipid_shift = 0;
2163 2236          uint_t coreid_shift = 0;
2164 2237          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2165 2238  
2166 2239          /*
2167 2240           * There are no compute units or processor nodes currently on Intel.
2168 2241           * Always set these to one.
2169 2242           */
2170 2243          cpi->cpi_procnodes_per_pkg = 1;
2171 2244          cpi->cpi_cores_per_compunit = 1;
2172 2245  
2173 2246          /*
2174 2247           * If cpuid Leaf B is present, use that to try and get this information.
2175 2248           * It will be the most accurate for Intel CPUs.
2176 2249           */
2177 2250          if (cpuid_leafB_getids(cpu))
2178 2251                  return;
2179 2252  
2180 2253          /*
2181 2254           * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2182 2255           * and ncore_per_chip. These represent the largest power of two values
2183 2256           * that we need to cover all of the IDs in the system. Therefore, we use
2184 2257           * those values to seed the number of bits needed to cover information
2185 2258           * in the case when leaf B is not available. These values will probably
2186 2259           * be larger than required, but that's OK.
2187 2260           */
2188 2261          cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2189 2262          cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2190 2263  
2191 2264          for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2192 2265                  chipid_shift++;
2193 2266  
2194 2267          cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2195 2268          cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2196 2269  
2197 2270          if (is_x86_feature(feature, X86FSET_CMP)) {
2198 2271                  /*
2199 2272                   * Multi-core (and possibly multi-threaded)
2200 2273                   * processors.
2201 2274                   */
2202 2275                  uint_t ncpu_per_core;
2203 2276                  if (cpi->cpi_ncore_per_chip == 1)
2204 2277                          ncpu_per_core = cpi->cpi_ncpu_per_chip;
2205 2278                  else if (cpi->cpi_ncore_per_chip > 1)
2206 2279                          ncpu_per_core = cpi->cpi_ncpu_per_chip /
2207 2280                              cpi->cpi_ncore_per_chip;
2208 2281                  /*
2209 2282                   * 8bit APIC IDs on dual core Pentiums
2210 2283                   * look like this:
2211 2284                   *
2212 2285                   * +-----------------------+------+------+
2213 2286                   * | Physical Package ID   |  MC  |  HT  |
2214 2287                   * +-----------------------+------+------+
2215 2288                   * <------- chipid -------->
2216 2289                   * <------- coreid --------------->
2217 2290                   *                         <--- clogid -->
2218 2291                   *                         <------>
2219 2292                   *                         pkgcoreid
2220 2293                   *
2221 2294                   * Where the number of bits necessary to
2222 2295                   * represent MC and HT fields together equals
2223 2296                   * to the minimum number of bits necessary to
2224 2297                   * store the value of cpi->cpi_ncpu_per_chip.
2225 2298                   * Of those bits, the MC part uses the number
2226 2299                   * of bits necessary to store the value of
2227 2300                   * cpi->cpi_ncore_per_chip.
2228 2301                   */
2229 2302                  for (i = 1; i < ncpu_per_core; i <<= 1)
2230 2303                          coreid_shift++;
2231 2304                  cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2232 2305                  cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2233 2306          } else if (is_x86_feature(feature, X86FSET_HTT)) {
2234 2307                  /*
2235 2308                   * Single-core multi-threaded processors.
2236 2309                   */
2237 2310                  cpi->cpi_coreid = cpi->cpi_chipid;
2238 2311                  cpi->cpi_pkgcoreid = 0;
2239 2312          } else {
2240 2313                  /*
2241 2314                   * Single-core single-thread processors.
2242 2315                   */
2243 2316                  cpi->cpi_coreid = cpu->cpu_id;
2244 2317                  cpi->cpi_pkgcoreid = 0;
2245 2318          }
2246 2319          cpi->cpi_procnodeid = cpi->cpi_chipid;
2247 2320          cpi->cpi_compunitid = cpi->cpi_coreid;
2248 2321  }
2249 2322  
2250 2323  /*
2251 2324   * Historically, AMD has had CMP chips with only a single thread per core.
2252 2325   * However, starting in family 17h (Zen), this has changed and they now have
2253 2326   * multiple threads. Our internal core id needs to be a unique value.
2254 2327   *
2255 2328   * To determine the core id of an AMD system, if we're from a family before 17h,
2256 2329   * then we just use the cpu id, as that gives us a good value that will be
2257 2330   * unique for each core. If instead, we're on family 17h or later, then we need
2258 2331   * to do something more complicated. CPUID leaf 0x8000001e can tell us
2259 2332   * how many threads are in the system. Based on that, we'll shift the APIC ID.
2260 2333   * We can't use the normal core id in that leaf as it's only unique within the
2261 2334   * socket, which is perfect for cpi_pkgcoreid, but not us.
2262 2335   */
2263 2336  static id_t
2264 2337  cpuid_amd_get_coreid(cpu_t *cpu)
2265 2338  {
2266 2339          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2267 2340  
2268 2341          if (cpi->cpi_family >= 0x17 &&
2269 2342              is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2270 2343              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2271 2344                  uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2272 2345                  if (nthreads > 1) {
2273 2346                          VERIFY3U(nthreads, ==, 2);
2274 2347                          return (cpi->cpi_apicid >> 1);
2275 2348                  }
2276 2349          }
2277 2350  
2278 2351          return (cpu->cpu_id);
2279 2352  }
2280 2353  
2281 2354  /*
2282 2355   * IDs on AMD is a more challenging task. This is notable because of the
2283 2356   * following two facts:
2284 2357   *
2285 2358   *  1. Before family 0x17 (Zen), there was no support for SMT and there was
2286 2359   *     also no way to get an actual unique core id from the system. As such, we
2287 2360   *     synthesize this case by using cpu->cpu_id.  This scheme does not,
2288 2361   *     however, guarantee that sibling cores of a chip will have sequential
2289 2362   *     coreids starting at a multiple of the number of cores per chip - that is
2290 2363   *     usually the case, but if the ACPI MADT table is presented in a different
2291 2364   *     order then we need to perform a few more gymnastics for the pkgcoreid.
2292 2365   *
2293 2366   *  2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2294 2367   *     called compute units. These compute units share the L1I cache, L2 cache,
2295 2368   *     and the FPU. To deal with this, a new topology leaf was added in
2296 2369   *     0x8000001e. However, parts of this leaf have different meanings
2297 2370   *     once we get to family 0x17.
2298 2371   */
2299 2372  
2300 2373  static void
2301 2374  cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2302 2375  {
2303 2376          int i, first_half, coreidsz;
2304 2377          uint32_t nb_caps_reg;
2305 2378          uint_t node2_1;
2306 2379          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2307 2380          struct cpuid_regs *cp;
2308 2381  
2309 2382          /*
2310 2383           * Calculate the core id (this comes from hardware in family 0x17 if it
2311 2384           * hasn't been stripped by virtualization). We always set the compute
2312 2385           * unit id to the same value. Also, initialize the default number of
2313 2386           * cores per compute unit and nodes per package. This will be
2314 2387           * overwritten when we know information about a particular family.
2315 2388           */
2316 2389          cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2317 2390          cpi->cpi_compunitid = cpi->cpi_coreid;
2318 2391          cpi->cpi_cores_per_compunit = 1;
2319 2392          cpi->cpi_procnodes_per_pkg = 1;
2320 2393  
2321 2394          /*
2322 2395           * To construct the logical ID, we need to determine how many APIC IDs
2323 2396           * are dedicated to the cores and threads. This is provided for us in
2324 2397           * 0x80000008. However, if it's not present (say due to virtualization),
2325 2398           * then we assume it's one. This should be present on all 64-bit AMD
2326 2399           * processors.  It was added in family 0xf (Hammer).
2327 2400           */
2328 2401          if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2329 2402                  coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2330 2403  
2331 2404                  /*
2332 2405                   * In AMD parlance chip is really a node while illumos
2333 2406                   * uses chip as equivalent to socket/package.
2334 2407                   */
2335 2408                  if (coreidsz == 0) {
2336 2409                          /* Use legacy method */
2337 2410                          for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2338 2411                                  coreidsz++;
2339 2412                          if (coreidsz == 0)
2340 2413                                  coreidsz = 1;
2341 2414                  }
2342 2415          } else {
2343 2416                  /* Assume single-core part */
2344 2417                  coreidsz = 1;
2345 2418          }
2346 2419          cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2347 2420  
2348 2421          /*
2349 2422           * The package core ID varies depending on the family. While it may be
2350 2423           * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2351 2424           * this value is the core id in the given node. For non-virtualized
2352 2425           * family 17h, we need to take the logical core id and shift off the
2353 2426           * threads like we do when getting the core id.  Otherwise, we can use
2354 2427           * the clogid as is. When family 17h is virtualized, the clogid should
2355 2428           * be sufficient as if we don't have valid data in the leaf, then we
2356 2429           * won't think we have SMT, in which case the cpi_clogid should be
2357 2430           * sufficient.
2358 2431           */
2359 2432          if (cpi->cpi_family >= 0x17 &&
2360 2433              is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2361 2434              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2362 2435              cpi->cpi_extd[0x1e].cp_ebx != 0) {
2363 2436                  uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2364 2437                  if (nthreads > 1) {
2365 2438                          VERIFY3U(nthreads, ==, 2);
2366 2439                          cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2367 2440                  } else {
2368 2441                          cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2369 2442                  }
2370 2443          } else {
2371 2444                  cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2372 2445          }
2373 2446  
2374 2447          /*
2375 2448           * Obtain the node ID and compute unit IDs. If we're on family 0x15
2376 2449           * (bulldozer) or newer, then we can derive all of this from leaf
2377 2450           * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2378 2451           */
2379 2452          if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2380 2453              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2381 2454                  cp = &cpi->cpi_extd[0x1e];
2382 2455  
2383 2456                  cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2384 2457                  cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2385 2458  
2386 2459                  /*
2387 2460                   * For Bulldozer-era CPUs, recalculate the compute unit
2388 2461                   * information.
2389 2462                   */
2390 2463                  if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2391 2464                          cpi->cpi_cores_per_compunit =
2392 2465                              BITX(cp->cp_ebx, 15, 8) + 1;
2393 2466                          cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2394 2467                              (cpi->cpi_ncore_per_chip /
2395 2468                              cpi->cpi_cores_per_compunit) *
2396 2469                              (cpi->cpi_procnodeid /
2397 2470                              cpi->cpi_procnodes_per_pkg);
2398 2471                  }
2399 2472          } else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2400 2473                  cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2401 2474          } else if (cpi->cpi_family == 0x10) {
2402 2475                  /*
2403 2476                   * See if we are a multi-node processor.
2404 2477                   * All processors in the system have the same number of nodes
2405 2478                   */
2406 2479                  nb_caps_reg =  pci_getl_func(0, 24, 3, 0xe8);
2407 2480                  if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2408 2481                          /* Single-node */
2409 2482                          cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2410 2483                              coreidsz);
2411 2484                  } else {
2412 2485  
2413 2486                          /*
2414 2487                           * Multi-node revision D (2 nodes per package
2415 2488                           * are supported)
2416 2489                           */
2417 2490                          cpi->cpi_procnodes_per_pkg = 2;
2418 2491  
2419 2492                          first_half = (cpi->cpi_pkgcoreid <=
2420 2493                              (cpi->cpi_ncore_per_chip/2 - 1));
2421 2494  
2422 2495                          if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2423 2496                                  /* We are BSP */
2424 2497                                  cpi->cpi_procnodeid = (first_half ? 0 : 1);
2425 2498                          } else {
2426 2499  
2427 2500                                  /* We are AP */
2428 2501                                  /* NodeId[2:1] bits to use for reading F3xe8 */
2429 2502                                  node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2430 2503  
2431 2504                                  nb_caps_reg =
2432 2505                                      pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2433 2506  
2434 2507                                  /*
2435 2508                                   * Check IntNodeNum bit (31:30, but bit 31 is
2436 2509                                   * always 0 on dual-node processors)
2437 2510                                   */
2438 2511                                  if (BITX(nb_caps_reg, 30, 30) == 0)
2439 2512                                          cpi->cpi_procnodeid = node2_1 +
2440 2513                                              !first_half;
2441 2514                                  else
2442 2515                                          cpi->cpi_procnodeid = node2_1 +
2443 2516                                              first_half;
2444 2517                          }
2445 2518                  }
2446 2519          } else {
2447 2520                  cpi->cpi_procnodeid = 0;
2448 2521          }
2449 2522  
2450 2523          cpi->cpi_chipid =
2451 2524              cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2452 2525  
2453 2526          cpi->cpi_ncore_bits = coreidsz;
2454 2527          cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2455 2528              cpi->cpi_ncore_per_chip);
2456 2529  }
2457 2530  
2458 2531  static void
2459 2532  spec_uarch_flush_noop(void)
2460 2533  {
2461 2534  }
2462 2535  
2463 2536  /*
2464 2537   * When microcode is present that mitigates MDS, this wrmsr will also flush the
2465 2538   * MDS-related micro-architectural state that would normally happen by calling
2466 2539   * x86_md_clear().
2467 2540   */
2468 2541  static void
2469 2542  spec_uarch_flush_msr(void)
2470 2543  {
2471 2544          wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2472 2545  }
2473 2546  
2474 2547  /*
2475 2548   * This function points to a function that will flush certain
2476 2549   * micro-architectural state on the processor. This flush is used to mitigate
2477 2550   * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This
2478 2551   * function can point to one of three functions:
2479 2552   *
2480 2553   * - A noop which is done because we either are vulnerable, but do not have
2481 2554   *   microcode available to help deal with a fix, or because we aren't
2482 2555   *   vulnerable.
2483 2556   *
2484 2557   * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2485 2558   *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2486 2559   *   however, it only flushes the MDS related micro-architectural state on the
2487 2560   *   current hyperthread, it does not do anything for the twin.
2488 2561   *
2489 2562   * - x86_md_clear which will flush the MDS related state. This is done when we
2490 2563   *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2491 2564   *   (RDCL_NO is set).
2492 2565   */
2493 2566  void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2494 2567  
2495 2568  static void
2496 2569  cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2497 2570  {
2498 2571          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2499 2572  
2500 2573          /*
2501 2574           * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2502 2575           * has been fixed in hardware, it doesn't cover everything related to
2503 2576           * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2504 2577           * need to mitigate this.
2505 2578           */
2506 2579          if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2507 2580              is_x86_feature(featureset, X86FSET_MDS_NO)) {
2508 2581                  return;
2509 2582          }
2510 2583  
2511 2584          if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2512 2585                  const uint8_t nop = NOP_INSTR;
2513 2586                  uint8_t *md = (uint8_t *)x86_md_clear;
2514 2587  
2515 2588                  *md = nop;
2516 2589          }
2517 2590  
2518 2591          membar_producer();
2519 2592  }
2520 2593  
2521 2594  static void
2522 2595  cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2523 2596  {
2524 2597          boolean_t need_l1d, need_mds;
2525 2598          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2526 2599  
2527 2600          /*
2528 2601           * If we're not on Intel or we've mitigated both RDCL and MDS in
2529 2602           * hardware, then there's nothing left for us to do for enabling the
2530 2603           * flush. We can also go ahead and say that SMT exclusion is
2531 2604           * unnecessary.
2532 2605           */
2533 2606          if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2534 2607              (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2535 2608              is_x86_feature(featureset, X86FSET_MDS_NO))) {
2536 2609                  extern int smt_exclusion;
2537 2610                  smt_exclusion = 0;
2538 2611                  spec_uarch_flush = spec_uarch_flush_noop;
2539 2612                  membar_producer();
2540 2613                  return;
2541 2614          }
2542 2615  
2543 2616          /*
2544 2617           * The locations where we need to perform an L1D flush are required both
2545 2618           * for mitigating L1TF and MDS. When verw support is present in
2546 2619           * microcode, then the L1D flush will take care of doing that as well.
2547 2620           * However, if we have a system where RDCL_NO is present, but we don't
2548 2621           * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2549 2622           * L1D flush.
2550 2623           */
2551 2624          if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2552 2625              is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2553 2626              !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2554 2627                  need_l1d = B_TRUE;
2555 2628          } else {
2556 2629                  need_l1d = B_FALSE;
2557 2630          }
2558 2631  
2559 2632          if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2560 2633              is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2561 2634                  need_mds = B_TRUE;
2562 2635          } else {
2563 2636                  need_mds = B_FALSE;
2564 2637          }
2565 2638  
2566 2639          if (need_l1d) {
2567 2640                  spec_uarch_flush = spec_uarch_flush_msr;
2568 2641          } else if (need_mds) {
2569 2642                  spec_uarch_flush = x86_md_clear;
2570 2643          } else {
2571 2644                  /*
2572 2645                   * We have no hardware mitigations available to us.
2573 2646                   */
2574 2647                  spec_uarch_flush = spec_uarch_flush_noop;
2575 2648          }
2576 2649          membar_producer();
2577 2650  }
2578 2651  
2579 2652  /*
2580 2653   * We default to enabling RSB mitigations.
2581 2654   */
2582 2655  static void
2583 2656  cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)
2584 2657  {
2585 2658          const uint8_t ret = RET_INSTR;
2586 2659          uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
2587 2660  
2588 2661          switch (mit) {
2589 2662          case X86_SPECTREV2_ENHANCED_IBRS:
2590 2663          case X86_SPECTREV2_DISABLED:
2591 2664                  *stuff = ret;
2592 2665                  break;
2593 2666          default:
2594 2667                  break;
2595 2668          }
2596 2669  }
2597 2670  
2598 2671  static void
2599 2672  cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
2600 2673  {
2601 2674          const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
2602 2675              "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
2603 2676              "_r14", "_r15" };
2604 2677          const uint_t nthunks = ARRAY_SIZE(thunks);
2605 2678          const char *type;
2606 2679          uint_t i;
2607 2680  
2608 2681          if (mit == x86_spectrev2_mitigation)
2609 2682                  return;
2610 2683  
2611 2684          switch (mit) {
2612 2685          case X86_SPECTREV2_RETPOLINE:
2613 2686                  type = "gen";
2614 2687                  break;
2615 2688          case X86_SPECTREV2_RETPOLINE_AMD:
2616 2689                  type = "amd";
2617 2690                  break;
2618 2691          case X86_SPECTREV2_ENHANCED_IBRS:
2619 2692          case X86_SPECTREV2_DISABLED:
2620 2693                  type = "jmp";
2621 2694                  break;
2622 2695          default:
2623 2696                  panic("asked to updated retpoline state with unknown state!");
2624 2697          }
2625 2698  
2626 2699          for (i = 0; i < nthunks; i++) {
2627 2700                  uintptr_t source, dest;
2628 2701                  int ssize, dsize;
2629 2702                  char sourcebuf[64], destbuf[64];
2630 2703                  size_t len;
2631 2704  
2632 2705                  (void) snprintf(destbuf, sizeof (destbuf),
2633 2706                      "__x86_indirect_thunk%s", thunks[i]);
2634 2707                  (void) snprintf(sourcebuf, sizeof (sourcebuf),
2635 2708                      "__x86_indirect_thunk_%s%s", type, thunks[i]);
2636 2709  
2637 2710                  source = kobj_getelfsym(sourcebuf, NULL, &ssize);
2638 2711                  dest = kobj_getelfsym(destbuf, NULL, &dsize);
2639 2712                  VERIFY3U(source, !=, 0);
2640 2713                  VERIFY3U(dest, !=, 0);
2641 2714                  VERIFY3S(dsize, >=, ssize);
2642 2715                  bcopy((void *)source, (void *)dest, ssize);
2643 2716          }
2644 2717  }
2645 2718  
2646 2719  static void
2647 2720  cpuid_enable_enhanced_ibrs(void)
2648 2721  {
2649 2722          uint64_t val;
2650 2723  
2651 2724          val = rdmsr(MSR_IA32_SPEC_CTRL);
2652 2725          val |= IA32_SPEC_CTRL_IBRS;
2653 2726          wrmsr(MSR_IA32_SPEC_CTRL, val);
2654 2727  }
2655 2728  
2656 2729  #ifndef __xpv
2657 2730  /*
2658 2731   * Determine whether or not we can use the AMD optimized retpoline
2659 2732   * functionality. We use this when we know we're on an AMD system and we can
2660 2733   * successfully verify that lfence is dispatch serializing.
2661 2734   */
2662 2735  static boolean_t
2663 2736  cpuid_use_amd_retpoline(struct cpuid_info *cpi)
2664 2737  {
2665 2738          uint64_t val;
2666 2739          on_trap_data_t otd;
2667 2740  
2668 2741          if (cpi->cpi_vendor != X86_VENDOR_AMD)
2669 2742                  return (B_FALSE);
2670 2743  
2671 2744          /*
2672 2745           * We need to determine whether or not lfence is serializing. It always
2673 2746           * is on families 0xf and 0x11. On others, it's controlled by
2674 2747           * MSR_AMD_DECODE_CONFIG (MSRC001_1029). If some hypervisor gives us a
2675 2748           * crazy old family, don't try and do anything.
2676 2749           */
2677 2750          if (cpi->cpi_family < 0xf)
2678 2751                  return (B_FALSE);
2679 2752          if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11)
2680 2753                  return (B_TRUE);
2681 2754  
2682 2755          /*
2683 2756           * While it may be tempting to use get_hwenv(), there are no promises
2684 2757           * that a hypervisor will actually declare themselves to be so in a
2685 2758           * friendly way. As such, try to read and set the MSR. If we can then
2686 2759           * read back the value we set (it wasn't just set to zero), then we go
2687 2760           * for it.
2688 2761           */
2689 2762          if (!on_trap(&otd, OT_DATA_ACCESS)) {
2690 2763                  val = rdmsr(MSR_AMD_DECODE_CONFIG);
2691 2764                  val |= AMD_DECODE_CONFIG_LFENCE_DISPATCH;
2692 2765                  wrmsr(MSR_AMD_DECODE_CONFIG, val);
2693 2766                  val = rdmsr(MSR_AMD_DECODE_CONFIG);
2694 2767          } else {

↓ open down ↓

1318 lines elided

↑ open up ↑

2695 2768                  val = 0;
2696 2769          }
2697 2770          no_trap();
2698 2771  
2699 2772          if ((val & AMD_DECODE_CONFIG_LFENCE_DISPATCH) != 0)
2700 2773                  return (B_TRUE);
2701 2774          return (B_FALSE);
2702 2775  }
2703 2776  #endif  /* !__xpv */
2704 2777  
     2778 +/*
     2779 + * Determine how we should mitigate TAA or if we need to. Regardless of TAA, if
     2780 + * we can disable TSX, we do so.
     2781 + *
     2782 + * This determination is done only on the boot CPU, potentially after loading
     2783 + * updated microcode.
     2784 + */
2705 2785  static void
     2786 +cpuid_update_tsx(cpu_t *cpu, uchar_t *featureset)
     2787 +{
     2788 +        struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
     2789 +
     2790 +        VERIFY(cpu->cpu_id == 0);
     2791 +
     2792 +        if (cpi->cpi_vendor != X86_VENDOR_Intel) {
     2793 +                x86_taa_mitigation = X86_TAA_HW_MITIGATED;
     2794 +                return;
     2795 +        }
     2796 +
     2797 +        if (x86_disable_taa) {
     2798 +                x86_taa_mitigation = X86_TAA_DISABLED;
     2799 +                return;
     2800 +        }
     2801 +
     2802 +        /*
     2803 +         * If we do not have the ability to disable TSX, then our only
     2804 +         * mitigation options are in hardware (TAA_NO), or by using our existing
     2805 +         * MDS mitigation as described above.  The latter relies upon us having
     2806 +         * configured MDS mitigations correctly! This includes disabling SMT if
     2807 +         * we want to cross-CPU-thread protection.
     2808 +         */
     2809 +        if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) {
     2810 +                /*
     2811 +                 * It's not clear whether any parts will enumerate TAA_NO
     2812 +                 * *without* TSX_CTRL, but let's mark it as such if we see this.
     2813 +                 */
     2814 +                if (is_x86_feature(featureset, X86FSET_TAA_NO)) {
     2815 +                        x86_taa_mitigation = X86_TAA_HW_MITIGATED;
     2816 +                        return;
     2817 +                }
     2818 +
     2819 +                if (is_x86_feature(featureset, X86FSET_MD_CLEAR) &&
     2820 +                    !is_x86_feature(featureset, X86FSET_MDS_NO)) {
     2821 +                        x86_taa_mitigation = X86_TAA_MD_CLEAR;
     2822 +                } else {
     2823 +                        x86_taa_mitigation = X86_TAA_NOTHING;
     2824 +                }
     2825 +                return;
     2826 +        }
     2827 +
     2828 +        /*
     2829 +         * We have TSX_CTRL, but we can only fully disable TSX if we're early
     2830 +         * enough in boot.
     2831 +         *
     2832 +         * Otherwise, we'll fall back to causing transactions to abort as our
     2833 +         * mitigation. TSX-using code will always take the fallback path.
     2834 +         */
     2835 +        if (cpi->cpi_pass < 4) {
     2836 +                x86_taa_mitigation = X86_TAA_TSX_DISABLE;
     2837 +        } else {
     2838 +                x86_taa_mitigation = X86_TAA_TSX_FORCE_ABORT;
     2839 +        }
     2840 +}
     2841 +
     2842 +static void
     2843 +cpuid_apply_tsx(x86_taa_mitigation_t taa)
     2844 +{
     2845 +        uint64_t val;
     2846 +
     2847 +        switch (taa) {
     2848 +        case X86_TAA_TSX_DISABLE:
     2849 +                val = rdmsr(MSR_IA32_TSX_CTRL);
     2850 +                val |= IA32_TSX_CTRL_CPUID_CLEAR | IA32_TSX_CTRL_RTM_DISABLE;
     2851 +                wrmsr(MSR_IA32_TSX_CTRL, val);
     2852 +                break;
     2853 +        case X86_TAA_TSX_FORCE_ABORT:
     2854 +                val = rdmsr(MSR_IA32_TSX_CTRL);
     2855 +                val |= IA32_TSX_CTRL_RTM_DISABLE;
     2856 +                wrmsr(MSR_IA32_TSX_CTRL, val);
     2857 +                break;
     2858 +        case X86_TAA_HW_MITIGATED:
     2859 +        case X86_TAA_MD_CLEAR:
     2860 +        case X86_TAA_DISABLED:
     2861 +        case X86_TAA_NOTHING:
     2862 +                break;
     2863 +        }
     2864 +}
     2865 +
     2866 +static void
2706 2867  cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
2707 2868  {
2708 2869          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2709 2870          x86_spectrev2_mitigation_t v2mit;
2710 2871  
2711 2872          if (cpi->cpi_vendor == X86_VENDOR_AMD &&
2712 2873              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2713 2874                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
2714 2875                          add_x86_feature(featureset, X86FSET_IBPB);
2715 2876                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)

2716 2877                          add_x86_feature(featureset, X86FSET_IBRS);
2717 2878                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
2718 2879                          add_x86_feature(featureset, X86FSET_STIBP);
2719 2880                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
2720 2881                          add_x86_feature(featureset, X86FSET_STIBP_ALL);
2721 2882                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
2722 2883                          add_x86_feature(featureset, X86FSET_SSBD);
2723 2884                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
2724 2885                          add_x86_feature(featureset, X86FSET_SSBD_VIRT);
2725 2886                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
2726 2887                          add_x86_feature(featureset, X86FSET_SSB_NO);
2727 2888                  /*
2728 2889                   * Don't enable enhanced IBRS unless we're told that we should
2729 2890                   * prefer it and it has the same semantics as Intel. This is
2730 2891                   * split into two bits rather than a single one.
2731 2892                   */
2732 2893                  if ((cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
2733 2894                      (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)) {
2734 2895                          add_x86_feature(featureset, X86FSET_IBRS_ALL);
2735 2896                  }
2736 2897  
2737 2898          } else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
2738 2899              cpi->cpi_maxeax >= 7) {
2739 2900                  struct cpuid_regs *ecp;
2740 2901                  ecp = &cpi->cpi_std[7];
2741 2902  
2742 2903                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
2743 2904                          add_x86_feature(featureset, X86FSET_MD_CLEAR);
2744 2905                  }
2745 2906  
2746 2907                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
2747 2908                          add_x86_feature(featureset, X86FSET_IBRS);
2748 2909                          add_x86_feature(featureset, X86FSET_IBPB);
2749 2910                  }
2750 2911  
2751 2912                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
2752 2913                          add_x86_feature(featureset, X86FSET_STIBP);
2753 2914                  }
2754 2915  
2755 2916                  /*
2756 2917                   * Don't read the arch caps MSR on xpv where we lack the
2757 2918                   * on_trap().
2758 2919                   */
2759 2920  #ifndef __xpv
2760 2921                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
2761 2922                          on_trap_data_t otd;
2762 2923  
2763 2924                          /*
2764 2925                           * Be paranoid and assume we'll get a #GP.
2765 2926                           */
2766 2927                          if (!on_trap(&otd, OT_DATA_ACCESS)) {
2767 2928                                  uint64_t reg;
2768 2929  
2769 2930                                  reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
2770 2931                                  if (reg & IA32_ARCH_CAP_RDCL_NO) {
2771 2932                                          add_x86_feature(featureset,
2772 2933                                              X86FSET_RDCL_NO);
2773 2934                                  }
2774 2935                                  if (reg & IA32_ARCH_CAP_IBRS_ALL) {
2775 2936                                          add_x86_feature(featureset,
2776 2937                                              X86FSET_IBRS_ALL);
2777 2938                                  }
2778 2939                                  if (reg & IA32_ARCH_CAP_RSBA) {
2779 2940                                          add_x86_feature(featureset,
2780 2941                                              X86FSET_RSBA);
2781 2942                                  }
2782 2943                                  if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
2783 2944                                          add_x86_feature(featureset,

↓ open down ↓

68 lines elided

↑ open up ↑

2784 2945                                              X86FSET_L1D_VM_NO);
2785 2946                                  }
2786 2947                                  if (reg & IA32_ARCH_CAP_SSB_NO) {
2787 2948                                          add_x86_feature(featureset,
2788 2949                                              X86FSET_SSB_NO);
2789 2950                                  }
2790 2951                                  if (reg & IA32_ARCH_CAP_MDS_NO) {
2791 2952                                          add_x86_feature(featureset,
2792 2953                                              X86FSET_MDS_NO);
2793 2954                                  }
     2955 +                                if (reg & IA32_ARCH_CAP_TSX_CTRL) {
     2956 +                                        add_x86_feature(featureset,
     2957 +                                            X86FSET_TSX_CTRL);
     2958 +                                }
     2959 +                                if (reg & IA32_ARCH_CAP_TAA_NO) {
     2960 +                                        add_x86_feature(featureset,
     2961 +                                            X86FSET_TAA_NO);
     2962 +                                }
2794 2963                          }
2795 2964                          no_trap();
2796 2965                  }
2797 2966  #endif  /* !__xpv */
2798 2967  
2799 2968                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
2800 2969                          add_x86_feature(featureset, X86FSET_SSBD);
2801 2970  
2802 2971                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
2803 2972                          add_x86_feature(featureset, X86FSET_FLUSH_CMD);
2804 2973          }
2805 2974  
     2975 +        /*
     2976 +         * Take care of certain mitigations on the non-boot CPU. The boot CPU
     2977 +         * will have already run this function and determined what we need to
     2978 +         * do. This gives us a hook for per-HW thread mitigations such as
     2979 +         * enhanced IBRS, or disabling TSX.  For TSX disabling, we need to be
     2980 +         * careful that we've had a chance to load ucode that enables the new
     2981 +         * MSRs.
     2982 +         */
2806 2983          if (cpu->cpu_id != 0) {
2807 2984                  if (x86_spectrev2_mitigation == X86_SPECTREV2_ENHANCED_IBRS) {
2808 2985                          cpuid_enable_enhanced_ibrs();
2809 2986                  }
     2987 +
     2988 +                if (cpi->cpi_pass >= 1)
     2989 +                        cpuid_apply_tsx(x86_taa_mitigation);
2810 2990                  return;
2811 2991          }
2812 2992  
2813 2993          /*
2814 2994           * Go through and initialize various security mechanisms that we should
2815      -         * only do on a single CPU. This includes Spectre V2, L1TF, and MDS.
     2995 +         * only do on a single CPU. This includes Spectre V2, L1TF, MDS, and
     2996 +         * TAA.
2816 2997           */
2817 2998  
2818 2999          /*
2819 3000           * By default we've come in with retpolines enabled. Check whether we
2820 3001           * should disable them or enable enhanced IBRS. RSB stuffing is enabled
2821 3002           * by default, but disabled if we are using enhanced IBRS.
2822 3003           */
2823 3004          if (x86_disable_spectrev2 != 0) {
2824 3005                  v2mit = X86_SPECTREV2_DISABLED;
2825 3006          } else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {

2826 3007                  cpuid_enable_enhanced_ibrs();
2827 3008                  v2mit = X86_SPECTREV2_ENHANCED_IBRS;
2828 3009  #ifndef __xpv
2829 3010          } else if (cpuid_use_amd_retpoline(cpi)) {
2830 3011                  v2mit = X86_SPECTREV2_RETPOLINE_AMD;
2831 3012  #endif  /* !__xpv */
2832 3013          } else {
2833 3014                  v2mit = X86_SPECTREV2_RETPOLINE;
2834 3015          }
2835 3016  
2836 3017          cpuid_patch_retpolines(v2mit);
2837 3018          cpuid_patch_rsb(v2mit);
2838 3019          x86_spectrev2_mitigation = v2mit;
2839 3020          membar_producer();
2840 3021  
2841 3022          /*
2842 3023           * We need to determine what changes are required for mitigating L1TF
2843 3024           * and MDS. If the CPU suffers from either of them, then SMT exclusion
2844 3025           * is required.
2845 3026           *
2846 3027           * If any of these are present, then we need to flush u-arch state at
2847 3028           * various points. For MDS, we need to do so whenever we change to a
2848 3029           * lesser privilege level or we are halting the CPU. For L1TF we need to
2849 3030           * flush the L1D cache at VM entry. When we have microcode that handles
2850 3031           * MDS, the L1D flush also clears the other u-arch state that the
2851 3032           * md_clear does.
2852 3033           */
2853 3034  
2854 3035          /*

↓ open down ↓

29 lines elided

↑ open up ↑

2855 3036           * Update whether or not we need to be taking explicit action against
2856 3037           * MDS.
2857 3038           */
2858 3039          cpuid_update_md_clear(cpu, featureset);
2859 3040  
2860 3041          /*
2861 3042           * Determine whether SMT exclusion is required and whether or not we
2862 3043           * need to perform an l1d flush.
2863 3044           */
2864 3045          cpuid_update_l1d_flush(cpu, featureset);
     3046 +
     3047 +        /*
     3048 +         * Determine what our mitigation strategy should be for TAA and then
     3049 +         * also apply TAA mitigations.
     3050 +         */
     3051 +        cpuid_update_tsx(cpu, featureset);
     3052 +        cpuid_apply_tsx(x86_taa_mitigation);
2865 3053  }
2866 3054  
2867 3055  /*
2868 3056   * Setup XFeature_Enabled_Mask register. Required by xsave feature.
2869 3057   */
2870 3058  void
2871 3059  setup_xfem(void)
2872 3060  {
2873 3061          uint64_t flags = XFEATURE_LEGACY_FP;
2874 3062

2875 3063          ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
2876 3064  
2877 3065          if (is_x86_feature(x86_featureset, X86FSET_SSE))
2878 3066                  flags |= XFEATURE_SSE;
2879 3067  
2880 3068          if (is_x86_feature(x86_featureset, X86FSET_AVX))
2881 3069                  flags |= XFEATURE_AVX;
2882 3070  
2883 3071          if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
2884 3072                  flags |= XFEATURE_AVX512;
2885 3073  
2886 3074          set_xcr(XFEATURE_ENABLED_MASK, flags);
2887 3075  
2888 3076          xsave_bv_all = flags;
2889 3077  }
2890 3078  
2891 3079  static void
2892 3080  cpuid_pass1_topology(cpu_t *cpu, uchar_t *featureset)
2893 3081  {
2894 3082          struct cpuid_info *cpi;
2895 3083  
2896 3084          cpi = cpu->cpu_m.mcpu_cpi;
2897 3085  
2898 3086          if (cpi->cpi_vendor == X86_VENDOR_AMD) {
2899 3087                  cpuid_gather_amd_topology_leaves(cpu);
2900 3088          }
2901 3089  
2902 3090          cpi->cpi_apicid = cpuid_gather_apicid(cpi);
2903 3091  
2904 3092          /*
2905 3093           * Before we can calculate the IDs that we should assign to this
2906 3094           * processor, we need to understand how many cores and threads it has.
2907 3095           */
2908 3096          switch (cpi->cpi_vendor) {
2909 3097          case X86_VENDOR_Intel:
2910 3098                  cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
2911 3099                      &cpi->cpi_ncore_per_chip);
2912 3100                  break;
2913 3101          case X86_VENDOR_AMD:
2914 3102                  cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
2915 3103                      &cpi->cpi_ncore_per_chip);
2916 3104                  break;
2917 3105          default:
2918 3106                  /*
2919 3107                   * If we have some other x86 compatible chip, it's not clear how
2920 3108                   * they would behave. The most common case is virtualization
2921 3109                   * today, though there are also 64-bit VIA chips. Assume that
2922 3110                   * all we can get is the basic Leaf 1 HTT information.
2923 3111                   */
2924 3112                  if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2925 3113                          cpi->cpi_ncore_per_chip = 1;
2926 3114                          cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
2927 3115                  }
2928 3116                  break;
2929 3117          }
2930 3118  
2931 3119          /*
2932 3120           * Based on the calculated number of threads and cores, potentially
2933 3121           * assign the HTT and CMT features.
2934 3122           */
2935 3123          if (cpi->cpi_ncore_per_chip > 1) {
2936 3124                  add_x86_feature(featureset, X86FSET_CMP);
2937 3125          }
2938 3126  
2939 3127          if (cpi->cpi_ncpu_per_chip > 1 &&
2940 3128              cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
2941 3129                  add_x86_feature(featureset, X86FSET_HTT);
2942 3130          }
2943 3131  
2944 3132          /*
2945 3133           * Now that has been set up, we need to go through and calculate all of
2946 3134           * the rest of the parameters that exist. If we think the CPU doesn't
2947 3135           * have either SMT (HTT) or CMP, then we basically go through and fake
2948 3136           * up information in some way. The most likely case for this is
2949 3137           * virtualization where we have a lot of partial topology information.
2950 3138           */
2951 3139          if (!is_x86_feature(featureset, X86FSET_HTT) &&
2952 3140              !is_x86_feature(featureset, X86FSET_CMP)) {
2953 3141                  /*
2954 3142                   * This is a single core, single-threaded processor.
2955 3143                   */
2956 3144                  cpi->cpi_procnodes_per_pkg = 1;
2957 3145                  cpi->cpi_cores_per_compunit = 1;
2958 3146                  cpi->cpi_compunitid = 0;
2959 3147                  cpi->cpi_chipid = -1;
2960 3148                  cpi->cpi_clogid = 0;
2961 3149                  cpi->cpi_coreid = cpu->cpu_id;
2962 3150                  cpi->cpi_pkgcoreid = 0;
2963 3151                  if (cpi->cpi_vendor == X86_VENDOR_AMD) {
2964 3152                          cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
2965 3153                  } else {
2966 3154                          cpi->cpi_procnodeid = cpi->cpi_chipid;
2967 3155                  }
2968 3156          } else {
2969 3157                  switch (cpi->cpi_vendor) {
2970 3158                  case X86_VENDOR_Intel:
2971 3159                          cpuid_intel_getids(cpu, featureset);
2972 3160                          break;
2973 3161                  case X86_VENDOR_AMD:
2974 3162                          cpuid_amd_getids(cpu, featureset);
2975 3163                          break;
2976 3164                  default:
2977 3165                          /*
2978 3166                           * In this case, it's hard to say what we should do.
2979 3167                           * We're going to model them to the OS as single core
2980 3168                           * threads. We don't have a good identifier for them, so
2981 3169                           * we're just going to use the cpu id all on a single
2982 3170                           * chip.
2983 3171                           *
2984 3172                           * This case has historically been different from the
2985 3173                           * case above where we don't have HTT or CMP. While they
2986 3174                           * could be combined, we've opted to keep it separate to
2987 3175                           * minimize the risk of topology changes in weird cases.
2988 3176                           */
2989 3177                          cpi->cpi_procnodes_per_pkg = 1;
2990 3178                          cpi->cpi_cores_per_compunit = 1;
2991 3179                          cpi->cpi_chipid = 0;
2992 3180                          cpi->cpi_coreid = cpu->cpu_id;
2993 3181                          cpi->cpi_clogid = cpu->cpu_id;
2994 3182                          cpi->cpi_pkgcoreid = cpu->cpu_id;
2995 3183                          cpi->cpi_procnodeid = cpi->cpi_chipid;
2996 3184                          cpi->cpi_compunitid = cpi->cpi_coreid;
2997 3185                          break;
2998 3186                  }
2999 3187          }
3000 3188  }
3001 3189  
3002 3190  /*
3003 3191   * Gather relevant CPU features from leaf 6 which covers thermal information. We
3004 3192   * always gather leaf 6 if it's supported; however, we only look for features on
3005 3193   * Intel systems as AMD does not currently define any of the features we look
3006 3194   * for below.
3007 3195   */
3008 3196  static void
3009 3197  cpuid_pass1_thermal(cpu_t *cpu, uchar_t *featureset)
3010 3198  {
3011 3199          struct cpuid_regs *cp;
3012 3200          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3013 3201  
3014 3202          if (cpi->cpi_maxeax < 6) {
3015 3203                  return;
3016 3204          }
3017 3205  
3018 3206          cp = &cpi->cpi_std[6];
3019 3207          cp->cp_eax = 6;
3020 3208          cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
3021 3209          (void) __cpuid_insn(cp);
3022 3210          platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
3023 3211  
3024 3212          if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3025 3213                  return;
3026 3214          }
3027 3215  
3028 3216          if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
3029 3217                  add_x86_feature(featureset, X86FSET_CORE_THERMAL);
3030 3218          }
3031 3219  
3032 3220          if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
3033 3221                  add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3034 3222          }
3035 3223  }
3036 3224  
3037 3225  void
3038 3226  cpuid_pass1(cpu_t *cpu, uchar_t *featureset)
3039 3227  {
3040 3228          uint32_t mask_ecx, mask_edx;
3041 3229          struct cpuid_info *cpi;
3042 3230          struct cpuid_regs *cp;
3043 3231          int xcpuid;
3044 3232  #if !defined(__xpv)
3045 3233          extern int idle_cpu_prefer_mwait;
3046 3234  #endif
3047 3235  
3048 3236          /*
3049 3237           * Space statically allocated for BSP, ensure pointer is set
3050 3238           */
3051 3239          if (cpu->cpu_id == 0) {
3052 3240                  if (cpu->cpu_m.mcpu_cpi == NULL)
3053 3241                          cpu->cpu_m.mcpu_cpi = &cpuid_info0;
3054 3242          }
3055 3243  
3056 3244          add_x86_feature(featureset, X86FSET_CPUID);
3057 3245  
3058 3246          cpi = cpu->cpu_m.mcpu_cpi;
3059 3247          ASSERT(cpi != NULL);
3060 3248          cp = &cpi->cpi_std[0];
3061 3249          cp->cp_eax = 0;
3062 3250          cpi->cpi_maxeax = __cpuid_insn(cp);
3063 3251          {
3064 3252                  uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
3065 3253                  *iptr++ = cp->cp_ebx;
3066 3254                  *iptr++ = cp->cp_edx;
3067 3255                  *iptr++ = cp->cp_ecx;
3068 3256                  *(char *)&cpi->cpi_vendorstr[12] = '\0';
3069 3257          }
3070 3258  
3071 3259          cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
3072 3260          x86_vendor = cpi->cpi_vendor; /* for compatibility */
3073 3261  
3074 3262          /*
3075 3263           * Limit the range in case of weird hardware
3076 3264           */
3077 3265          if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
3078 3266                  cpi->cpi_maxeax = CPI_MAXEAX_MAX;
3079 3267          if (cpi->cpi_maxeax < 1)
3080 3268                  goto pass1_done;
3081 3269  
3082 3270          cp = &cpi->cpi_std[1];
3083 3271          cp->cp_eax = 1;
3084 3272          (void) __cpuid_insn(cp);
3085 3273  
3086 3274          /*
3087 3275           * Extract identifying constants for easy access.
3088 3276           */
3089 3277          cpi->cpi_model = CPI_MODEL(cpi);
3090 3278          cpi->cpi_family = CPI_FAMILY(cpi);
3091 3279  
3092 3280          if (cpi->cpi_family == 0xf)
3093 3281                  cpi->cpi_family += CPI_FAMILY_XTD(cpi);
3094 3282  
3095 3283          /*
3096 3284           * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
3097 3285           * Intel, and presumably everyone else, uses model == 0xf, as
3098 3286           * one would expect (max value means possible overflow).  Sigh.
3099 3287           */
3100 3288  
3101 3289          switch (cpi->cpi_vendor) {
3102 3290          case X86_VENDOR_Intel:
3103 3291                  if (IS_EXTENDED_MODEL_INTEL(cpi))
3104 3292                          cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3105 3293                  break;
3106 3294          case X86_VENDOR_AMD:
3107 3295                  if (CPI_FAMILY(cpi) == 0xf)
3108 3296                          cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3109 3297                  break;
3110 3298          default:
3111 3299                  if (cpi->cpi_model == 0xf)
3112 3300                          cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3113 3301                  break;
3114 3302          }
3115 3303  
3116 3304          cpi->cpi_step = CPI_STEP(cpi);
3117 3305          cpi->cpi_brandid = CPI_BRANDID(cpi);
3118 3306  
3119 3307          /*
3120 3308           * *default* assumptions:
3121 3309           * - believe %edx feature word
3122 3310           * - ignore %ecx feature word
3123 3311           * - 32-bit virtual and physical addressing
3124 3312           */
3125 3313          mask_edx = 0xffffffff;
3126 3314          mask_ecx = 0;
3127 3315  
3128 3316          cpi->cpi_pabits = cpi->cpi_vabits = 32;
3129 3317  
3130 3318          switch (cpi->cpi_vendor) {
3131 3319          case X86_VENDOR_Intel:
3132 3320                  if (cpi->cpi_family == 5)
3133 3321                          x86_type = X86_TYPE_P5;
3134 3322                  else if (IS_LEGACY_P6(cpi)) {
3135 3323                          x86_type = X86_TYPE_P6;
3136 3324                          pentiumpro_bug4046376 = 1;
3137 3325                          /*
3138 3326                           * Clear the SEP bit when it was set erroneously
3139 3327                           */
3140 3328                          if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
3141 3329                                  cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
3142 3330                  } else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
3143 3331                          x86_type = X86_TYPE_P4;
3144 3332                          /*
3145 3333                           * We don't currently depend on any of the %ecx
3146 3334                           * features until Prescott, so we'll only check
3147 3335                           * this from P4 onwards.  We might want to revisit
3148 3336                           * that idea later.
3149 3337                           */
3150 3338                          mask_ecx = 0xffffffff;
3151 3339                  } else if (cpi->cpi_family > 0xf)
3152 3340                          mask_ecx = 0xffffffff;
3153 3341                  /*
3154 3342                   * We don't support MONITOR/MWAIT if leaf 5 is not available
3155 3343                   * to obtain the monitor linesize.
3156 3344                   */
3157 3345                  if (cpi->cpi_maxeax < 5)
3158 3346                          mask_ecx &= ~CPUID_INTC_ECX_MON;
3159 3347                  break;
3160 3348          case X86_VENDOR_IntelClone:
3161 3349          default:
3162 3350                  break;
3163 3351          case X86_VENDOR_AMD:
3164 3352  #if defined(OPTERON_ERRATUM_108)
3165 3353                  if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
3166 3354                          cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
3167 3355                          cpi->cpi_model = 0xc;
3168 3356                  } else
3169 3357  #endif
3170 3358                  if (cpi->cpi_family == 5) {
3171 3359                          /*
3172 3360                           * AMD K5 and K6
3173 3361                           *
3174 3362                           * These CPUs have an incomplete implementation
3175 3363                           * of MCA/MCE which we mask away.
3176 3364                           */
3177 3365                          mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
3178 3366  
3179 3367                          /*
3180 3368                           * Model 0 uses the wrong (APIC) bit
3181 3369                           * to indicate PGE.  Fix it here.
3182 3370                           */
3183 3371                          if (cpi->cpi_model == 0) {
3184 3372                                  if (cp->cp_edx & 0x200) {
3185 3373                                          cp->cp_edx &= ~0x200;
3186 3374                                          cp->cp_edx |= CPUID_INTC_EDX_PGE;
3187 3375                                  }
3188 3376                          }
3189 3377  
3190 3378                          /*
3191 3379                           * Early models had problems w/ MMX; disable.
3192 3380                           */
3193 3381                          if (cpi->cpi_model < 6)
3194 3382                                  mask_edx &= ~CPUID_INTC_EDX_MMX;
3195 3383                  }
3196 3384  
3197 3385                  /*
3198 3386                   * For newer families, SSE3 and CX16, at least, are valid;
3199 3387                   * enable all
3200 3388                   */
3201 3389                  if (cpi->cpi_family >= 0xf)
3202 3390                          mask_ecx = 0xffffffff;
3203 3391                  /*
3204 3392                   * We don't support MONITOR/MWAIT if leaf 5 is not available
3205 3393                   * to obtain the monitor linesize.
3206 3394                   */
3207 3395                  if (cpi->cpi_maxeax < 5)
3208 3396                          mask_ecx &= ~CPUID_INTC_ECX_MON;
3209 3397  
3210 3398  #if !defined(__xpv)
3211 3399                  /*
3212 3400                   * AMD has not historically used MWAIT in the CPU's idle loop.
3213 3401                   * Pre-family-10h Opterons do not have the MWAIT instruction. We
3214 3402                   * know for certain that in at least family 17h, per AMD, mwait
3215 3403                   * is preferred. Families in-between are less certain.
3216 3404                   */
3217 3405                  if (cpi->cpi_family < 0x17) {
3218 3406                          idle_cpu_prefer_mwait = 0;
3219 3407                  }
3220 3408  #endif
3221 3409  
3222 3410                  break;
3223 3411          case X86_VENDOR_TM:
3224 3412                  /*
3225 3413                   * workaround the NT workaround in CMS 4.1
3226 3414                   */
3227 3415                  if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
3228 3416                      (cpi->cpi_step == 2 || cpi->cpi_step == 3))
3229 3417                          cp->cp_edx |= CPUID_INTC_EDX_CX8;
3230 3418                  break;
3231 3419          case X86_VENDOR_Centaur:
3232 3420                  /*
3233 3421                   * workaround the NT workarounds again
3234 3422                   */
3235 3423                  if (cpi->cpi_family == 6)
3236 3424                          cp->cp_edx |= CPUID_INTC_EDX_CX8;
3237 3425                  break;
3238 3426          case X86_VENDOR_Cyrix:
3239 3427                  /*
3240 3428                   * We rely heavily on the probing in locore
3241 3429                   * to actually figure out what parts, if any,
3242 3430                   * of the Cyrix cpuid instruction to believe.
3243 3431                   */
3244 3432                  switch (x86_type) {
3245 3433                  case X86_TYPE_CYRIX_486:
3246 3434                          mask_edx = 0;
3247 3435                          break;
3248 3436                  case X86_TYPE_CYRIX_6x86:
3249 3437                          mask_edx = 0;
3250 3438                          break;
3251 3439                  case X86_TYPE_CYRIX_6x86L:
3252 3440                          mask_edx =
3253 3441                              CPUID_INTC_EDX_DE |
3254 3442                              CPUID_INTC_EDX_CX8;
3255 3443                          break;
3256 3444                  case X86_TYPE_CYRIX_6x86MX:
3257 3445                          mask_edx =
3258 3446                              CPUID_INTC_EDX_DE |
3259 3447                              CPUID_INTC_EDX_MSR |
3260 3448                              CPUID_INTC_EDX_CX8 |
3261 3449                              CPUID_INTC_EDX_PGE |
3262 3450                              CPUID_INTC_EDX_CMOV |
3263 3451                              CPUID_INTC_EDX_MMX;
3264 3452                          break;
3265 3453                  case X86_TYPE_CYRIX_GXm:
3266 3454                          mask_edx =
3267 3455                              CPUID_INTC_EDX_MSR |
3268 3456                              CPUID_INTC_EDX_CX8 |
3269 3457                              CPUID_INTC_EDX_CMOV |
3270 3458                              CPUID_INTC_EDX_MMX;
3271 3459                          break;
3272 3460                  case X86_TYPE_CYRIX_MediaGX:
3273 3461                          break;
3274 3462                  case X86_TYPE_CYRIX_MII:
3275 3463                  case X86_TYPE_VIA_CYRIX_III:
3276 3464                          mask_edx =
3277 3465                              CPUID_INTC_EDX_DE |
3278 3466                              CPUID_INTC_EDX_TSC |
3279 3467                              CPUID_INTC_EDX_MSR |
3280 3468                              CPUID_INTC_EDX_CX8 |
3281 3469                              CPUID_INTC_EDX_PGE |
3282 3470                              CPUID_INTC_EDX_CMOV |
3283 3471                              CPUID_INTC_EDX_MMX;
3284 3472                          break;
3285 3473                  default:
3286 3474                          break;
3287 3475                  }
3288 3476                  break;
3289 3477          }
3290 3478  
3291 3479  #if defined(__xpv)
3292 3480          /*
3293 3481           * Do not support MONITOR/MWAIT under a hypervisor
3294 3482           */
3295 3483          mask_ecx &= ~CPUID_INTC_ECX_MON;
3296 3484          /*
3297 3485           * Do not support XSAVE under a hypervisor for now
3298 3486           */
3299 3487          xsave_force_disable = B_TRUE;
3300 3488  
3301 3489  #endif  /* __xpv */
3302 3490  
3303 3491          if (xsave_force_disable) {
3304 3492                  mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
3305 3493                  mask_ecx &= ~CPUID_INTC_ECX_AVX;
3306 3494                  mask_ecx &= ~CPUID_INTC_ECX_F16C;
3307 3495                  mask_ecx &= ~CPUID_INTC_ECX_FMA;
3308 3496          }
3309 3497  
3310 3498          /*
3311 3499           * Now we've figured out the masks that determine
3312 3500           * which bits we choose to believe, apply the masks
3313 3501           * to the feature words, then map the kernel's view
3314 3502           * of these feature words into its feature word.
3315 3503           */
3316 3504          cp->cp_edx &= mask_edx;
3317 3505          cp->cp_ecx &= mask_ecx;
3318 3506  
3319 3507          /*
3320 3508           * apply any platform restrictions (we don't call this
3321 3509           * immediately after __cpuid_insn here, because we need the
3322 3510           * workarounds applied above first)
3323 3511           */
3324 3512          platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
3325 3513  
3326 3514          /*
3327 3515           * In addition to ecx and edx, Intel and AMD are storing a bunch of
3328 3516           * instruction set extensions in leaf 7's ebx, ecx, and edx.
3329 3517           */
3330 3518          if (cpi->cpi_maxeax >= 7) {
3331 3519                  struct cpuid_regs *ecp;
3332 3520                  ecp = &cpi->cpi_std[7];
3333 3521                  ecp->cp_eax = 7;
3334 3522                  ecp->cp_ecx = 0;
3335 3523                  (void) __cpuid_insn(ecp);
3336 3524  
3337 3525                  /*
3338 3526                   * If XSAVE has been disabled, just ignore all of the
3339 3527                   * extended-save-area dependent flags here.
3340 3528                   */
3341 3529                  if (xsave_force_disable) {
3342 3530                          ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
3343 3531                          ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
3344 3532                          ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
3345 3533                          ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
3346 3534                          ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
3347 3535                          ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
3348 3536                          ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
3349 3537                  }
3350 3538  
3351 3539                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
3352 3540                          add_x86_feature(featureset, X86FSET_SMEP);
3353 3541  
3354 3542                  /*
3355 3543                   * We check disable_smap here in addition to in startup_smap()
3356 3544                   * to ensure CPUs that aren't the boot CPU don't accidentally
3357 3545                   * include it in the feature set and thus generate a mismatched
3358 3546                   * x86 feature set across CPUs.
3359 3547                   */
3360 3548                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
3361 3549                      disable_smap == 0)
3362 3550                          add_x86_feature(featureset, X86FSET_SMAP);
3363 3551  
3364 3552                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
3365 3553                          add_x86_feature(featureset, X86FSET_RDSEED);
3366 3554  
3367 3555                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
3368 3556                          add_x86_feature(featureset, X86FSET_ADX);
3369 3557  
3370 3558                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
3371 3559                          add_x86_feature(featureset, X86FSET_FSGSBASE);
3372 3560  
3373 3561                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
3374 3562                          add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
3375 3563  
3376 3564                  if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3377 3565                          if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
3378 3566                                  add_x86_feature(featureset, X86FSET_INVPCID);
3379 3567  
3380 3568                          if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
3381 3569                                  add_x86_feature(featureset, X86FSET_MPX);
3382 3570  
3383 3571                          if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
3384 3572                                  add_x86_feature(featureset, X86FSET_CLWB);
3385 3573                  }
3386 3574          }
3387 3575  
3388 3576          /*
3389 3577           * fold in overrides from the "eeprom" mechanism
3390 3578           */
3391 3579          cp->cp_edx |= cpuid_feature_edx_include;
3392 3580          cp->cp_edx &= ~cpuid_feature_edx_exclude;
3393 3581  
3394 3582          cp->cp_ecx |= cpuid_feature_ecx_include;
3395 3583          cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
3396 3584  
3397 3585          if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
3398 3586                  add_x86_feature(featureset, X86FSET_LARGEPAGE);
3399 3587          }
3400 3588          if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
3401 3589                  add_x86_feature(featureset, X86FSET_TSC);
3402 3590          }
3403 3591          if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
3404 3592                  add_x86_feature(featureset, X86FSET_MSR);
3405 3593          }
3406 3594          if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
3407 3595                  add_x86_feature(featureset, X86FSET_MTRR);
3408 3596          }
3409 3597          if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
3410 3598                  add_x86_feature(featureset, X86FSET_PGE);
3411 3599          }
3412 3600          if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
3413 3601                  add_x86_feature(featureset, X86FSET_CMOV);
3414 3602          }
3415 3603          if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
3416 3604                  add_x86_feature(featureset, X86FSET_MMX);
3417 3605          }
3418 3606          if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
3419 3607              (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
3420 3608                  add_x86_feature(featureset, X86FSET_MCA);
3421 3609          }
3422 3610          if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
3423 3611                  add_x86_feature(featureset, X86FSET_PAE);
3424 3612          }
3425 3613          if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
3426 3614                  add_x86_feature(featureset, X86FSET_CX8);
3427 3615          }
3428 3616          if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
3429 3617                  add_x86_feature(featureset, X86FSET_CX16);
3430 3618          }
3431 3619          if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
3432 3620                  add_x86_feature(featureset, X86FSET_PAT);
3433 3621          }
3434 3622          if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
3435 3623                  add_x86_feature(featureset, X86FSET_SEP);
3436 3624          }
3437 3625          if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
3438 3626                  /*
3439 3627                   * In our implementation, fxsave/fxrstor
3440 3628                   * are prerequisites before we'll even
3441 3629                   * try and do SSE things.
3442 3630                   */
3443 3631                  if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
3444 3632                          add_x86_feature(featureset, X86FSET_SSE);
3445 3633                  }
3446 3634                  if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
3447 3635                          add_x86_feature(featureset, X86FSET_SSE2);
3448 3636                  }
3449 3637                  if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
3450 3638                          add_x86_feature(featureset, X86FSET_SSE3);
3451 3639                  }
3452 3640                  if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
3453 3641                          add_x86_feature(featureset, X86FSET_SSSE3);
3454 3642                  }
3455 3643                  if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
3456 3644                          add_x86_feature(featureset, X86FSET_SSE4_1);
3457 3645                  }
3458 3646                  if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
3459 3647                          add_x86_feature(featureset, X86FSET_SSE4_2);
3460 3648                  }
3461 3649                  if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
3462 3650                          add_x86_feature(featureset, X86FSET_AES);
3463 3651                  }
3464 3652                  if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
3465 3653                          add_x86_feature(featureset, X86FSET_PCLMULQDQ);
3466 3654                  }
3467 3655  
3468 3656                  if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
3469 3657                          add_x86_feature(featureset, X86FSET_SHA);
3470 3658  
3471 3659                  if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
3472 3660                          add_x86_feature(featureset, X86FSET_UMIP);
3473 3661                  if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_PKU)
3474 3662                          add_x86_feature(featureset, X86FSET_PKU);
3475 3663                  if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
3476 3664                          add_x86_feature(featureset, X86FSET_OSPKE);
3477 3665  
3478 3666                  if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
3479 3667                          add_x86_feature(featureset, X86FSET_XSAVE);
3480 3668  
3481 3669                          /* We only test AVX & AVX512 when there is XSAVE */
3482 3670  
3483 3671                          if (cp->cp_ecx & CPUID_INTC_ECX_AVX) {
3484 3672                                  add_x86_feature(featureset,
3485 3673                                      X86FSET_AVX);
3486 3674  
3487 3675                                  /*
3488 3676                                   * Intel says we can't check these without also
3489 3677                                   * checking AVX.
3490 3678                                   */
3491 3679                                  if (cp->cp_ecx & CPUID_INTC_ECX_F16C)
3492 3680                                          add_x86_feature(featureset,
3493 3681                                              X86FSET_F16C);
3494 3682  
3495 3683                                  if (cp->cp_ecx & CPUID_INTC_ECX_FMA)
3496 3684                                          add_x86_feature(featureset,
3497 3685                                              X86FSET_FMA);
3498 3686  
3499 3687                                  if (cpi->cpi_std[7].cp_ebx &
3500 3688                                      CPUID_INTC_EBX_7_0_BMI1)
3501 3689                                          add_x86_feature(featureset,
3502 3690                                              X86FSET_BMI1);
3503 3691  
3504 3692                                  if (cpi->cpi_std[7].cp_ebx &
3505 3693                                      CPUID_INTC_EBX_7_0_BMI2)
3506 3694                                          add_x86_feature(featureset,
3507 3695                                              X86FSET_BMI2);
3508 3696  
3509 3697                                  if (cpi->cpi_std[7].cp_ebx &
3510 3698                                      CPUID_INTC_EBX_7_0_AVX2)
3511 3699                                          add_x86_feature(featureset,
3512 3700                                              X86FSET_AVX2);
3513 3701                          }
3514 3702  
3515 3703                          if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3516 3704                              (cpi->cpi_std[7].cp_ebx &
3517 3705                              CPUID_INTC_EBX_7_0_AVX512F) != 0) {
3518 3706                                  add_x86_feature(featureset, X86FSET_AVX512F);
3519 3707  
3520 3708                                  if (cpi->cpi_std[7].cp_ebx &
3521 3709                                      CPUID_INTC_EBX_7_0_AVX512DQ)
3522 3710                                          add_x86_feature(featureset,
3523 3711                                              X86FSET_AVX512DQ);
3524 3712                                  if (cpi->cpi_std[7].cp_ebx &
3525 3713                                      CPUID_INTC_EBX_7_0_AVX512IFMA)
3526 3714                                          add_x86_feature(featureset,
3527 3715                                              X86FSET_AVX512FMA);
3528 3716                                  if (cpi->cpi_std[7].cp_ebx &
3529 3717                                      CPUID_INTC_EBX_7_0_AVX512PF)
3530 3718                                          add_x86_feature(featureset,
3531 3719                                              X86FSET_AVX512PF);
3532 3720                                  if (cpi->cpi_std[7].cp_ebx &
3533 3721                                      CPUID_INTC_EBX_7_0_AVX512ER)
3534 3722                                          add_x86_feature(featureset,
3535 3723                                              X86FSET_AVX512ER);
3536 3724                                  if (cpi->cpi_std[7].cp_ebx &
3537 3725                                      CPUID_INTC_EBX_7_0_AVX512CD)
3538 3726                                          add_x86_feature(featureset,
3539 3727                                              X86FSET_AVX512CD);
3540 3728                                  if (cpi->cpi_std[7].cp_ebx &
3541 3729                                      CPUID_INTC_EBX_7_0_AVX512BW)
3542 3730                                          add_x86_feature(featureset,
3543 3731                                              X86FSET_AVX512BW);
3544 3732                                  if (cpi->cpi_std[7].cp_ebx &
3545 3733                                      CPUID_INTC_EBX_7_0_AVX512VL)
3546 3734                                          add_x86_feature(featureset,
3547 3735                                              X86FSET_AVX512VL);
3548 3736  
3549 3737                                  if (cpi->cpi_std[7].cp_ecx &
3550 3738                                      CPUID_INTC_ECX_7_0_AVX512VBMI)
3551 3739                                          add_x86_feature(featureset,
3552 3740                                              X86FSET_AVX512VBMI);
3553 3741                                  if (cpi->cpi_std[7].cp_ecx &
3554 3742                                      CPUID_INTC_ECX_7_0_AVX512VNNI)
3555 3743                                          add_x86_feature(featureset,
3556 3744                                              X86FSET_AVX512VNNI);
3557 3745                                  if (cpi->cpi_std[7].cp_ecx &
3558 3746                                      CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3559 3747                                          add_x86_feature(featureset,
3560 3748                                              X86FSET_AVX512VPOPCDQ);
3561 3749  
3562 3750                                  if (cpi->cpi_std[7].cp_edx &
3563 3751                                      CPUID_INTC_EDX_7_0_AVX5124NNIW)
3564 3752                                          add_x86_feature(featureset,
3565 3753                                              X86FSET_AVX512NNIW);
3566 3754                                  if (cpi->cpi_std[7].cp_edx &
3567 3755                                      CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3568 3756                                          add_x86_feature(featureset,
3569 3757                                              X86FSET_AVX512FMAPS);
3570 3758                          }
3571 3759                  }
3572 3760          }
3573 3761  
3574 3762          if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3575 3763                  if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
3576 3764                          add_x86_feature(featureset, X86FSET_PCID);
3577 3765                  }
3578 3766          }
3579 3767  
3580 3768          if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
3581 3769                  add_x86_feature(featureset, X86FSET_X2APIC);
3582 3770          }
3583 3771          if (cp->cp_edx & CPUID_INTC_EDX_DE) {
3584 3772                  add_x86_feature(featureset, X86FSET_DE);
3585 3773          }
3586 3774  #if !defined(__xpv)
3587 3775          if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
3588 3776  
3589 3777                  /*
3590 3778                   * We require the CLFLUSH instruction for erratum workaround
3591 3779                   * to use MONITOR/MWAIT.
3592 3780                   */
3593 3781                  if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3594 3782                          cpi->cpi_mwait.support |= MWAIT_SUPPORT;
3595 3783                          add_x86_feature(featureset, X86FSET_MWAIT);
3596 3784                  } else {
3597 3785                          extern int idle_cpu_assert_cflush_monitor;
3598 3786  
3599 3787                          /*
3600 3788                           * All processors we are aware of which have
3601 3789                           * MONITOR/MWAIT also have CLFLUSH.
3602 3790                           */
3603 3791                          if (idle_cpu_assert_cflush_monitor) {
3604 3792                                  ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
3605 3793                                      (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
3606 3794                          }
3607 3795                  }
3608 3796          }
3609 3797  #endif  /* __xpv */
3610 3798  
3611 3799          if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
3612 3800                  add_x86_feature(featureset, X86FSET_VMX);
3613 3801          }
3614 3802  
3615 3803          if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
3616 3804                  add_x86_feature(featureset, X86FSET_RDRAND);
3617 3805  
3618 3806          /*
3619 3807           * Only need it first time, rest of the cpus would follow suit.
3620 3808           * we only capture this for the bootcpu.
3621 3809           */
3622 3810          if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3623 3811                  add_x86_feature(featureset, X86FSET_CLFSH);
3624 3812                  x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
3625 3813          }
3626 3814          if (is_x86_feature(featureset, X86FSET_PAE))
3627 3815                  cpi->cpi_pabits = 36;
3628 3816  
3629 3817          if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
3630 3818                  struct cpuid_regs r, *ecp;
3631 3819  
3632 3820                  ecp = &r;
3633 3821                  ecp->cp_eax = 0xD;
3634 3822                  ecp->cp_ecx = 1;
3635 3823                  ecp->cp_edx = ecp->cp_ebx = 0;
3636 3824                  (void) __cpuid_insn(ecp);
3637 3825  
3638 3826                  if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
3639 3827                          add_x86_feature(featureset, X86FSET_XSAVEOPT);
3640 3828                  if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
3641 3829                          add_x86_feature(featureset, X86FSET_XSAVEC);
3642 3830                  if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
3643 3831                          add_x86_feature(featureset, X86FSET_XSAVES);
3644 3832          }
3645 3833  
3646 3834          /*
3647 3835           * Work on the "extended" feature information, doing
3648 3836           * some basic initialization for cpuid_pass2()
3649 3837           */
3650 3838          xcpuid = 0;
3651 3839          switch (cpi->cpi_vendor) {
3652 3840          case X86_VENDOR_Intel:
3653 3841                  /*
3654 3842                   * On KVM we know we will have proper support for extended
3655 3843                   * cpuid.
3656 3844                   */
3657 3845                  if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
3658 3846                      (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
3659 3847                      (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
3660 3848                          xcpuid++;
3661 3849                  break;
3662 3850          case X86_VENDOR_AMD:
3663 3851                  if (cpi->cpi_family > 5 ||
3664 3852                      (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
3665 3853                          xcpuid++;
3666 3854                  break;
3667 3855          case X86_VENDOR_Cyrix:
3668 3856                  /*
3669 3857                   * Only these Cyrix CPUs are -known- to support
3670 3858                   * extended cpuid operations.
3671 3859                   */
3672 3860                  if (x86_type == X86_TYPE_VIA_CYRIX_III ||
3673 3861                      x86_type == X86_TYPE_CYRIX_GXm)
3674 3862                          xcpuid++;
3675 3863                  break;
3676 3864          case X86_VENDOR_Centaur:
3677 3865          case X86_VENDOR_TM:
3678 3866          default:
3679 3867                  xcpuid++;
3680 3868                  break;
3681 3869          }
3682 3870  
3683 3871          if (xcpuid) {
3684 3872                  cp = &cpi->cpi_extd[0];
3685 3873                  cp->cp_eax = CPUID_LEAF_EXT_0;
3686 3874                  cpi->cpi_xmaxeax = __cpuid_insn(cp);
3687 3875          }
3688 3876  
3689 3877          if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
3690 3878  
3691 3879                  if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
3692 3880                          cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
3693 3881  
3694 3882                  switch (cpi->cpi_vendor) {
3695 3883                  case X86_VENDOR_Intel:
3696 3884                  case X86_VENDOR_AMD:
3697 3885                          if (cpi->cpi_xmaxeax < 0x80000001)
3698 3886                                  break;
3699 3887                          cp = &cpi->cpi_extd[1];
3700 3888                          cp->cp_eax = 0x80000001;
3701 3889                          (void) __cpuid_insn(cp);
3702 3890  
3703 3891                          if (cpi->cpi_vendor == X86_VENDOR_AMD &&
3704 3892                              cpi->cpi_family == 5 &&
3705 3893                              cpi->cpi_model == 6 &&
3706 3894                              cpi->cpi_step == 6) {
3707 3895                                  /*
3708 3896                                   * K6 model 6 uses bit 10 to indicate SYSC
3709 3897                                   * Later models use bit 11. Fix it here.
3710 3898                                   */
3711 3899                                  if (cp->cp_edx & 0x400) {
3712 3900                                          cp->cp_edx &= ~0x400;
3713 3901                                          cp->cp_edx |= CPUID_AMD_EDX_SYSC;
3714 3902                                  }
3715 3903                          }
3716 3904  
3717 3905                          platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
3718 3906  
3719 3907                          /*
3720 3908                           * Compute the additions to the kernel's feature word.
3721 3909                           */
3722 3910                          if (cp->cp_edx & CPUID_AMD_EDX_NX) {
3723 3911                                  add_x86_feature(featureset, X86FSET_NX);
3724 3912                          }
3725 3913  
3726 3914                          /*
3727 3915                           * Regardless whether or not we boot 64-bit,
3728 3916                           * we should have a way to identify whether
3729 3917                           * the CPU is capable of running 64-bit.
3730 3918                           */
3731 3919                          if (cp->cp_edx & CPUID_AMD_EDX_LM) {
3732 3920                                  add_x86_feature(featureset, X86FSET_64);
3733 3921                          }
3734 3922  
3735 3923                          /* 1 GB large page - enable only for 64 bit kernel */
3736 3924                          if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
3737 3925                                  add_x86_feature(featureset, X86FSET_1GPG);
3738 3926                          }
3739 3927  
3740 3928                          if ((cpi->cpi_vendor == X86_VENDOR_AMD) &&
3741 3929                              (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
3742 3930                              (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
3743 3931                                  add_x86_feature(featureset, X86FSET_SSE4A);
3744 3932                          }
3745 3933  
3746 3934                          /*
3747 3935                           * It's really tricky to support syscall/sysret in
3748 3936                           * the i386 kernel; we rely on sysenter/sysexit
3749 3937                           * instead.  In the amd64 kernel, things are -way-
3750 3938                           * better.
3751 3939                           */
3752 3940                          if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
3753 3941                                  add_x86_feature(featureset, X86FSET_ASYSC);
3754 3942                          }
3755 3943  
3756 3944                          /*
3757 3945                           * While we're thinking about system calls, note
3758 3946                           * that AMD processors don't support sysenter
3759 3947                           * in long mode at all, so don't try to program them.
3760 3948                           */
3761 3949                          if (x86_vendor == X86_VENDOR_AMD) {
3762 3950                                  remove_x86_feature(featureset, X86FSET_SEP);
3763 3951                          }
3764 3952  
3765 3953                          if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
3766 3954                                  add_x86_feature(featureset, X86FSET_TSCP);
3767 3955                          }
3768 3956  
3769 3957                          if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
3770 3958                                  add_x86_feature(featureset, X86FSET_SVM);
3771 3959                          }
3772 3960  
3773 3961                          if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
3774 3962                                  add_x86_feature(featureset, X86FSET_TOPOEXT);
3775 3963                          }
3776 3964  
3777 3965                          if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
3778 3966                                  add_x86_feature(featureset, X86FSET_AMD_PCEC);
3779 3967                          }
3780 3968  
3781 3969                          if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
3782 3970                                  add_x86_feature(featureset, X86FSET_XOP);
3783 3971                          }
3784 3972  
3785 3973                          if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
3786 3974                                  add_x86_feature(featureset, X86FSET_FMA4);
3787 3975                          }
3788 3976  
3789 3977                          if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
3790 3978                                  add_x86_feature(featureset, X86FSET_TBM);
3791 3979                          }
3792 3980  
3793 3981                          if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
3794 3982                                  add_x86_feature(featureset, X86FSET_MONITORX);
3795 3983                          }
3796 3984                          break;
3797 3985                  default:
3798 3986                          break;
3799 3987                  }
3800 3988  
3801 3989                  /*
3802 3990                   * Get CPUID data about processor cores and hyperthreads.
3803 3991                   */
3804 3992                  switch (cpi->cpi_vendor) {
3805 3993                  case X86_VENDOR_Intel:
3806 3994                          if (cpi->cpi_maxeax >= 4) {
3807 3995                                  cp = &cpi->cpi_std[4];
3808 3996                                  cp->cp_eax = 4;
3809 3997                                  cp->cp_ecx = 0;
3810 3998                                  (void) __cpuid_insn(cp);
3811 3999                                  platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
3812 4000                          }
3813 4001                          /*FALLTHROUGH*/
3814 4002                  case X86_VENDOR_AMD:
3815 4003                          if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
3816 4004                                  break;
3817 4005                          cp = &cpi->cpi_extd[8];
3818 4006                          cp->cp_eax = CPUID_LEAF_EXT_8;
3819 4007                          (void) __cpuid_insn(cp);
3820 4008                          platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
3821 4009                              cp);
3822 4010  
3823 4011                          /*
3824 4012                           * AMD uses ebx for some extended functions.
3825 4013                           */
3826 4014                          if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3827 4015                                  /*
3828 4016                                   * While we're here, check for the AMD "Error
3829 4017                                   * Pointer Zero/Restore" feature. This can be
3830 4018                                   * used to setup the FP save handlers
3831 4019                                   * appropriately.
3832 4020                                   */
3833 4021                                  if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
3834 4022                                          cpi->cpi_fp_amd_save = 0;
3835 4023                                  } else {
3836 4024                                          cpi->cpi_fp_amd_save = 1;
3837 4025                                  }
3838 4026  
3839 4027                                  if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
3840 4028                                          add_x86_feature(featureset,
3841 4029                                              X86FSET_CLZERO);
3842 4030                                  }
3843 4031                          }
3844 4032  
3845 4033                          /*
3846 4034                           * Virtual and physical address limits from
3847 4035                           * cpuid override previously guessed values.
3848 4036                           */
3849 4037                          cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
3850 4038                          cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
3851 4039                          break;
3852 4040                  default:
3853 4041                          break;
3854 4042                  }
3855 4043  
3856 4044                  /*
3857 4045                   * Get CPUID data about TSC Invariance in Deep C-State.
3858 4046                   */
3859 4047                  switch (cpi->cpi_vendor) {
3860 4048                  case X86_VENDOR_Intel:
3861 4049                  case X86_VENDOR_AMD:
3862 4050                          if (cpi->cpi_maxeax >= 7) {
3863 4051                                  cp = &cpi->cpi_extd[7];
3864 4052                                  cp->cp_eax = 0x80000007;
3865 4053                                  cp->cp_ecx = 0;
3866 4054                                  (void) __cpuid_insn(cp);
3867 4055                          }
3868 4056                          break;
3869 4057                  default:
3870 4058                          break;
3871 4059                  }
3872 4060          }
3873 4061  
3874 4062          cpuid_pass1_topology(cpu, featureset);
3875 4063          cpuid_pass1_thermal(cpu, featureset);
3876 4064  
3877 4065          /*
3878 4066           * Synthesize chip "revision" and socket type
3879 4067           */
3880 4068          cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
3881 4069              cpi->cpi_model, cpi->cpi_step);
3882 4070          cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
3883 4071              cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
3884 4072          cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
3885 4073              cpi->cpi_model, cpi->cpi_step);
3886 4074  
3887 4075          if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3888 4076                  if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
3889 4077                      cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
3890 4078                          /* Special handling for AMD FP not necessary. */
3891 4079                          cpi->cpi_fp_amd_save = 0;
3892 4080                  } else {
3893 4081                          cpi->cpi_fp_amd_save = 1;
3894 4082                  }
3895 4083          }
3896 4084  
3897 4085          /*
3898 4086           * Check the processor leaves that are used for security features.
3899 4087           */
3900 4088          cpuid_scan_security(cpu, featureset);
3901 4089  
3902 4090  pass1_done:
3903 4091          cpi->cpi_pass = 1;
3904 4092  }
3905 4093  
3906 4094  /*
3907 4095   * Make copies of the cpuid table entries we depend on, in
3908 4096   * part for ease of parsing now, in part so that we have only
3909 4097   * one place to correct any of it, in part for ease of
3910 4098   * later export to userland, and in part so we can look at
3911 4099   * this stuff in a crash dump.
3912 4100   */
3913 4101  
3914 4102  /*ARGSUSED*/
3915 4103  void
3916 4104  cpuid_pass2(cpu_t *cpu)
3917 4105  {
3918 4106          uint_t n, nmax;
3919 4107          int i;
3920 4108          struct cpuid_regs *cp;
3921 4109          uint8_t *dp;
3922 4110          uint32_t *iptr;
3923 4111          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3924 4112  
3925 4113          ASSERT(cpi->cpi_pass == 1);
3926 4114  
3927 4115          if (cpi->cpi_maxeax < 1)
3928 4116                  goto pass2_done;
3929 4117  
3930 4118          if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
3931 4119                  nmax = NMAX_CPI_STD;
3932 4120          /*
3933 4121           * (We already handled n == 0 and n == 1 in pass 1)
3934 4122           */
3935 4123          for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
3936 4124                  /*
3937 4125                   * leaves 6 and 7 were handled in pass 1
3938 4126                   */
3939 4127                  if (n == 6 || n == 7)
3940 4128                          continue;
3941 4129  
3942 4130                  cp->cp_eax = n;
3943 4131  
3944 4132                  /*
3945 4133                   * CPUID function 4 expects %ecx to be initialized
3946 4134                   * with an index which indicates which cache to return
3947 4135                   * information about. The OS is expected to call function 4
3948 4136                   * with %ecx set to 0, 1, 2, ... until it returns with
3949 4137                   * EAX[4:0] set to 0, which indicates there are no more
3950 4138                   * caches.
3951 4139                   *
3952 4140                   * Here, populate cpi_std[4] with the information returned by
3953 4141                   * function 4 when %ecx == 0, and do the rest in cpuid_pass3()
3954 4142                   * when dynamic memory allocation becomes available.
3955 4143                   *
3956 4144                   * Note: we need to explicitly initialize %ecx here, since
3957 4145                   * function 4 may have been previously invoked.
3958 4146                   */
3959 4147                  if (n == 4)
3960 4148                          cp->cp_ecx = 0;
3961 4149  
3962 4150                  (void) __cpuid_insn(cp);
3963 4151                  platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
3964 4152                  switch (n) {
3965 4153                  case 2:
3966 4154                          /*
3967 4155                           * "the lower 8 bits of the %eax register
3968 4156                           * contain a value that identifies the number
3969 4157                           * of times the cpuid [instruction] has to be
3970 4158                           * executed to obtain a complete image of the
3971 4159                           * processor's caching systems."
3972 4160                           *
3973 4161                           * How *do* they make this stuff up?
3974 4162                           */
3975 4163                          cpi->cpi_ncache = sizeof (*cp) *
3976 4164                              BITX(cp->cp_eax, 7, 0);
3977 4165                          if (cpi->cpi_ncache == 0)
3978 4166                                  break;
3979 4167                          cpi->cpi_ncache--;      /* skip count byte */
3980 4168  
3981 4169                          /*
3982 4170                           * Well, for now, rather than attempt to implement
3983 4171                           * this slightly dubious algorithm, we just look
3984 4172                           * at the first 15 ..
3985 4173                           */
3986 4174                          if (cpi->cpi_ncache > (sizeof (*cp) - 1))
3987 4175                                  cpi->cpi_ncache = sizeof (*cp) - 1;
3988 4176  
3989 4177                          dp = cpi->cpi_cacheinfo;
3990 4178                          if (BITX(cp->cp_eax, 31, 31) == 0) {
3991 4179                                  uint8_t *p = (void *)&cp->cp_eax;
3992 4180                                  for (i = 1; i < 4; i++)
3993 4181                                          if (p[i] != 0)
3994 4182                                                  *dp++ = p[i];
3995 4183                          }
3996 4184                          if (BITX(cp->cp_ebx, 31, 31) == 0) {
3997 4185                                  uint8_t *p = (void *)&cp->cp_ebx;
3998 4186                                  for (i = 0; i < 4; i++)
3999 4187                                          if (p[i] != 0)
4000 4188                                                  *dp++ = p[i];
4001 4189                          }
4002 4190                          if (BITX(cp->cp_ecx, 31, 31) == 0) {
4003 4191                                  uint8_t *p = (void *)&cp->cp_ecx;
4004 4192                                  for (i = 0; i < 4; i++)
4005 4193                                          if (p[i] != 0)
4006 4194                                                  *dp++ = p[i];
4007 4195                          }
4008 4196                          if (BITX(cp->cp_edx, 31, 31) == 0) {
4009 4197                                  uint8_t *p = (void *)&cp->cp_edx;
4010 4198                                  for (i = 0; i < 4; i++)
4011 4199                                          if (p[i] != 0)
4012 4200                                                  *dp++ = p[i];
4013 4201                          }
4014 4202                          break;
4015 4203  
4016 4204                  case 3: /* Processor serial number, if PSN supported */
4017 4205                          break;
4018 4206  
4019 4207                  case 4: /* Deterministic cache parameters */
4020 4208                          break;
4021 4209  
4022 4210                  case 5: /* Monitor/Mwait parameters */
4023 4211                  {
4024 4212                          size_t mwait_size;
4025 4213  
4026 4214                          /*
4027 4215                           * check cpi_mwait.support which was set in cpuid_pass1
4028 4216                           */
4029 4217                          if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
4030 4218                                  break;
4031 4219  
4032 4220                          /*
4033 4221                           * Protect ourself from insane mwait line size.
4034 4222                           * Workaround for incomplete hardware emulator(s).
4035 4223                           */
4036 4224                          mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
4037 4225                          if (mwait_size < sizeof (uint32_t) ||
4038 4226                              !ISP2(mwait_size)) {
4039 4227  #if DEBUG
4040 4228                                  cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
4041 4229                                      "size %ld", cpu->cpu_id, (long)mwait_size);
4042 4230  #endif
4043 4231                                  break;
4044 4232                          }
4045 4233  
4046 4234                          cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
4047 4235                          cpi->cpi_mwait.mon_max = mwait_size;
4048 4236                          if (MWAIT_EXTENSION(cpi)) {
4049 4237                                  cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
4050 4238                                  if (MWAIT_INT_ENABLE(cpi))
4051 4239                                          cpi->cpi_mwait.support |=
4052 4240                                              MWAIT_ECX_INT_ENABLE;
4053 4241                          }
4054 4242                          break;
4055 4243                  }
4056 4244                  default:
4057 4245                          break;
4058 4246                  }
4059 4247          }
4060 4248  
4061 4249          /*
4062 4250           * XSAVE enumeration
4063 4251           */
4064 4252          if (cpi->cpi_maxeax >= 0xD) {
4065 4253                  struct cpuid_regs regs;
4066 4254                  boolean_t cpuid_d_valid = B_TRUE;
4067 4255  
4068 4256                  cp = &regs;
4069 4257                  cp->cp_eax = 0xD;
4070 4258                  cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
4071 4259  
4072 4260                  (void) __cpuid_insn(cp);
4073 4261  
4074 4262                  /*
4075 4263                   * Sanity checks for debug
4076 4264                   */
4077 4265                  if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
4078 4266                      (cp->cp_eax & XFEATURE_SSE) == 0) {
4079 4267                          cpuid_d_valid = B_FALSE;
4080 4268                  }
4081 4269  
4082 4270                  cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
4083 4271                  cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
4084 4272                  cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
4085 4273  
4086 4274                  /*
4087 4275                   * If the hw supports AVX, get the size and offset in the save
4088 4276                   * area for the ymm state.
4089 4277                   */
4090 4278                  if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
4091 4279                          cp->cp_eax = 0xD;
4092 4280                          cp->cp_ecx = 2;
4093 4281                          cp->cp_edx = cp->cp_ebx = 0;
4094 4282  
4095 4283                          (void) __cpuid_insn(cp);
4096 4284  
4097 4285                          if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
4098 4286                              cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
4099 4287                                  cpuid_d_valid = B_FALSE;
4100 4288                          }
4101 4289  
4102 4290                          cpi->cpi_xsave.ymm_size = cp->cp_eax;
4103 4291                          cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
4104 4292                  }
4105 4293  
4106 4294                  /*
4107 4295                   * If the hw supports MPX, get the size and offset in the
4108 4296                   * save area for BNDREGS and BNDCSR.
4109 4297                   */
4110 4298                  if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
4111 4299                          cp->cp_eax = 0xD;
4112 4300                          cp->cp_ecx = 3;
4113 4301                          cp->cp_edx = cp->cp_ebx = 0;
4114 4302  
4115 4303                          (void) __cpuid_insn(cp);
4116 4304  
4117 4305                          cpi->cpi_xsave.bndregs_size = cp->cp_eax;
4118 4306                          cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
4119 4307  
4120 4308                          cp->cp_eax = 0xD;
4121 4309                          cp->cp_ecx = 4;
4122 4310                          cp->cp_edx = cp->cp_ebx = 0;
4123 4311  
4124 4312                          (void) __cpuid_insn(cp);
4125 4313  
4126 4314                          cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
4127 4315                          cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
4128 4316                  }
4129 4317  
4130 4318                  /*
4131 4319                   * If the hw supports AVX512, get the size and offset in the
4132 4320                   * save area for the opmask registers and zmm state.
4133 4321                   */
4134 4322                  if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
4135 4323                          cp->cp_eax = 0xD;
4136 4324                          cp->cp_ecx = 5;
4137 4325                          cp->cp_edx = cp->cp_ebx = 0;
4138 4326  
4139 4327                          (void) __cpuid_insn(cp);
4140 4328  
4141 4329                          cpi->cpi_xsave.opmask_size = cp->cp_eax;
4142 4330                          cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
4143 4331  
4144 4332                          cp->cp_eax = 0xD;
4145 4333                          cp->cp_ecx = 6;
4146 4334                          cp->cp_edx = cp->cp_ebx = 0;
4147 4335  
4148 4336                          (void) __cpuid_insn(cp);
4149 4337  
4150 4338                          cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
4151 4339                          cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
4152 4340  
4153 4341                          cp->cp_eax = 0xD;
4154 4342                          cp->cp_ecx = 7;
4155 4343                          cp->cp_edx = cp->cp_ebx = 0;
4156 4344  
4157 4345                          (void) __cpuid_insn(cp);
4158 4346  
4159 4347                          cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
4160 4348                          cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
4161 4349                  }
4162 4350  
4163 4351                  if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
4164 4352                          xsave_state_size = 0;
4165 4353                  } else if (cpuid_d_valid) {
4166 4354                          xsave_state_size = cpi->cpi_xsave.xsav_max_size;
4167 4355                  } else {
4168 4356                          /* Broken CPUID 0xD, probably in HVM */
4169 4357                          cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
4170 4358                              "value: hw_low = %d, hw_high = %d, xsave_size = %d"
4171 4359                              ", ymm_size = %d, ymm_offset = %d\n",
4172 4360                              cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
4173 4361                              cpi->cpi_xsave.xsav_hw_features_high,
4174 4362                              (int)cpi->cpi_xsave.xsav_max_size,
4175 4363                              (int)cpi->cpi_xsave.ymm_size,
4176 4364                              (int)cpi->cpi_xsave.ymm_offset);
4177 4365  
4178 4366                          if (xsave_state_size != 0) {
4179 4367                                  /*
4180 4368                                   * This must be a non-boot CPU. We cannot
4181 4369                                   * continue, because boot cpu has already
4182 4370                                   * enabled XSAVE.
4183 4371                                   */
4184 4372                                  ASSERT(cpu->cpu_id != 0);
4185 4373                                  cmn_err(CE_PANIC, "cpu%d: we have already "
4186 4374                                      "enabled XSAVE on boot cpu, cannot "
4187 4375                                      "continue.", cpu->cpu_id);
4188 4376                          } else {
4189 4377                                  /*
4190 4378                                   * If we reached here on the boot CPU, it's also
4191 4379                                   * almost certain that we'll reach here on the
4192 4380                                   * non-boot CPUs. When we're here on a boot CPU
4193 4381                                   * we should disable the feature, on a non-boot
4194 4382                                   * CPU we need to confirm that we have.
4195 4383                                   */
4196 4384                                  if (cpu->cpu_id == 0) {
4197 4385                                          remove_x86_feature(x86_featureset,
4198 4386                                              X86FSET_XSAVE);
4199 4387                                          remove_x86_feature(x86_featureset,
4200 4388                                              X86FSET_AVX);
4201 4389                                          remove_x86_feature(x86_featureset,
4202 4390                                              X86FSET_F16C);
4203 4391                                          remove_x86_feature(x86_featureset,
4204 4392                                              X86FSET_BMI1);
4205 4393                                          remove_x86_feature(x86_featureset,
4206 4394                                              X86FSET_BMI2);
4207 4395                                          remove_x86_feature(x86_featureset,
4208 4396                                              X86FSET_FMA);
4209 4397                                          remove_x86_feature(x86_featureset,
4210 4398                                              X86FSET_AVX2);
4211 4399                                          remove_x86_feature(x86_featureset,
4212 4400                                              X86FSET_MPX);
4213 4401                                          remove_x86_feature(x86_featureset,
4214 4402                                              X86FSET_AVX512F);
4215 4403                                          remove_x86_feature(x86_featureset,
4216 4404                                              X86FSET_AVX512DQ);
4217 4405                                          remove_x86_feature(x86_featureset,
4218 4406                                              X86FSET_AVX512PF);
4219 4407                                          remove_x86_feature(x86_featureset,
4220 4408                                              X86FSET_AVX512ER);
4221 4409                                          remove_x86_feature(x86_featureset,
4222 4410                                              X86FSET_AVX512CD);
4223 4411                                          remove_x86_feature(x86_featureset,
4224 4412                                              X86FSET_AVX512BW);
4225 4413                                          remove_x86_feature(x86_featureset,
4226 4414                                              X86FSET_AVX512VL);
4227 4415                                          remove_x86_feature(x86_featureset,
4228 4416                                              X86FSET_AVX512FMA);
4229 4417                                          remove_x86_feature(x86_featureset,
4230 4418                                              X86FSET_AVX512VBMI);
4231 4419                                          remove_x86_feature(x86_featureset,
4232 4420                                              X86FSET_AVX512VNNI);
4233 4421                                          remove_x86_feature(x86_featureset,
4234 4422                                              X86FSET_AVX512VPOPCDQ);
4235 4423                                          remove_x86_feature(x86_featureset,
4236 4424                                              X86FSET_AVX512NNIW);
4237 4425                                          remove_x86_feature(x86_featureset,
4238 4426                                              X86FSET_AVX512FMAPS);
4239 4427  
4240 4428                                          CPI_FEATURES_ECX(cpi) &=
4241 4429                                              ~CPUID_INTC_ECX_XSAVE;
4242 4430                                          CPI_FEATURES_ECX(cpi) &=
4243 4431                                              ~CPUID_INTC_ECX_AVX;
4244 4432                                          CPI_FEATURES_ECX(cpi) &=
4245 4433                                              ~CPUID_INTC_ECX_F16C;
4246 4434                                          CPI_FEATURES_ECX(cpi) &=
4247 4435                                              ~CPUID_INTC_ECX_FMA;
4248 4436                                          CPI_FEATURES_7_0_EBX(cpi) &=
4249 4437                                              ~CPUID_INTC_EBX_7_0_BMI1;
4250 4438                                          CPI_FEATURES_7_0_EBX(cpi) &=
4251 4439                                              ~CPUID_INTC_EBX_7_0_BMI2;
4252 4440                                          CPI_FEATURES_7_0_EBX(cpi) &=
4253 4441                                              ~CPUID_INTC_EBX_7_0_AVX2;
4254 4442                                          CPI_FEATURES_7_0_EBX(cpi) &=
4255 4443                                              ~CPUID_INTC_EBX_7_0_MPX;
4256 4444                                          CPI_FEATURES_7_0_EBX(cpi) &=
4257 4445                                              ~CPUID_INTC_EBX_7_0_ALL_AVX512;
4258 4446  
4259 4447                                          CPI_FEATURES_7_0_ECX(cpi) &=
4260 4448                                              ~CPUID_INTC_ECX_7_0_ALL_AVX512;
4261 4449  
4262 4450                                          CPI_FEATURES_7_0_EDX(cpi) &=
4263 4451                                              ~CPUID_INTC_EDX_7_0_ALL_AVX512;
4264 4452  
4265 4453                                          xsave_force_disable = B_TRUE;
4266 4454                                  } else {
4267 4455                                          VERIFY(is_x86_feature(x86_featureset,
4268 4456                                              X86FSET_XSAVE) == B_FALSE);
4269 4457                                  }
4270 4458                          }
4271 4459                  }
4272 4460          }
4273 4461  
4274 4462  
4275 4463          if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
4276 4464                  goto pass2_done;
4277 4465  
4278 4466          if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
4279 4467                  nmax = NMAX_CPI_EXTD;
4280 4468          /*
4281 4469           * Copy the extended properties, fixing them as we go.
4282 4470           * (We already handled n == 0 and n == 1 in pass 1)
4283 4471           */
4284 4472          iptr = (void *)cpi->cpi_brandstr;
4285 4473          for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
4286 4474                  cp->cp_eax = CPUID_LEAF_EXT_0 + n;
4287 4475                  (void) __cpuid_insn(cp);
4288 4476                  platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
4289 4477                      cp);
4290 4478                  switch (n) {
4291 4479                  case 2:
4292 4480                  case 3:
4293 4481                  case 4:
4294 4482                          /*
4295 4483                           * Extract the brand string
4296 4484                           */
4297 4485                          *iptr++ = cp->cp_eax;
4298 4486                          *iptr++ = cp->cp_ebx;
4299 4487                          *iptr++ = cp->cp_ecx;
4300 4488                          *iptr++ = cp->cp_edx;
4301 4489                          break;
4302 4490                  case 5:
4303 4491                          switch (cpi->cpi_vendor) {
4304 4492                          case X86_VENDOR_AMD:
4305 4493                                  /*
4306 4494                                   * The Athlon and Duron were the first
4307 4495                                   * parts to report the sizes of the
4308 4496                                   * TLB for large pages. Before then,
4309 4497                                   * we don't trust the data.
4310 4498                                   */
4311 4499                                  if (cpi->cpi_family < 6 ||
4312 4500                                      (cpi->cpi_family == 6 &&
4313 4501                                      cpi->cpi_model < 1))
4314 4502                                          cp->cp_eax = 0;
4315 4503                                  break;
4316 4504                          default:
4317 4505                                  break;
4318 4506                          }
4319 4507                          break;
4320 4508                  case 6:
4321 4509                          switch (cpi->cpi_vendor) {
4322 4510                          case X86_VENDOR_AMD:
4323 4511                                  /*
4324 4512                                   * The Athlon and Duron were the first
4325 4513                                   * AMD parts with L2 TLB's.
4326 4514                                   * Before then, don't trust the data.
4327 4515                                   */
4328 4516                                  if (cpi->cpi_family < 6 ||
4329 4517                                      cpi->cpi_family == 6 &&
4330 4518                                      cpi->cpi_model < 1)
4331 4519                                          cp->cp_eax = cp->cp_ebx = 0;
4332 4520                                  /*
4333 4521                                   * AMD Duron rev A0 reports L2
4334 4522                                   * cache size incorrectly as 1K
4335 4523                                   * when it is really 64K
4336 4524                                   */
4337 4525                                  if (cpi->cpi_family == 6 &&
4338 4526                                      cpi->cpi_model == 3 &&
4339 4527                                      cpi->cpi_step == 0) {
4340 4528                                          cp->cp_ecx &= 0xffff;
4341 4529                                          cp->cp_ecx |= 0x400000;
4342 4530                                  }
4343 4531                                  break;
4344 4532                          case X86_VENDOR_Cyrix:  /* VIA C3 */
4345 4533                                  /*
4346 4534                                   * VIA C3 processors are a bit messed
4347 4535                                   * up w.r.t. encoding cache sizes in %ecx
4348 4536                                   */
4349 4537                                  if (cpi->cpi_family != 6)
4350 4538                                          break;
4351 4539                                  /*
4352 4540                                   * model 7 and 8 were incorrectly encoded
4353 4541                                   *
4354 4542                                   * xxx is model 8 really broken?
4355 4543                                   */
4356 4544                                  if (cpi->cpi_model == 7 ||
4357 4545                                      cpi->cpi_model == 8)
4358 4546                                          cp->cp_ecx =
4359 4547                                              BITX(cp->cp_ecx, 31, 24) << 16 |
4360 4548                                              BITX(cp->cp_ecx, 23, 16) << 12 |
4361 4549                                              BITX(cp->cp_ecx, 15, 8) << 8 |
4362 4550                                              BITX(cp->cp_ecx, 7, 0);
4363 4551                                  /*
4364 4552                                   * model 9 stepping 1 has wrong associativity
4365 4553                                   */
4366 4554                                  if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
4367 4555                                          cp->cp_ecx |= 8 << 12;
4368 4556                                  break;
4369 4557                          case X86_VENDOR_Intel:
4370 4558                                  /*
4371 4559                                   * Extended L2 Cache features function.
4372 4560                                   * First appeared on Prescott.
4373 4561                                   */
4374 4562                          default:
4375 4563                                  break;
4376 4564                          }
4377 4565                          break;
4378 4566                  default:
4379 4567                          break;
4380 4568                  }
4381 4569          }
4382 4570  
4383 4571  pass2_done:
4384 4572          cpi->cpi_pass = 2;
4385 4573  }
4386 4574  
4387 4575  static const char *
4388 4576  intel_cpubrand(const struct cpuid_info *cpi)
4389 4577  {
4390 4578          int i;
4391 4579  
4392 4580          if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4393 4581              cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
4394 4582                  return ("i486");
4395 4583  
4396 4584          switch (cpi->cpi_family) {
4397 4585          case 5:
4398 4586                  return ("Intel Pentium(r)");
4399 4587          case 6:
4400 4588                  switch (cpi->cpi_model) {
4401 4589                          uint_t celeron, xeon;
4402 4590                          const struct cpuid_regs *cp;
4403 4591                  case 0:
4404 4592                  case 1:
4405 4593                  case 2:
4406 4594                          return ("Intel Pentium(r) Pro");
4407 4595                  case 3:
4408 4596                  case 4:
4409 4597                          return ("Intel Pentium(r) II");
4410 4598                  case 6:
4411 4599                          return ("Intel Celeron(r)");
4412 4600                  case 5:
4413 4601                  case 7:
4414 4602                          celeron = xeon = 0;
4415 4603                          cp = &cpi->cpi_std[2];  /* cache info */
4416 4604  
4417 4605                          for (i = 1; i < 4; i++) {
4418 4606                                  uint_t tmp;
4419 4607  
4420 4608                                  tmp = (cp->cp_eax >> (8 * i)) & 0xff;
4421 4609                                  if (tmp == 0x40)
4422 4610                                          celeron++;
4423 4611                                  if (tmp >= 0x44 && tmp <= 0x45)
4424 4612                                          xeon++;
4425 4613                          }
4426 4614  
4427 4615                          for (i = 0; i < 2; i++) {
4428 4616                                  uint_t tmp;
4429 4617  
4430 4618                                  tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
4431 4619                                  if (tmp == 0x40)
4432 4620                                          celeron++;
4433 4621                                  else if (tmp >= 0x44 && tmp <= 0x45)
4434 4622                                          xeon++;
4435 4623                          }
4436 4624  
4437 4625                          for (i = 0; i < 4; i++) {
4438 4626                                  uint_t tmp;
4439 4627  
4440 4628                                  tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
4441 4629                                  if (tmp == 0x40)
4442 4630                                          celeron++;
4443 4631                                  else if (tmp >= 0x44 && tmp <= 0x45)
4444 4632                                          xeon++;
4445 4633                          }
4446 4634  
4447 4635                          for (i = 0; i < 4; i++) {
4448 4636                                  uint_t tmp;
4449 4637  
4450 4638                                  tmp = (cp->cp_edx >> (8 * i)) & 0xff;
4451 4639                                  if (tmp == 0x40)
4452 4640                                          celeron++;
4453 4641                                  else if (tmp >= 0x44 && tmp <= 0x45)
4454 4642                                          xeon++;
4455 4643                          }
4456 4644  
4457 4645                          if (celeron)
4458 4646                                  return ("Intel Celeron(r)");
4459 4647                          if (xeon)
4460 4648                                  return (cpi->cpi_model == 5 ?
4461 4649                                      "Intel Pentium(r) II Xeon(tm)" :
4462 4650                                      "Intel Pentium(r) III Xeon(tm)");
4463 4651                          return (cpi->cpi_model == 5 ?
4464 4652                              "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
4465 4653                              "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
4466 4654                  default:
4467 4655                          break;
4468 4656                  }
4469 4657          default:
4470 4658                  break;
4471 4659          }
4472 4660  
4473 4661          /* BrandID is present if the field is nonzero */
4474 4662          if (cpi->cpi_brandid != 0) {
4475 4663                  static const struct {
4476 4664                          uint_t bt_bid;
4477 4665                          const char *bt_str;
4478 4666                  } brand_tbl[] = {
4479 4667                          { 0x1,  "Intel(r) Celeron(r)" },
4480 4668                          { 0x2,  "Intel(r) Pentium(r) III" },
4481 4669                          { 0x3,  "Intel(r) Pentium(r) III Xeon(tm)" },
4482 4670                          { 0x4,  "Intel(r) Pentium(r) III" },
4483 4671                          { 0x6,  "Mobile Intel(r) Pentium(r) III" },
4484 4672                          { 0x7,  "Mobile Intel(r) Celeron(r)" },
4485 4673                          { 0x8,  "Intel(r) Pentium(r) 4" },
4486 4674                          { 0x9,  "Intel(r) Pentium(r) 4" },
4487 4675                          { 0xa,  "Intel(r) Celeron(r)" },
4488 4676                          { 0xb,  "Intel(r) Xeon(tm)" },
4489 4677                          { 0xc,  "Intel(r) Xeon(tm) MP" },
4490 4678                          { 0xe,  "Mobile Intel(r) Pentium(r) 4" },
4491 4679                          { 0xf,  "Mobile Intel(r) Celeron(r)" },
4492 4680                          { 0x11, "Mobile Genuine Intel(r)" },
4493 4681                          { 0x12, "Intel(r) Celeron(r) M" },
4494 4682                          { 0x13, "Mobile Intel(r) Celeron(r)" },
4495 4683                          { 0x14, "Intel(r) Celeron(r)" },
4496 4684                          { 0x15, "Mobile Genuine Intel(r)" },
4497 4685                          { 0x16, "Intel(r) Pentium(r) M" },
4498 4686                          { 0x17, "Mobile Intel(r) Celeron(r)" }
4499 4687                  };
4500 4688                  uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
4501 4689                  uint_t sgn;
4502 4690  
4503 4691                  sgn = (cpi->cpi_family << 8) |
4504 4692                      (cpi->cpi_model << 4) | cpi->cpi_step;
4505 4693  
4506 4694                  for (i = 0; i < btblmax; i++)
4507 4695                          if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
4508 4696                                  break;
4509 4697                  if (i < btblmax) {
4510 4698                          if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
4511 4699                                  return ("Intel(r) Celeron(r)");
4512 4700                          if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
4513 4701                                  return ("Intel(r) Xeon(tm) MP");
4514 4702                          if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
4515 4703                                  return ("Intel(r) Xeon(tm)");
4516 4704                          return (brand_tbl[i].bt_str);
4517 4705                  }
4518 4706          }
4519 4707  
4520 4708          return (NULL);
4521 4709  }
4522 4710  
4523 4711  static const char *
4524 4712  amd_cpubrand(const struct cpuid_info *cpi)
4525 4713  {
4526 4714          if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4527 4715              cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
4528 4716                  return ("i486 compatible");
4529 4717  
4530 4718          switch (cpi->cpi_family) {
4531 4719          case 5:
4532 4720                  switch (cpi->cpi_model) {
4533 4721                  case 0:
4534 4722                  case 1:
4535 4723                  case 2:
4536 4724                  case 3:
4537 4725                  case 4:
4538 4726                  case 5:
4539 4727                          return ("AMD-K5(r)");
4540 4728                  case 6:
4541 4729                  case 7:
4542 4730                          return ("AMD-K6(r)");
4543 4731                  case 8:
4544 4732                          return ("AMD-K6(r)-2");
4545 4733                  case 9:
4546 4734                          return ("AMD-K6(r)-III");
4547 4735                  default:
4548 4736                          return ("AMD (family 5)");
4549 4737                  }
4550 4738          case 6:
4551 4739                  switch (cpi->cpi_model) {
4552 4740                  case 1:
4553 4741                          return ("AMD-K7(tm)");
4554 4742                  case 0:
4555 4743                  case 2:
4556 4744                  case 4:
4557 4745                          return ("AMD Athlon(tm)");
4558 4746                  case 3:
4559 4747                  case 7:
4560 4748                          return ("AMD Duron(tm)");
4561 4749                  case 6:
4562 4750                  case 8:
4563 4751                  case 10:
4564 4752                          /*
4565 4753                           * Use the L2 cache size to distinguish
4566 4754                           */
4567 4755                          return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
4568 4756                              "AMD Athlon(tm)" : "AMD Duron(tm)");
4569 4757                  default:
4570 4758                          return ("AMD (family 6)");
4571 4759                  }
4572 4760          default:
4573 4761                  break;
4574 4762          }
4575 4763  
4576 4764          if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
4577 4765              cpi->cpi_brandid != 0) {
4578 4766                  switch (BITX(cpi->cpi_brandid, 7, 5)) {
4579 4767                  case 3:
4580 4768                          return ("AMD Opteron(tm) UP 1xx");
4581 4769                  case 4:
4582 4770                          return ("AMD Opteron(tm) DP 2xx");
4583 4771                  case 5:
4584 4772                          return ("AMD Opteron(tm) MP 8xx");
4585 4773                  default:
4586 4774                          return ("AMD Opteron(tm)");
4587 4775                  }
4588 4776          }
4589 4777  
4590 4778          return (NULL);
4591 4779  }
4592 4780  
4593 4781  static const char *
4594 4782  cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
4595 4783  {
4596 4784          if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4597 4785              cpi->cpi_maxeax < 1 || cpi->cpi_family < 5 ||
4598 4786              type == X86_TYPE_CYRIX_486)
4599 4787                  return ("i486 compatible");
4600 4788  
4601 4789          switch (type) {
4602 4790          case X86_TYPE_CYRIX_6x86:
4603 4791                  return ("Cyrix 6x86");
4604 4792          case X86_TYPE_CYRIX_6x86L:
4605 4793                  return ("Cyrix 6x86L");
4606 4794          case X86_TYPE_CYRIX_6x86MX:
4607 4795                  return ("Cyrix 6x86MX");
4608 4796          case X86_TYPE_CYRIX_GXm:
4609 4797                  return ("Cyrix GXm");
4610 4798          case X86_TYPE_CYRIX_MediaGX:
4611 4799                  return ("Cyrix MediaGX");
4612 4800          case X86_TYPE_CYRIX_MII:
4613 4801                  return ("Cyrix M2");
4614 4802          case X86_TYPE_VIA_CYRIX_III:
4615 4803                  return ("VIA Cyrix M3");
4616 4804          default:
4617 4805                  /*
4618 4806                   * Have another wild guess ..
4619 4807                   */
4620 4808                  if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
4621 4809                          return ("Cyrix 5x86");
4622 4810                  else if (cpi->cpi_family == 5) {
4623 4811                          switch (cpi->cpi_model) {
4624 4812                          case 2:
4625 4813                                  return ("Cyrix 6x86");  /* Cyrix M1 */
4626 4814                          case 4:
4627 4815                                  return ("Cyrix MediaGX");
4628 4816                          default:
4629 4817                                  break;
4630 4818                          }
4631 4819                  } else if (cpi->cpi_family == 6) {
4632 4820                          switch (cpi->cpi_model) {
4633 4821                          case 0:
4634 4822                                  return ("Cyrix 6x86MX"); /* Cyrix M2? */
4635 4823                          case 5:
4636 4824                          case 6:
4637 4825                          case 7:
4638 4826                          case 8:
4639 4827                          case 9:
4640 4828                                  return ("VIA C3");
4641 4829                          default:
4642 4830                                  break;
4643 4831                          }
4644 4832                  }
4645 4833                  break;
4646 4834          }
4647 4835          return (NULL);
4648 4836  }
4649 4837  
4650 4838  /*
4651 4839   * This only gets called in the case that the CPU extended
4652 4840   * feature brand string (0x80000002, 0x80000003, 0x80000004)
4653 4841   * aren't available, or contain null bytes for some reason.
4654 4842   */
4655 4843  static void
4656 4844  fabricate_brandstr(struct cpuid_info *cpi)
4657 4845  {
4658 4846          const char *brand = NULL;
4659 4847  
4660 4848          switch (cpi->cpi_vendor) {
4661 4849          case X86_VENDOR_Intel:
4662 4850                  brand = intel_cpubrand(cpi);
4663 4851                  break;
4664 4852          case X86_VENDOR_AMD:
4665 4853                  brand = amd_cpubrand(cpi);
4666 4854                  break;
4667 4855          case X86_VENDOR_Cyrix:
4668 4856                  brand = cyrix_cpubrand(cpi, x86_type);
4669 4857                  break;
4670 4858          case X86_VENDOR_NexGen:
4671 4859                  if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4672 4860                          brand = "NexGen Nx586";
4673 4861                  break;
4674 4862          case X86_VENDOR_Centaur:
4675 4863                  if (cpi->cpi_family == 5)
4676 4864                          switch (cpi->cpi_model) {
4677 4865                          case 4:
4678 4866                                  brand = "Centaur C6";
4679 4867                                  break;
4680 4868                          case 8:
4681 4869                                  brand = "Centaur C2";
4682 4870                                  break;
4683 4871                          case 9:
4684 4872                                  brand = "Centaur C3";
4685 4873                                  break;
4686 4874                          default:
4687 4875                                  break;
4688 4876                          }
4689 4877                  break;
4690 4878          case X86_VENDOR_Rise:
4691 4879                  if (cpi->cpi_family == 5 &&
4692 4880                      (cpi->cpi_model == 0 || cpi->cpi_model == 2))
4693 4881                          brand = "Rise mP6";
4694 4882                  break;
4695 4883          case X86_VENDOR_SiS:
4696 4884                  if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4697 4885                          brand = "SiS 55x";
4698 4886                  break;
4699 4887          case X86_VENDOR_TM:
4700 4888                  if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
4701 4889                          brand = "Transmeta Crusoe TM3x00 or TM5x00";
4702 4890                  break;
4703 4891          case X86_VENDOR_NSC:
4704 4892          case X86_VENDOR_UMC:
4705 4893          default:
4706 4894                  break;
4707 4895          }
4708 4896          if (brand) {
4709 4897                  (void) strcpy((char *)cpi->cpi_brandstr, brand);
4710 4898                  return;
4711 4899          }
4712 4900  
4713 4901          /*
4714 4902           * If all else fails ...
4715 4903           */
4716 4904          (void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
4717 4905              "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
4718 4906              cpi->cpi_model, cpi->cpi_step);
4719 4907  }
4720 4908  
4721 4909  /*
4722 4910   * This routine is called just after kernel memory allocation
4723 4911   * becomes available on cpu0, and as part of mp_startup() on
4724 4912   * the other cpus.
4725 4913   *
4726 4914   * Fixup the brand string, and collect any information from cpuid
4727 4915   * that requires dynamically allocated storage to represent.
4728 4916   */
4729 4917  /*ARGSUSED*/
4730 4918  void
4731 4919  cpuid_pass3(cpu_t *cpu)
4732 4920  {
4733 4921          int     i, max, shft, level, size;
4734 4922          struct cpuid_regs regs;
4735 4923          struct cpuid_regs *cp;
4736 4924          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4737 4925  
4738 4926          ASSERT(cpi->cpi_pass == 2);
4739 4927  
4740 4928          /*
4741 4929           * Deterministic cache parameters
4742 4930           *
4743 4931           * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
4744 4932           * values that are present are currently defined to be the same. This
4745 4933           * means we can use the same logic to parse it as long as we use the
4746 4934           * appropriate leaf to get the data. If you're updating this, make sure
4747 4935           * you're careful about which vendor supports which aspect.
4748 4936           *
4749 4937           * Take this opportunity to detect the number of threads sharing the
4750 4938           * last level cache, and construct a corresponding cache id. The
4751 4939           * respective cpuid_info members are initialized to the default case of
4752 4940           * "no last level cache sharing".
4753 4941           */
4754 4942          cpi->cpi_ncpu_shr_last_cache = 1;
4755 4943          cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
4756 4944  
4757 4945          if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
4758 4946              (cpi->cpi_vendor == X86_VENDOR_AMD &&
4759 4947              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
4760 4948              is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
4761 4949                  uint32_t leaf;
4762 4950  
4763 4951                  if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4764 4952                          leaf = 4;
4765 4953                  } else {
4766 4954                          leaf = CPUID_LEAF_EXT_1d;
4767 4955                  }
4768 4956  
4769 4957                  /*
4770 4958                   * Find the # of elements (size) returned by the leaf and along
4771 4959                   * the way detect last level cache sharing details.
4772 4960                   */
4773 4961                  bzero(&regs, sizeof (regs));
4774 4962                  cp = &regs;
4775 4963                  for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
4776 4964                          cp->cp_eax = leaf;
4777 4965                          cp->cp_ecx = i;
4778 4966  
4779 4967                          (void) __cpuid_insn(cp);
4780 4968  
4781 4969                          if (CPI_CACHE_TYPE(cp) == 0)
4782 4970                                  break;
4783 4971                          level = CPI_CACHE_LVL(cp);
4784 4972                          if (level > max) {
4785 4973                                  max = level;
4786 4974                                  cpi->cpi_ncpu_shr_last_cache =
4787 4975                                      CPI_NTHR_SHR_CACHE(cp) + 1;
4788 4976                          }
4789 4977                  }
4790 4978                  cpi->cpi_cache_leaf_size = size = i;
4791 4979  
4792 4980                  /*
4793 4981                   * Allocate the cpi_cache_leaves array. The first element
4794 4982                   * references the regs for the corresponding leaf with %ecx set
4795 4983                   * to 0. This was gathered in cpuid_pass2().
4796 4984                   */
4797 4985                  if (size > 0) {
4798 4986                          cpi->cpi_cache_leaves =
4799 4987                              kmem_alloc(size * sizeof (cp), KM_SLEEP);
4800 4988                          if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4801 4989                                  cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
4802 4990                          } else {
4803 4991                                  cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
4804 4992                          }
4805 4993  
4806 4994                          /*
4807 4995                           * Allocate storage to hold the additional regs
4808 4996                           * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
4809 4997                           *
4810 4998                           * The regs for the leaf, %ecx == 0 has already
4811 4999                           * been allocated as indicated above.
4812 5000                           */
4813 5001                          for (i = 1; i < size; i++) {
4814 5002                                  cp = cpi->cpi_cache_leaves[i] =
4815 5003                                      kmem_zalloc(sizeof (regs), KM_SLEEP);
4816 5004                                  cp->cp_eax = leaf;
4817 5005                                  cp->cp_ecx = i;
4818 5006  
4819 5007                                  (void) __cpuid_insn(cp);
4820 5008                          }
4821 5009                  }
4822 5010                  /*
4823 5011                   * Determine the number of bits needed to represent
4824 5012                   * the number of CPUs sharing the last level cache.
4825 5013                   *
4826 5014                   * Shift off that number of bits from the APIC id to
4827 5015                   * derive the cache id.
4828 5016                   */
4829 5017                  shft = 0;
4830 5018                  for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
4831 5019                          shft++;
4832 5020                  cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
4833 5021          }
4834 5022  
4835 5023          /*
4836 5024           * Now fixup the brand string
4837 5025           */
4838 5026          if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
4839 5027                  fabricate_brandstr(cpi);
4840 5028          } else {
4841 5029  
4842 5030                  /*
4843 5031                   * If we successfully extracted a brand string from the cpuid
4844 5032                   * instruction, clean it up by removing leading spaces and
4845 5033                   * similar junk.
4846 5034                   */
4847 5035                  if (cpi->cpi_brandstr[0]) {
4848 5036                          size_t maxlen = sizeof (cpi->cpi_brandstr);
4849 5037                          char *src, *dst;
4850 5038  
4851 5039                          dst = src = (char *)cpi->cpi_brandstr;
4852 5040                          src[maxlen - 1] = '\0';
4853 5041                          /*
4854 5042                           * strip leading spaces
4855 5043                           */
4856 5044                          while (*src == ' ')
4857 5045                                  src++;
4858 5046                          /*
4859 5047                           * Remove any 'Genuine' or "Authentic" prefixes
4860 5048                           */
4861 5049                          if (strncmp(src, "Genuine ", 8) == 0)
4862 5050                                  src += 8;
4863 5051                          if (strncmp(src, "Authentic ", 10) == 0)
4864 5052                                  src += 10;
4865 5053  
4866 5054                          /*
4867 5055                           * Now do an in-place copy.
4868 5056                           * Map (R) to (r) and (TM) to (tm).
4869 5057                           * The era of teletypes is long gone, and there's
4870 5058                           * -really- no need to shout.
4871 5059                           */
4872 5060                          while (*src != '\0') {
4873 5061                                  if (src[0] == '(') {
4874 5062                                          if (strncmp(src + 1, "R)", 2) == 0) {
4875 5063                                                  (void) strncpy(dst, "(r)", 3);
4876 5064                                                  src += 3;
4877 5065                                                  dst += 3;
4878 5066                                                  continue;
4879 5067                                          }
4880 5068                                          if (strncmp(src + 1, "TM)", 3) == 0) {
4881 5069                                                  (void) strncpy(dst, "(tm)", 4);
4882 5070                                                  src += 4;
4883 5071                                                  dst += 4;
4884 5072                                                  continue;
4885 5073                                          }
4886 5074                                  }
4887 5075                                  *dst++ = *src++;
4888 5076                          }
4889 5077                          *dst = '\0';
4890 5078  
4891 5079                          /*
4892 5080                           * Finally, remove any trailing spaces
4893 5081                           */
4894 5082                          while (--dst > cpi->cpi_brandstr)
4895 5083                                  if (*dst == ' ')
4896 5084                                          *dst = '\0';
4897 5085                                  else
4898 5086                                          break;
4899 5087                  } else
4900 5088                          fabricate_brandstr(cpi);
4901 5089          }
4902 5090          cpi->cpi_pass = 3;
4903 5091  }
4904 5092  
4905 5093  /*
4906 5094   * This routine is called out of bind_hwcap() much later in the life
4907 5095   * of the kernel (post_startup()).  The job of this routine is to resolve
4908 5096   * the hardware feature support and kernel support for those features into
4909 5097   * what we're actually going to tell applications via the aux vector.
4910 5098   */
4911 5099  void
4912 5100  cpuid_pass4(cpu_t *cpu, uint_t *hwcap_out)
4913 5101  {
4914 5102          struct cpuid_info *cpi;
4915 5103          uint_t hwcap_flags = 0, hwcap_flags_2 = 0;
4916 5104  
4917 5105          if (cpu == NULL)
4918 5106                  cpu = CPU;
4919 5107          cpi = cpu->cpu_m.mcpu_cpi;
4920 5108  
4921 5109          ASSERT(cpi->cpi_pass == 3);
4922 5110  
4923 5111          if (cpi->cpi_maxeax >= 1) {
4924 5112                  uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
4925 5113                  uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
4926 5114                  uint32_t *ebx = &cpi->cpi_support[STD_EBX_FEATURES];
4927 5115  
4928 5116                  *edx = CPI_FEATURES_EDX(cpi);
4929 5117                  *ecx = CPI_FEATURES_ECX(cpi);
4930 5118                  *ebx = CPI_FEATURES_7_0_EBX(cpi);
4931 5119  
4932 5120                  /*
4933 5121                   * [these require explicit kernel support]
4934 5122                   */
4935 5123                  if (!is_x86_feature(x86_featureset, X86FSET_SEP))
4936 5124                          *edx &= ~CPUID_INTC_EDX_SEP;
4937 5125  
4938 5126                  if (!is_x86_feature(x86_featureset, X86FSET_SSE))
4939 5127                          *edx &= ~(CPUID_INTC_EDX_FXSR|CPUID_INTC_EDX_SSE);
4940 5128                  if (!is_x86_feature(x86_featureset, X86FSET_SSE2))
4941 5129                          *edx &= ~CPUID_INTC_EDX_SSE2;
4942 5130  
4943 5131                  if (!is_x86_feature(x86_featureset, X86FSET_HTT))
4944 5132                          *edx &= ~CPUID_INTC_EDX_HTT;
4945 5133  
4946 5134                  if (!is_x86_feature(x86_featureset, X86FSET_SSE3))
4947 5135                          *ecx &= ~CPUID_INTC_ECX_SSE3;
4948 5136  
4949 5137                  if (!is_x86_feature(x86_featureset, X86FSET_SSSE3))
4950 5138                          *ecx &= ~CPUID_INTC_ECX_SSSE3;
4951 5139                  if (!is_x86_feature(x86_featureset, X86FSET_SSE4_1))
4952 5140                          *ecx &= ~CPUID_INTC_ECX_SSE4_1;
4953 5141                  if (!is_x86_feature(x86_featureset, X86FSET_SSE4_2))
4954 5142                          *ecx &= ~CPUID_INTC_ECX_SSE4_2;
4955 5143                  if (!is_x86_feature(x86_featureset, X86FSET_AES))
4956 5144                          *ecx &= ~CPUID_INTC_ECX_AES;
4957 5145                  if (!is_x86_feature(x86_featureset, X86FSET_PCLMULQDQ))
4958 5146                          *ecx &= ~CPUID_INTC_ECX_PCLMULQDQ;
4959 5147                  if (!is_x86_feature(x86_featureset, X86FSET_XSAVE))
4960 5148                          *ecx &= ~(CPUID_INTC_ECX_XSAVE |
4961 5149                              CPUID_INTC_ECX_OSXSAVE);
4962 5150                  if (!is_x86_feature(x86_featureset, X86FSET_AVX))
4963 5151                          *ecx &= ~CPUID_INTC_ECX_AVX;
4964 5152                  if (!is_x86_feature(x86_featureset, X86FSET_F16C))
4965 5153                          *ecx &= ~CPUID_INTC_ECX_F16C;
4966 5154                  if (!is_x86_feature(x86_featureset, X86FSET_FMA))
4967 5155                          *ecx &= ~CPUID_INTC_ECX_FMA;
4968 5156                  if (!is_x86_feature(x86_featureset, X86FSET_BMI1))
4969 5157                          *ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
4970 5158                  if (!is_x86_feature(x86_featureset, X86FSET_BMI2))
4971 5159                          *ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
4972 5160                  if (!is_x86_feature(x86_featureset, X86FSET_AVX2))
4973 5161                          *ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
4974 5162                  if (!is_x86_feature(x86_featureset, X86FSET_RDSEED))
4975 5163                          *ebx &= ~CPUID_INTC_EBX_7_0_RDSEED;
4976 5164                  if (!is_x86_feature(x86_featureset, X86FSET_ADX))
4977 5165                          *ebx &= ~CPUID_INTC_EBX_7_0_ADX;
4978 5166  
4979 5167                  /*
4980 5168                   * [no explicit support required beyond x87 fp context]
4981 5169                   */
4982 5170                  if (!fpu_exists)
4983 5171                          *edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
4984 5172  
4985 5173                  /*
4986 5174                   * Now map the supported feature vector to things that we
4987 5175                   * think userland will care about.
4988 5176                   */
4989 5177                  if (*edx & CPUID_INTC_EDX_SEP)
4990 5178                          hwcap_flags |= AV_386_SEP;
4991 5179                  if (*edx & CPUID_INTC_EDX_SSE)
4992 5180                          hwcap_flags |= AV_386_FXSR | AV_386_SSE;
4993 5181                  if (*edx & CPUID_INTC_EDX_SSE2)
4994 5182                          hwcap_flags |= AV_386_SSE2;
4995 5183                  if (*ecx & CPUID_INTC_ECX_SSE3)
4996 5184                          hwcap_flags |= AV_386_SSE3;
4997 5185                  if (*ecx & CPUID_INTC_ECX_SSSE3)
4998 5186                          hwcap_flags |= AV_386_SSSE3;
4999 5187                  if (*ecx & CPUID_INTC_ECX_SSE4_1)
5000 5188                          hwcap_flags |= AV_386_SSE4_1;
5001 5189                  if (*ecx & CPUID_INTC_ECX_SSE4_2)
5002 5190                          hwcap_flags |= AV_386_SSE4_2;
5003 5191                  if (*ecx & CPUID_INTC_ECX_MOVBE)
5004 5192                          hwcap_flags |= AV_386_MOVBE;
5005 5193                  if (*ecx & CPUID_INTC_ECX_AES)
5006 5194                          hwcap_flags |= AV_386_AES;
5007 5195                  if (*ecx & CPUID_INTC_ECX_PCLMULQDQ)
5008 5196                          hwcap_flags |= AV_386_PCLMULQDQ;
5009 5197                  if ((*ecx & CPUID_INTC_ECX_XSAVE) &&
5010 5198                      (*ecx & CPUID_INTC_ECX_OSXSAVE)) {
5011 5199                          hwcap_flags |= AV_386_XSAVE;
5012 5200  
5013 5201                          if (*ecx & CPUID_INTC_ECX_AVX) {
5014 5202                                  uint32_t *ecx_7 = &CPI_FEATURES_7_0_ECX(cpi);
5015 5203                                  uint32_t *edx_7 = &CPI_FEATURES_7_0_EDX(cpi);
5016 5204  
5017 5205                                  hwcap_flags |= AV_386_AVX;
5018 5206                                  if (*ecx & CPUID_INTC_ECX_F16C)
5019 5207                                          hwcap_flags_2 |= AV_386_2_F16C;
5020 5208                                  if (*ecx & CPUID_INTC_ECX_FMA)
5021 5209                                          hwcap_flags_2 |= AV_386_2_FMA;
5022 5210  
5023 5211                                  if (*ebx & CPUID_INTC_EBX_7_0_BMI1)
5024 5212                                          hwcap_flags_2 |= AV_386_2_BMI1;
5025 5213                                  if (*ebx & CPUID_INTC_EBX_7_0_BMI2)
5026 5214                                          hwcap_flags_2 |= AV_386_2_BMI2;
5027 5215                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX2)
5028 5216                                          hwcap_flags_2 |= AV_386_2_AVX2;
5029 5217                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512F)
5030 5218                                          hwcap_flags_2 |= AV_386_2_AVX512F;
5031 5219                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
5032 5220                                          hwcap_flags_2 |= AV_386_2_AVX512DQ;
5033 5221                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
5034 5222                                          hwcap_flags_2 |= AV_386_2_AVX512IFMA;
5035 5223                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512PF)
5036 5224                                          hwcap_flags_2 |= AV_386_2_AVX512PF;
5037 5225                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512ER)
5038 5226                                          hwcap_flags_2 |= AV_386_2_AVX512ER;
5039 5227                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512CD)
5040 5228                                          hwcap_flags_2 |= AV_386_2_AVX512CD;
5041 5229                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512BW)
5042 5230                                          hwcap_flags_2 |= AV_386_2_AVX512BW;
5043 5231                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512VL)
5044 5232                                          hwcap_flags_2 |= AV_386_2_AVX512VL;
5045 5233  
5046 5234                                  if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VBMI)
5047 5235                                          hwcap_flags_2 |= AV_386_2_AVX512VBMI;
5048 5236                                  if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VNNI)
5049 5237                                          hwcap_flags_2 |= AV_386_2_AVX512_VNNI;
5050 5238                                  if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
5051 5239                                          hwcap_flags_2 |= AV_386_2_AVX512VPOPCDQ;
5052 5240  
5053 5241                                  if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124NNIW)
5054 5242                                          hwcap_flags_2 |= AV_386_2_AVX512_4NNIW;
5055 5243                                  if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
5056 5244                                          hwcap_flags_2 |= AV_386_2_AVX512_4FMAPS;
5057 5245                          }
5058 5246                  }
5059 5247                  if (*ecx & CPUID_INTC_ECX_VMX)
5060 5248                          hwcap_flags |= AV_386_VMX;
5061 5249                  if (*ecx & CPUID_INTC_ECX_POPCNT)
5062 5250                          hwcap_flags |= AV_386_POPCNT;
5063 5251                  if (*edx & CPUID_INTC_EDX_FPU)
5064 5252                          hwcap_flags |= AV_386_FPU;
5065 5253                  if (*edx & CPUID_INTC_EDX_MMX)
5066 5254                          hwcap_flags |= AV_386_MMX;
5067 5255  
5068 5256                  if (*edx & CPUID_INTC_EDX_TSC)
5069 5257                          hwcap_flags |= AV_386_TSC;
5070 5258                  if (*edx & CPUID_INTC_EDX_CX8)
5071 5259                          hwcap_flags |= AV_386_CX8;
5072 5260                  if (*edx & CPUID_INTC_EDX_CMOV)
5073 5261                          hwcap_flags |= AV_386_CMOV;
5074 5262                  if (*ecx & CPUID_INTC_ECX_CX16)
5075 5263                          hwcap_flags |= AV_386_CX16;
5076 5264  
5077 5265                  if (*ecx & CPUID_INTC_ECX_RDRAND)
5078 5266                          hwcap_flags_2 |= AV_386_2_RDRAND;
5079 5267                  if (*ebx & CPUID_INTC_EBX_7_0_ADX)
5080 5268                          hwcap_flags_2 |= AV_386_2_ADX;
5081 5269                  if (*ebx & CPUID_INTC_EBX_7_0_RDSEED)
5082 5270                          hwcap_flags_2 |= AV_386_2_RDSEED;
5083 5271                  if (*ebx & CPUID_INTC_EBX_7_0_SHA)
5084 5272                          hwcap_flags_2 |= AV_386_2_SHA;
5085 5273                  if (*ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
5086 5274                          hwcap_flags_2 |= AV_386_2_FSGSBASE;
5087 5275                  if (*ebx & CPUID_INTC_EBX_7_0_CLWB)
5088 5276                          hwcap_flags_2 |= AV_386_2_CLWB;
5089 5277                  if (*ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
5090 5278                          hwcap_flags_2 |= AV_386_2_CLFLUSHOPT;
5091 5279  
5092 5280          }
5093 5281          /*
5094 5282           * Check a few miscilaneous features.
5095 5283           */
5096 5284          if (is_x86_feature(x86_featureset, X86FSET_CLZERO))
5097 5285                  hwcap_flags_2 |= AV_386_2_CLZERO;
5098 5286  
5099 5287          if (cpi->cpi_xmaxeax < 0x80000001)
5100 5288                  goto pass4_done;
5101 5289  
5102 5290          switch (cpi->cpi_vendor) {
5103 5291                  struct cpuid_regs cp;
5104 5292                  uint32_t *edx, *ecx;
5105 5293  
5106 5294          case X86_VENDOR_Intel:
5107 5295                  /*
5108 5296                   * Seems like Intel duplicated what we necessary
5109 5297                   * here to make the initial crop of 64-bit OS's work.
5110 5298                   * Hopefully, those are the only "extended" bits
5111 5299                   * they'll add.
5112 5300                   */
5113 5301                  /*FALLTHROUGH*/
5114 5302  
5115 5303          case X86_VENDOR_AMD:
5116 5304                  edx = &cpi->cpi_support[AMD_EDX_FEATURES];
5117 5305                  ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
5118 5306  
5119 5307                  *edx = CPI_FEATURES_XTD_EDX(cpi);
5120 5308                  *ecx = CPI_FEATURES_XTD_ECX(cpi);
5121 5309  
5122 5310                  /*
5123 5311                   * [these features require explicit kernel support]
5124 5312                   */
5125 5313                  switch (cpi->cpi_vendor) {
5126 5314                  case X86_VENDOR_Intel:
5127 5315                          if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
5128 5316                                  *edx &= ~CPUID_AMD_EDX_TSCP;
5129 5317                          break;
5130 5318  
5131 5319                  case X86_VENDOR_AMD:
5132 5320                          if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
5133 5321                                  *edx &= ~CPUID_AMD_EDX_TSCP;
5134 5322                          if (!is_x86_feature(x86_featureset, X86FSET_SSE4A))
5135 5323                                  *ecx &= ~CPUID_AMD_ECX_SSE4A;
5136 5324                          break;
5137 5325  
5138 5326                  default:
5139 5327                          break;
5140 5328                  }
5141 5329  
5142 5330                  /*
5143 5331                   * [no explicit support required beyond
5144 5332                   * x87 fp context and exception handlers]
5145 5333                   */
5146 5334                  if (!fpu_exists)
5147 5335                          *edx &= ~(CPUID_AMD_EDX_MMXamd |
5148 5336                              CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
5149 5337  
5150 5338                  if (!is_x86_feature(x86_featureset, X86FSET_NX))
5151 5339                          *edx &= ~CPUID_AMD_EDX_NX;
5152 5340  #if !defined(__amd64)
5153 5341                  *edx &= ~CPUID_AMD_EDX_LM;
5154 5342  #endif
5155 5343                  /*
5156 5344                   * Now map the supported feature vector to
5157 5345                   * things that we think userland will care about.
5158 5346                   */
5159 5347  #if defined(__amd64)
5160 5348                  if (*edx & CPUID_AMD_EDX_SYSC)
5161 5349                          hwcap_flags |= AV_386_AMD_SYSC;
5162 5350  #endif
5163 5351                  if (*edx & CPUID_AMD_EDX_MMXamd)
5164 5352                          hwcap_flags |= AV_386_AMD_MMX;
5165 5353                  if (*edx & CPUID_AMD_EDX_3DNow)
5166 5354                          hwcap_flags |= AV_386_AMD_3DNow;
5167 5355                  if (*edx & CPUID_AMD_EDX_3DNowx)
5168 5356                          hwcap_flags |= AV_386_AMD_3DNowx;
5169 5357                  if (*ecx & CPUID_AMD_ECX_SVM)
5170 5358                          hwcap_flags |= AV_386_AMD_SVM;
5171 5359  
5172 5360                  switch (cpi->cpi_vendor) {
5173 5361                  case X86_VENDOR_AMD:
5174 5362                          if (*edx & CPUID_AMD_EDX_TSCP)
5175 5363                                  hwcap_flags |= AV_386_TSCP;
5176 5364                          if (*ecx & CPUID_AMD_ECX_AHF64)
5177 5365                                  hwcap_flags |= AV_386_AHF;
5178 5366                          if (*ecx & CPUID_AMD_ECX_SSE4A)
5179 5367                                  hwcap_flags |= AV_386_AMD_SSE4A;
5180 5368                          if (*ecx & CPUID_AMD_ECX_LZCNT)
5181 5369                                  hwcap_flags |= AV_386_AMD_LZCNT;
5182 5370                          if (*ecx & CPUID_AMD_ECX_MONITORX)
5183 5371                                  hwcap_flags_2 |= AV_386_2_MONITORX;
5184 5372                          break;
5185 5373  
5186 5374                  case X86_VENDOR_Intel:
5187 5375                          if (*edx & CPUID_AMD_EDX_TSCP)
5188 5376                                  hwcap_flags |= AV_386_TSCP;
5189 5377                          if (*ecx & CPUID_AMD_ECX_LZCNT)
5190 5378                                  hwcap_flags |= AV_386_AMD_LZCNT;
5191 5379                          /*
5192 5380                           * Aarrgh.
5193 5381                           * Intel uses a different bit in the same word.
5194 5382                           */
5195 5383                          if (*ecx & CPUID_INTC_ECX_AHF64)
5196 5384                                  hwcap_flags |= AV_386_AHF;
5197 5385                          break;
5198 5386  
5199 5387                  default:
5200 5388                          break;
5201 5389                  }
5202 5390                  break;
5203 5391  
5204 5392          case X86_VENDOR_TM:
5205 5393                  cp.cp_eax = 0x80860001;
5206 5394                  (void) __cpuid_insn(&cp);
5207 5395                  cpi->cpi_support[TM_EDX_FEATURES] = cp.cp_edx;
5208 5396                  break;
5209 5397  
5210 5398          default:
5211 5399                  break;
5212 5400          }
5213 5401  
5214 5402  pass4_done:
5215 5403          cpi->cpi_pass = 4;
5216 5404          if (hwcap_out != NULL) {
5217 5405                  hwcap_out[0] = hwcap_flags;
5218 5406                  hwcap_out[1] = hwcap_flags_2;
5219 5407          }
5220 5408  }
5221 5409  
5222 5410  
5223 5411  /*
5224 5412   * Simulate the cpuid instruction using the data we previously
5225 5413   * captured about this CPU.  We try our best to return the truth
5226 5414   * about the hardware, independently of kernel support.
5227 5415   */
5228 5416  uint32_t
5229 5417  cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
5230 5418  {
5231 5419          struct cpuid_info *cpi;
5232 5420          struct cpuid_regs *xcp;
5233 5421  
5234 5422          if (cpu == NULL)
5235 5423                  cpu = CPU;
5236 5424          cpi = cpu->cpu_m.mcpu_cpi;
5237 5425  
5238 5426          ASSERT(cpuid_checkpass(cpu, 3));
5239 5427  
5240 5428          /*
5241 5429           * CPUID data is cached in two separate places: cpi_std for standard
5242 5430           * CPUID leaves , and cpi_extd for extended CPUID leaves.
5243 5431           */
5244 5432          if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
5245 5433                  xcp = &cpi->cpi_std[cp->cp_eax];
5246 5434          } else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
5247 5435              cp->cp_eax <= cpi->cpi_xmaxeax &&
5248 5436              cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
5249 5437                  xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
5250 5438          } else {
5251 5439                  /*
5252 5440                   * The caller is asking for data from an input parameter which
5253 5441                   * the kernel has not cached.  In this case we go fetch from
5254 5442                   * the hardware and return the data directly to the user.
5255 5443                   */
5256 5444                  return (__cpuid_insn(cp));
5257 5445          }
5258 5446  
5259 5447          cp->cp_eax = xcp->cp_eax;
5260 5448          cp->cp_ebx = xcp->cp_ebx;
5261 5449          cp->cp_ecx = xcp->cp_ecx;
5262 5450          cp->cp_edx = xcp->cp_edx;
5263 5451          return (cp->cp_eax);
5264 5452  }
5265 5453  
5266 5454  int
5267 5455  cpuid_checkpass(cpu_t *cpu, int pass)
5268 5456  {
5269 5457          return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
5270 5458              cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
5271 5459  }
5272 5460  
5273 5461  int
5274 5462  cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
5275 5463  {
5276 5464          ASSERT(cpuid_checkpass(cpu, 3));
5277 5465  
5278 5466          return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
5279 5467  }
5280 5468  
5281 5469  int
5282 5470  cpuid_is_cmt(cpu_t *cpu)
5283 5471  {
5284 5472          if (cpu == NULL)
5285 5473                  cpu = CPU;
5286 5474  
5287 5475          ASSERT(cpuid_checkpass(cpu, 1));
5288 5476  
5289 5477          return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
5290 5478  }
5291 5479  
5292 5480  /*
5293 5481   * AMD and Intel both implement the 64-bit variant of the syscall
5294 5482   * instruction (syscallq), so if there's -any- support for syscall,
5295 5483   * cpuid currently says "yes, we support this".
5296 5484   *
5297 5485   * However, Intel decided to -not- implement the 32-bit variant of the
5298 5486   * syscall instruction, so we provide a predicate to allow our caller
5299 5487   * to test that subtlety here.
5300 5488   *
5301 5489   * XXPV Currently, 32-bit syscall instructions don't work via the hypervisor,
5302 5490   *      even in the case where the hardware would in fact support it.
5303 5491   */
5304 5492  /*ARGSUSED*/
5305 5493  int
5306 5494  cpuid_syscall32_insn(cpu_t *cpu)
5307 5495  {
5308 5496          ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), 1));
5309 5497  
5310 5498  #if !defined(__xpv)
5311 5499          if (cpu == NULL)
5312 5500                  cpu = CPU;
5313 5501  
5314 5502          /*CSTYLED*/
5315 5503          {
5316 5504                  struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5317 5505  
5318 5506                  if (cpi->cpi_vendor == X86_VENDOR_AMD &&
5319 5507                      cpi->cpi_xmaxeax >= 0x80000001 &&
5320 5508                      (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
5321 5509                          return (1);
5322 5510          }
5323 5511  #endif
5324 5512          return (0);
5325 5513  }
5326 5514  
5327 5515  int
5328 5516  cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
5329 5517  {
5330 5518          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5331 5519  
5332 5520          static const char fmt[] =
5333 5521              "x86 (%s %X family %d model %d step %d clock %d MHz)";
5334 5522          static const char fmt_ht[] =
5335 5523              "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
5336 5524  
5337 5525          ASSERT(cpuid_checkpass(cpu, 1));
5338 5526  
5339 5527          if (cpuid_is_cmt(cpu))
5340 5528                  return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
5341 5529                      cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5342 5530                      cpi->cpi_family, cpi->cpi_model,
5343 5531                      cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5344 5532          return (snprintf(s, n, fmt,
5345 5533              cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5346 5534              cpi->cpi_family, cpi->cpi_model,
5347 5535              cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5348 5536  }
5349 5537  
5350 5538  const char *
5351 5539  cpuid_getvendorstr(cpu_t *cpu)
5352 5540  {
5353 5541          ASSERT(cpuid_checkpass(cpu, 1));
5354 5542          return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
5355 5543  }
5356 5544  
5357 5545  uint_t
5358 5546  cpuid_getvendor(cpu_t *cpu)
5359 5547  {
5360 5548          ASSERT(cpuid_checkpass(cpu, 1));
5361 5549          return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
5362 5550  }
5363 5551  
5364 5552  uint_t
5365 5553  cpuid_getfamily(cpu_t *cpu)
5366 5554  {
5367 5555          ASSERT(cpuid_checkpass(cpu, 1));
5368 5556          return (cpu->cpu_m.mcpu_cpi->cpi_family);
5369 5557  }
5370 5558  
5371 5559  uint_t
5372 5560  cpuid_getmodel(cpu_t *cpu)
5373 5561  {
5374 5562          ASSERT(cpuid_checkpass(cpu, 1));
5375 5563          return (cpu->cpu_m.mcpu_cpi->cpi_model);
5376 5564  }
5377 5565  
5378 5566  uint_t
5379 5567  cpuid_get_ncpu_per_chip(cpu_t *cpu)
5380 5568  {
5381 5569          ASSERT(cpuid_checkpass(cpu, 1));
5382 5570          return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
5383 5571  }
5384 5572  
5385 5573  uint_t
5386 5574  cpuid_get_ncore_per_chip(cpu_t *cpu)
5387 5575  {
5388 5576          ASSERT(cpuid_checkpass(cpu, 1));
5389 5577          return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
5390 5578  }
5391 5579  
5392 5580  uint_t
5393 5581  cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
5394 5582  {
5395 5583          ASSERT(cpuid_checkpass(cpu, 2));
5396 5584          return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
5397 5585  }
5398 5586  
5399 5587  id_t
5400 5588  cpuid_get_last_lvl_cacheid(cpu_t *cpu)
5401 5589  {
5402 5590          ASSERT(cpuid_checkpass(cpu, 2));
5403 5591          return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5404 5592  }
5405 5593  
5406 5594  uint_t
5407 5595  cpuid_getstep(cpu_t *cpu)
5408 5596  {
5409 5597          ASSERT(cpuid_checkpass(cpu, 1));
5410 5598          return (cpu->cpu_m.mcpu_cpi->cpi_step);
5411 5599  }
5412 5600  
5413 5601  uint_t
5414 5602  cpuid_getsig(struct cpu *cpu)
5415 5603  {
5416 5604          ASSERT(cpuid_checkpass(cpu, 1));
5417 5605          return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
5418 5606  }
5419 5607  
5420 5608  uint32_t
5421 5609  cpuid_getchiprev(struct cpu *cpu)
5422 5610  {
5423 5611          ASSERT(cpuid_checkpass(cpu, 1));
5424 5612          return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
5425 5613  }
5426 5614  
5427 5615  const char *
5428 5616  cpuid_getchiprevstr(struct cpu *cpu)
5429 5617  {
5430 5618          ASSERT(cpuid_checkpass(cpu, 1));
5431 5619          return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
5432 5620  }
5433 5621  
5434 5622  uint32_t
5435 5623  cpuid_getsockettype(struct cpu *cpu)
5436 5624  {
5437 5625          ASSERT(cpuid_checkpass(cpu, 1));
5438 5626          return (cpu->cpu_m.mcpu_cpi->cpi_socket);
5439 5627  }
5440 5628  
5441 5629  const char *
5442 5630  cpuid_getsocketstr(cpu_t *cpu)
5443 5631  {
5444 5632          static const char *socketstr = NULL;
5445 5633          struct cpuid_info *cpi;
5446 5634  
5447 5635          ASSERT(cpuid_checkpass(cpu, 1));
5448 5636          cpi = cpu->cpu_m.mcpu_cpi;
5449 5637  
5450 5638          /* Assume that socket types are the same across the system */
5451 5639          if (socketstr == NULL)
5452 5640                  socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
5453 5641                      cpi->cpi_model, cpi->cpi_step);
5454 5642  
5455 5643  
5456 5644          return (socketstr);
5457 5645  }
5458 5646  
5459 5647  int
5460 5648  cpuid_get_chipid(cpu_t *cpu)
5461 5649  {
5462 5650          ASSERT(cpuid_checkpass(cpu, 1));
5463 5651  
5464 5652          if (cpuid_is_cmt(cpu))
5465 5653                  return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
5466 5654          return (cpu->cpu_id);
5467 5655  }
5468 5656  
5469 5657  id_t
5470 5658  cpuid_get_coreid(cpu_t *cpu)
5471 5659  {
5472 5660          ASSERT(cpuid_checkpass(cpu, 1));
5473 5661          return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
5474 5662  }
5475 5663  
5476 5664  int
5477 5665  cpuid_get_pkgcoreid(cpu_t *cpu)
5478 5666  {
5479 5667          ASSERT(cpuid_checkpass(cpu, 1));
5480 5668          return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
5481 5669  }
5482 5670  
5483 5671  int
5484 5672  cpuid_get_clogid(cpu_t *cpu)
5485 5673  {
5486 5674          ASSERT(cpuid_checkpass(cpu, 1));
5487 5675          return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
5488 5676  }
5489 5677  
5490 5678  int
5491 5679  cpuid_get_cacheid(cpu_t *cpu)
5492 5680  {
5493 5681          ASSERT(cpuid_checkpass(cpu, 1));
5494 5682          return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5495 5683  }
5496 5684  
5497 5685  uint_t
5498 5686  cpuid_get_procnodeid(cpu_t *cpu)
5499 5687  {
5500 5688          ASSERT(cpuid_checkpass(cpu, 1));
5501 5689          return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
5502 5690  }
5503 5691  
5504 5692  uint_t
5505 5693  cpuid_get_procnodes_per_pkg(cpu_t *cpu)
5506 5694  {
5507 5695          ASSERT(cpuid_checkpass(cpu, 1));
5508 5696          return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
5509 5697  }
5510 5698  
5511 5699  uint_t
5512 5700  cpuid_get_compunitid(cpu_t *cpu)
5513 5701  {
5514 5702          ASSERT(cpuid_checkpass(cpu, 1));
5515 5703          return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
5516 5704  }
5517 5705  
5518 5706  uint_t
5519 5707  cpuid_get_cores_per_compunit(cpu_t *cpu)
5520 5708  {
5521 5709          ASSERT(cpuid_checkpass(cpu, 1));
5522 5710          return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
5523 5711  }
5524 5712  
5525 5713  /*ARGSUSED*/
5526 5714  int
5527 5715  cpuid_have_cr8access(cpu_t *cpu)
5528 5716  {
5529 5717  #if defined(__amd64)
5530 5718          return (1);
5531 5719  #else
5532 5720          struct cpuid_info *cpi;
5533 5721  
5534 5722          ASSERT(cpu != NULL);
5535 5723          cpi = cpu->cpu_m.mcpu_cpi;
5536 5724          if (cpi->cpi_vendor == X86_VENDOR_AMD && cpi->cpi_maxeax >= 1 &&
5537 5725              (CPI_FEATURES_XTD_ECX(cpi) & CPUID_AMD_ECX_CR8D) != 0)
5538 5726                  return (1);
5539 5727          return (0);
5540 5728  #endif
5541 5729  }
5542 5730  
5543 5731  uint32_t
5544 5732  cpuid_get_apicid(cpu_t *cpu)
5545 5733  {
5546 5734          ASSERT(cpuid_checkpass(cpu, 1));
5547 5735          if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
5548 5736                  return (UINT32_MAX);
5549 5737          } else {
5550 5738                  return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
5551 5739          }
5552 5740  }
5553 5741  
5554 5742  void
5555 5743  cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
5556 5744  {
5557 5745          struct cpuid_info *cpi;
5558 5746  
5559 5747          if (cpu == NULL)
5560 5748                  cpu = CPU;
5561 5749          cpi = cpu->cpu_m.mcpu_cpi;
5562 5750  
5563 5751          ASSERT(cpuid_checkpass(cpu, 1));
5564 5752  
5565 5753          if (pabits)
5566 5754                  *pabits = cpi->cpi_pabits;
5567 5755          if (vabits)
5568 5756                  *vabits = cpi->cpi_vabits;
5569 5757  }
5570 5758  
5571 5759  size_t
5572 5760  cpuid_get_xsave_size()
5573 5761  {
5574 5762          return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
5575 5763              sizeof (struct xsave_state)));
5576 5764  }
5577 5765  
5578 5766  /*
5579 5767   * Return true if the CPUs on this system require 'pointer clearing' for the
5580 5768   * floating point error pointer exception handling. In the past, this has been
5581 5769   * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
5582 5770   * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
5583 5771   * feature bit and is reflected in the cpi_fp_amd_save member.
5584 5772   */
5585 5773  boolean_t
5586 5774  cpuid_need_fp_excp_handling()
5587 5775  {
5588 5776          return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
5589 5777              cpuid_info0.cpi_fp_amd_save != 0);
5590 5778  }
5591 5779  
5592 5780  /*
5593 5781   * Returns the number of data TLB entries for a corresponding
5594 5782   * pagesize.  If it can't be computed, or isn't known, the
5595 5783   * routine returns zero.  If you ask about an architecturally
5596 5784   * impossible pagesize, the routine will panic (so that the
5597 5785   * hat implementor knows that things are inconsistent.)
5598 5786   */
5599 5787  uint_t
5600 5788  cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
5601 5789  {
5602 5790          struct cpuid_info *cpi;
5603 5791          uint_t dtlb_nent = 0;
5604 5792  
5605 5793          if (cpu == NULL)
5606 5794                  cpu = CPU;
5607 5795          cpi = cpu->cpu_m.mcpu_cpi;
5608 5796  
5609 5797          ASSERT(cpuid_checkpass(cpu, 1));
5610 5798  
5611 5799          /*
5612 5800           * Check the L2 TLB info
5613 5801           */
5614 5802          if (cpi->cpi_xmaxeax >= 0x80000006) {
5615 5803                  struct cpuid_regs *cp = &cpi->cpi_extd[6];
5616 5804  
5617 5805                  switch (pagesize) {
5618 5806  
5619 5807                  case 4 * 1024:
5620 5808                          /*
5621 5809                           * All zero in the top 16 bits of the register
5622 5810                           * indicates a unified TLB. Size is in low 16 bits.
5623 5811                           */
5624 5812                          if ((cp->cp_ebx & 0xffff0000) == 0)
5625 5813                                  dtlb_nent = cp->cp_ebx & 0x0000ffff;
5626 5814                          else
5627 5815                                  dtlb_nent = BITX(cp->cp_ebx, 27, 16);
5628 5816                          break;
5629 5817  
5630 5818                  case 2 * 1024 * 1024:
5631 5819                          if ((cp->cp_eax & 0xffff0000) == 0)
5632 5820                                  dtlb_nent = cp->cp_eax & 0x0000ffff;
5633 5821                          else
5634 5822                                  dtlb_nent = BITX(cp->cp_eax, 27, 16);
5635 5823                          break;
5636 5824  
5637 5825                  default:
5638 5826                          panic("unknown L2 pagesize");
5639 5827                          /*NOTREACHED*/
5640 5828                  }
5641 5829          }
5642 5830  
5643 5831          if (dtlb_nent != 0)
5644 5832                  return (dtlb_nent);
5645 5833  
5646 5834          /*
5647 5835           * No L2 TLB support for this size, try L1.
5648 5836           */
5649 5837          if (cpi->cpi_xmaxeax >= 0x80000005) {
5650 5838                  struct cpuid_regs *cp = &cpi->cpi_extd[5];
5651 5839  
5652 5840                  switch (pagesize) {
5653 5841                  case 4 * 1024:
5654 5842                          dtlb_nent = BITX(cp->cp_ebx, 23, 16);
5655 5843                          break;
5656 5844                  case 2 * 1024 * 1024:
5657 5845                          dtlb_nent = BITX(cp->cp_eax, 23, 16);
5658 5846                          break;
5659 5847                  default:
5660 5848                          panic("unknown L1 d-TLB pagesize");
5661 5849                          /*NOTREACHED*/
5662 5850                  }
5663 5851          }
5664 5852  
5665 5853          return (dtlb_nent);
5666 5854  }
5667 5855  
5668 5856  /*
5669 5857   * Return 0 if the erratum is not present or not applicable, positive
5670 5858   * if it is, and negative if the status of the erratum is unknown.
5671 5859   *
5672 5860   * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
5673 5861   * Processors" #25759, Rev 3.57, August 2005
5674 5862   */
5675 5863  int
5676 5864  cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
5677 5865  {
5678 5866          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5679 5867          uint_t eax;
5680 5868  
5681 5869          /*
5682 5870           * Bail out if this CPU isn't an AMD CPU, or if it's
5683 5871           * a legacy (32-bit) AMD CPU.
5684 5872           */
5685 5873          if (cpi->cpi_vendor != X86_VENDOR_AMD ||
5686 5874              cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
5687 5875              cpi->cpi_family == 6) {
5688 5876                  return (0);
5689 5877          }
5690 5878  
5691 5879          eax = cpi->cpi_std[1].cp_eax;
5692 5880  
5693 5881  #define SH_B0(eax)      (eax == 0xf40 || eax == 0xf50)
5694 5882  #define SH_B3(eax)      (eax == 0xf51)
5695 5883  #define B(eax)          (SH_B0(eax) || SH_B3(eax))
5696 5884  
5697 5885  #define SH_C0(eax)      (eax == 0xf48 || eax == 0xf58)
5698 5886  
5699 5887  #define SH_CG(eax)      (eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
5700 5888  #define DH_CG(eax)      (eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
5701 5889  #define CH_CG(eax)      (eax == 0xf82 || eax == 0xfb2)
5702 5890  #define CG(eax)         (SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
5703 5891  
5704 5892  #define SH_D0(eax)      (eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
5705 5893  #define DH_D0(eax)      (eax == 0x10fc0 || eax == 0x10ff0)
5706 5894  #define CH_D0(eax)      (eax == 0x10f80 || eax == 0x10fb0)
5707 5895  #define D0(eax)         (SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
5708 5896  
5709 5897  #define SH_E0(eax)      (eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
5710 5898  #define JH_E1(eax)      (eax == 0x20f10)        /* JH8_E0 had 0x20f30 */
5711 5899  #define DH_E3(eax)      (eax == 0x20fc0 || eax == 0x20ff0)
5712 5900  #define SH_E4(eax)      (eax == 0x20f51 || eax == 0x20f71)
5713 5901  #define BH_E4(eax)      (eax == 0x20fb1)
5714 5902  #define SH_E5(eax)      (eax == 0x20f42)
5715 5903  #define DH_E6(eax)      (eax == 0x20ff2 || eax == 0x20fc2)
5716 5904  #define JH_E6(eax)      (eax == 0x20f12 || eax == 0x20f32)
5717 5905  #define EX(eax)         (SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
5718 5906                              SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
5719 5907                              DH_E6(eax) || JH_E6(eax))
5720 5908  
5721 5909  #define DR_AX(eax)      (eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
5722 5910  #define DR_B0(eax)      (eax == 0x100f20)
5723 5911  #define DR_B1(eax)      (eax == 0x100f21)
5724 5912  #define DR_BA(eax)      (eax == 0x100f2a)
5725 5913  #define DR_B2(eax)      (eax == 0x100f22)
5726 5914  #define DR_B3(eax)      (eax == 0x100f23)
5727 5915  #define RB_C0(eax)      (eax == 0x100f40)
5728 5916  
5729 5917          switch (erratum) {
5730 5918          case 1:
5731 5919                  return (cpi->cpi_family < 0x10);
5732 5920          case 51:        /* what does the asterisk mean? */
5733 5921                  return (B(eax) || SH_C0(eax) || CG(eax));
5734 5922          case 52:
5735 5923                  return (B(eax));
5736 5924          case 57:
5737 5925                  return (cpi->cpi_family <= 0x11);
5738 5926          case 58:
5739 5927                  return (B(eax));
5740 5928          case 60:
5741 5929                  return (cpi->cpi_family <= 0x11);
5742 5930          case 61:
5743 5931          case 62:
5744 5932          case 63:
5745 5933          case 64:
5746 5934          case 65:
5747 5935          case 66:
5748 5936          case 68:
5749 5937          case 69:
5750 5938          case 70:
5751 5939          case 71:
5752 5940                  return (B(eax));
5753 5941          case 72:
5754 5942                  return (SH_B0(eax));
5755 5943          case 74:
5756 5944                  return (B(eax));
5757 5945          case 75:
5758 5946                  return (cpi->cpi_family < 0x10);
5759 5947          case 76:
5760 5948                  return (B(eax));
5761 5949          case 77:
5762 5950                  return (cpi->cpi_family <= 0x11);
5763 5951          case 78:
5764 5952                  return (B(eax) || SH_C0(eax));
5765 5953          case 79:
5766 5954                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5767 5955          case 80:
5768 5956          case 81:
5769 5957          case 82:
5770 5958                  return (B(eax));
5771 5959          case 83:
5772 5960                  return (B(eax) || SH_C0(eax) || CG(eax));
5773 5961          case 85:
5774 5962                  return (cpi->cpi_family < 0x10);
5775 5963          case 86:
5776 5964                  return (SH_C0(eax) || CG(eax));
5777 5965          case 88:
5778 5966  #if !defined(__amd64)
5779 5967                  return (0);
5780 5968  #else
5781 5969                  return (B(eax) || SH_C0(eax));
5782 5970  #endif
5783 5971          case 89:
5784 5972                  return (cpi->cpi_family < 0x10);
5785 5973          case 90:
5786 5974                  return (B(eax) || SH_C0(eax) || CG(eax));
5787 5975          case 91:
5788 5976          case 92:
5789 5977                  return (B(eax) || SH_C0(eax));
5790 5978          case 93:
5791 5979                  return (SH_C0(eax));
5792 5980          case 94:
5793 5981                  return (B(eax) || SH_C0(eax) || CG(eax));
5794 5982          case 95:
5795 5983  #if !defined(__amd64)
5796 5984                  return (0);
5797 5985  #else
5798 5986                  return (B(eax) || SH_C0(eax));
5799 5987  #endif
5800 5988          case 96:
5801 5989                  return (B(eax) || SH_C0(eax) || CG(eax));
5802 5990          case 97:
5803 5991          case 98:
5804 5992                  return (SH_C0(eax) || CG(eax));
5805 5993          case 99:
5806 5994                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5807 5995          case 100:
5808 5996                  return (B(eax) || SH_C0(eax));
5809 5997          case 101:
5810 5998          case 103:
5811 5999                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5812 6000          case 104:
5813 6001                  return (SH_C0(eax) || CG(eax) || D0(eax));
5814 6002          case 105:
5815 6003          case 106:
5816 6004          case 107:
5817 6005                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5818 6006          case 108:
5819 6007                  return (DH_CG(eax));
5820 6008          case 109:
5821 6009                  return (SH_C0(eax) || CG(eax) || D0(eax));
5822 6010          case 110:
5823 6011                  return (D0(eax) || EX(eax));
5824 6012          case 111:
5825 6013                  return (CG(eax));
5826 6014          case 112:
5827 6015                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5828 6016          case 113:
5829 6017                  return (eax == 0x20fc0);
5830 6018          case 114:
5831 6019                  return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
5832 6020          case 115:
5833 6021                  return (SH_E0(eax) || JH_E1(eax));
5834 6022          case 116:
5835 6023                  return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
5836 6024          case 117:
5837 6025                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5838 6026          case 118:
5839 6027                  return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
5840 6028                      JH_E6(eax));
5841 6029          case 121:
5842 6030                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5843 6031          case 122:
5844 6032                  return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
5845 6033          case 123:
5846 6034                  return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
5847 6035          case 131:
5848 6036                  return (cpi->cpi_family < 0x10);
5849 6037          case 6336786:
5850 6038  
5851 6039                  /*
5852 6040                   * Test for AdvPowerMgmtInfo.TscPStateInvariant
5853 6041                   * if this is a K8 family or newer processor. We're testing for
5854 6042                   * this 'erratum' to determine whether or not we have a constant
5855 6043                   * TSC.
5856 6044                   *
5857 6045                   * Our current fix for this is to disable the C1-Clock ramping.
5858 6046                   * However, this doesn't work on newer processor families nor
5859 6047                   * does it work when virtualized as those devices don't exist.
5860 6048                   */
5861 6049                  if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
5862 6050                          return (0);
5863 6051                  }
5864 6052  
5865 6053                  if (CPI_FAMILY(cpi) == 0xf) {
5866 6054                          struct cpuid_regs regs;
5867 6055                          regs.cp_eax = 0x80000007;
5868 6056                          (void) __cpuid_insn(&regs);
5869 6057                          return (!(regs.cp_edx & 0x100));
5870 6058                  }
5871 6059                  return (0);
5872 6060          case 6323525:
5873 6061                  /*
5874 6062                   * This erratum (K8 #147) is not present on family 10 and newer.
5875 6063                   */
5876 6064                  if (cpi->cpi_family >= 0x10) {
5877 6065                          return (0);
5878 6066                  }
5879 6067                  return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
5880 6068                      (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
5881 6069  
5882 6070          case 6671130:
5883 6071                  /*
5884 6072                   * check for processors (pre-Shanghai) that do not provide
5885 6073                   * optimal management of 1gb ptes in its tlb.
5886 6074                   */
5887 6075                  return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
5888 6076  
5889 6077          case 298:
5890 6078                  return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
5891 6079                      DR_B2(eax) || RB_C0(eax));
5892 6080  
5893 6081          case 721:
5894 6082  #if defined(__amd64)
5895 6083                  return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
5896 6084  #else
5897 6085                  return (0);
5898 6086  #endif
5899 6087  
5900 6088          default:
5901 6089                  return (-1);
5902 6090  
5903 6091          }
5904 6092  }
5905 6093  
5906 6094  /*
5907 6095   * Determine if specified erratum is present via OSVW (OS Visible Workaround).
5908 6096   * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
5909 6097   */
5910 6098  int
5911 6099  osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
5912 6100  {
5913 6101          struct cpuid_info       *cpi;
5914 6102          uint_t                  osvwid;
5915 6103          static int              osvwfeature = -1;
5916 6104          uint64_t                osvwlength;
5917 6105  
5918 6106  
5919 6107          cpi = cpu->cpu_m.mcpu_cpi;
5920 6108  
5921 6109          /* confirm OSVW supported */
5922 6110          if (osvwfeature == -1) {
5923 6111                  osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
5924 6112          } else {
5925 6113                  /* assert that osvw feature setting is consistent on all cpus */
5926 6114                  ASSERT(osvwfeature ==
5927 6115                      (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
5928 6116          }
5929 6117          if (!osvwfeature)
5930 6118                  return (-1);
5931 6119  
5932 6120          osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
5933 6121  
5934 6122          switch (erratum) {
5935 6123          case 298:       /* osvwid is 0 */
5936 6124                  osvwid = 0;
5937 6125                  if (osvwlength <= (uint64_t)osvwid) {
5938 6126                          /* osvwid 0 is unknown */
5939 6127                          return (-1);
5940 6128                  }
5941 6129  
5942 6130                  /*
5943 6131                   * Check the OSVW STATUS MSR to determine the state
5944 6132                   * of the erratum where:
5945 6133                   *   0 - fixed by HW
5946 6134                   *   1 - BIOS has applied the workaround when BIOS
5947 6135                   *   workaround is available. (Or for other errata,
5948 6136                   *   OS workaround is required.)
5949 6137                   * For a value of 1, caller will confirm that the
5950 6138                   * erratum 298 workaround has indeed been applied by BIOS.
5951 6139                   *
5952 6140                   * A 1 may be set in cpus that have a HW fix
5953 6141                   * in a mixed cpu system. Regarding erratum 298:
5954 6142                   *   In a multiprocessor platform, the workaround above
5955 6143                   *   should be applied to all processors regardless of
5956 6144                   *   silicon revision when an affected processor is
5957 6145                   *   present.
5958 6146                   */
5959 6147  
5960 6148                  return (rdmsr(MSR_AMD_OSVW_STATUS +
5961 6149                      (osvwid / OSVW_ID_CNT_PER_MSR)) &
5962 6150                      (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
5963 6151  
5964 6152          default:
5965 6153                  return (-1);
5966 6154          }
5967 6155  }
5968 6156  
5969 6157  static const char assoc_str[] = "associativity";
5970 6158  static const char line_str[] = "line-size";
5971 6159  static const char size_str[] = "size";
5972 6160  
5973 6161  static void
5974 6162  add_cache_prop(dev_info_t *devi, const char *label, const char *type,
5975 6163      uint32_t val)
5976 6164  {
5977 6165          char buf[128];
5978 6166  
5979 6167          /*
5980 6168           * ndi_prop_update_int() is used because it is desirable for
5981 6169           * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
5982 6170           */
5983 6171          if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
5984 6172                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
5985 6173  }
5986 6174  
5987 6175  /*
5988 6176   * Intel-style cache/tlb description
5989 6177   *
5990 6178   * Standard cpuid level 2 gives a randomly ordered
5991 6179   * selection of tags that index into a table that describes
5992 6180   * cache and tlb properties.
5993 6181   */
5994 6182  
5995 6183  static const char l1_icache_str[] = "l1-icache";
5996 6184  static const char l1_dcache_str[] = "l1-dcache";
5997 6185  static const char l2_cache_str[] = "l2-cache";
5998 6186  static const char l3_cache_str[] = "l3-cache";
5999 6187  static const char itlb4k_str[] = "itlb-4K";
6000 6188  static const char dtlb4k_str[] = "dtlb-4K";
6001 6189  static const char itlb2M_str[] = "itlb-2M";
6002 6190  static const char itlb4M_str[] = "itlb-4M";
6003 6191  static const char dtlb4M_str[] = "dtlb-4M";
6004 6192  static const char dtlb24_str[] = "dtlb0-2M-4M";
6005 6193  static const char itlb424_str[] = "itlb-4K-2M-4M";
6006 6194  static const char itlb24_str[] = "itlb-2M-4M";
6007 6195  static const char dtlb44_str[] = "dtlb-4K-4M";
6008 6196  static const char sl1_dcache_str[] = "sectored-l1-dcache";
6009 6197  static const char sl2_cache_str[] = "sectored-l2-cache";
6010 6198  static const char itrace_str[] = "itrace-cache";
6011 6199  static const char sl3_cache_str[] = "sectored-l3-cache";
6012 6200  static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
6013 6201  
6014 6202  static const struct cachetab {
6015 6203          uint8_t         ct_code;
6016 6204          uint8_t         ct_assoc;
6017 6205          uint16_t        ct_line_size;
6018 6206          size_t          ct_size;
6019 6207          const char      *ct_label;
6020 6208  } intel_ctab[] = {
6021 6209          /*
6022 6210           * maintain descending order!
6023 6211           *
6024 6212           * Codes ignored - Reason
6025 6213           * ----------------------
6026 6214           * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
6027 6215           * f0H/f1H - Currently we do not interpret prefetch size by design
6028 6216           */
6029 6217          { 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
6030 6218          { 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
6031 6219          { 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
6032 6220          { 0xde, 12, 64, 6*1024*1024, l3_cache_str},
6033 6221          { 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
6034 6222          { 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
6035 6223          { 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
6036 6224          { 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
6037 6225          { 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
6038 6226          { 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
6039 6227          { 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
6040 6228          { 0xd0, 4, 64, 512*1024, l3_cache_str},
6041 6229          { 0xca, 4, 0, 512, sh_l2_tlb4k_str},
6042 6230          { 0xc0, 4, 0, 8, dtlb44_str },
6043 6231          { 0xba, 4, 0, 64, dtlb4k_str },
6044 6232          { 0xb4, 4, 0, 256, dtlb4k_str },
6045 6233          { 0xb3, 4, 0, 128, dtlb4k_str },
6046 6234          { 0xb2, 4, 0, 64, itlb4k_str },
6047 6235          { 0xb0, 4, 0, 128, itlb4k_str },
6048 6236          { 0x87, 8, 64, 1024*1024, l2_cache_str},
6049 6237          { 0x86, 4, 64, 512*1024, l2_cache_str},
6050 6238          { 0x85, 8, 32, 2*1024*1024, l2_cache_str},
6051 6239          { 0x84, 8, 32, 1024*1024, l2_cache_str},
6052 6240          { 0x83, 8, 32, 512*1024, l2_cache_str},
6053 6241          { 0x82, 8, 32, 256*1024, l2_cache_str},
6054 6242          { 0x80, 8, 64, 512*1024, l2_cache_str},
6055 6243          { 0x7f, 2, 64, 512*1024, l2_cache_str},
6056 6244          { 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
6057 6245          { 0x7c, 8, 64, 1024*1024, sl2_cache_str},
6058 6246          { 0x7b, 8, 64, 512*1024, sl2_cache_str},
6059 6247          { 0x7a, 8, 64, 256*1024, sl2_cache_str},
6060 6248          { 0x79, 8, 64, 128*1024, sl2_cache_str},
6061 6249          { 0x78, 8, 64, 1024*1024, l2_cache_str},
6062 6250          { 0x73, 8, 0, 64*1024, itrace_str},
6063 6251          { 0x72, 8, 0, 32*1024, itrace_str},
6064 6252          { 0x71, 8, 0, 16*1024, itrace_str},
6065 6253          { 0x70, 8, 0, 12*1024, itrace_str},
6066 6254          { 0x68, 4, 64, 32*1024, sl1_dcache_str},
6067 6255          { 0x67, 4, 64, 16*1024, sl1_dcache_str},
6068 6256          { 0x66, 4, 64, 8*1024, sl1_dcache_str},
6069 6257          { 0x60, 8, 64, 16*1024, sl1_dcache_str},
6070 6258          { 0x5d, 0, 0, 256, dtlb44_str},
6071 6259          { 0x5c, 0, 0, 128, dtlb44_str},
6072 6260          { 0x5b, 0, 0, 64, dtlb44_str},
6073 6261          { 0x5a, 4, 0, 32, dtlb24_str},
6074 6262          { 0x59, 0, 0, 16, dtlb4k_str},
6075 6263          { 0x57, 4, 0, 16, dtlb4k_str},
6076 6264          { 0x56, 4, 0, 16, dtlb4M_str},
6077 6265          { 0x55, 0, 0, 7, itlb24_str},
6078 6266          { 0x52, 0, 0, 256, itlb424_str},
6079 6267          { 0x51, 0, 0, 128, itlb424_str},
6080 6268          { 0x50, 0, 0, 64, itlb424_str},
6081 6269          { 0x4f, 0, 0, 32, itlb4k_str},
6082 6270          { 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
6083 6271          { 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
6084 6272          { 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
6085 6273          { 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
6086 6274          { 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
6087 6275          { 0x49, 16, 64, 4*1024*1024, l3_cache_str},
6088 6276          { 0x48, 12, 64, 3*1024*1024, l2_cache_str},
6089 6277          { 0x47, 8, 64, 8*1024*1024, l3_cache_str},
6090 6278          { 0x46, 4, 64, 4*1024*1024, l3_cache_str},
6091 6279          { 0x45, 4, 32, 2*1024*1024, l2_cache_str},
6092 6280          { 0x44, 4, 32, 1024*1024, l2_cache_str},
6093 6281          { 0x43, 4, 32, 512*1024, l2_cache_str},
6094 6282          { 0x42, 4, 32, 256*1024, l2_cache_str},
6095 6283          { 0x41, 4, 32, 128*1024, l2_cache_str},
6096 6284          { 0x3e, 4, 64, 512*1024, sl2_cache_str},
6097 6285          { 0x3d, 6, 64, 384*1024, sl2_cache_str},
6098 6286          { 0x3c, 4, 64, 256*1024, sl2_cache_str},
6099 6287          { 0x3b, 2, 64, 128*1024, sl2_cache_str},
6100 6288          { 0x3a, 6, 64, 192*1024, sl2_cache_str},
6101 6289          { 0x39, 4, 64, 128*1024, sl2_cache_str},
6102 6290          { 0x30, 8, 64, 32*1024, l1_icache_str},
6103 6291          { 0x2c, 8, 64, 32*1024, l1_dcache_str},
6104 6292          { 0x29, 8, 64, 4096*1024, sl3_cache_str},
6105 6293          { 0x25, 8, 64, 2048*1024, sl3_cache_str},
6106 6294          { 0x23, 8, 64, 1024*1024, sl3_cache_str},
6107 6295          { 0x22, 4, 64, 512*1024, sl3_cache_str},
6108 6296          { 0x0e, 6, 64, 24*1024, l1_dcache_str},
6109 6297          { 0x0d, 4, 32, 16*1024, l1_dcache_str},
6110 6298          { 0x0c, 4, 32, 16*1024, l1_dcache_str},
6111 6299          { 0x0b, 4, 0, 4, itlb4M_str},
6112 6300          { 0x0a, 2, 32, 8*1024, l1_dcache_str},
6113 6301          { 0x08, 4, 32, 16*1024, l1_icache_str},
6114 6302          { 0x06, 4, 32, 8*1024, l1_icache_str},
6115 6303          { 0x05, 4, 0, 32, dtlb4M_str},
6116 6304          { 0x04, 4, 0, 8, dtlb4M_str},
6117 6305          { 0x03, 4, 0, 64, dtlb4k_str},
6118 6306          { 0x02, 4, 0, 2, itlb4M_str},
6119 6307          { 0x01, 4, 0, 32, itlb4k_str},
6120 6308          { 0 }
6121 6309  };
6122 6310  
6123 6311  static const struct cachetab cyrix_ctab[] = {
6124 6312          { 0x70, 4, 0, 32, "tlb-4K" },
6125 6313          { 0x80, 4, 16, 16*1024, "l1-cache" },
6126 6314          { 0 }
6127 6315  };
6128 6316  
6129 6317  /*
6130 6318   * Search a cache table for a matching entry
6131 6319   */
6132 6320  static const struct cachetab *
6133 6321  find_cacheent(const struct cachetab *ct, uint_t code)
6134 6322  {
6135 6323          if (code != 0) {
6136 6324                  for (; ct->ct_code != 0; ct++)
6137 6325                          if (ct->ct_code <= code)
6138 6326                                  break;
6139 6327                  if (ct->ct_code == code)
6140 6328                          return (ct);
6141 6329          }
6142 6330          return (NULL);
6143 6331  }
6144 6332  
6145 6333  /*
6146 6334   * Populate cachetab entry with L2 or L3 cache-information using
6147 6335   * cpuid function 4. This function is called from intel_walk_cacheinfo()
6148 6336   * when descriptor 0x49 is encountered. It returns 0 if no such cache
6149 6337   * information is found.
6150 6338   */
6151 6339  static int
6152 6340  intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
6153 6341  {
6154 6342          uint32_t level, i;
6155 6343          int ret = 0;
6156 6344  
6157 6345          for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
6158 6346                  level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
6159 6347  
6160 6348                  if (level == 2 || level == 3) {
6161 6349                          ct->ct_assoc =
6162 6350                              CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
6163 6351                          ct->ct_line_size =
6164 6352                              CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
6165 6353                          ct->ct_size = ct->ct_assoc *
6166 6354                              (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
6167 6355                              ct->ct_line_size *
6168 6356                              (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
6169 6357  
6170 6358                          if (level == 2) {
6171 6359                                  ct->ct_label = l2_cache_str;
6172 6360                          } else if (level == 3) {
6173 6361                                  ct->ct_label = l3_cache_str;
6174 6362                          }
6175 6363                          ret = 1;
6176 6364                  }
6177 6365          }
6178 6366  
6179 6367          return (ret);
6180 6368  }
6181 6369  
6182 6370  /*
6183 6371   * Walk the cacheinfo descriptor, applying 'func' to every valid element
6184 6372   * The walk is terminated if the walker returns non-zero.
6185 6373   */
6186 6374  static void
6187 6375  intel_walk_cacheinfo(struct cpuid_info *cpi,
6188 6376      void *arg, int (*func)(void *, const struct cachetab *))
6189 6377  {
6190 6378          const struct cachetab *ct;
6191 6379          struct cachetab des_49_ct, des_b1_ct;
6192 6380          uint8_t *dp;
6193 6381          int i;
6194 6382  
6195 6383          if ((dp = cpi->cpi_cacheinfo) == NULL)
6196 6384                  return;
6197 6385          for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6198 6386                  /*
6199 6387                   * For overloaded descriptor 0x49 we use cpuid function 4
6200 6388                   * if supported by the current processor, to create
6201 6389                   * cache information.
6202 6390                   * For overloaded descriptor 0xb1 we use X86_PAE flag
6203 6391                   * to disambiguate the cache information.
6204 6392                   */
6205 6393                  if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
6206 6394                      intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
6207 6395                                  ct = &des_49_ct;
6208 6396                  } else if (*dp == 0xb1) {
6209 6397                          des_b1_ct.ct_code = 0xb1;
6210 6398                          des_b1_ct.ct_assoc = 4;
6211 6399                          des_b1_ct.ct_line_size = 0;
6212 6400                          if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
6213 6401                                  des_b1_ct.ct_size = 8;
6214 6402                                  des_b1_ct.ct_label = itlb2M_str;
6215 6403                          } else {
6216 6404                                  des_b1_ct.ct_size = 4;
6217 6405                                  des_b1_ct.ct_label = itlb4M_str;
6218 6406                          }
6219 6407                          ct = &des_b1_ct;
6220 6408                  } else {
6221 6409                          if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
6222 6410                                  continue;
6223 6411                          }
6224 6412                  }
6225 6413  
6226 6414                  if (func(arg, ct) != 0) {
6227 6415                          break;
6228 6416                  }
6229 6417          }
6230 6418  }
6231 6419  
6232 6420  /*
6233 6421   * (Like the Intel one, except for Cyrix CPUs)
6234 6422   */
6235 6423  static void
6236 6424  cyrix_walk_cacheinfo(struct cpuid_info *cpi,
6237 6425      void *arg, int (*func)(void *, const struct cachetab *))
6238 6426  {
6239 6427          const struct cachetab *ct;
6240 6428          uint8_t *dp;
6241 6429          int i;
6242 6430  
6243 6431          if ((dp = cpi->cpi_cacheinfo) == NULL)
6244 6432                  return;
6245 6433          for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6246 6434                  /*
6247 6435                   * Search Cyrix-specific descriptor table first ..
6248 6436                   */
6249 6437                  if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
6250 6438                          if (func(arg, ct) != 0)
6251 6439                                  break;
6252 6440                          continue;
6253 6441                  }
6254 6442                  /*
6255 6443                   * .. else fall back to the Intel one
6256 6444                   */
6257 6445                  if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
6258 6446                          if (func(arg, ct) != 0)
6259 6447                                  break;
6260 6448                          continue;
6261 6449                  }
6262 6450          }
6263 6451  }
6264 6452  
6265 6453  /*
6266 6454   * A cacheinfo walker that adds associativity, line-size, and size properties
6267 6455   * to the devinfo node it is passed as an argument.
6268 6456   */
6269 6457  static int
6270 6458  add_cacheent_props(void *arg, const struct cachetab *ct)
6271 6459  {
6272 6460          dev_info_t *devi = arg;
6273 6461  
6274 6462          add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
6275 6463          if (ct->ct_line_size != 0)
6276 6464                  add_cache_prop(devi, ct->ct_label, line_str,
6277 6465                      ct->ct_line_size);
6278 6466          add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
6279 6467          return (0);
6280 6468  }
6281 6469  
6282 6470  
6283 6471  static const char fully_assoc[] = "fully-associative?";
6284 6472  
6285 6473  /*
6286 6474   * AMD style cache/tlb description
6287 6475   *
6288 6476   * Extended functions 5 and 6 directly describe properties of
6289 6477   * tlbs and various cache levels.
6290 6478   */
6291 6479  static void
6292 6480  add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6293 6481  {
6294 6482          switch (assoc) {
6295 6483          case 0: /* reserved; ignore */
6296 6484                  break;
6297 6485          default:
6298 6486                  add_cache_prop(devi, label, assoc_str, assoc);
6299 6487                  break;
6300 6488          case 0xff:
6301 6489                  add_cache_prop(devi, label, fully_assoc, 1);
6302 6490                  break;
6303 6491          }
6304 6492  }
6305 6493  
6306 6494  static void
6307 6495  add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6308 6496  {
6309 6497          if (size == 0)
6310 6498                  return;
6311 6499          add_cache_prop(devi, label, size_str, size);
6312 6500          add_amd_assoc(devi, label, assoc);
6313 6501  }
6314 6502  
6315 6503  static void
6316 6504  add_amd_cache(dev_info_t *devi, const char *label,
6317 6505      uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6318 6506  {
6319 6507          if (size == 0 || line_size == 0)
6320 6508                  return;
6321 6509          add_amd_assoc(devi, label, assoc);
6322 6510          /*
6323 6511           * Most AMD parts have a sectored cache. Multiple cache lines are
6324 6512           * associated with each tag. A sector consists of all cache lines
6325 6513           * associated with a tag. For example, the AMD K6-III has a sector
6326 6514           * size of 2 cache lines per tag.
6327 6515           */
6328 6516          if (lines_per_tag != 0)
6329 6517                  add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6330 6518          add_cache_prop(devi, label, line_str, line_size);
6331 6519          add_cache_prop(devi, label, size_str, size * 1024);
6332 6520  }
6333 6521  
6334 6522  static void
6335 6523  add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6336 6524  {
6337 6525          switch (assoc) {
6338 6526          case 0: /* off */
6339 6527                  break;
6340 6528          case 1:
6341 6529          case 2:
6342 6530          case 4:
6343 6531                  add_cache_prop(devi, label, assoc_str, assoc);
6344 6532                  break;
6345 6533          case 6:
6346 6534                  add_cache_prop(devi, label, assoc_str, 8);
6347 6535                  break;
6348 6536          case 8:
6349 6537                  add_cache_prop(devi, label, assoc_str, 16);
6350 6538                  break;
6351 6539          case 0xf:
6352 6540                  add_cache_prop(devi, label, fully_assoc, 1);
6353 6541                  break;
6354 6542          default: /* reserved; ignore */
6355 6543                  break;
6356 6544          }
6357 6545  }
6358 6546  
6359 6547  static void
6360 6548  add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6361 6549  {
6362 6550          if (size == 0 || assoc == 0)
6363 6551                  return;
6364 6552          add_amd_l2_assoc(devi, label, assoc);
6365 6553          add_cache_prop(devi, label, size_str, size);
6366 6554  }
6367 6555  
6368 6556  static void
6369 6557  add_amd_l2_cache(dev_info_t *devi, const char *label,
6370 6558      uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6371 6559  {
6372 6560          if (size == 0 || assoc == 0 || line_size == 0)
6373 6561                  return;
6374 6562          add_amd_l2_assoc(devi, label, assoc);
6375 6563          if (lines_per_tag != 0)
6376 6564                  add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6377 6565          add_cache_prop(devi, label, line_str, line_size);
6378 6566          add_cache_prop(devi, label, size_str, size * 1024);
6379 6567  }
6380 6568  
6381 6569  static void
6382 6570  amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
6383 6571  {
6384 6572          struct cpuid_regs *cp;
6385 6573  
6386 6574          if (cpi->cpi_xmaxeax < 0x80000005)
6387 6575                  return;
6388 6576          cp = &cpi->cpi_extd[5];
6389 6577  
6390 6578          /*
6391 6579           * 4M/2M L1 TLB configuration
6392 6580           *
6393 6581           * We report the size for 2M pages because AMD uses two
6394 6582           * TLB entries for one 4M page.
6395 6583           */
6396 6584          add_amd_tlb(devi, "dtlb-2M",
6397 6585              BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
6398 6586          add_amd_tlb(devi, "itlb-2M",
6399 6587              BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
6400 6588  
6401 6589          /*
6402 6590           * 4K L1 TLB configuration
6403 6591           */
6404 6592  
6405 6593          switch (cpi->cpi_vendor) {
6406 6594                  uint_t nentries;
6407 6595          case X86_VENDOR_TM:
6408 6596                  if (cpi->cpi_family >= 5) {
6409 6597                          /*
6410 6598                           * Crusoe processors have 256 TLB entries, but
6411 6599                           * cpuid data format constrains them to only
6412 6600                           * reporting 255 of them.
6413 6601                           */
6414 6602                          if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
6415 6603                                  nentries = 256;
6416 6604                          /*
6417 6605                           * Crusoe processors also have a unified TLB
6418 6606                           */
6419 6607                          add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
6420 6608                              nentries);
6421 6609                          break;
6422 6610                  }
6423 6611                  /*FALLTHROUGH*/
6424 6612          default:
6425 6613                  add_amd_tlb(devi, itlb4k_str,
6426 6614                      BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
6427 6615                  add_amd_tlb(devi, dtlb4k_str,
6428 6616                      BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
6429 6617                  break;
6430 6618          }
6431 6619  
6432 6620          /*
6433 6621           * data L1 cache configuration
6434 6622           */
6435 6623  
6436 6624          add_amd_cache(devi, l1_dcache_str,
6437 6625              BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
6438 6626              BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
6439 6627  
6440 6628          /*
6441 6629           * code L1 cache configuration
6442 6630           */
6443 6631  
6444 6632          add_amd_cache(devi, l1_icache_str,
6445 6633              BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
6446 6634              BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
6447 6635  
6448 6636          if (cpi->cpi_xmaxeax < 0x80000006)
6449 6637                  return;
6450 6638          cp = &cpi->cpi_extd[6];
6451 6639  
6452 6640          /* Check for a unified L2 TLB for large pages */
6453 6641  
6454 6642          if (BITX(cp->cp_eax, 31, 16) == 0)
6455 6643                  add_amd_l2_tlb(devi, "l2-tlb-2M",
6456 6644                      BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6457 6645          else {
6458 6646                  add_amd_l2_tlb(devi, "l2-dtlb-2M",
6459 6647                      BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6460 6648                  add_amd_l2_tlb(devi, "l2-itlb-2M",
6461 6649                      BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6462 6650          }
6463 6651  
6464 6652          /* Check for a unified L2 TLB for 4K pages */
6465 6653  
6466 6654          if (BITX(cp->cp_ebx, 31, 16) == 0) {
6467 6655                  add_amd_l2_tlb(devi, "l2-tlb-4K",
6468 6656                      BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6469 6657          } else {
6470 6658                  add_amd_l2_tlb(devi, "l2-dtlb-4K",
6471 6659                      BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6472 6660                  add_amd_l2_tlb(devi, "l2-itlb-4K",
6473 6661                      BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6474 6662          }
6475 6663  
6476 6664          add_amd_l2_cache(devi, l2_cache_str,
6477 6665              BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
6478 6666              BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
6479 6667  }
6480 6668  
6481 6669  /*
6482 6670   * There are two basic ways that the x86 world describes it cache
6483 6671   * and tlb architecture - Intel's way and AMD's way.
6484 6672   *
6485 6673   * Return which flavor of cache architecture we should use
6486 6674   */
6487 6675  static int
6488 6676  x86_which_cacheinfo(struct cpuid_info *cpi)
6489 6677  {
6490 6678          switch (cpi->cpi_vendor) {
6491 6679          case X86_VENDOR_Intel:
6492 6680                  if (cpi->cpi_maxeax >= 2)
6493 6681                          return (X86_VENDOR_Intel);
6494 6682                  break;
6495 6683          case X86_VENDOR_AMD:
6496 6684                  /*
6497 6685                   * The K5 model 1 was the first part from AMD that reported
6498 6686                   * cache sizes via extended cpuid functions.
6499 6687                   */
6500 6688                  if (cpi->cpi_family > 5 ||
6501 6689                      (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
6502 6690                          return (X86_VENDOR_AMD);
6503 6691                  break;
6504 6692          case X86_VENDOR_TM:
6505 6693                  if (cpi->cpi_family >= 5)
6506 6694                          return (X86_VENDOR_AMD);
6507 6695                  /*FALLTHROUGH*/
6508 6696          default:
6509 6697                  /*
6510 6698                   * If they have extended CPU data for 0x80000005
6511 6699                   * then we assume they have AMD-format cache
6512 6700                   * information.
6513 6701                   *
6514 6702                   * If not, and the vendor happens to be Cyrix,
6515 6703                   * then try our-Cyrix specific handler.
6516 6704                   *
6517 6705                   * If we're not Cyrix, then assume we're using Intel's
6518 6706                   * table-driven format instead.
6519 6707                   */
6520 6708                  if (cpi->cpi_xmaxeax >= 0x80000005)
6521 6709                          return (X86_VENDOR_AMD);
6522 6710                  else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
6523 6711                          return (X86_VENDOR_Cyrix);
6524 6712                  else if (cpi->cpi_maxeax >= 2)
6525 6713                          return (X86_VENDOR_Intel);
6526 6714                  break;
6527 6715          }
6528 6716          return (-1);
6529 6717  }
6530 6718  
6531 6719  void
6532 6720  cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
6533 6721      struct cpuid_info *cpi)
6534 6722  {
6535 6723          dev_info_t *cpu_devi;
6536 6724          int create;
6537 6725  
6538 6726          cpu_devi = (dev_info_t *)dip;
6539 6727  
6540 6728          /* device_type */
6541 6729          (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6542 6730              "device_type", "cpu");
6543 6731  
6544 6732          /* reg */
6545 6733          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6546 6734              "reg", cpu_id);
6547 6735  
6548 6736          /* cpu-mhz, and clock-frequency */
6549 6737          if (cpu_freq > 0) {
6550 6738                  long long mul;
6551 6739  
6552 6740                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6553 6741                      "cpu-mhz", cpu_freq);
6554 6742                  if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
6555 6743                          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6556 6744                              "clock-frequency", (int)mul);
6557 6745          }
6558 6746  
6559 6747          if (!is_x86_feature(x86_featureset, X86FSET_CPUID)) {
6560 6748                  return;
6561 6749          }
6562 6750  
6563 6751          /* vendor-id */
6564 6752          (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6565 6753              "vendor-id", cpi->cpi_vendorstr);
6566 6754  
6567 6755          if (cpi->cpi_maxeax == 0) {
6568 6756                  return;
6569 6757          }
6570 6758  
6571 6759          /*
6572 6760           * family, model, and step
6573 6761           */
6574 6762          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6575 6763              "family", CPI_FAMILY(cpi));
6576 6764          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6577 6765              "cpu-model", CPI_MODEL(cpi));
6578 6766          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6579 6767              "stepping-id", CPI_STEP(cpi));
6580 6768  
6581 6769          /* type */
6582 6770          switch (cpi->cpi_vendor) {
6583 6771          case X86_VENDOR_Intel:
6584 6772                  create = 1;
6585 6773                  break;
6586 6774          default:
6587 6775                  create = 0;
6588 6776                  break;
6589 6777          }
6590 6778          if (create)
6591 6779                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6592 6780                      "type", CPI_TYPE(cpi));
6593 6781  
6594 6782          /* ext-family */
6595 6783          switch (cpi->cpi_vendor) {
6596 6784          case X86_VENDOR_Intel:
6597 6785          case X86_VENDOR_AMD:
6598 6786                  create = cpi->cpi_family >= 0xf;
6599 6787                  break;
6600 6788          default:
6601 6789                  create = 0;
6602 6790                  break;
6603 6791          }
6604 6792          if (create)
6605 6793                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6606 6794                      "ext-family", CPI_FAMILY_XTD(cpi));
6607 6795  
6608 6796          /* ext-model */
6609 6797          switch (cpi->cpi_vendor) {
6610 6798          case X86_VENDOR_Intel:
6611 6799                  create = IS_EXTENDED_MODEL_INTEL(cpi);
6612 6800                  break;
6613 6801          case X86_VENDOR_AMD:
6614 6802                  create = CPI_FAMILY(cpi) == 0xf;
6615 6803                  break;
6616 6804          default:
6617 6805                  create = 0;
6618 6806                  break;
6619 6807          }
6620 6808          if (create)
6621 6809                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6622 6810                      "ext-model", CPI_MODEL_XTD(cpi));
6623 6811  
6624 6812          /* generation */
6625 6813          switch (cpi->cpi_vendor) {
6626 6814          case X86_VENDOR_AMD:
6627 6815                  /*
6628 6816                   * AMD K5 model 1 was the first part to support this
6629 6817                   */
6630 6818                  create = cpi->cpi_xmaxeax >= 0x80000001;
6631 6819                  break;
6632 6820          default:
6633 6821                  create = 0;
6634 6822                  break;
6635 6823          }
6636 6824          if (create)
6637 6825                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6638 6826                      "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
6639 6827  
6640 6828          /* brand-id */
6641 6829          switch (cpi->cpi_vendor) {
6642 6830          case X86_VENDOR_Intel:
6643 6831                  /*
6644 6832                   * brand id first appeared on Pentium III Xeon model 8,
6645 6833                   * and Celeron model 8 processors and Opteron
6646 6834                   */
6647 6835                  create = cpi->cpi_family > 6 ||
6648 6836                      (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
6649 6837                  break;
6650 6838          case X86_VENDOR_AMD:
6651 6839                  create = cpi->cpi_family >= 0xf;
6652 6840                  break;
6653 6841          default:
6654 6842                  create = 0;
6655 6843                  break;
6656 6844          }
6657 6845          if (create && cpi->cpi_brandid != 0) {
6658 6846                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6659 6847                      "brand-id", cpi->cpi_brandid);
6660 6848          }
6661 6849  
6662 6850          /* chunks, and apic-id */
6663 6851          switch (cpi->cpi_vendor) {
6664 6852                  /*
6665 6853                   * first available on Pentium IV and Opteron (K8)
6666 6854                   */
6667 6855          case X86_VENDOR_Intel:
6668 6856                  create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6669 6857                  break;
6670 6858          case X86_VENDOR_AMD:
6671 6859                  create = cpi->cpi_family >= 0xf;
6672 6860                  break;
6673 6861          default:
6674 6862                  create = 0;
6675 6863                  break;
6676 6864          }
6677 6865          if (create) {
6678 6866                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6679 6867                      "chunks", CPI_CHUNKS(cpi));
6680 6868                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6681 6869                      "apic-id", cpi->cpi_apicid);
6682 6870                  if (cpi->cpi_chipid >= 0) {
6683 6871                          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6684 6872                              "chip#", cpi->cpi_chipid);
6685 6873                          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6686 6874                              "clog#", cpi->cpi_clogid);
6687 6875                  }
6688 6876          }
6689 6877  
6690 6878          /* cpuid-features */
6691 6879          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6692 6880              "cpuid-features", CPI_FEATURES_EDX(cpi));
6693 6881  
6694 6882  
6695 6883          /* cpuid-features-ecx */
6696 6884          switch (cpi->cpi_vendor) {
6697 6885          case X86_VENDOR_Intel:
6698 6886                  create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6699 6887                  break;
6700 6888          case X86_VENDOR_AMD:
6701 6889                  create = cpi->cpi_family >= 0xf;
6702 6890                  break;
6703 6891          default:
6704 6892                  create = 0;
6705 6893                  break;
6706 6894          }
6707 6895          if (create)
6708 6896                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6709 6897                      "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
6710 6898  
6711 6899          /* ext-cpuid-features */
6712 6900          switch (cpi->cpi_vendor) {
6713 6901          case X86_VENDOR_Intel:
6714 6902          case X86_VENDOR_AMD:
6715 6903          case X86_VENDOR_Cyrix:
6716 6904          case X86_VENDOR_TM:
6717 6905          case X86_VENDOR_Centaur:
6718 6906                  create = cpi->cpi_xmaxeax >= 0x80000001;
6719 6907                  break;
6720 6908          default:
6721 6909                  create = 0;
6722 6910                  break;
6723 6911          }
6724 6912          if (create) {
6725 6913                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6726 6914                      "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
6727 6915                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6728 6916                      "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
6729 6917          }
6730 6918  
6731 6919          /*
6732 6920           * Brand String first appeared in Intel Pentium IV, AMD K5
6733 6921           * model 1, and Cyrix GXm.  On earlier models we try and
6734 6922           * simulate something similar .. so this string should always
6735 6923           * same -something- about the processor, however lame.
6736 6924           */
6737 6925          (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6738 6926              "brand-string", cpi->cpi_brandstr);
6739 6927  
6740 6928          /*
6741 6929           * Finally, cache and tlb information
6742 6930           */
6743 6931          switch (x86_which_cacheinfo(cpi)) {
6744 6932          case X86_VENDOR_Intel:
6745 6933                  intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
6746 6934                  break;
6747 6935          case X86_VENDOR_Cyrix:
6748 6936                  cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
6749 6937                  break;
6750 6938          case X86_VENDOR_AMD:
6751 6939                  amd_cache_info(cpi, cpu_devi);
6752 6940                  break;
6753 6941          default:
6754 6942                  break;
6755 6943          }
6756 6944  }
6757 6945  
6758 6946  struct l2info {
6759 6947          int *l2i_csz;
6760 6948          int *l2i_lsz;
6761 6949          int *l2i_assoc;
6762 6950          int l2i_ret;
6763 6951  };
6764 6952  
6765 6953  /*
6766 6954   * A cacheinfo walker that fetches the size, line-size and associativity
6767 6955   * of the L2 cache
6768 6956   */
6769 6957  static int
6770 6958  intel_l2cinfo(void *arg, const struct cachetab *ct)
6771 6959  {
6772 6960          struct l2info *l2i = arg;
6773 6961          int *ip;
6774 6962  
6775 6963          if (ct->ct_label != l2_cache_str &&
6776 6964              ct->ct_label != sl2_cache_str)
6777 6965                  return (0);     /* not an L2 -- keep walking */
6778 6966  
6779 6967          if ((ip = l2i->l2i_csz) != NULL)
6780 6968                  *ip = ct->ct_size;
6781 6969          if ((ip = l2i->l2i_lsz) != NULL)
6782 6970                  *ip = ct->ct_line_size;
6783 6971          if ((ip = l2i->l2i_assoc) != NULL)
6784 6972                  *ip = ct->ct_assoc;
6785 6973          l2i->l2i_ret = ct->ct_size;
6786 6974          return (1);             /* was an L2 -- terminate walk */
6787 6975  }
6788 6976  
6789 6977  /*
6790 6978   * AMD L2/L3 Cache and TLB Associativity Field Definition:
6791 6979   *
6792 6980   *      Unlike the associativity for the L1 cache and tlb where the 8 bit
6793 6981   *      value is the associativity, the associativity for the L2 cache and
6794 6982   *      tlb is encoded in the following table. The 4 bit L2 value serves as
6795 6983   *      an index into the amd_afd[] array to determine the associativity.
6796 6984   *      -1 is undefined. 0 is fully associative.
6797 6985   */
6798 6986  
6799 6987  static int amd_afd[] =
6800 6988          {-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
6801 6989  
6802 6990  static void
6803 6991  amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
6804 6992  {
6805 6993          struct cpuid_regs *cp;
6806 6994          uint_t size, assoc;
6807 6995          int i;
6808 6996          int *ip;
6809 6997  
6810 6998          if (cpi->cpi_xmaxeax < 0x80000006)
6811 6999                  return;
6812 7000          cp = &cpi->cpi_extd[6];
6813 7001  
6814 7002          if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
6815 7003              (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
6816 7004                  uint_t cachesz = size * 1024;
6817 7005                  assoc = amd_afd[i];
6818 7006  
6819 7007                  ASSERT(assoc != -1);
6820 7008  
6821 7009                  if ((ip = l2i->l2i_csz) != NULL)
6822 7010                          *ip = cachesz;
6823 7011                  if ((ip = l2i->l2i_lsz) != NULL)
6824 7012                          *ip = BITX(cp->cp_ecx, 7, 0);
6825 7013                  if ((ip = l2i->l2i_assoc) != NULL)
6826 7014                          *ip = assoc;
6827 7015                  l2i->l2i_ret = cachesz;
6828 7016          }
6829 7017  }
6830 7018  
6831 7019  int
6832 7020  getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
6833 7021  {
6834 7022          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6835 7023          struct l2info __l2info, *l2i = &__l2info;
6836 7024  
6837 7025          l2i->l2i_csz = csz;
6838 7026          l2i->l2i_lsz = lsz;
6839 7027          l2i->l2i_assoc = assoc;
6840 7028          l2i->l2i_ret = -1;
6841 7029  
6842 7030          switch (x86_which_cacheinfo(cpi)) {
6843 7031          case X86_VENDOR_Intel:
6844 7032                  intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
6845 7033                  break;
6846 7034          case X86_VENDOR_Cyrix:
6847 7035                  cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
6848 7036                  break;
6849 7037          case X86_VENDOR_AMD:
6850 7038                  amd_l2cacheinfo(cpi, l2i);
6851 7039                  break;
6852 7040          default:
6853 7041                  break;
6854 7042          }
6855 7043          return (l2i->l2i_ret);
6856 7044  }
6857 7045  
6858 7046  #if !defined(__xpv)
6859 7047  
6860 7048  uint32_t *
6861 7049  cpuid_mwait_alloc(cpu_t *cpu)
6862 7050  {
6863 7051          uint32_t        *ret;
6864 7052          size_t          mwait_size;
6865 7053  
6866 7054          ASSERT(cpuid_checkpass(CPU, 2));
6867 7055  
6868 7056          mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
6869 7057          if (mwait_size == 0)
6870 7058                  return (NULL);
6871 7059  
6872 7060          /*
6873 7061           * kmem_alloc() returns cache line size aligned data for mwait_size
6874 7062           * allocations.  mwait_size is currently cache line sized.  Neither
6875 7063           * of these implementation details are guarantied to be true in the
6876 7064           * future.
6877 7065           *
6878 7066           * First try allocating mwait_size as kmem_alloc() currently returns
6879 7067           * correctly aligned memory.  If kmem_alloc() does not return
6880 7068           * mwait_size aligned memory, then use mwait_size ROUNDUP.
6881 7069           *
6882 7070           * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
6883 7071           * decide to free this memory.
6884 7072           */
6885 7073          ret = kmem_zalloc(mwait_size, KM_SLEEP);
6886 7074          if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
6887 7075                  cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
6888 7076                  cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
6889 7077                  *ret = MWAIT_RUNNING;
6890 7078                  return (ret);
6891 7079          } else {
6892 7080                  kmem_free(ret, mwait_size);
6893 7081                  ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
6894 7082                  cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
6895 7083                  cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
6896 7084                  ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
6897 7085                  *ret = MWAIT_RUNNING;
6898 7086                  return (ret);
6899 7087          }
6900 7088  }
6901 7089  
6902 7090  void
6903 7091  cpuid_mwait_free(cpu_t *cpu)
6904 7092  {
6905 7093          if (cpu->cpu_m.mcpu_cpi == NULL) {
6906 7094                  return;
6907 7095          }
6908 7096  
6909 7097          if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
6910 7098              cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
6911 7099                  kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
6912 7100                      cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
6913 7101          }
6914 7102  
6915 7103          cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
6916 7104          cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
6917 7105  }
6918 7106  
6919 7107  void
6920 7108  patch_tsc_read(int flag)
6921 7109  {
6922 7110          size_t cnt;
6923 7111  
6924 7112          switch (flag) {
6925 7113          case TSC_NONE:
6926 7114                  cnt = &_no_rdtsc_end - &_no_rdtsc_start;
6927 7115                  (void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
6928 7116                  break;
6929 7117          case TSC_RDTSC_MFENCE:
6930 7118                  cnt = &_tsc_mfence_end - &_tsc_mfence_start;
6931 7119                  (void) memcpy((void *)tsc_read,
6932 7120                      (void *)&_tsc_mfence_start, cnt);
6933 7121                  break;
6934 7122          case TSC_RDTSC_LFENCE:
6935 7123                  cnt = &_tsc_lfence_end - &_tsc_lfence_start;
6936 7124                  (void) memcpy((void *)tsc_read,
6937 7125                      (void *)&_tsc_lfence_start, cnt);
6938 7126                  break;
6939 7127          case TSC_TSCP:
6940 7128                  cnt = &_tscp_end - &_tscp_start;
6941 7129                  (void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
6942 7130                  break;
6943 7131          default:
6944 7132                  /* Bail for unexpected TSC types. (TSC_NONE covers 0) */
6945 7133                  cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
6946 7134                  break;
6947 7135          }
6948 7136          tsc_type = flag;
6949 7137  }
6950 7138  
6951 7139  int
6952 7140  cpuid_deep_cstates_supported(void)
6953 7141  {
6954 7142          struct cpuid_info *cpi;
6955 7143          struct cpuid_regs regs;
6956 7144  
6957 7145          ASSERT(cpuid_checkpass(CPU, 1));
6958 7146  
6959 7147          cpi = CPU->cpu_m.mcpu_cpi;
6960 7148  
6961 7149          if (!is_x86_feature(x86_featureset, X86FSET_CPUID))
6962 7150                  return (0);
6963 7151  
6964 7152          switch (cpi->cpi_vendor) {
6965 7153          case X86_VENDOR_Intel:
6966 7154                  if (cpi->cpi_xmaxeax < 0x80000007)
6967 7155                          return (0);
6968 7156  
6969 7157                  /*
6970 7158                   * TSC run at a constant rate in all ACPI C-states?
6971 7159                   */
6972 7160                  regs.cp_eax = 0x80000007;
6973 7161                  (void) __cpuid_insn(&regs);
6974 7162                  return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
6975 7163  
6976 7164          default:
6977 7165                  return (0);
6978 7166          }
6979 7167  }
6980 7168  
6981 7169  #endif  /* !__xpv */
6982 7170  
6983 7171  void
6984 7172  post_startup_cpu_fixups(void)
6985 7173  {
6986 7174  #ifndef __xpv
6987 7175          /*
6988 7176           * Some AMD processors support C1E state. Entering this state will
6989 7177           * cause the local APIC timer to stop, which we can't deal with at
6990 7178           * this time.
6991 7179           */
6992 7180          if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
6993 7181                  on_trap_data_t otd;
6994 7182                  uint64_t reg;
6995 7183  
6996 7184                  if (!on_trap(&otd, OT_DATA_ACCESS)) {
6997 7185                          reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
6998 7186                          /* Disable C1E state if it is enabled by BIOS */
6999 7187                          if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
7000 7188                              AMD_ACTONCMPHALT_MASK) {
7001 7189                                  reg &= ~(AMD_ACTONCMPHALT_MASK <<
7002 7190                                      AMD_ACTONCMPHALT_SHIFT);
7003 7191                                  wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
7004 7192                          }
7005 7193                  }
7006 7194                  no_trap();
7007 7195          }
7008 7196  #endif  /* !__xpv */
7009 7197  }
7010 7198  
7011 7199  void
7012 7200  enable_pcid(void)
7013 7201  {
7014 7202          if (x86_use_pcid == -1)
7015 7203                  x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
7016 7204  
7017 7205          if (x86_use_invpcid == -1) {
7018 7206                  x86_use_invpcid = is_x86_feature(x86_featureset,
7019 7207                      X86FSET_INVPCID);
7020 7208          }
7021 7209  
7022 7210          if (!x86_use_pcid)
7023 7211                  return;
7024 7212  
7025 7213          /*
7026 7214           * Intel say that on setting PCIDE, it immediately starts using the PCID
7027 7215           * bits; better make sure there's nothing there.
7028 7216           */
7029 7217          ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
7030 7218  
7031 7219          setcr4(getcr4() | CR4_PCIDE);
7032 7220  }
7033 7221  
7034 7222  /*
7035 7223   * Setup necessary registers to enable XSAVE feature on this processor.
7036 7224   * This function needs to be called early enough, so that no xsave/xrstor
7037 7225   * ops will execute on the processor before the MSRs are properly set up.
7038 7226   *
7039 7227   * Current implementation has the following assumption:
7040 7228   * - cpuid_pass1() is done, so that X86 features are known.
7041 7229   * - fpu_probe() is done, so that fp_save_mech is chosen.
7042 7230   */
7043 7231  void
7044 7232  xsave_setup_msr(cpu_t *cpu)
7045 7233  {
7046 7234          ASSERT(fp_save_mech == FP_XSAVE);
7047 7235          ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
7048 7236  
7049 7237          /* Enable OSXSAVE in CR4. */
7050 7238          setcr4(getcr4() | CR4_OSXSAVE);
7051 7239          /*
7052 7240           * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report
7053 7241           * correct value.
7054 7242           */
7055 7243          cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE;
7056 7244          setup_xfem();
7057 7245  }
7058 7246  
7059 7247  /*
7060 7248   * Starting with the Westmere processor the local
7061 7249   * APIC timer will continue running in all C-states,
7062 7250   * including the deepest C-states.
7063 7251   */
7064 7252  int
7065 7253  cpuid_arat_supported(void)
7066 7254  {
7067 7255          struct cpuid_info *cpi;
7068 7256          struct cpuid_regs regs;
7069 7257  
7070 7258          ASSERT(cpuid_checkpass(CPU, 1));
7071 7259          ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7072 7260  
7073 7261          cpi = CPU->cpu_m.mcpu_cpi;
7074 7262  
7075 7263          switch (cpi->cpi_vendor) {
7076 7264          case X86_VENDOR_Intel:
7077 7265                  /*
7078 7266                   * Always-running Local APIC Timer is
7079 7267                   * indicated by CPUID.6.EAX[2].
7080 7268                   */
7081 7269                  if (cpi->cpi_maxeax >= 6) {
7082 7270                          regs.cp_eax = 6;
7083 7271                          (void) cpuid_insn(NULL, &regs);
7084 7272                          return (regs.cp_eax & CPUID_INTC_EAX_ARAT);
7085 7273                  } else {
7086 7274                          return (0);
7087 7275                  }
7088 7276          default:
7089 7277                  return (0);
7090 7278          }
7091 7279  }
7092 7280  
7093 7281  /*
7094 7282   * Check support for Intel ENERGY_PERF_BIAS feature
7095 7283   */
7096 7284  int
7097 7285  cpuid_iepb_supported(struct cpu *cp)
7098 7286  {
7099 7287          struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi;
7100 7288          struct cpuid_regs regs;
7101 7289  
7102 7290          ASSERT(cpuid_checkpass(cp, 1));
7103 7291  
7104 7292          if (!(is_x86_feature(x86_featureset, X86FSET_CPUID)) ||
7105 7293              !(is_x86_feature(x86_featureset, X86FSET_MSR))) {
7106 7294                  return (0);
7107 7295          }
7108 7296  
7109 7297          /*
7110 7298           * Intel ENERGY_PERF_BIAS MSR is indicated by
7111 7299           * capability bit CPUID.6.ECX.3
7112 7300           */
7113 7301          if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6))
7114 7302                  return (0);
7115 7303  
7116 7304          regs.cp_eax = 0x6;
7117 7305          (void) cpuid_insn(NULL, &regs);
7118 7306          return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS);
7119 7307  }
7120 7308  
7121 7309  /*
7122 7310   * Check support for TSC deadline timer
7123 7311   *
7124 7312   * TSC deadline timer provides a superior software programming
7125 7313   * model over local APIC timer that eliminates "time drifts".
7126 7314   * Instead of specifying a relative time, software specifies an
7127 7315   * absolute time as the target at which the processor should
7128 7316   * generate a timer event.
7129 7317   */
7130 7318  int
7131 7319  cpuid_deadline_tsc_supported(void)
7132 7320  {
7133 7321          struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi;
7134 7322          struct cpuid_regs regs;
7135 7323  
7136 7324          ASSERT(cpuid_checkpass(CPU, 1));
7137 7325          ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7138 7326  
7139 7327          switch (cpi->cpi_vendor) {
7140 7328          case X86_VENDOR_Intel:
7141 7329                  if (cpi->cpi_maxeax >= 1) {
7142 7330                          regs.cp_eax = 1;
7143 7331                          (void) cpuid_insn(NULL, &regs);
7144 7332                          return (regs.cp_ecx & CPUID_DEADLINE_TSC);
7145 7333                  } else {
7146 7334                          return (0);
7147 7335                  }
7148 7336          default:
7149 7337                  return (0);
7150 7338          }
7151 7339  }
7152 7340  
7153 7341  #if defined(__amd64) && !defined(__xpv)
7154 7342  /*
7155 7343   * Patch in versions of bcopy for high performance Intel Nhm processors
7156 7344   * and later...
7157 7345   */
7158 7346  void
7159 7347  patch_memops(uint_t vendor)
7160 7348  {
7161 7349          size_t cnt, i;
7162 7350          caddr_t to, from;
7163 7351  
7164 7352          if ((vendor == X86_VENDOR_Intel) &&
7165 7353              is_x86_feature(x86_featureset, X86FSET_SSE4_2)) {
7166 7354                  cnt = &bcopy_patch_end - &bcopy_patch_start;
7167 7355                  to = &bcopy_ck_size;
7168 7356                  from = &bcopy_patch_start;
7169 7357                  for (i = 0; i < cnt; i++) {
7170 7358                          *to++ = *from++;
7171 7359                  }
7172 7360          }
7173 7361  }
7174 7362  #endif  /* __amd64 && !__xpv */
7175 7363  
7176 7364  /*
7177 7365   * We're being asked to tell the system how many bits are required to represent
7178 7366   * the various thread and strand IDs. While it's tempting to derive this based
7179 7367   * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite
7180 7368   * correct. Instead, this needs to be based on the number of bits that the APIC
7181 7369   * allows for these different configurations. We only update these to a larger
7182 7370   * value if we find one.
7183 7371   */
7184 7372  void
7185 7373  cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
7186 7374  {
7187 7375          struct cpuid_info *cpi;
7188 7376  
7189 7377          VERIFY(cpuid_checkpass(CPU, 1));
7190 7378          cpi = cpu->cpu_m.mcpu_cpi;
7191 7379  
7192 7380          if (cpi->cpi_ncore_bits > *core_nbits) {
7193 7381                  *core_nbits = cpi->cpi_ncore_bits;
7194 7382          }
7195 7383  
7196 7384          if (cpi->cpi_nthread_bits > *strand_nbits) {
7197 7385                  *strand_nbits = cpi->cpi_nthread_bits;
7198 7386          }
7199 7387  }
7200 7388  
7201 7389  void
7202 7390  cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
7203 7391  {
7204 7392          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7205 7393          struct cpuid_regs cp;
7206 7394  
7207 7395          /*
7208 7396           * Reread the CPUID portions that we need for various security
7209 7397           * information.
7210 7398           */
7211 7399          if (cpi->cpi_vendor == X86_VENDOR_Intel) {
7212 7400                  /*
7213 7401                   * Check if we now have leaf 7 available to us.
7214 7402                   */
7215 7403                  if (cpi->cpi_maxeax < 7) {
7216 7404                          bzero(&cp, sizeof (cp));
7217 7405                          cp.cp_eax = 0;
7218 7406                          cpi->cpi_maxeax = __cpuid_insn(&cp);
7219 7407                          if (cpi->cpi_maxeax < 7)
7220 7408                                  return;
7221 7409                  }
7222 7410  
7223 7411                  bzero(&cp, sizeof (cp));
7224 7412                  cp.cp_eax = 7;
7225 7413                  cp.cp_ecx = 0;
7226 7414                  (void) __cpuid_insn(&cp);
7227 7415                  cpi->cpi_std[7] = cp;
7228 7416          } else if (cpi->cpi_vendor == X86_VENDOR_AMD) {
7229 7417                  /* No xcpuid support */
7230 7418                  if (cpi->cpi_family < 5 ||
7231 7419                      (cpi->cpi_family == 5 && cpi->cpi_model < 1))
7232 7420                          return;
7233 7421  
7234 7422                  if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7235 7423                          bzero(&cp, sizeof (cp));
7236 7424                          cp.cp_eax = CPUID_LEAF_EXT_0;
7237 7425                          cpi->cpi_xmaxeax = __cpuid_insn(&cp);
7238 7426                          if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7239 7427                                  return;
7240 7428                          }
7241 7429                  }
7242 7430  
7243 7431                  bzero(&cp, sizeof (cp));
7244 7432                  cp.cp_eax = CPUID_LEAF_EXT_8;
7245 7433                  (void) __cpuid_insn(&cp);
7246 7434                  platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
7247 7435                  cpi->cpi_extd[8] = cp;
7248 7436          } else {
7249 7437                  /*
7250 7438                   * Nothing to do here. Return an empty set which has already
7251 7439                   * been zeroed for us.
7252 7440                   */
7253 7441                  return;
7254 7442          }
7255 7443          cpuid_scan_security(cpu, fset);
7256 7444  }
7257 7445  
7258 7446  /* ARGSUSED */
7259 7447  static int
7260 7448  cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
7261 7449  {
7262 7450          uchar_t *fset;
7263 7451          boolean_t first_pass = (boolean_t)arg1;
7264 7452  
7265 7453          fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
7266 7454          if (first_pass && CPU->cpu_id != 0)
7267 7455                  return (0);
7268 7456          if (!first_pass && CPU->cpu_id == 0)
7269 7457                  return (0);
7270 7458          cpuid_pass_ucode(CPU, fset);
7271 7459  
7272 7460          return (0);
7273 7461  }
7274 7462  
7275 7463  /*
7276 7464   * After a microcode update where the version has changed, then we need to
7277 7465   * rescan CPUID. To do this we check every CPU to make sure that they have the
7278 7466   * same microcode. Then we perform a cross call to all such CPUs. It's the
7279 7467   * caller's job to make sure that no one else can end up doing an update while
7280 7468   * this is going on.
7281 7469   *
7282 7470   * We assume that the system is microcode capable if we're called.
7283 7471   */
7284 7472  void
7285 7473  cpuid_post_ucodeadm(void)
7286 7474  {
7287 7475          uint32_t rev;
7288 7476          int i;
7289 7477          struct cpu *cpu;
7290 7478          cpuset_t cpuset;
7291 7479          void *argdata;
7292 7480          uchar_t *f0;
7293 7481  
7294 7482          argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
7295 7483  
7296 7484          mutex_enter(&cpu_lock);
7297 7485          cpu = cpu_get(0);
7298 7486          rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
7299 7487          CPUSET_ONLY(cpuset, 0);
7300 7488          for (i = 1; i < max_ncpus; i++) {
7301 7489                  if ((cpu = cpu_get(i)) == NULL)
7302 7490                          continue;
7303 7491  
7304 7492                  if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
7305 7493                          panic("post microcode update CPU %d has differing "
7306 7494                              "microcode revision (%u) from CPU 0 (%u)",
7307 7495                              i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
7308 7496                  }
7309 7497                  CPUSET_ADD(cpuset, i);
7310 7498          }
7311 7499  
7312 7500          /*
7313 7501           * We do the cross calls in two passes. The first pass is only for the
7314 7502           * boot CPU. The second pass is for all of the other CPUs. This allows
7315 7503           * the boot CPU to go through and change behavior related to patching or
7316 7504           * whether or not Enhanced IBRS needs to be enabled and then allow all
7317 7505           * other CPUs to follow suit.
7318 7506           */
7319 7507          kpreempt_disable();
7320 7508          xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset),
7321 7509              cpuid_post_ucodeadm_xc);
7322 7510          xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset),
7323 7511              cpuid_post_ucodeadm_xc);
7324 7512          kpreempt_enable();
7325 7513  
7326 7514          /*
7327 7515           * OK, now look at each CPU and see if their feature sets are equal.
7328 7516           */
7329 7517          f0 = argdata;
7330 7518          for (i = 1; i < max_ncpus; i++) {
7331 7519                  uchar_t *fset;
7332 7520                  if (!CPU_IN_SET(cpuset, i))
7333 7521                          continue;
7334 7522  
7335 7523                  fset = (uchar_t *)((uintptr_t)argdata +
7336 7524                      sizeof (x86_featureset) * i);
7337 7525  
7338 7526                  if (!compare_x86_featureset(f0, fset)) {
7339 7527                          panic("Post microcode update CPU %d has "
7340 7528                              "differing security feature (%p) set from CPU 0 "
7341 7529                              "(%p), not appending to feature set", i,
7342 7530                              (void *)fset, (void *)f0);
7343 7531                  }
7344 7532          }
7345 7533  
7346 7534          mutex_exit(&cpu_lock);
7347 7535  
7348 7536          for (i = 0; i < NUM_X86_FEATURES; i++) {
7349 7537                  cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n",
7350 7538                      x86_feature_names[i]);
7351 7539                  if (is_x86_feature(f0, i)) {
7352 7540                          add_x86_feature(x86_featureset, i);
7353 7541                  }
7354 7542          }
7355 7543          kmem_free(argdata, sizeof (x86_featureset) * NCPU);
7356 7544  }

↓ open down ↓

4482 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX