2 Wdiff usr/src/uts/i86pc/os/cpuid.c

Print this page

update

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/i86pc/os/cpuid.c
          +++ new/usr/src/uts/i86pc/os/cpuid.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the

↓ open down ↓

15 lines elided

↑ open up ↑

  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  24   24   * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
  25   25   * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
       26 + * Copyright 2020 Joyent, Inc.
  26   27   */
  27   28  /*
  28   29   * Copyright (c) 2010, Intel Corporation.
  29   30   * All rights reserved.
  30   31   */
  31   32  /*
  32   33   * Portions Copyright 2009 Advanced Micro Devices, Inc.
  33   34   */
  34      -/*
  35      - * Copyright 2020 Joyent, Inc.
  36      - */
  37   35  
  38   36  /*
  39   37   * CPU Identification logic
  40   38   *
  41   39   * The purpose of this file and its companion, cpuid_subr.c, is to help deal
  42   40   * with the identification of CPUs, their features, and their topologies. More
  43   41   * specifically, this file helps drive the following:
  44   42   *
  45   43   * 1. Enumeration of features of the processor which are used by the kernel to
  46   44   *    determine what features to enable or disable. These may be instruction set

  47   45   *    enhancements or features that we use.
  48   46   *
  49   47   * 2. Enumeration of instruction set architecture (ISA) additions that userland
  50   48   *    will be told about through the auxiliary vector.
  51   49   *
  52   50   * 3. Understanding the physical topology of the CPU such as the number of
  53   51   *    caches, how many cores it has, whether or not it supports symmetric
  54   52   *    multi-processing (SMT), etc.
  55   53   *
  56   54   * ------------------------
  57   55   * CPUID History and Basics
  58   56   * ------------------------
  59   57   *
  60   58   * The cpuid instruction was added by Intel roughly around the time that the
  61   59   * original Pentium was introduced. The purpose of cpuid was to tell in a
  62   60   * programmatic fashion information about the CPU that previously was guessed
  63   61   * at. For example, an important part of cpuid is that we can know what
  64   62   * extensions to the ISA exist. If you use an invalid opcode you would get a
  65   63   * #UD, so this method allows a program (whether a user program or the kernel)
  66   64   * to determine what exists without crashing or getting a SIGILL. Of course,
  67   65   * this was also during the era of the clones and the AMD Am5x86. The vendor
  68   66   * name shows up first in cpuid for a reason.
  69   67   *
  70   68   * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
  71   69   * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
  72   70   * its own meaning. The different leaves are broken down into different regions:
  73   71   *
  74   72   *      [ 0, 7fffffff ]                 This region is called the 'basic'
  75   73   *                                      region. This region is generally defined
  76   74   *                                      by Intel, though some of the original
  77   75   *                                      portions have different meanings based
  78   76   *                                      on the manufacturer. These days, Intel
  79   77   *                                      adds most new features to this region.
  80   78   *                                      AMD adds non-Intel compatible
  81   79   *                                      information in the third, extended
  82   80   *                                      region. Intel uses this for everything
  83   81   *                                      including ISA extensions, CPU
  84   82   *                                      features, cache information, topology,
  85   83   *                                      and more.
  86   84   *
  87   85   *                                      There is a hole carved out of this
  88   86   *                                      region which is reserved for
  89   87   *                                      hypervisors.
  90   88   *
  91   89   *      [ 40000000, 4fffffff ]          This region, which is found in the
  92   90   *                                      middle of the previous region, is
  93   91   *                                      explicitly promised to never be used by
  94   92   *                                      CPUs. Instead, it is used by hypervisors
  95   93   *                                      to communicate information about
  96   94   *                                      themselves to the operating system. The
  97   95   *                                      values and details are unique for each
  98   96   *                                      hypervisor.
  99   97   *
 100   98   *      [ 80000000, ffffffff ]          This region is called the 'extended'
 101   99   *                                      region. Some of the low leaves mirror
 102  100   *                                      parts of the basic leaves. This region
 103  101   *                                      has generally been used by AMD for
 104  102   *                                      various extensions. For example, AMD-
 105  103   *                                      specific information about caches,
 106  104   *                                      features, and topology are found in this
 107  105   *                                      region.
 108  106   *
 109  107   * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
 110  108   * and %edx, and then issue the cpuid instruction. At the first leaf in each of
 111  109   * the ranges, one of the primary things returned is the maximum valid leaf in
 112  110   * that range. This allows for discovery of what range of CPUID is valid.
 113  111   *
 114  112   * The CPUs have potentially surprising behavior when using an invalid leaf or
 115  113   * unimplemented leaf. If the requested leaf is within the valid basic or
 116  114   * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
 117  115   * set to zero. However, if you specify a leaf that is outside of a valid range,
 118  116   * then instead it will be filled with the last valid _basic_ leaf. For example,
 119  117   * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
 120  118   * an invalid extended leaf will return the information for leaf 3.
 121  119   *
 122  120   * Some leaves are broken down into sub-leaves. This means that the value
 123  121   * depends on both the leaf asked for in %eax and a secondary register. For
 124  122   * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
 125  123   * additional information. Or when getting topology information in leaf 0xb, the
 126  124   * initial value in %ecx changes which level of the topology that you are
 127  125   * getting information about.
 128  126   *
 129  127   * cpuid values are always kept to 32 bits regardless of whether or not the
 130  128   * program is in 64-bit mode. When executing in 64-bit mode, the upper
 131  129   * 32 bits of the register are always set to zero so that way the values are the
 132  130   * same regardless of execution mode.
 133  131   *
 134  132   * ----------------------
 135  133   * Identifying Processors
 136  134   * ----------------------
 137  135   *
 138  136   * We can identify a processor in two steps. The first step looks at cpuid leaf
 139  137   * 0. Leaf 0 contains the processor's vendor information. This is done by
 140  138   * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
 141  139   * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
 142  140   *
 143  141   * From there, a processor is identified by a combination of three different
 144  142   * values:
 145  143   *
 146  144   *  1. Family
 147  145   *  2. Model
 148  146   *  3. Stepping
 149  147   *
 150  148   * Each vendor uses the family and model to uniquely identify a processor. The
 151  149   * way that family and model are changed depends on the vendor. For example,
 152  150   * Intel has been using family 0x6 for almost all of their processor since the
 153  151   * Pentium Pro/Pentium II era, often called the P6. The model is used to
 154  152   * identify the exact processor. Different models are often used for the client
 155  153   * (consumer) and server parts. Even though each processor often has major
 156  154   * architectural differences, they still are considered the same family by
 157  155   * Intel.
 158  156   *
 159  157   * On the other hand, each major AMD architecture generally has its own family.
 160  158   * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
 161  159   * the model number is used to help identify specific processors.
 162  160   *
 163  161   * The stepping is used to refer to a revision of a specific microprocessor. The
 164  162   * term comes from equipment used to produce masks that are used to create
 165  163   * integrated circuits.
 166  164   *
 167  165   * The information is present in leaf 1, %eax. In technical documentation you
 168  166   * will see the terms extended model and extended family. The original family,
 169  167   * model, and stepping fields were each 4 bits wide. If the values in either
 170  168   * are 0xf, then one is to consult the extended model and extended family, which
 171  169   * take previously reserved bits and allow for a larger number of models and add
 172  170   * 0xf to them.
 173  171   *
 174  172   * When we process this information, we store the full family, model, and
 175  173   * stepping in the struct cpuid_info members cpi_family, cpi_model, and
 176  174   * cpi_step, respectively. Whenever you are performing comparisons with the
 177  175   * family, model, and stepping, you should use these members and not the raw
 178  176   * values from cpuid. If you must use the raw values from cpuid directly, you
 179  177   * must make sure that you add the extended model and family to the base model
 180  178   * and family.
 181  179   *
 182  180   * In general, we do not use information about the family, model, and stepping
 183  181   * to determine whether or not a feature is present; that is generally driven by
 184  182   * specific leaves. However, when something we care about on the processor is
 185  183   * not considered 'architectural' meaning that it is specific to a set of
 186  184   * processors and not promised in the architecture model to be consistent from
 187  185   * generation to generation, then we will fall back on this information. The
 188  186   * most common cases where this comes up is when we have to workaround errata in
 189  187   * the processor, are dealing with processor-specific features such as CPU
 190  188   * performance counters, or we want to provide additional information for things
 191  189   * such as fault management.
 192  190   *
 193  191   * While processors also do have a brand string, which is the name that people
 194  192   * are familiar with when buying the processor, they are not meant for
 195  193   * programmatic consumption. That is what the family, model, and stepping are
 196  194   * for.
 197  195   *
 198  196   * ------------
 199  197   * CPUID Passes
 200  198   * ------------
 201  199   *
 202  200   * As part of performing feature detection, we break this into several different
 203  201   * passes. The passes are as follows:
 204  202   *
 205  203   *      Pass 0          This is a primordial pass done in locore.s to deal with
 206  204   *                      Cyrix CPUs that don't support cpuid. The reality is that
 207  205   *                      we likely don't run on them any more, but there is still
 208  206   *                      logic for handling them.
 209  207   *
 210  208   *      Pass 1          This is the primary pass and is responsible for doing a
 211  209   *                      large number of different things:
 212  210   *
 213  211   *                      1. Determine which vendor manufactured the CPU and
 214  212   *                      determining the family, model, and stepping information.
 215  213   *
 216  214   *                      2. Gathering a large number of feature flags to
 217  215   *                      determine which features the CPU support and which
 218  216   *                      indicate things that we need to do other work in the OS
 219  217   *                      to enable. Features detected this way are added to the
 220  218   *                      x86_featureset which can be queried to
 221  219   *                      determine what we should do. This includes processing
 222  220   *                      all of the basic and extended CPU features that we care
 223  221   *                      about.
 224  222   *
 225  223   *                      3. Determining the CPU's topology. This includes
 226  224   *                      information about how many cores and threads are present
 227  225   *                      in the package. It also is responsible for figuring out
 228  226   *                      which logical CPUs are potentially part of the same core
 229  227   *                      and what other resources they might share. For more
 230  228   *                      information see the 'Topology' section.
 231  229   *
 232  230   *                      4. Determining the set of CPU security-specific features
 233  231   *                      that we need to worry about and determine the
 234  232   *                      appropriate set of workarounds.
 235  233   *
 236  234   *                      Pass 1 on the boot CPU occurs before KMDB is started.
 237  235   *
 238  236   *      Pass 2          The second pass is done after startup(). Here, we check
 239  237   *                      other miscellaneous features. Most of this is gathering
 240  238   *                      additional basic and extended features that we'll use in
 241  239   *                      later passes or for debugging support.
 242  240   *
 243  241   *      Pass 3          The third pass occurs after the kernel memory allocator
 244  242   *                      has been fully initialized. This gathers information
 245  243   *                      where we might need dynamic memory available for our
 246  244   *                      uses. This includes several varying width leaves that
 247  245   *                      have cache information and the processor's brand string.
 248  246   *
 249  247   *      Pass 4          The fourth and final normal pass is performed after the
 250  248   *                      kernel has brought most everything online. This is
 251  249   *                      invoked from post_startup(). In this pass, we go through
 252  250   *                      the set of features that we have enabled and turn that
 253  251   *                      into the hardware auxiliary vector features that
 254  252   *                      userland receives. This is used by userland, primarily
 255  253   *                      by the run-time link-editor (RTLD), though userland
 256  254   *                      software could also refer to it directly.
 257  255   *
 258  256   *      Microcode       After a microcode update, we do a selective rescan of
 259  257   *                      the cpuid leaves to determine what features have
 260  258   *                      changed. Microcode updates can provide more details
 261  259   *                      about security related features to deal with issues like
 262  260   *                      Spectre and L1TF. On occasion, vendors have violated
 263  261   *                      their contract and removed bits. However, we don't try
 264  262   *                      to detect that because that puts us in a situation that
 265  263   *                      we really can't deal with. As such, the only thing we
 266  264   *                      rescan are security related features today. See
 267  265   *                      cpuid_pass_ucode().
 268  266   *
 269  267   * All of the passes (except pass 0) are run on all CPUs. However, for the most
 270  268   * part we only care about what the boot CPU says about this information and use
 271  269   * the other CPUs as a rough guide to sanity check that we have the same feature
 272  270   * set.
 273  271   *
 274  272   * We do not support running multiple logical CPUs with disjoint, let alone
 275  273   * different, feature sets.
 276  274   *
 277  275   * ------------------
 278  276   * Processor Topology
 279  277   * ------------------
 280  278   *
 281  279   * One of the important things that we need to do is to understand the topology
 282  280   * of the underlying processor. When we say topology in this case, we're trying
 283  281   * to understand the relationship between the logical CPUs that the operating
 284  282   * system sees and the underlying physical layout. Different logical CPUs may
 285  283   * share different resources which can have important consequences for the
 286  284   * performance of the system. For example, they may share caches, execution
 287  285   * units, and more.
 288  286   *
 289  287   * The topology of the processor changes from generation to generation and
 290  288   * vendor to vendor.  Along with that, different vendors use different
 291  289   * terminology, and the operating system itself uses occasionally overlapping
 292  290   * terminology. It's important to understand what this topology looks like so
 293  291   * one can understand the different things that we try to calculate and
 294  292   * determine.
 295  293   *
 296  294   * To get started, let's talk about a little bit of terminology that we've used
 297  295   * so far, is used throughout this file, and is fairly generic across multiple
 298  296   * vendors:
 299  297   *
 300  298   * CPU
 301  299   *      A central processing unit (CPU) refers to a logical and/or virtual
 302  300   *      entity that the operating system can execute instructions on. The
 303  301   *      underlying resources for this CPU may be shared between multiple
 304  302   *      entities; however, to the operating system it is a discrete unit.
 305  303   *
 306  304   * PROCESSOR and PACKAGE
 307  305   *
 308  306   *      Generally, when we use the term 'processor' on its own, we are referring
 309  307   *      to the physical entity that one buys and plugs into a board. However,
 310  308   *      because processor has been overloaded and one might see it used to mean
 311  309   *      multiple different levels, we will instead use the term 'package' for
 312  310   *      the rest of this file. The term package comes from the electrical
 313  311   *      engineering side and refers to the physical entity that encloses the
 314  312   *      electronics inside. Strictly speaking the package can contain more than
 315  313   *      just the CPU, for example, on many processors it may also have what's
 316  314   *      called an 'integrated graphical processing unit (GPU)'. Because the
 317  315   *      package can encapsulate multiple units, it is the largest physical unit
 318  316   *      that we refer to.
 319  317   *
 320  318   * SOCKET
 321  319   *
 322  320   *      A socket refers to unit on a system board (generally the motherboard)
 323  321   *      that can receive a package. A single package, or processor, is plugged
 324  322   *      into a single socket. A system may have multiple sockets. Often times,
 325  323   *      the term socket is used interchangeably with package and refers to the
 326  324   *      electrical component that has plugged in, and not the receptacle itself.
 327  325   *
 328  326   * CORE
 329  327   *
 330  328   *      A core refers to the physical instantiation of a CPU, generally, with a
 331  329   *      full set of hardware resources available to it. A package may contain
 332  330   *      multiple cores inside of it or it may just have a single one. A
 333  331   *      processor with more than one core is often referred to as 'multi-core'.
 334  332   *      In illumos, we will use the feature X86FSET_CMP to refer to a system
 335  333   *      that has 'multi-core' processors.
 336  334   *
 337  335   *      A core may expose a single logical CPU to the operating system, or it
 338  336   *      may expose multiple CPUs, which we call threads, defined below.
 339  337   *
 340  338   *      Some resources may still be shared by cores in the same package. For
 341  339   *      example, many processors will share the level 3 cache between cores.
 342  340   *      Some AMD generations share hardware resources between cores. For more
 343  341   *      information on that see the section 'AMD Topology'.
 344  342   *
 345  343   * THREAD and STRAND
 346  344   *
 347  345   *      In this file, generally a thread refers to a hardware resources and not
 348  346   *      the operating system's logical abstraction. A thread is always exposed
 349  347   *      as an independent logical CPU to the operating system. A thread belongs
 350  348   *      to a specific core. A core may have more than one thread. When that is
 351  349   *      the case, the threads that are part of the same core are often referred
 352  350   *      to as 'siblings'.
 353  351   *
 354  352   *      When multiple threads exist, this is generally referred to as
 355  353   *      simultaneous multi-threading (SMT). When Intel introduced this in their
 356  354   *      processors they called it hyper-threading (HT). When multiple threads
 357  355   *      are active in a core, they split the resources of the core. For example,
 358  356   *      two threads may share the same set of hardware execution units.
 359  357   *
 360  358   *      The operating system often uses the term 'strand' to refer to a thread.
 361  359   *      This helps disambiguate it from the software concept.
 362  360   *
 363  361   * CHIP
 364  362   *
 365  363   *      Unfortunately, the term 'chip' is dramatically overloaded. At its most
 366  364   *      base meaning, it is used to refer to a single integrated circuit, which
 367  365   *      may or may not be the only thing in the package. In illumos, when you
 368  366   *      see the term 'chip' it is almost always referring to the same thing as
 369  367   *      the 'package'. However, many vendors may use chip to refer to one of
 370  368   *      many integrated circuits that have been placed in the package. As an
 371  369   *      example, see the subsequent definition.
 372  370   *
 373  371   *      To try and keep things consistent, we will only use chip when referring
 374  372   *      to the entire integrated circuit package, with the exception of the
 375  373   *      definition of multi-chip module (because it is in the name) and use the
 376  374   *      term 'die' when we want the more general, potential sub-component
 377  375   *      definition.
 378  376   *
 379  377   * DIE
 380  378   *
 381  379   *      A die refers to an integrated circuit. Inside of the package there may
 382  380   *      be a single die or multiple dies. This is sometimes called a 'chip' in
 383  381   *      vendor's parlance, but in this file, we use the term die to refer to a
 384  382   *      subcomponent.
 385  383   *
 386  384   * MULTI-CHIP MODULE
 387  385   *
 388  386   *      A multi-chip module (MCM) refers to putting multiple distinct chips that
 389  387   *      are connected together in the same package. When a multi-chip design is
 390  388   *      used, generally each chip is manufactured independently and then joined
 391  389   *      together in the package. For example, on AMD's Zen microarchitecture
 392  390   *      (family 0x17), the package contains several dies (the second meaning of
 393  391   *      chip from above) that are connected together.
 394  392   *
 395  393   * CACHE
 396  394   *
 397  395   *      A cache is a part of the processor that maintains copies of recently
 398  396   *      accessed memory. Caches are split into levels and then into types.
 399  397   *      Commonly there are one to three levels, called level one, two, and
 400  398   *      three. The lower the level, the smaller it is, the closer it is to the
 401  399   *      execution units of the CPU, and the faster it is to access. The layout
 402  400   *      and design of the cache come in many different flavors, consult other
 403  401   *      resources for a discussion of those.
 404  402   *
 405  403   *      Caches are generally split into two types, the instruction and data
 406  404   *      cache. The caches contain what their names suggest, the instruction
 407  405   *      cache has executable program text, while the data cache has all other
 408  406   *      memory that the processor accesses. As of this writing, data is kept
 409  407   *      coherent between all of the caches on x86, so if one modifies program
 410  408   *      text before it is executed, that will be in the data cache, and the
 411  409   *      instruction cache will be synchronized with that change when the
 412  410   *      processor actually executes those instructions. This coherency also
 413  411   *      covers the fact that data could show up in multiple caches.
 414  412   *
 415  413   *      Generally, the lowest level caches are specific to a core. However, the
 416  414   *      last layer cache is shared between some number of cores. The number of
 417  415   *      CPUs sharing this last level cache is important. This has implications
 418  416   *      for the choices that the scheduler makes, as accessing memory that might
 419  417   *      be in a remote cache after thread migration can be quite expensive.
 420  418   *
 421  419   *      Sometimes, the word cache is abbreviated with a '$', because in US
 422  420   *      English the word cache is pronounced the same as cash. So L1D$ refers to
 423  421   *      the L1 data cache, and L2$ would be the L2 cache. This will not be used
 424  422   *      in the rest of this theory statement for clarity.
 425  423   *
 426  424   * MEMORY CONTROLLER
 427  425   *
 428  426   *      The memory controller is a component that provides access to DRAM. Each
 429  427   *      memory controller can access a set number of DRAM channels. Each channel
 430  428   *      can have a number of DIMMs (sticks of memory) associated with it. A
 431  429   *      given package may have more than one memory controller. The association
 432  430   *      of the memory controller to a group of cores is important as it is
 433  431   *      cheaper to access memory on the controller that you are associated with.
 434  432   *
 435  433   * NUMA
 436  434   *
 437  435   *      NUMA or non-uniform memory access, describes a way that systems are
 438  436   *      built. On x86, any processor core can address all of the memory in the
 439  437   *      system. However, When using multiple sockets or possibly within a
 440  438   *      multi-chip module, some of that memory is physically closer and some of
 441  439   *      it is further. Memory that is further away is more expensive to access.
 442  440   *      Consider the following image of multiple sockets with memory:
 443  441   *
 444  442   *      +--------+                                                +--------+
 445  443   *      | DIMM A |         +----------+      +----------+         | DIMM D |
 446  444   *      +--------+-+       |          |      |          |       +-+------+-+
 447  445   *        | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
 448  446   *        +--------+-+     |          |      |          |     +-+------+-+
 449  447   *          | DIMM C |     +----------+      +----------+     | DIMM F |
 450  448   *          +--------+                                        +--------+
 451  449   *
 452  450   *      In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
 453  451   *      closer to DIMMs D-F. This means that it is cheaper for socket 0 to
 454  452   *      access DIMMs A-C and more expensive to access D-F as it has to go
 455  453   *      through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
 456  454   *      D-F are cheaper than A-C. While the socket form is the most common, when
 457  455   *      using multi-chip modules, this can also sometimes occur. For another
 458  456   *      example of this that's more involved, see the AMD topology section.
 459  457   *
 460  458   *
 461  459   * Intel Topology
 462  460   * --------------
 463  461   *
 464  462   * Most Intel processors since Nehalem, (as of this writing the current gen
 465  463   * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
 466  464   * the package is a single monolithic die. MCMs currently aren't used. Most
 467  465   * parts have three levels of caches, with the L3 cache being shared between
 468  466   * all of the cores on the package. The L1/L2 cache is generally specific to
 469  467   * an individual core. The following image shows at a simplified level what
 470  468   * this looks like. The memory controller is commonly part of something called
 471  469   * the 'Uncore', that used to be separate physical chips that were not a part of
 472  470   * the package, but are now part of the same chip.
 473  471   *
 474  472   *  +-----------------------------------------------------------------------+
 475  473   *  | Package                                                               |
 476  474   *  |  +-------------------+  +-------------------+  +-------------------+  |
 477  475   *  |  | Core              |  | Core              |  | Core              |  |
 478  476   *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
 479  477   *  |  |  | Thread | | L | |  |  | Thread | | L | |  |  | Thread | | L | |  |
 480  478   *  |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |
 481  479   *  |  |  +--------+ |   | |  |  +--------+ |   | |  |  +--------+ |   | |  |
 482  480   *  |  |  | Thread | |   | |  |  | Thread | |   | |  |  | Thread | |   | |  |
 483  481   *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
 484  482   *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
 485  483   *  |  |  | L2 Cache     | |  |  | L2 Cache     | |  |  | L2 Cache     | |  |
 486  484   *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
 487  485   *  |  +-------------------+  +-------------------+  +-------------------+  |
 488  486   *  | +-------------------------------------------------------------------+ |
 489  487   *  | |                         Shared L3 Cache                           | |
 490  488   *  | +-------------------------------------------------------------------+ |
 491  489   *  | +-------------------------------------------------------------------+ |
 492  490   *  | |                        Memory Controller                          | |
 493  491   *  | +-------------------------------------------------------------------+ |
 494  492   *  +-----------------------------------------------------------------------+
 495  493   *
 496  494   * A side effect of this current architecture is that what we care about from a
 497  495   * scheduling and topology perspective, is simplified. In general we care about
 498  496   * understanding which logical CPUs are part of the same core and socket.
 499  497   *
 500  498   * To determine the relationship between threads and cores, Intel initially used
 501  499   * the identifier in the advanced programmable interrupt controller (APIC). They
 502  500   * also added cpuid leaf 4 to give additional information about the number of
 503  501   * threads and CPUs in the processor. With the addition of x2apic (which
 504  502   * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
 505  503   * additional cpuid topology leaf 0xB was added.
 506  504   *
 507  505   * AMD Topology
 508  506   * ------------
 509  507   *
 510  508   * When discussing AMD topology, we want to break this into three distinct
 511  509   * generations of topology. There's the basic topology that has been used in
 512  510   * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
 513  511   * with family 0x15 (Bulldozer), and there's the topology that was introduced
 514  512   * with family 0x17 (Zen). AMD also has some additional terminology that's worth
 515  513   * talking about.
 516  514   *
 517  515   * Until the introduction of family 0x17 (Zen), AMD did not implement something
 518  516   * that they considered SMT. Whether or not the AMD processors have SMT
 519  517   * influences many things including scheduling and reliability, availability,
 520  518   * and serviceability (RAS) features.
 521  519   *
 522  520   * NODE
 523  521   *
 524  522   *      AMD uses the term node to refer to a die that contains a number of cores
 525  523   *      and I/O resources. Depending on the processor family and model, more
 526  524   *      than one node can be present in the package. When there is more than one
 527  525   *      node this indicates a multi-chip module. Usually each node has its own
 528  526   *      access to memory and I/O devices. This is important and generally
 529  527   *      different from the corresponding Intel Nehalem-Skylake+ processors. As a
 530  528   *      result, we track this relationship in the operating system.
 531  529   *
 532  530   *      In processors with an L3 cache, the L3 cache is generally shared across
 533  531   *      the entire node, though the way this is carved up varies from generation
 534  532   *      to generation.
 535  533   *
 536  534   * BULLDOZER
 537  535   *
 538  536   *      Starting with the Bulldozer family (0x15) and continuing until the
 539  537   *      introduction of the Zen microarchitecture, AMD introduced the idea of a
 540  538   *      compute unit. In a compute unit, two traditional cores share a number of
 541  539   *      hardware resources. Critically, they share the FPU, L1 instruction
 542  540   *      cache, and the L2 cache. Several compute units were then combined inside
 543  541   *      of a single node.  Because the integer execution units, L1 data cache,
 544  542   *      and some other resources were not shared between the cores, AMD never
 545  543   *      considered this to be SMT.
 546  544   *
 547  545   * ZEN
 548  546   *
 549  547   *      The Zen family (0x17) uses a multi-chip module (MCM) design, the module
 550  548   *      is called Zeppelin. These modules are similar to the idea of nodes used
 551  549   *      previously. Each of these nodes has two DRAM channels which all of the
 552  550   *      cores in the node can access uniformly. These nodes are linked together
 553  551   *      in the package, creating a NUMA environment.
 554  552   *
 555  553   *      The Zeppelin die itself contains two different 'core complexes'. Each
 556  554   *      core complex consists of four cores which each have two threads, for a
 557  555   *      total of 8 logical CPUs per complex. Unlike other generations,
 558  556   *      where all the logical CPUs in a given node share the L3 cache, here each
 559  557   *      core complex has its own shared L3 cache.
 560  558   *
 561  559   *      A further thing that we need to consider is that in some configurations,
 562  560   *      particularly with the Threadripper line of processors, not every die
 563  561   *      actually has its memory controllers wired up to actual memory channels.
 564  562   *      This means that some cores have memory attached to them and others
 565  563   *      don't.
 566  564   *
 567  565   *      To put Zen in perspective, consider the following images:
 568  566   *
 569  567   *      +--------------------------------------------------------+
 570  568   *      | Core Complex                                           |
 571  569   *      | +-------------------+    +-------------------+  +---+  |
 572  570   *      | | Core       +----+ |    | Core       +----+ |  |   |  |
 573  571   *      | | +--------+ | L2 | |    | +--------+ | L2 | |  |   |  |
 574  572   *      | | | Thread | +----+ |    | | Thread | +----+ |  |   |  |
 575  573   *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  | L |  |
 576  574   *      | |   | Thread | |L1| |    |   | Thread | |L1| |  | 3 |  |
 577  575   *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
 578  576   *      | +-------------------+    +-------------------+  | C |  |
 579  577   *      | +-------------------+    +-------------------+  | a |  |
 580  578   *      | | Core       +----+ |    | Core       +----+ |  | c |  |
 581  579   *      | | +--------+ | L2 | |    | +--------+ | L2 | |  | h |  |
 582  580   *      | | | Thread | +----+ |    | | Thread | +----+ |  | e |  |
 583  581   *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |   |  |
 584  582   *      | |   | Thread | |L1| |    |   | Thread | |L1| |  |   |  |
 585  583   *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
 586  584   *      | +-------------------+    +-------------------+  +---+  |
 587  585   *      |                                                        |
 588  586   *      +--------------------------------------------------------+
 589  587   *
 590  588   *  This first image represents a single Zen core complex that consists of four
 591  589   *  cores.
 592  590   *
 593  591   *
 594  592   *      +--------------------------------------------------------+
 595  593   *      | Zeppelin Die                                           |
 596  594   *      |  +--------------------------------------------------+  |
 597  595   *      |  |         I/O Units (PCIe, SATA, USB, etc.)        |  |
 598  596   *      |  +--------------------------------------------------+  |
 599  597   *      |                           HH                           |
 600  598   *      |          +-----------+    HH    +-----------+          |
 601  599   *      |          |           |    HH    |           |          |
 602  600   *      |          |    Core   |==========|    Core   |          |
 603  601   *      |          |  Complex  |==========|  Complex  |          |
 604  602   *      |          |           |    HH    |           |          |
 605  603   *      |          +-----------+    HH    +-----------+          |
 606  604   *      |                           HH                           |
 607  605   *      |  +--------------------------------------------------+  |
 608  606   *      |  |                Memory Controller                 |  |
 609  607   *      |  +--------------------------------------------------+  |
 610  608   *      |                                                        |
 611  609   *      +--------------------------------------------------------+
 612  610   *
 613  611   *  This image represents a single Zeppelin Die. Note how both cores are
 614  612   *  connected to the same memory controller and I/O units. While each core
 615  613   *  complex has its own L3 cache as seen in the first image, they both have
 616  614   *  uniform access to memory.
 617  615   *
 618  616   *
 619  617   *                      PP                     PP
 620  618   *                      PP                     PP
 621  619   *           +----------PP---------------------PP---------+
 622  620   *           |          PP                     PP         |
 623  621   *           |    +-----------+          +-----------+    |
 624  622   *           |    |           |          |           |    |
 625  623   *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
 626  624   *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
 627  625   *           |    |           |          |           |    |
 628  626   *           |    +-----------+ooo    ...+-----------+    |
 629  627   *           |          HH      ooo  ...       HH         |
 630  628   *           |          HH        oo..         HH         |
 631  629   *           |          HH        ..oo         HH         |
 632  630   *           |          HH      ...  ooo       HH         |
 633  631   *           |    +-----------+...    ooo+-----------+    |
 634  632   *           |    |           |          |           |    |
 635  633   *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
 636  634   *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
 637  635   *           |    |           |          |           |    |
 638  636   *           |    +-----------+          +-----------+    |
 639  637   *           |          PP                     PP         |
 640  638   *           +----------PP---------------------PP---------+
 641  639   *                      PP                     PP
 642  640   *                      PP                     PP
 643  641   *
 644  642   *  This image represents a single Zen package. In this example, it has four
 645  643   *  Zeppelin dies, though some configurations only have a single one. In this
 646  644   *  example, each die is directly connected to the next. Also, each die is
 647  645   *  represented as being connected to memory by the 'M' character and connected
 648  646   *  to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
 649  647   *  die is made up of two core complexes, we have multiple different NUMA
 650  648   *  domains that we care about for these systems.
 651  649   *
 652  650   * CPUID LEAVES
 653  651   *
 654  652   * There are a few different CPUID leaves that we can use to try and understand
 655  653   * the actual state of the world. As part of the introduction of family 0xf, AMD
 656  654   * added CPUID leaf 0x80000008. This leaf tells us the number of logical
 657  655   * processors that are in the system. Because families before Zen didn't have
 658  656   * SMT, this was always the number of cores that were in the system. However, it
 659  657   * should always be thought of as the number of logical threads to be consistent
 660  658   * between generations. In addition we also get the size of the APIC ID that is
 661  659   * used to represent the number of logical processors. This is important for
 662  660   * deriving topology information.
 663  661   *
 664  662   * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
 665  663   * bit between Bulldozer and later families, but it is quite useful in
 666  664   * determining the topology information. Because this information has changed
 667  665   * across family generations, it's worth calling out what these mean
 668  666   * explicitly. The registers have the following meanings:
 669  667   *
 670  668   *      %eax    The APIC ID. The entire register is defined to have a 32-bit
 671  669   *              APIC ID, even though on systems without x2apic support, it will
 672  670   *              be limited to 8 bits.
 673  671   *
 674  672   *      %ebx    On Bulldozer-era systems this contains information about the
 675  673   *              number of cores that are in a compute unit (cores that share
 676  674   *              resources). It also contains a per-package compute unit ID that
 677  675   *              identifies which compute unit the logical CPU is a part of.
 678  676   *
 679  677   *              On Zen-era systems this instead contains the number of threads
 680  678   *              per core and the ID of the core that the logical CPU is a part
 681  679   *              of. Note, this ID is unique only to the package, it is not
 682  680   *              globally unique across the entire system.
 683  681   *
 684  682   *      %ecx    This contains the number of nodes that exist in the package. It
 685  683   *              also contains an ID that identifies which node the logical CPU
 686  684   *              is a part of.
 687  685   *
 688  686   * Finally, we also use cpuid leaf 0x8000001D to determine information about the
 689  687   * cache layout to determine which logical CPUs are sharing which caches.
 690  688   *
 691  689   * illumos Topology
 692  690   * ----------------
 693  691   *
 694  692   * Based on the above we synthesize the information into several different
 695  693   * variables that we store in the 'struct cpuid_info'. We'll go into the details
 696  694   * of what each member is supposed to represent and their uniqueness. In
 697  695   * general, there are two levels of uniqueness that we care about. We care about
 698  696   * an ID that is globally unique. That means that it will be unique across all
 699  697   * entities in the system. For example, the default logical CPU ID is globally
 700  698   * unique. On the other hand, there is some information that we only care about
 701  699   * being unique within the context of a single package / socket. Here are the
 702  700   * variables that we keep track of and their meaning.
 703  701   *
 704  702   * Several of the values that are asking for an identifier, with the exception
 705  703   * of cpi_apicid, are allowed to be synthetic.
 706  704   *
 707  705   *
 708  706   * cpi_apicid
 709  707   *
 710  708   *      This is the value of the CPU's APIC id. This should be the full 32-bit
 711  709   *      ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
 712  710   *      APIC ID. This value is globally unique between all logical CPUs across
 713  711   *      all packages. This is usually required by the APIC.
 714  712   *
 715  713   * cpi_chipid
 716  714   *
 717  715   *      This value indicates the ID of the package that the logical CPU is a
 718  716   *      part of. This value is allowed to be synthetic. It is usually derived by
 719  717   *      taking the CPU's APIC ID and determining how many bits are used to
 720  718   *      represent CPU cores in the package. All logical CPUs that are part of
 721  719   *      the same package must have the same value.
 722  720   *
 723  721   * cpi_coreid
 724  722   *
 725  723   *      This represents the ID of a CPU core. Two logical CPUs should only have
 726  724   *      the same cpi_coreid value if they are part of the same core. These
 727  725   *      values may be synthetic. On systems that support SMT, this value is
 728  726   *      usually derived from the APIC ID, otherwise it is often synthetic and
 729  727   *      just set to the value of the cpu_id in the cpu_t.
 730  728   *
 731  729   * cpi_pkgcoreid
 732  730   *
 733  731   *      This is similar to the cpi_coreid in that logical CPUs that are part of
 734  732   *      the same core should have the same ID. The main difference is that these
 735  733   *      values are only required to be unique to a given socket.
 736  734   *
 737  735   * cpi_clogid
 738  736   *
 739  737   *      This represents the logical ID of a logical CPU. This value should be
 740  738   *      unique within a given socket for each logical CPU. This is allowed to be
 741  739   *      synthetic, though it is usually based off of the CPU's apic ID. The
 742  740   *      broader system expects that logical CPUs that have are part of the same
 743  741   *      core have contiguous numbers. For example, if there were two threads per
 744  742   *      core, then the core IDs divided by two should be the same and the first
 745  743   *      modulus two should be zero and the second one. For example, IDs 4 and 5
 746  744   *      indicate two logical CPUs that are part of the same core. But IDs 5 and
 747  745   *      6 represent two logical CPUs that are part of different cores.
 748  746   *
 749  747   *      While it is common for the cpi_coreid and the cpi_clogid to be derived
 750  748   *      from the same source, strictly speaking, they don't have to be and the
 751  749   *      two values should be considered logically independent. One should not
 752  750   *      try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
 753  751   *      some kind of relationship. While this is tempting, we've seen cases on
 754  752   *      AMD family 0xf where the system's cpu id is not related to its APIC ID.
 755  753   *
 756  754   * cpi_ncpu_per_chip
 757  755   *
 758  756   *      This value indicates the total number of logical CPUs that exist in the
 759  757   *      physical package. Critically, this is not the number of logical CPUs
 760  758   *      that exist for just the single core.
 761  759   *
 762  760   *      This value should be the same for all logical CPUs in the same package.
 763  761   *
 764  762   * cpi_ncore_per_chip
 765  763   *
 766  764   *      This value indicates the total number of physical CPU cores that exist
 767  765   *      in the package. The system compares this value with cpi_ncpu_per_chip to
 768  766   *      determine if simultaneous multi-threading (SMT) is enabled. When
 769  767   *      cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
 770  768   *      the X86FSET_HTT feature is not set. If this value is greater than one,
 771  769   *      than we consider the processor to have the feature X86FSET_CMP, to
 772  770   *      indicate that there is support for more than one core.
 773  771   *
 774  772   *      This value should be the same for all logical CPUs in the same package.
 775  773   *
 776  774   * cpi_procnodes_per_pkg
 777  775   *
 778  776   *      This value indicates the number of 'nodes' that exist in the package.
 779  777   *      When processors are actually a multi-chip module, this represents the
 780  778   *      number of such modules that exist in the package. Currently, on Intel
 781  779   *      based systems this member is always set to 1.
 782  780   *
 783  781   *      This value should be the same for all logical CPUs in the same package.
 784  782   *
 785  783   * cpi_procnodeid
 786  784   *
 787  785   *      This value indicates the ID of the node that the logical CPU is a part
 788  786   *      of. All logical CPUs that are in the same node must have the same value
 789  787   *      here. This value must be unique across all of the packages in the
 790  788   *      system.  On Intel based systems, this is currently set to the value in
 791  789   *      cpi_chipid because there is only one node.
 792  790   *
 793  791   * cpi_cores_per_compunit
 794  792   *
 795  793   *      This value indicates the number of cores that are part of a compute
 796  794   *      unit. See the AMD topology section for this. This member only has real
 797  795   *      meaning currently for AMD Bulldozer family processors. For all other
 798  796   *      processors, this should currently be set to 1.
 799  797   *
 800  798   * cpi_compunitid
 801  799   *
 802  800   *      This indicates the compute unit that the logical CPU belongs to. For
 803  801   *      processors without AMD Bulldozer-style compute units this should be set
 804  802   *      to the value of cpi_coreid.
 805  803   *
 806  804   * cpi_ncpu_shr_last_cache
 807  805   *
 808  806   *      This indicates the number of logical CPUs that are sharing the same last
 809  807   *      level cache. This value should be the same for all CPUs that are sharing
 810  808   *      that cache. The last cache refers to the cache that is closest to memory
 811  809   *      and furthest away from the CPU.
 812  810   *
 813  811   * cpi_last_lvl_cacheid
 814  812   *
 815  813   *      This indicates the ID of the last cache that the logical CPU uses. This
 816  814   *      cache is often shared between multiple logical CPUs and is the cache
 817  815   *      that is closest to memory and furthest away from the CPU. This value
 818  816   *      should be the same for a group of logical CPUs only if they actually
 819  817   *      share the same last level cache. IDs should not overlap between
 820  818   *      packages.
 821  819   *
 822  820   * cpi_ncore_bits
 823  821   *
 824  822   *      This indicates the number of bits that are required to represent all of
 825  823   *      the cores in the system. As cores are derived based on their APIC IDs,
 826  824   *      we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
 827  825   *      this value to be larger than the actual number of IDs that are present
 828  826   *      in the system. This is used to size tables by the CMI framework. It is
 829  827   *      only filled in for Intel and AMD CPUs.
 830  828   *
 831  829   * cpi_nthread_bits
 832  830   *
 833  831   *      This indicates the number of bits required to represent all of the IDs
 834  832   *      that cover the logical CPUs that exist on a given core. It's OK for this
 835  833   *      value to be larger than the actual number of IDs that are present in the
 836  834   *      system.  This is used to size tables by the CMI framework. It is
 837  835   *      only filled in for Intel and AMD CPUs.
 838  836   *
 839  837   * -----------
 840  838   * Hypervisors
 841  839   * -----------
 842  840   *
 843  841   * If trying to manage the differences between vendors wasn't bad enough, it can
 844  842   * get worse thanks to our friend hardware virtualization. Hypervisors are given
 845  843   * the ability to interpose on all cpuid instructions and change them to suit
 846  844   * their purposes. In general, this is necessary as the hypervisor wants to be
 847  845   * able to present a more uniform set of features or not necessarily give the
 848  846   * guest operating system kernel knowledge of all features so it can be
 849  847   * more easily migrated between systems.
 850  848   *
 851  849   * When it comes to trying to determine topology information, this can be a
 852  850   * double edged sword. When a hypervisor doesn't actually implement a cpuid
 853  851   * leaf, it'll often return all zeros. Because of that, you'll often see various
 854  852   * checks scattered about fields being non-zero before we assume we can use
 855  853   * them.
 856  854   *
 857  855   * When it comes to topology information, the hypervisor is often incentivized
 858  856   * to lie to you about topology. This is because it doesn't always actually
 859  857   * guarantee that topology at all. The topology path we take in the system
 860  858   * depends on how the CPU advertises itself. If it advertises itself as an Intel
 861  859   * or AMD CPU, then we basically do our normal path. However, when they don't
 862  860   * use an actual vendor, then that usually turns into multiple one-core CPUs
 863  861   * that we enumerate that are often on different sockets. The actual behavior
 864  862   * depends greatly on what the hypervisor actually exposes to us.
 865  863   *
 866  864   * --------------------
 867  865   * Exposing Information
 868  866   * --------------------
 869  867   *
 870  868   * We expose CPUID information in three different forms in the system.
 871  869   *
 872  870   * The first is through the x86_featureset variable. This is used in conjunction
 873  871   * with the is_x86_feature() function. This is queried by x86-specific functions
 874  872   * to determine which features are or aren't present in the system and to make
 875  873   * decisions based upon them. For example, users of this include everything from
 876  874   * parts of the system dedicated to reliability, availability, and
 877  875   * serviceability (RAS), to making decisions about how to handle security
 878  876   * mitigations, to various x86-specific drivers. General purpose or
 879  877   * architecture independent drivers should never be calling this function.
 880  878   *
 881  879   * The second means is through the auxiliary vector. The auxiliary vector is a
 882  880   * series of tagged data that the kernel passes down to a user program when it
 883  881   * begins executing. This information is used to indicate to programs what
 884  882   * instruction set extensions are present. For example, information about the
 885  883   * CPU supporting the machine check architecture (MCA) wouldn't be passed down
 886  884   * since user programs cannot make use of it. However, things like the AVX
 887  885   * instruction sets are. Programs use this information to make run-time
 888  886   * decisions about what features they should use. As an example, the run-time
 889  887   * link-editor (rtld) can relocate different functions depending on the hardware
 890  888   * support available.
 891  889   *
 892  890   * The final form is through a series of accessor functions that all have the
 893  891   * form cpuid_get*. This is used by a number of different subsystems in the
 894  892   * kernel to determine more detailed information about what we're running on,
 895  893   * topology information, etc. Some of these subsystems include processor groups
 896  894   * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
 897  895   * microcode, and performance monitoring. These functions all ASSERT that the
 898  896   * CPU they're being called on has reached a certain cpuid pass. If the passes
 899  897   * are rearranged, then this needs to be adjusted.
 900  898   *
 901  899   * -----------------------------------------------
 902  900   * Speculative Execution CPU Side Channel Security
 903  901   * -----------------------------------------------
 904  902   *
 905  903   * With the advent of the Spectre and Meltdown attacks which exploit speculative
 906  904   * execution in the CPU to create side channels there have been a number of
 907  905   * different attacks and corresponding issues that the operating system needs to
 908  906   * mitigate against. The following list is some of the common, but not
 909  907   * exhaustive, set of issues that we know about and have done some or need to do
 910  908   * more work in the system to mitigate against:
 911  909   *
 912  910   *   - Spectre v1
 913  911   *   - swapgs (Spectre v1 variant)
 914  912   *   - Spectre v2
 915  913   *   - Meltdown (Spectre v3)
 916  914   *   - Rogue Register Read (Spectre v3a)
 917  915   *   - Speculative Store Bypass (Spectre v4)
 918  916   *   - ret2spec, SpectreRSB
 919  917   *   - L1 Terminal Fault (L1TF)
 920  918   *   - Microarchitectural Data Sampling (MDS)
 921  919   *
 922  920   * Each of these requires different sets of mitigations and has different attack
 923  921   * surfaces. For the most part, this discussion is about protecting the kernel
 924  922   * from non-kernel executing environments such as user processes and hardware
 925  923   * virtual machines. Unfortunately, there are a number of user vs. user
 926  924   * scenarios that exist with these. The rest of this section will describe the
 927  925   * overall approach that the system has taken to address these as well as their
 928  926   * shortcomings. Unfortunately, not all of the above have been handled today.
 929  927   *
 930  928   * SPECTRE v2, ret2spec, SpectreRSB
 931  929   *
 932  930   * The second variant of the spectre attack focuses on performing branch target
 933  931   * injection. This generally impacts indirect call instructions in the system.
 934  932   * There are three different ways to mitigate this issue that are commonly
 935  933   * described today:
 936  934   *
 937  935   *  1. Using Indirect Branch Restricted Speculation (IBRS).
 938  936   *  2. Using Retpolines and RSB Stuffing
 939  937   *  3. Using Enhanced Indirect Branch Restricted Speculation (EIBRS)
 940  938   *
 941  939   * IBRS uses a feature added to microcode to restrict speculation, among other
 942  940   * things. This form of mitigation has not been used as it has been generally
 943  941   * seen as too expensive and requires reactivation upon various transitions in
 944  942   * the system.
 945  943   *
 946  944   * As a less impactful alternative to IBRS, retpolines were developed by
 947  945   * Google. These basically require one to replace indirect calls with a specific
 948  946   * trampoline that will cause speculation to fail and break the attack.
 949  947   * Retpolines require compiler support. We always build with retpolines in the
 950  948   * external thunk mode. This means that a traditional indirect call is replaced
 951  949   * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
 952  950   * of this is that all indirect function calls are performed through a register.
 953  951   *
 954  952   * We have to use a common external location of the thunk and not inline it into
 955  953   * the callsite so that way we can have a single place to patch these functions.
 956  954   * As it turns out, we actually have three different forms of retpolines that
 957  955   * exist in the system:
 958  956   *
 959  957   *  1. A full retpoline
 960  958   *  2. An AMD-specific optimized retpoline
 961  959   *  3. A no-op version
 962  960   *
 963  961   * The first one is used in the general case. The second one is used if we can
 964  962   * determine that we're on an AMD system and we can successfully toggle the
 965  963   * lfence serializing MSR that exists on the platform. Basically with this
 966  964   * present, an lfence is sufficient and we don't need to do anywhere near as
 967  965   * complicated a dance to successfully use retpolines.
 968  966   *
 969  967   * The third form described above is the most curious. It turns out that the way
 970  968   * that retpolines are implemented is that they rely on how speculation is
 971  969   * performed on a 'ret' instruction. Intel has continued to optimize this
 972  970   * process (which is partly why we need to have return stack buffer stuffing,
 973  971   * but more on that in a bit) and in processors starting with Cascade Lake
 974  972   * on the server side, it's dangerous to rely on retpolines. Instead, a new
 975  973   * mechanism has been introduced called Enhanced IBRS (EIBRS).
 976  974   *
 977  975   * Unlike IBRS, EIBRS is designed to be enabled once at boot and left on each
 978  976   * physical core. However, if this is the case, we don't want to use retpolines
 979  977   * any more. Therefore if EIBRS is present, we end up turning each retpoline
 980  978   * function (called a thunk) into a jmp instruction. This means that we're still
 981  979   * paying the cost of an extra jump to the external thunk, but it gives us
 982  980   * flexibility and the ability to have a single kernel image that works across a
 983  981   * wide variety of systems and hardware features.
 984  982   *
 985  983   * Unfortunately, this alone is insufficient. First, Skylake systems have
 986  984   * additional speculation for the Return Stack Buffer (RSB) which is used to
 987  985   * return from call instructions which retpolines take advantage of. However,
 988  986   * this problem is not just limited to Skylake and is actually more pernicious.
 989  987   * The SpectreRSB paper introduces several more problems that can arise with
 990  988   * dealing with this. The RSB can be poisoned just like the indirect branch
 991  989   * predictor. This means that one needs to clear the RSB when transitioning
 992  990   * between two different privilege domains. Some examples include:
 993  991   *
 994  992   *  - Switching between two different user processes
 995  993   *  - Going between user land and the kernel
 996  994   *  - Returning to the kernel from a hardware virtual machine
 997  995   *
 998  996   * Mitigating this involves combining a couple of different things. The first is
 999  997   * SMEP (supervisor mode execution protection) which was introduced in Ivy
1000  998   * Bridge. When an RSB entry refers to a user address and we're executing in the
1001  999   * kernel, speculation through it will be stopped when SMEP is enabled. This
1002 1000   * protects against a number of the different cases that we would normally be
1003 1001   * worried about such as when we enter the kernel from user land.
1004 1002   *
1005 1003   * To prevent against additional manipulation of the RSB from other contexts
1006 1004   * such as a non-root VMX context attacking the kernel we first look to enhanced
1007 1005   * IBRS. When EIBRS is present and enabled, then there is nothing else that we
1008 1006   * need to do to protect the kernel at this time.
1009 1007   *
1010 1008   * On CPUs without EIBRS we need to manually overwrite the contents of the
1011 1009   * return stack buffer. We do this through the x86_rsb_stuff() function.
1012 1010   * Currently this is employed on context switch. The x86_rsb_stuff() function is
1013 1011   * disabled when enhanced IBRS is present because Intel claims on such systems
1014 1012   * it will be ineffective. Stuffing the RSB in context switch helps prevent user
1015 1013   * to user attacks via the RSB.
1016 1014   *
1017 1015   * If SMEP is not present, then we would have to stuff the RSB every time we
1018 1016   * transitioned from user mode to the kernel, which isn't very practical right
1019 1017   * now.
1020 1018   *
1021 1019   * To fully protect user to user and vmx to vmx attacks from these classes of
1022 1020   * issues, we would also need to allow them to opt into performing an Indirect
1023 1021   * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1024 1022   *
1025 1023   * By default, the system will enable RSB stuffing and the required variant of
1026 1024   * retpolines and store that information in the x86_spectrev2_mitigation value.
1027 1025   * This will be evaluated after a microcode update as well, though it is
1028 1026   * expected that microcode updates will not take away features. This may mean
1029 1027   * that a late loaded microcode may not end up in the optimal configuration
1030 1028   * (though this should be rare).
1031 1029   *
1032 1030   * Currently we do not build kmdb with retpolines or perform any additional side
1033 1031   * channel security mitigations for it. One complication with kmdb is that it
1034 1032   * requires its own retpoline thunks and it would need to adjust itself based on
1035 1033   * what the kernel does. The threat model of kmdb is more limited and therefore
1036 1034   * it may make more sense to investigate using prediction barriers as the whole
1037 1035   * system is only executing a single instruction at a time while in kmdb.
1038 1036   *
1039 1037   * SPECTRE v1, v4
1040 1038   *
1041 1039   * The v1 and v4 variants of spectre are not currently mitigated in the
1042 1040   * system and require other classes of changes to occur in the code.
1043 1041   *
1044 1042   * SPECTRE v1 (SWAPGS VARIANT)
1045 1043   *
1046 1044   * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but
1047 1045   * can generally affect any branch-dependent code. The swapgs issue is one
1048 1046   * variant of this. If we are coming in from userspace, we can have code like
1049 1047   * this:
1050 1048   *
1051 1049   *      cmpw    $KCS_SEL, REGOFF_CS(%rsp)
1052 1050   *      je      1f
1053 1051   *      movq    $0, REGOFF_SAVFP(%rsp)
1054 1052   *      swapgs
1055 1053   *      1:
1056 1054   *      movq    %gs:CPU_THREAD, %rax
1057 1055   *
1058 1056   * If an attacker can cause a mis-speculation of the branch here, we could skip
1059 1057   * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based
1060 1058   * load. If subsequent code can act as the usual Spectre cache gadget, this
1061 1059   * would potentially allow KPTI bypass. To fix this, we need an lfence prior to
1062 1060   * any use of the %gs override.
1063 1061   *
1064 1062   * The other case is also an issue: if we're coming into a trap from kernel
1065 1063   * space, we could mis-speculate and swapgs the user %gsbase back in prior to
1066 1064   * using it. AMD systems are not vulnerable to this version, as a swapgs is
1067 1065   * serializing with respect to subsequent uses. But as AMD /does/ need the other
1068 1066   * case, and the fix is the same in both cases (an lfence at the branch target
1069 1067   * 1: in this example), we'll just do it unconditionally.
1070 1068   *
1071 1069   * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it
1072 1070   * harder for user-space to actually set a useful %gsbase value: although it's
1073 1071   * not clear, it might still be feasible via lwp_setprivate(), though, so we
1074 1072   * mitigate anyway.
1075 1073   *
1076 1074   * MELTDOWN
1077 1075   *
1078 1076   * Meltdown, or spectre v3, allowed a user process to read any data in their
1079 1077   * address space regardless of whether or not the page tables in question
1080 1078   * allowed the user to have the ability to read them. The solution to meltdown
1081 1079   * is kernel page table isolation. In this world, there are two page tables that
1082 1080   * are used for a process, one in user land and one in the kernel. To implement
1083 1081   * this we use per-CPU page tables and switch between the user and kernel
1084 1082   * variants when entering and exiting the kernel.  For more information about
1085 1083   * this process and how the trampolines work, please see the big theory
1086 1084   * statements and additional comments in:
1087 1085   *
1088 1086   *  - uts/i86pc/ml/kpti_trampolines.s
1089 1087   *  - uts/i86pc/vm/hat_i86.c
1090 1088   *
1091 1089   * While Meltdown only impacted Intel systems and there are also Intel systems
1092 1090   * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1093 1091   * kernel page table isolation enabled. While this may at first seem weird, an
1094 1092   * important thing to remember is that you can't speculatively read an address
1095 1093   * if it's never in your page table at all. Having user processes without kernel
1096 1094   * pages present provides us with an important layer of defense in the kernel
1097 1095   * against any other side channel attacks that exist and have yet to be
1098 1096   * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1099 1097   * default, no matter the x86 system.
1100 1098   *
1101 1099   * L1 TERMINAL FAULT
1102 1100   *
1103 1101   * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1104 1102   * execution uses page table entries. Effectively, it is two different problems.
1105 1103   * The first is that it ignores the not present bit in the page table entries
1106 1104   * when performing speculative execution. This means that something can
1107 1105   * speculatively read the listed physical address if it's present in the L1
1108 1106   * cache under certain conditions (see Intel's documentation for the full set of
1109 1107   * conditions). Secondly, this can be used to bypass hardware virtualization
1110 1108   * extended page tables (EPT) that are part of Intel's hardware virtual machine
1111 1109   * instructions.
1112 1110   *
1113 1111   * For the non-hardware virtualized case, this is relatively easy to deal with.
1114 1112   * We must make sure that all unmapped pages have an address of zero. This means
1115 1113   * that they could read the first 4k of physical memory; however, we never use
1116 1114   * that first page in the operating system and always skip putting it in our
1117 1115   * memory map, even if firmware tells us we can use it in our memory map. While
1118 1116   * other systems try to put extra metadata in the address and reserved bits,
1119 1117   * which led to this being problematic in those cases, we do not.
1120 1118   *
1121 1119   * For hardware virtual machines things are more complicated. Because they can
1122 1120   * construct their own page tables, it isn't hard for them to perform this
1123 1121   * attack against any physical address. The one wrinkle is that this physical
1124 1122   * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1125 1123   * to flush the L1 data cache. We wrap this up in the function
1126 1124   * spec_uarch_flush(). This function is also used in the mitigation of
1127 1125   * microarchitectural data sampling (MDS) discussed later on. Kernel based
1128 1126   * hypervisors such as KVM or bhyve are responsible for performing this before
1129 1127   * entering the guest.
1130 1128   *
1131 1129   * Because this attack takes place in the L1 cache, there's another wrinkle
1132 1130   * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1133 1131   * designs. This means that when a thread enters a hardware virtualized context
1134 1132   * and flushes the L1 data cache, the other thread on the processor may then go
1135 1133   * ahead and put new data in it that can be potentially attacked. While one
1136 1134   * solution is to disable SMT on the system, another option that is available is
1137 1135   * to use a feature for hardware virtualization called 'SMT exclusion'. This
1138 1136   * goes through and makes sure that if a HVM is being scheduled on one thread,
1139 1137   * then the thing on the other thread is from the same hardware virtual machine.
1140 1138   * If an interrupt comes in or the guest exits to the broader system, then the
1141 1139   * other SMT thread will be kicked out.
1142 1140   *
1143 1141   * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1144 1142   * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1145 1143   * perform L1TF related mitigations.
1146 1144   *
1147 1145   * MICROARCHITECTURAL DATA SAMPLING
1148 1146   *
1149 1147   * Microarchitectural data sampling (MDS) is a combination of four discrete
1150 1148   * vulnerabilities that are similar issues affecting various parts of the CPU's
1151 1149   * microarchitectural implementation around load, store, and fill buffers.
1152 1150   * Specifically it is made up of the following subcomponents:
1153 1151   *
1154 1152   *  1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1155 1153   *  2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1156 1154   *  3. Microarchitectural Load Port Data Sampling (MLPDS)
1157 1155   *  4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1158 1156   *
1159 1157   * To begin addressing these, Intel has introduced another feature in microcode
1160 1158   * called MD_CLEAR. This changes the verw instruction to operate in a different
1161 1159   * way. This allows us to execute the verw instruction in a particular way to
1162 1160   * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1163 1161   * updated when this microcode is present to flush this state.
1164 1162   *
1165 1163   * Primarily we need to flush this state whenever we transition from the kernel
1166 1164   * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1167 1165   * little bit different. Here the structures are statically sized when a logical
1168 1166   * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1169 1167   * flush the microarchitectural state before the CPU goes idles by calling hlt,
1170 1168   * mwait, or another ACPI method. To perform these flushes, we call
1171 1169   * x86_md_clear() at all of these transition points.
1172 1170   *
1173 1171   * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1174 1172   * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1175 1173   * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1176 1174   * a no-op.
1177 1175   *
1178 1176   * Unfortunately, with this issue hyperthreading rears its ugly head. In
1179 1177   * particular, everything we've discussed above is only valid for a single
1180 1178   * thread executing on a core. In the case where you have hyper-threading
1181 1179   * present, this attack can be performed between threads. The theoretical fix
1182 1180   * for this is to ensure that both threads are always in the same security
1183 1181   * domain. This means that they are executing in the same ring and mutually
1184 1182   * trust each other. Practically speaking, this would mean that a system call
1185 1183   * would have to issue an inter-processor interrupt (IPI) to the other thread.
1186 1184   * Rather than implement this, we recommend that one disables hyper-threading
1187 1185   * through the use of psradm -aS.
1188 1186   *
1189 1187   * TSX ASYNCHRONOUS ABORT
1190 1188   *
1191 1189   * TSX Asynchronous Abort (TAA) is another side-channel vulnerability that
1192 1190   * behaves like MDS, but leverages Intel's transactional instructions as another
1193 1191   * vector. Effectively, when a transaction hits one of these cases (unmapped
1194 1192   * page, various cache snoop activity, etc.) then the same data can be exposed
1195 1193   * as in the case of MDS. This means that you can attack your twin.
1196 1194   *
1197 1195   * Intel has described that there are two different ways that we can mitigate
1198 1196   * this problem on affected processors:
1199 1197   *
1200 1198   *   1) We can use the same techniques used to deal with MDS. Flushing the
1201 1199   *      microarchitectural buffers and disabling hyperthreading will mitigate
1202 1200   *      this in the same way.
1203 1201   *
1204 1202   *   2) Using microcode to disable TSX.
1205 1203   *
1206 1204   * Now, most processors that are subject to MDS (as in they don't have MDS_NO in
1207 1205   * the IA32_ARCH_CAPABILITIES MSR) will not receive microcode to disable TSX.
1208 1206   * That's OK as we're already doing all such mitigations. On the other hand,
1209 1207   * processors with MDS_NO are all supposed to receive microcode updates that
1210 1208   * enumerate support for disabling TSX. In general, we'd rather use this method
1211 1209   * when available as it doesn't require disabling hyperthreading to be
1212 1210   * effective. Currently we basically are relying on microcode for processors
1213 1211   * that enumerate MDS_NO.
1214 1212   *
1215 1213   * The microcode features are enumerated as part of the IA32_ARCH_CAPABILITIES.
1216 1214   * When bit 7 (IA32_ARCH_CAP_TSX_CTRL) is present, then we are given two
1217 1215   * different powers. The first allows us to cause all transactions to
1218 1216   * immediately abort. The second gives us a means of disabling TSX completely,
1219 1217   * which includes removing it from cpuid. If we have support for this in

↓ open down ↓

1173 lines elided

↑ open up ↑

1220 1218   * microcode during the first cpuid pass, then we'll disable TSX completely such
1221 1219   * that user land never has a chance to observe the bit. However, if we are late
1222 1220   * loading the microcode, then we must use the functionality to cause
1223 1221   * transactions to automatically abort. This is necessary for user land's sake.
1224 1222   * Once a program sees a cpuid bit, it must not be taken away.
1225 1223   *
1226 1224   * We track whether or not we should do this based on what cpuid pass we're in.
1227 1225   * Whenever we hit cpuid_scan_security() on the boot CPU and we're still on pass
1228 1226   * 1 of the cpuid logic, then we can completely turn off TSX. Notably this
1229 1227   * should happen twice. Once in the normal cpuid_pass1() code and then a second
1230      - * time after we do the initial microcode update.
     1228 + * time after we do the initial microcode update.  As a result we need to be
     1229 + * careful in cpuid_apply_tsx() to only use the MSR if we've loaded a suitable
     1230 + * microcode on the current CPU (which happens prior to cpuid_pass_ucode()).
1231 1231   *
1232 1232   * If TAA has been fixed, then it will be enumerated in IA32_ARCH_CAPABILITIES
1233 1233   * as TAA_NO. In such a case, we will still disable TSX: it's proven to be an
1234 1234   * unfortunate feature in a number of ways, and taking the opportunity to
1235 1235   * finally be able to turn it off is likely to be of benefit in the future.
1236 1236   *
1237 1237   * SUMMARY
1238 1238   *
1239 1239   * The following table attempts to summarize the mitigations for various issues
1240 1240   * and what's done in various places:

1241 1241   *
1242 1242   *  - Spectre v1: Not currently mitigated
1243 1243   *  - swapgs: lfences after swapgs paths
1244 1244   *  - Spectre v2: Retpolines/RSB Stuffing or EIBRS if HW support
1245 1245   *  - Meltdown: Kernel Page Table Isolation
1246 1246   *  - Spectre v3a: Updated CPU microcode
1247 1247   *  - Spectre v4: Not currently mitigated
1248 1248   *  - SpectreRSB: SMEP and RSB Stuffing
1249 1249   *  - L1TF: spec_uarch_flush, SMT exclusion, requires microcode
1250 1250   *  - MDS: x86_md_clear, requires microcode, disabling SMT
1251 1251   *  - TAA: x86_md_clear and disabling SMT OR microcode and disabling TSX
1252 1252   *
1253 1253   * The following table indicates the x86 feature set bits that indicate that a
1254 1254   * given problem has been solved or a notable feature is present:
1255 1255   *
1256 1256   *  - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1257 1257   *  - MDS_NO: All forms of MDS
1258 1258   *  - TAA_NO: TAA
1259 1259   */
1260 1260  
1261 1261  #include <sys/types.h>
1262 1262  #include <sys/archsystm.h>
1263 1263  #include <sys/x86_archext.h>
1264 1264  #include <sys/kmem.h>
1265 1265  #include <sys/systm.h>
1266 1266  #include <sys/cmn_err.h>
1267 1267  #include <sys/sunddi.h>
1268 1268  #include <sys/sunndi.h>
1269 1269  #include <sys/cpuvar.h>
1270 1270  #include <sys/processor.h>
1271 1271  #include <sys/sysmacros.h>
1272 1272  #include <sys/pg.h>
1273 1273  #include <sys/fp.h>
1274 1274  #include <sys/controlregs.h>
1275 1275  #include <sys/bitmap.h>
1276 1276  #include <sys/auxv_386.h>
1277 1277  #include <sys/memnode.h>
1278 1278  #include <sys/pci_cfgspace.h>
1279 1279  #include <sys/comm_page.h>
1280 1280  #include <sys/mach_mmu.h>
1281 1281  #include <sys/ucode.h>
1282 1282  #include <sys/tsc.h>
1283 1283  #include <sys/kobj.h>
1284 1284  #include <sys/asm_misc.h>
1285 1285  
1286 1286  #ifdef __xpv
1287 1287  #include <sys/hypervisor.h>
1288 1288  #else
1289 1289  #include <sys/ontrap.h>
1290 1290  #endif
1291 1291  
1292 1292  uint_t x86_vendor = X86_VENDOR_IntelClone;
1293 1293  uint_t x86_type = X86_TYPE_OTHER;
1294 1294  uint_t x86_clflush_size = 0;
1295 1295  
1296 1296  #if defined(__xpv)
1297 1297  int x86_use_pcid = 0;
1298 1298  int x86_use_invpcid = 0;
1299 1299  #else
1300 1300  int x86_use_pcid = -1;
1301 1301  int x86_use_invpcid = -1;
1302 1302  #endif
1303 1303  
1304 1304  typedef enum {
1305 1305          X86_SPECTREV2_RETPOLINE,
1306 1306          X86_SPECTREV2_RETPOLINE_AMD,
1307 1307          X86_SPECTREV2_ENHANCED_IBRS,
1308 1308          X86_SPECTREV2_DISABLED
1309 1309  } x86_spectrev2_mitigation_t;
1310 1310  
1311 1311  uint_t x86_disable_spectrev2 = 0;
1312 1312  static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1313 1313      X86_SPECTREV2_RETPOLINE;
1314 1314  
1315 1315  /*
1316 1316   * The mitigation status for TAA:
1317 1317   * X86_TAA_NOTHING -- no mitigation available for TAA side-channels
1318 1318   * X86_TAA_DISABLED -- mitigation disabled via x86_disable_taa
1319 1319   * X86_TAA_MD_CLEAR -- MDS mitigation also suffices for TAA
1320 1320   * X86_TAA_TSX_FORCE_ABORT -- transactions are forced to abort
1321 1321   * X86_TAA_TSX_DISABLE -- force abort transactions and hide from CPUID
1322 1322   * X86_TAA_HW_MITIGATED -- TSX potentially active but H/W not TAA-vulnerable
1323 1323   */
1324 1324  typedef enum {
1325 1325          X86_TAA_NOTHING,
1326 1326          X86_TAA_DISABLED,
1327 1327          X86_TAA_MD_CLEAR,
1328 1328          X86_TAA_TSX_FORCE_ABORT,
1329 1329          X86_TAA_TSX_DISABLE,
1330 1330          X86_TAA_HW_MITIGATED
1331 1331  } x86_taa_mitigation_t;
1332 1332  
1333 1333  uint_t x86_disable_taa = 0;
1334 1334  static x86_taa_mitigation_t x86_taa_mitigation = X86_TAA_NOTHING;
1335 1335  
1336 1336  uint_t pentiumpro_bug4046376;
1337 1337  
1338 1338  uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1339 1339  
1340 1340  static char *x86_feature_names[NUM_X86_FEATURES] = {
1341 1341          "lgpg",
1342 1342          "tsc",
1343 1343          "msr",
1344 1344          "mtrr",
1345 1345          "pge",
1346 1346          "de",
1347 1347          "cmov",
1348 1348          "mmx",
1349 1349          "mca",
1350 1350          "pae",
1351 1351          "cv8",
1352 1352          "pat",
1353 1353          "sep",
1354 1354          "sse",
1355 1355          "sse2",
1356 1356          "htt",
1357 1357          "asysc",
1358 1358          "nx",
1359 1359          "sse3",
1360 1360          "cx16",
1361 1361          "cmp",
1362 1362          "tscp",
1363 1363          "mwait",
1364 1364          "sse4a",
1365 1365          "cpuid",
1366 1366          "ssse3",
1367 1367          "sse4_1",
1368 1368          "sse4_2",
1369 1369          "1gpg",
1370 1370          "clfsh",
1371 1371          "64",
1372 1372          "aes",
1373 1373          "pclmulqdq",
1374 1374          "xsave",
1375 1375          "avx",
1376 1376          "vmx",
1377 1377          "svm",
1378 1378          "topoext",
1379 1379          "f16c",
1380 1380          "rdrand",
1381 1381          "x2apic",
1382 1382          "avx2",
1383 1383          "bmi1",
1384 1384          "bmi2",
1385 1385          "fma",
1386 1386          "smep",
1387 1387          "smap",
1388 1388          "adx",
1389 1389          "rdseed",
1390 1390          "mpx",
1391 1391          "avx512f",
1392 1392          "avx512dq",
1393 1393          "avx512pf",
1394 1394          "avx512er",
1395 1395          "avx512cd",
1396 1396          "avx512bw",
1397 1397          "avx512vl",
1398 1398          "avx512fma",
1399 1399          "avx512vbmi",
1400 1400          "avx512_vpopcntdq",
1401 1401          "avx512_4vnniw",
1402 1402          "avx512_4fmaps",
1403 1403          "xsaveopt",
1404 1404          "xsavec",
1405 1405          "xsaves",
1406 1406          "sha",
1407 1407          "umip",
1408 1408          "pku",
1409 1409          "ospke",
1410 1410          "pcid",
1411 1411          "invpcid",
1412 1412          "ibrs",
1413 1413          "ibpb",
1414 1414          "stibp",
1415 1415          "ssbd",
1416 1416          "ssbd_virt",
1417 1417          "rdcl_no",
1418 1418          "ibrs_all",
1419 1419          "rsba",
1420 1420          "ssb_no",
1421 1421          "stibp_all",
1422 1422          "flush_cmd",
1423 1423          "l1d_vmentry_no",
1424 1424          "fsgsbase",
1425 1425          "clflushopt",
1426 1426          "clwb",
1427 1427          "monitorx",
1428 1428          "clzero",
1429 1429          "xop",
1430 1430          "fma4",
1431 1431          "tbm",
1432 1432          "avx512_vnni",
1433 1433          "amd_pcec",
1434 1434          "mb_clear",
1435 1435          "mds_no",
1436 1436          "core_thermal",
1437 1437          "pkg_thermal",
1438 1438          "tsx_ctrl",
1439 1439          "taa_no"
1440 1440  };
1441 1441  
1442 1442  boolean_t
1443 1443  is_x86_feature(void *featureset, uint_t feature)
1444 1444  {
1445 1445          ASSERT(feature < NUM_X86_FEATURES);
1446 1446          return (BT_TEST((ulong_t *)featureset, feature));
1447 1447  }
1448 1448  
1449 1449  void
1450 1450  add_x86_feature(void *featureset, uint_t feature)
1451 1451  {
1452 1452          ASSERT(feature < NUM_X86_FEATURES);
1453 1453          BT_SET((ulong_t *)featureset, feature);
1454 1454  }
1455 1455  
1456 1456  void
1457 1457  remove_x86_feature(void *featureset, uint_t feature)
1458 1458  {
1459 1459          ASSERT(feature < NUM_X86_FEATURES);
1460 1460          BT_CLEAR((ulong_t *)featureset, feature);
1461 1461  }
1462 1462  
1463 1463  boolean_t
1464 1464  compare_x86_featureset(void *setA, void *setB)
1465 1465  {
1466 1466          /*
1467 1467           * We assume that the unused bits of the bitmap are always zero.
1468 1468           */
1469 1469          if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1470 1470                  return (B_TRUE);
1471 1471          } else {
1472 1472                  return (B_FALSE);
1473 1473          }
1474 1474  }
1475 1475  
1476 1476  void
1477 1477  print_x86_featureset(void *featureset)
1478 1478  {
1479 1479          uint_t i;
1480 1480  
1481 1481          for (i = 0; i < NUM_X86_FEATURES; i++) {
1482 1482                  if (is_x86_feature(featureset, i)) {
1483 1483                          cmn_err(CE_CONT, "?x86_feature: %s\n",
1484 1484                              x86_feature_names[i]);
1485 1485                  }
1486 1486          }
1487 1487  }
1488 1488  
1489 1489  /* Note: This is the maximum size for the CPU, not the size of the structure. */
1490 1490  static size_t xsave_state_size = 0;
1491 1491  uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1492 1492  boolean_t xsave_force_disable = B_FALSE;
1493 1493  extern int disable_smap;
1494 1494  
1495 1495  /*
1496 1496   * This is set to platform type we are running on.
1497 1497   */
1498 1498  static int platform_type = -1;
1499 1499  
1500 1500  #if !defined(__xpv)
1501 1501  /*
1502 1502   * Variable to patch if hypervisor platform detection needs to be
1503 1503   * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1504 1504   */
1505 1505  int enable_platform_detection = 1;
1506 1506  #endif
1507 1507  
1508 1508  /*
1509 1509   * monitor/mwait info.
1510 1510   *
1511 1511   * size_actual and buf_actual are the real address and size allocated to get
1512 1512   * proper mwait_buf alignement.  buf_actual and size_actual should be passed
1513 1513   * to kmem_free().  Currently kmem_alloc() and mwait happen to both use
1514 1514   * processor cache-line alignment, but this is not guarantied in the furture.
1515 1515   */
1516 1516  struct mwait_info {
1517 1517          size_t          mon_min;        /* min size to avoid missed wakeups */
1518 1518          size_t          mon_max;        /* size to avoid false wakeups */
1519 1519          size_t          size_actual;    /* size actually allocated */
1520 1520          void            *buf_actual;    /* memory actually allocated */
1521 1521          uint32_t        support;        /* processor support of monitor/mwait */
1522 1522  };
1523 1523  
1524 1524  /*
1525 1525   * xsave/xrestor info.
1526 1526   *
1527 1527   * This structure contains HW feature bits and the size of the xsave save area.
1528 1528   * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1529 1529   * (xsave_state) to describe the xsave layout. However, at runtime the
1530 1530   * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1531 1531   * xsave_state structure simply represents the legacy layout of the beginning
1532 1532   * of the xsave area.
1533 1533   */
1534 1534  struct xsave_info {
1535 1535          uint32_t        xsav_hw_features_low;   /* Supported HW features */
1536 1536          uint32_t        xsav_hw_features_high;  /* Supported HW features */
1537 1537          size_t          xsav_max_size;  /* max size save area for HW features */
1538 1538          size_t          ymm_size;       /* AVX: size of ymm save area */
1539 1539          size_t          ymm_offset;     /* AVX: offset for ymm save area */
1540 1540          size_t          bndregs_size;   /* MPX: size of bndregs save area */
1541 1541          size_t          bndregs_offset; /* MPX: offset for bndregs save area */
1542 1542          size_t          bndcsr_size;    /* MPX: size of bndcsr save area */
1543 1543          size_t          bndcsr_offset;  /* MPX: offset for bndcsr save area */
1544 1544          size_t          opmask_size;    /* AVX512: size of opmask save */
1545 1545          size_t          opmask_offset;  /* AVX512: offset for opmask save */
1546 1546          size_t          zmmlo_size;     /* AVX512: size of zmm 256 save */
1547 1547          size_t          zmmlo_offset;   /* AVX512: offset for zmm 256 save */
1548 1548          size_t          zmmhi_size;     /* AVX512: size of zmm hi reg save */
1549 1549          size_t          zmmhi_offset;   /* AVX512: offset for zmm hi reg save */
1550 1550  };
1551 1551  
1552 1552  
1553 1553  /*
1554 1554   * These constants determine how many of the elements of the
1555 1555   * cpuid we cache in the cpuid_info data structure; the
1556 1556   * remaining elements are accessible via the cpuid instruction.
1557 1557   */
1558 1558  
1559 1559  #define NMAX_CPI_STD    8               /* eax = 0 .. 7 */
1560 1560  #define NMAX_CPI_EXTD   0x1f            /* eax = 0x80000000 .. 0x8000001e */
1561 1561  
1562 1562  /*
1563 1563   * See the big theory statement for a more detailed explanation of what some of
1564 1564   * these members mean.
1565 1565   */
1566 1566  struct cpuid_info {
1567 1567          uint_t cpi_pass;                /* last pass completed */
1568 1568          /*
1569 1569           * standard function information
1570 1570           */
1571 1571          uint_t cpi_maxeax;              /* fn 0: %eax */
1572 1572          char cpi_vendorstr[13];         /* fn 0: %ebx:%ecx:%edx */
1573 1573          uint_t cpi_vendor;              /* enum of cpi_vendorstr */
1574 1574  
1575 1575          uint_t cpi_family;              /* fn 1: extended family */
1576 1576          uint_t cpi_model;               /* fn 1: extended model */
1577 1577          uint_t cpi_step;                /* fn 1: stepping */
1578 1578          chipid_t cpi_chipid;            /* fn 1: %ebx:  Intel: chip # */
1579 1579                                          /*              AMD: package/socket # */
1580 1580          uint_t cpi_brandid;             /* fn 1: %ebx: brand ID */
1581 1581          int cpi_clogid;                 /* fn 1: %ebx: thread # */
1582 1582          uint_t cpi_ncpu_per_chip;       /* fn 1: %ebx: logical cpu count */
1583 1583          uint8_t cpi_cacheinfo[16];      /* fn 2: intel-style cache desc */
1584 1584          uint_t cpi_ncache;              /* fn 2: number of elements */
1585 1585          uint_t cpi_ncpu_shr_last_cache; /* fn 4: %eax: ncpus sharing cache */
1586 1586          id_t cpi_last_lvl_cacheid;      /* fn 4: %eax: derived cache id */
1587 1587          uint_t cpi_cache_leaf_size;     /* Number of cache elements */
1588 1588                                          /* Intel fn: 4, AMD fn: 8000001d */
1589 1589          struct cpuid_regs **cpi_cache_leaves;   /* Acual leaves from above */
1590 1590          struct cpuid_regs cpi_std[NMAX_CPI_STD];        /* 0 .. 7 */
1591 1591          /*
1592 1592           * extended function information
1593 1593           */
1594 1594          uint_t cpi_xmaxeax;             /* fn 0x80000000: %eax */
1595 1595          char cpi_brandstr[49];          /* fn 0x8000000[234] */
1596 1596          uint8_t cpi_pabits;             /* fn 0x80000006: %eax */
1597 1597          uint8_t cpi_vabits;             /* fn 0x80000006: %eax */
1598 1598          uint8_t cpi_fp_amd_save;        /* AMD: FP error pointer save rqd. */
1599 1599          struct  cpuid_regs cpi_extd[NMAX_CPI_EXTD];     /* 0x800000XX */
1600 1600  
1601 1601          id_t cpi_coreid;                /* same coreid => strands share core */
1602 1602          int cpi_pkgcoreid;              /* core number within single package */
1603 1603          uint_t cpi_ncore_per_chip;      /* AMD: fn 0x80000008: %ecx[7-0] */
1604 1604                                          /* Intel: fn 4: %eax[31-26] */
1605 1605  
1606 1606          /*
1607 1607           * These values represent the number of bits that are required to store
1608 1608           * information about the number of cores and threads.
1609 1609           */
1610 1610          uint_t cpi_ncore_bits;
1611 1611          uint_t cpi_nthread_bits;
1612 1612          /*
1613 1613           * supported feature information
1614 1614           */
1615 1615          uint32_t cpi_support[6];
1616 1616  #define STD_EDX_FEATURES        0
1617 1617  #define AMD_EDX_FEATURES        1
1618 1618  #define TM_EDX_FEATURES         2
1619 1619  #define STD_ECX_FEATURES        3
1620 1620  #define AMD_ECX_FEATURES        4
1621 1621  #define STD_EBX_FEATURES        5
1622 1622          /*
1623 1623           * Synthesized information, where known.
1624 1624           */
1625 1625          uint32_t cpi_chiprev;           /* See X86_CHIPREV_* in x86_archext.h */
1626 1626          const char *cpi_chiprevstr;     /* May be NULL if chiprev unknown */
1627 1627          uint32_t cpi_socket;            /* Chip package/socket type */
1628 1628  
1629 1629          struct mwait_info cpi_mwait;    /* fn 5: monitor/mwait info */
1630 1630          uint32_t cpi_apicid;
1631 1631          uint_t cpi_procnodeid;          /* AMD: nodeID on HT, Intel: chipid */
1632 1632          uint_t cpi_procnodes_per_pkg;   /* AMD: # of nodes in the package */
1633 1633                                          /* Intel: 1 */
1634 1634          uint_t cpi_compunitid;          /* AMD: ComputeUnit ID, Intel: coreid */
1635 1635          uint_t cpi_cores_per_compunit;  /* AMD: # of cores in the ComputeUnit */
1636 1636  
1637 1637          struct xsave_info cpi_xsave;    /* fn D: xsave/xrestor info */
1638 1638  };
1639 1639  
1640 1640  
1641 1641  static struct cpuid_info cpuid_info0;
1642 1642  
1643 1643  /*
1644 1644   * These bit fields are defined by the Intel Application Note AP-485
1645 1645   * "Intel Processor Identification and the CPUID Instruction"
1646 1646   */
1647 1647  #define CPI_FAMILY_XTD(cpi)     BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1648 1648  #define CPI_MODEL_XTD(cpi)      BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1649 1649  #define CPI_TYPE(cpi)           BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1650 1650  #define CPI_FAMILY(cpi)         BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1651 1651  #define CPI_STEP(cpi)           BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1652 1652  #define CPI_MODEL(cpi)          BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1653 1653  
1654 1654  #define CPI_FEATURES_EDX(cpi)           ((cpi)->cpi_std[1].cp_edx)
1655 1655  #define CPI_FEATURES_ECX(cpi)           ((cpi)->cpi_std[1].cp_ecx)
1656 1656  #define CPI_FEATURES_XTD_EDX(cpi)       ((cpi)->cpi_extd[1].cp_edx)
1657 1657  #define CPI_FEATURES_XTD_ECX(cpi)       ((cpi)->cpi_extd[1].cp_ecx)
1658 1658  #define CPI_FEATURES_7_0_EBX(cpi)       ((cpi)->cpi_std[7].cp_ebx)
1659 1659  #define CPI_FEATURES_7_0_ECX(cpi)       ((cpi)->cpi_std[7].cp_ecx)
1660 1660  #define CPI_FEATURES_7_0_EDX(cpi)       ((cpi)->cpi_std[7].cp_edx)
1661 1661  
1662 1662  #define CPI_BRANDID(cpi)        BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1663 1663  #define CPI_CHUNKS(cpi)         BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1664 1664  #define CPI_CPU_COUNT(cpi)      BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1665 1665  #define CPI_APIC_ID(cpi)        BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1666 1666  
1667 1667  #define CPI_MAXEAX_MAX          0x100           /* sanity control */
1668 1668  #define CPI_XMAXEAX_MAX         0x80000100
1669 1669  #define CPI_FN4_ECX_MAX         0x20            /* sanity: max fn 4 levels */
1670 1670  #define CPI_FNB_ECX_MAX         0x20            /* sanity: max fn B levels */
1671 1671  
1672 1672  /*
1673 1673   * Function 4 (Deterministic Cache Parameters) macros
1674 1674   * Defined by Intel Application Note AP-485
1675 1675   */
1676 1676  #define CPI_NUM_CORES(regs)             BITX((regs)->cp_eax, 31, 26)
1677 1677  #define CPI_NTHR_SHR_CACHE(regs)        BITX((regs)->cp_eax, 25, 14)
1678 1678  #define CPI_FULL_ASSOC_CACHE(regs)      BITX((regs)->cp_eax, 9, 9)
1679 1679  #define CPI_SELF_INIT_CACHE(regs)       BITX((regs)->cp_eax, 8, 8)
1680 1680  #define CPI_CACHE_LVL(regs)             BITX((regs)->cp_eax, 7, 5)
1681 1681  #define CPI_CACHE_TYPE(regs)            BITX((regs)->cp_eax, 4, 0)
1682 1682  #define CPI_CPU_LEVEL_TYPE(regs)        BITX((regs)->cp_ecx, 15, 8)
1683 1683  
1684 1684  #define CPI_CACHE_WAYS(regs)            BITX((regs)->cp_ebx, 31, 22)
1685 1685  #define CPI_CACHE_PARTS(regs)           BITX((regs)->cp_ebx, 21, 12)
1686 1686  #define CPI_CACHE_COH_LN_SZ(regs)       BITX((regs)->cp_ebx, 11, 0)
1687 1687  
1688 1688  #define CPI_CACHE_SETS(regs)            BITX((regs)->cp_ecx, 31, 0)
1689 1689  
1690 1690  #define CPI_PREFCH_STRIDE(regs)         BITX((regs)->cp_edx, 9, 0)
1691 1691  
1692 1692  
1693 1693  /*
1694 1694   * A couple of shorthand macros to identify "later" P6-family chips
1695 1695   * like the Pentium M and Core.  First, the "older" P6-based stuff
1696 1696   * (loosely defined as "pre-Pentium-4"):
1697 1697   * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1698 1698   */
1699 1699  #define IS_LEGACY_P6(cpi) (                     \
1700 1700          cpi->cpi_family == 6 &&                 \
1701 1701                  (cpi->cpi_model == 1 ||         \
1702 1702                  cpi->cpi_model == 3 ||          \
1703 1703                  cpi->cpi_model == 5 ||          \
1704 1704                  cpi->cpi_model == 6 ||          \
1705 1705                  cpi->cpi_model == 7 ||          \
1706 1706                  cpi->cpi_model == 8 ||          \
1707 1707                  cpi->cpi_model == 0xA ||        \
1708 1708                  cpi->cpi_model == 0xB)          \
1709 1709  )
1710 1710  
1711 1711  /* A "new F6" is everything with family 6 that's not the above */
1712 1712  #define IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
1713 1713  
1714 1714  /* Extended family/model support */
1715 1715  #define IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
1716 1716          cpi->cpi_family >= 0xf)
1717 1717  
1718 1718  /*
1719 1719   * Info for monitor/mwait idle loop.
1720 1720   *
1721 1721   * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
1722 1722   * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
1723 1723   * 2006.
1724 1724   * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
1725 1725   * Documentation Updates" #33633, Rev 2.05, December 2006.
1726 1726   */
1727 1727  #define MWAIT_SUPPORT           (0x00000001)    /* mwait supported */
1728 1728  #define MWAIT_EXTENSIONS        (0x00000002)    /* extenstion supported */
1729 1729  #define MWAIT_ECX_INT_ENABLE    (0x00000004)    /* ecx 1 extension supported */
1730 1730  #define MWAIT_SUPPORTED(cpi)    ((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
1731 1731  #define MWAIT_INT_ENABLE(cpi)   ((cpi)->cpi_std[5].cp_ecx & 0x2)
1732 1732  #define MWAIT_EXTENSION(cpi)    ((cpi)->cpi_std[5].cp_ecx & 0x1)
1733 1733  #define MWAIT_SIZE_MIN(cpi)     BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
1734 1734  #define MWAIT_SIZE_MAX(cpi)     BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
1735 1735  /*
1736 1736   * Number of sub-cstates for a given c-state.
1737 1737   */
1738 1738  #define MWAIT_NUM_SUBC_STATES(cpi, c_state)                     \
1739 1739          BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
1740 1740  
1741 1741  /*
1742 1742   * XSAVE leaf 0xD enumeration
1743 1743   */
1744 1744  #define CPUID_LEAFD_2_YMM_OFFSET        576
1745 1745  #define CPUID_LEAFD_2_YMM_SIZE          256
1746 1746  
1747 1747  /*
1748 1748   * Common extended leaf names to cut down on typos.
1749 1749   */
1750 1750  #define CPUID_LEAF_EXT_0                0x80000000
1751 1751  #define CPUID_LEAF_EXT_8                0x80000008
1752 1752  #define CPUID_LEAF_EXT_1d               0x8000001d
1753 1753  #define CPUID_LEAF_EXT_1e               0x8000001e
1754 1754  
1755 1755  /*
1756 1756   * Functions we consune from cpuid_subr.c;  don't publish these in a header
1757 1757   * file to try and keep people using the expected cpuid_* interfaces.
1758 1758   */
1759 1759  extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
1760 1760  extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
1761 1761  extern uint32_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
1762 1762  extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
1763 1763  extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
1764 1764  
1765 1765  /*
1766 1766   * Apply up various platform-dependent restrictions where the
1767 1767   * underlying platform restrictions mean the CPU can be marked
1768 1768   * as less capable than its cpuid instruction would imply.
1769 1769   */
1770 1770  #if defined(__xpv)
1771 1771  static void
1772 1772  platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
1773 1773  {
1774 1774          switch (eax) {
1775 1775          case 1: {
1776 1776                  uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
1777 1777                      0 : CPUID_INTC_EDX_MCA;
1778 1778                  cp->cp_edx &=
1779 1779                      ~(mcamask |
1780 1780                      CPUID_INTC_EDX_PSE |
1781 1781                      CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1782 1782                      CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
1783 1783                      CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
1784 1784                      CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1785 1785                      CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
1786 1786                  break;
1787 1787          }
1788 1788  
1789 1789          case 0x80000001:
1790 1790                  cp->cp_edx &=
1791 1791                      ~(CPUID_AMD_EDX_PSE |
1792 1792                      CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1793 1793                      CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
1794 1794                      CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
1795 1795                      CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1796 1796                      CPUID_AMD_EDX_TSCP);
1797 1797                  cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
1798 1798                  break;
1799 1799          default:
1800 1800                  break;
1801 1801          }
1802 1802  
1803 1803          switch (vendor) {
1804 1804          case X86_VENDOR_Intel:
1805 1805                  switch (eax) {
1806 1806                  case 4:
1807 1807                          /*
1808 1808                           * Zero out the (ncores-per-chip - 1) field
1809 1809                           */
1810 1810                          cp->cp_eax &= 0x03fffffff;
1811 1811                          break;
1812 1812                  default:
1813 1813                          break;
1814 1814                  }
1815 1815                  break;
1816 1816          case X86_VENDOR_AMD:
1817 1817                  switch (eax) {
1818 1818  
1819 1819                  case 0x80000001:
1820 1820                          cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
1821 1821                          break;
1822 1822  
1823 1823                  case CPUID_LEAF_EXT_8:
1824 1824                          /*
1825 1825                           * Zero out the (ncores-per-chip - 1) field
1826 1826                           */
1827 1827                          cp->cp_ecx &= 0xffffff00;
1828 1828                          break;
1829 1829                  default:
1830 1830                          break;
1831 1831                  }
1832 1832                  break;
1833 1833          default:
1834 1834                  break;
1835 1835          }
1836 1836  }
1837 1837  #else
1838 1838  #define platform_cpuid_mangle(vendor, eax, cp)  /* nothing */
1839 1839  #endif
1840 1840  
1841 1841  /*
1842 1842   *  Some undocumented ways of patching the results of the cpuid
1843 1843   *  instruction to permit running Solaris 10 on future cpus that
1844 1844   *  we don't currently support.  Could be set to non-zero values
1845 1845   *  via settings in eeprom.
1846 1846   */
1847 1847  
1848 1848  uint32_t cpuid_feature_ecx_include;
1849 1849  uint32_t cpuid_feature_ecx_exclude;
1850 1850  uint32_t cpuid_feature_edx_include;
1851 1851  uint32_t cpuid_feature_edx_exclude;
1852 1852  
1853 1853  /*
1854 1854   * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
1855 1855   */
1856 1856  void
1857 1857  cpuid_alloc_space(cpu_t *cpu)
1858 1858  {
1859 1859          /*
1860 1860           * By convention, cpu0 is the boot cpu, which is set up
1861 1861           * before memory allocation is available.  All other cpus get
1862 1862           * their cpuid_info struct allocated here.
1863 1863           */
1864 1864          ASSERT(cpu->cpu_id != 0);
1865 1865          ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
1866 1866          cpu->cpu_m.mcpu_cpi =
1867 1867              kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
1868 1868  }
1869 1869  
1870 1870  void
1871 1871  cpuid_free_space(cpu_t *cpu)
1872 1872  {
1873 1873          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1874 1874          int i;
1875 1875  
1876 1876          ASSERT(cpi != NULL);
1877 1877          ASSERT(cpi != &cpuid_info0);
1878 1878  
1879 1879          /*
1880 1880           * Free up any cache leaf related dynamic storage. The first entry was
1881 1881           * cached from the standard cpuid storage, so we should not free it.
1882 1882           */
1883 1883          for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
1884 1884                  kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
1885 1885          if (cpi->cpi_cache_leaf_size > 0)
1886 1886                  kmem_free(cpi->cpi_cache_leaves,
1887 1887                      cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
1888 1888  
1889 1889          kmem_free(cpi, sizeof (*cpi));
1890 1890          cpu->cpu_m.mcpu_cpi = NULL;
1891 1891  }
1892 1892  
1893 1893  #if !defined(__xpv)
1894 1894  /*
1895 1895   * Determine the type of the underlying platform. This is used to customize
1896 1896   * initialization of various subsystems (e.g. TSC). determine_platform() must
1897 1897   * only ever be called once to prevent two processors from seeing different
1898 1898   * values of platform_type. Must be called before cpuid_pass1(), the earliest
1899 1899   * consumer to execute (uses _cpuid_chiprev --> synth_amd_info --> get_hwenv).
1900 1900   */
1901 1901  void
1902 1902  determine_platform(void)
1903 1903  {
1904 1904          struct cpuid_regs cp;
1905 1905          uint32_t base;
1906 1906          uint32_t regs[4];
1907 1907          char *hvstr = (char *)regs;
1908 1908  
1909 1909          ASSERT(platform_type == -1);
1910 1910  
1911 1911          platform_type = HW_NATIVE;
1912 1912  
1913 1913          if (!enable_platform_detection)
1914 1914                  return;
1915 1915  
1916 1916          /*
1917 1917           * If Hypervisor CPUID bit is set, try to determine hypervisor
1918 1918           * vendor signature, and set platform type accordingly.
1919 1919           *
1920 1920           * References:
1921 1921           * http://lkml.org/lkml/2008/10/1/246
1922 1922           * http://kb.vmware.com/kb/1009458
1923 1923           */
1924 1924          cp.cp_eax = 0x1;
1925 1925          (void) __cpuid_insn(&cp);
1926 1926          if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
1927 1927                  cp.cp_eax = 0x40000000;
1928 1928                  (void) __cpuid_insn(&cp);
1929 1929                  regs[0] = cp.cp_ebx;
1930 1930                  regs[1] = cp.cp_ecx;
1931 1931                  regs[2] = cp.cp_edx;
1932 1932                  regs[3] = 0;
1933 1933                  if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
1934 1934                          platform_type = HW_XEN_HVM;
1935 1935                          return;
1936 1936                  }
1937 1937                  if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
1938 1938                          platform_type = HW_VMWARE;
1939 1939                          return;
1940 1940                  }
1941 1941                  if (strcmp(hvstr, HVSIG_KVM) == 0) {
1942 1942                          platform_type = HW_KVM;
1943 1943                          return;
1944 1944                  }
1945 1945                  if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
1946 1946                          platform_type = HW_BHYVE;
1947 1947                          return;
1948 1948                  }
1949 1949                  if (strcmp(hvstr, HVSIG_MICROSOFT) == 0)
1950 1950                          platform_type = HW_MICROSOFT;
1951 1951          } else {
1952 1952                  /*
1953 1953                   * Check older VMware hardware versions. VMware hypervisor is
1954 1954                   * detected by performing an IN operation to VMware hypervisor
1955 1955                   * port and checking that value returned in %ebx is VMware
1956 1956                   * hypervisor magic value.
1957 1957                   *
1958 1958                   * References: http://kb.vmware.com/kb/1009458
1959 1959                   */
1960 1960                  vmware_port(VMWARE_HVCMD_GETVERSION, regs);
1961 1961                  if (regs[1] == VMWARE_HVMAGIC) {
1962 1962                          platform_type = HW_VMWARE;
1963 1963                          return;
1964 1964                  }
1965 1965          }
1966 1966  
1967 1967          /*
1968 1968           * Check Xen hypervisor. In a fully virtualized domain,
1969 1969           * Xen's pseudo-cpuid function returns a string representing the
1970 1970           * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
1971 1971           * supported cpuid function. We need at least a (base + 2) leaf value
1972 1972           * to do what we want to do. Try different base values, since the
1973 1973           * hypervisor might use a different one depending on whether Hyper-V
1974 1974           * emulation is switched on by default or not.
1975 1975           */
1976 1976          for (base = 0x40000000; base < 0x40010000; base += 0x100) {
1977 1977                  cp.cp_eax = base;
1978 1978                  (void) __cpuid_insn(&cp);
1979 1979                  regs[0] = cp.cp_ebx;
1980 1980                  regs[1] = cp.cp_ecx;
1981 1981                  regs[2] = cp.cp_edx;
1982 1982                  regs[3] = 0;
1983 1983                  if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
1984 1984                      cp.cp_eax >= (base + 2)) {
1985 1985                          platform_type &= ~HW_NATIVE;
1986 1986                          platform_type |= HW_XEN_HVM;
1987 1987                          return;
1988 1988                  }
1989 1989          }
1990 1990  }
1991 1991  
1992 1992  int
1993 1993  get_hwenv(void)
1994 1994  {
1995 1995          ASSERT(platform_type != -1);
1996 1996          return (platform_type);
1997 1997  }
1998 1998  
1999 1999  int
2000 2000  is_controldom(void)
2001 2001  {
2002 2002          return (0);
2003 2003  }
2004 2004  
2005 2005  #else
2006 2006  
2007 2007  int
2008 2008  get_hwenv(void)
2009 2009  {
2010 2010          return (HW_XEN_PV);
2011 2011  }
2012 2012  
2013 2013  int
2014 2014  is_controldom(void)
2015 2015  {
2016 2016          return (DOMAIN_IS_INITDOMAIN(xen_info));
2017 2017  }
2018 2018  
2019 2019  #endif  /* __xpv */
2020 2020  
2021 2021  /*
2022 2022   * Make sure that we have gathered all of the CPUID leaves that we might need to
2023 2023   * determine topology. We assume that the standard leaf 1 has already been done
2024 2024   * and that xmaxeax has already been calculated.
2025 2025   */
2026 2026  static void
2027 2027  cpuid_gather_amd_topology_leaves(cpu_t *cpu)
2028 2028  {
2029 2029          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2030 2030  
2031 2031          if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2032 2032                  struct cpuid_regs *cp;
2033 2033  
2034 2034                  cp = &cpi->cpi_extd[8];
2035 2035                  cp->cp_eax = CPUID_LEAF_EXT_8;
2036 2036                  (void) __cpuid_insn(cp);
2037 2037                  platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
2038 2038          }
2039 2039  
2040 2040          if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2041 2041              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2042 2042                  struct cpuid_regs *cp;
2043 2043  
2044 2044                  cp = &cpi->cpi_extd[0x1e];
2045 2045                  cp->cp_eax = CPUID_LEAF_EXT_1e;
2046 2046                  (void) __cpuid_insn(cp);
2047 2047          }
2048 2048  }
2049 2049  
2050 2050  /*
2051 2051   * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
2052 2052   * it to everything else. If not, and we're on an AMD system where 8000001e is
2053 2053   * valid, then we use that. Othewrise, we fall back to the default value for the
2054 2054   * APIC ID in leaf 1.
2055 2055   */
2056 2056  static uint32_t
2057 2057  cpuid_gather_apicid(struct cpuid_info *cpi)
2058 2058  {
2059 2059          /*
2060 2060           * Leaf B changes based on the arguments to it. Beacuse we don't cache
2061 2061           * it, we need to gather it again.
2062 2062           */
2063 2063          if (cpi->cpi_maxeax >= 0xB) {
2064 2064                  struct cpuid_regs regs;
2065 2065                  struct cpuid_regs *cp;
2066 2066  
2067 2067                  cp = &regs;
2068 2068                  cp->cp_eax = 0xB;
2069 2069                  cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2070 2070                  (void) __cpuid_insn(cp);
2071 2071  
2072 2072                  if (cp->cp_ebx != 0) {
2073 2073                          return (cp->cp_edx);
2074 2074                  }
2075 2075          }
2076 2076  
2077 2077          if (cpi->cpi_vendor == X86_VENDOR_AMD &&
2078 2078              is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2079 2079              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2080 2080                  return (cpi->cpi_extd[0x1e].cp_eax);
2081 2081          }
2082 2082  
2083 2083          return (CPI_APIC_ID(cpi));
2084 2084  }
2085 2085  
2086 2086  /*
2087 2087   * For AMD processors, attempt to calculate the number of chips and cores that
2088 2088   * exist. The way that we do this varies based on the generation, because the
2089 2089   * generations themselves have changed dramatically.
2090 2090   *
2091 2091   * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
2092 2092   * However, with the advent of family 17h (Zen) it actually tells us the number
2093 2093   * of threads, so we need to look at leaf 0x8000001e if available to determine
2094 2094   * its value. Otherwise, for all prior families, the number of enabled cores is
2095 2095   * the same as threads.
2096 2096   *
2097 2097   * If we do not have leaf 0x80000008, then we assume that this processor does
2098 2098   * not have anything. AMD's older CPUID specification says there's no reason to
2099 2099   * fall back to leaf 1.
2100 2100   *
2101 2101   * In some virtualization cases we will not have leaf 8000001e or it will be
2102 2102   * zero. When that happens we assume the number of threads is one.
2103 2103   */
2104 2104  static void
2105 2105  cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2106 2106  {
2107 2107          uint_t nthreads, nthread_per_core;
2108 2108  
2109 2109          nthreads = nthread_per_core = 1;
2110 2110  
2111 2111          if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2112 2112                  nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2113 2113          } else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2114 2114                  nthreads = CPI_CPU_COUNT(cpi);
2115 2115          }
2116 2116  
2117 2117          /*
2118 2118           * For us to have threads, and know about it, we have to be at least at
2119 2119           * family 17h and have the cpuid bit that says we have extended
2120 2120           * topology.
2121 2121           */
2122 2122          if (cpi->cpi_family >= 0x17 &&
2123 2123              is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2124 2124              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2125 2125                  nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2126 2126          }
2127 2127  
2128 2128          *ncpus = nthreads;
2129 2129          *ncores = nthreads / nthread_per_core;
2130 2130  }
2131 2131  
2132 2132  /*
2133 2133   * Seed the initial values for the cores and threads for an Intel based
2134 2134   * processor. These values will be overwritten if we detect that the processor
2135 2135   * supports CPUID leaf 0xb.
2136 2136   */
2137 2137  static void
2138 2138  cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2139 2139  {
2140 2140          /*
2141 2141           * Only seed the number of physical cores from the first level leaf 4
2142 2142           * information. The number of threads there indicate how many share the
2143 2143           * L1 cache, which may or may not have anything to do with the number of
2144 2144           * logical CPUs per core.
2145 2145           */
2146 2146          if (cpi->cpi_maxeax >= 4) {
2147 2147                  *ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2148 2148          } else {
2149 2149                  *ncores = 1;
2150 2150          }
2151 2151  
2152 2152          if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2153 2153                  *ncpus = CPI_CPU_COUNT(cpi);
2154 2154          } else {
2155 2155                  *ncpus = *ncores;
2156 2156          }
2157 2157  }
2158 2158  
2159 2159  static boolean_t
2160 2160  cpuid_leafB_getids(cpu_t *cpu)
2161 2161  {
2162 2162          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2163 2163          struct cpuid_regs regs;
2164 2164          struct cpuid_regs *cp;
2165 2165  
2166 2166          if (cpi->cpi_maxeax < 0xB)
2167 2167                  return (B_FALSE);
2168 2168  
2169 2169          cp = &regs;
2170 2170          cp->cp_eax = 0xB;
2171 2171          cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2172 2172  
2173 2173          (void) __cpuid_insn(cp);
2174 2174  
2175 2175          /*
2176 2176           * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2177 2177           * indicates that the extended topology enumeration leaf is
2178 2178           * available.
2179 2179           */
2180 2180          if (cp->cp_ebx != 0) {
2181 2181                  uint32_t x2apic_id = 0;
2182 2182                  uint_t coreid_shift = 0;
2183 2183                  uint_t ncpu_per_core = 1;
2184 2184                  uint_t chipid_shift = 0;
2185 2185                  uint_t ncpu_per_chip = 1;
2186 2186                  uint_t i;
2187 2187                  uint_t level;
2188 2188  
2189 2189                  for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2190 2190                          cp->cp_eax = 0xB;
2191 2191                          cp->cp_ecx = i;
2192 2192  
2193 2193                          (void) __cpuid_insn(cp);
2194 2194                          level = CPI_CPU_LEVEL_TYPE(cp);
2195 2195  
2196 2196                          if (level == 1) {
2197 2197                                  x2apic_id = cp->cp_edx;
2198 2198                                  coreid_shift = BITX(cp->cp_eax, 4, 0);
2199 2199                                  ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2200 2200                          } else if (level == 2) {
2201 2201                                  x2apic_id = cp->cp_edx;
2202 2202                                  chipid_shift = BITX(cp->cp_eax, 4, 0);
2203 2203                                  ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2204 2204                          }
2205 2205                  }
2206 2206  
2207 2207                  /*
2208 2208                   * cpi_apicid is taken care of in cpuid_gather_apicid.
2209 2209                   */
2210 2210                  cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2211 2211                  cpi->cpi_ncore_per_chip = ncpu_per_chip /
2212 2212                      ncpu_per_core;
2213 2213                  cpi->cpi_chipid = x2apic_id >> chipid_shift;
2214 2214                  cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2215 2215                  cpi->cpi_coreid = x2apic_id >> coreid_shift;
2216 2216                  cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2217 2217                  cpi->cpi_procnodeid = cpi->cpi_chipid;
2218 2218                  cpi->cpi_compunitid = cpi->cpi_coreid;
2219 2219  
2220 2220                  if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2221 2221                          cpi->cpi_nthread_bits = coreid_shift;
2222 2222                          cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2223 2223                  }
2224 2224  
2225 2225                  return (B_TRUE);
2226 2226          } else {
2227 2227                  return (B_FALSE);
2228 2228          }
2229 2229  }
2230 2230  
2231 2231  static void
2232 2232  cpuid_intel_getids(cpu_t *cpu, void *feature)
2233 2233  {
2234 2234          uint_t i;
2235 2235          uint_t chipid_shift = 0;
2236 2236          uint_t coreid_shift = 0;
2237 2237          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2238 2238  
2239 2239          /*
2240 2240           * There are no compute units or processor nodes currently on Intel.
2241 2241           * Always set these to one.
2242 2242           */
2243 2243          cpi->cpi_procnodes_per_pkg = 1;
2244 2244          cpi->cpi_cores_per_compunit = 1;
2245 2245  
2246 2246          /*
2247 2247           * If cpuid Leaf B is present, use that to try and get this information.
2248 2248           * It will be the most accurate for Intel CPUs.
2249 2249           */
2250 2250          if (cpuid_leafB_getids(cpu))
2251 2251                  return;
2252 2252  
2253 2253          /*
2254 2254           * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2255 2255           * and ncore_per_chip. These represent the largest power of two values
2256 2256           * that we need to cover all of the IDs in the system. Therefore, we use
2257 2257           * those values to seed the number of bits needed to cover information
2258 2258           * in the case when leaf B is not available. These values will probably
2259 2259           * be larger than required, but that's OK.
2260 2260           */
2261 2261          cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2262 2262          cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2263 2263  
2264 2264          for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2265 2265                  chipid_shift++;
2266 2266  
2267 2267          cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2268 2268          cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2269 2269  
2270 2270          if (is_x86_feature(feature, X86FSET_CMP)) {
2271 2271                  /*
2272 2272                   * Multi-core (and possibly multi-threaded)
2273 2273                   * processors.
2274 2274                   */
2275 2275                  uint_t ncpu_per_core;
2276 2276                  if (cpi->cpi_ncore_per_chip == 1)
2277 2277                          ncpu_per_core = cpi->cpi_ncpu_per_chip;
2278 2278                  else if (cpi->cpi_ncore_per_chip > 1)
2279 2279                          ncpu_per_core = cpi->cpi_ncpu_per_chip /
2280 2280                              cpi->cpi_ncore_per_chip;
2281 2281                  /*
2282 2282                   * 8bit APIC IDs on dual core Pentiums
2283 2283                   * look like this:
2284 2284                   *
2285 2285                   * +-----------------------+------+------+
2286 2286                   * | Physical Package ID   |  MC  |  HT  |
2287 2287                   * +-----------------------+------+------+
2288 2288                   * <------- chipid -------->
2289 2289                   * <------- coreid --------------->
2290 2290                   *                         <--- clogid -->
2291 2291                   *                         <------>
2292 2292                   *                         pkgcoreid
2293 2293                   *
2294 2294                   * Where the number of bits necessary to
2295 2295                   * represent MC and HT fields together equals
2296 2296                   * to the minimum number of bits necessary to
2297 2297                   * store the value of cpi->cpi_ncpu_per_chip.
2298 2298                   * Of those bits, the MC part uses the number
2299 2299                   * of bits necessary to store the value of
2300 2300                   * cpi->cpi_ncore_per_chip.
2301 2301                   */
2302 2302                  for (i = 1; i < ncpu_per_core; i <<= 1)
2303 2303                          coreid_shift++;
2304 2304                  cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2305 2305                  cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2306 2306          } else if (is_x86_feature(feature, X86FSET_HTT)) {
2307 2307                  /*
2308 2308                   * Single-core multi-threaded processors.
2309 2309                   */
2310 2310                  cpi->cpi_coreid = cpi->cpi_chipid;
2311 2311                  cpi->cpi_pkgcoreid = 0;
2312 2312          } else {
2313 2313                  /*
2314 2314                   * Single-core single-thread processors.
2315 2315                   */
2316 2316                  cpi->cpi_coreid = cpu->cpu_id;
2317 2317                  cpi->cpi_pkgcoreid = 0;
2318 2318          }
2319 2319          cpi->cpi_procnodeid = cpi->cpi_chipid;
2320 2320          cpi->cpi_compunitid = cpi->cpi_coreid;
2321 2321  }
2322 2322  
2323 2323  /*
2324 2324   * Historically, AMD has had CMP chips with only a single thread per core.
2325 2325   * However, starting in family 17h (Zen), this has changed and they now have
2326 2326   * multiple threads. Our internal core id needs to be a unique value.
2327 2327   *
2328 2328   * To determine the core id of an AMD system, if we're from a family before 17h,
2329 2329   * then we just use the cpu id, as that gives us a good value that will be
2330 2330   * unique for each core. If instead, we're on family 17h or later, then we need
2331 2331   * to do something more complicated. CPUID leaf 0x8000001e can tell us
2332 2332   * how many threads are in the system. Based on that, we'll shift the APIC ID.
2333 2333   * We can't use the normal core id in that leaf as it's only unique within the
2334 2334   * socket, which is perfect for cpi_pkgcoreid, but not us.
2335 2335   */
2336 2336  static id_t
2337 2337  cpuid_amd_get_coreid(cpu_t *cpu)
2338 2338  {
2339 2339          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2340 2340  
2341 2341          if (cpi->cpi_family >= 0x17 &&
2342 2342              is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2343 2343              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2344 2344                  uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2345 2345                  if (nthreads > 1) {
2346 2346                          VERIFY3U(nthreads, ==, 2);
2347 2347                          return (cpi->cpi_apicid >> 1);
2348 2348                  }
2349 2349          }
2350 2350  
2351 2351          return (cpu->cpu_id);
2352 2352  }
2353 2353  
2354 2354  /*
2355 2355   * IDs on AMD is a more challenging task. This is notable because of the
2356 2356   * following two facts:
2357 2357   *
2358 2358   *  1. Before family 0x17 (Zen), there was no support for SMT and there was
2359 2359   *     also no way to get an actual unique core id from the system. As such, we
2360 2360   *     synthesize this case by using cpu->cpu_id.  This scheme does not,
2361 2361   *     however, guarantee that sibling cores of a chip will have sequential
2362 2362   *     coreids starting at a multiple of the number of cores per chip - that is
2363 2363   *     usually the case, but if the ACPI MADT table is presented in a different
2364 2364   *     order then we need to perform a few more gymnastics for the pkgcoreid.
2365 2365   *
2366 2366   *  2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2367 2367   *     called compute units. These compute units share the L1I cache, L2 cache,
2368 2368   *     and the FPU. To deal with this, a new topology leaf was added in
2369 2369   *     0x8000001e. However, parts of this leaf have different meanings
2370 2370   *     once we get to family 0x17.
2371 2371   */
2372 2372  
2373 2373  static void
2374 2374  cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2375 2375  {
2376 2376          int i, first_half, coreidsz;
2377 2377          uint32_t nb_caps_reg;
2378 2378          uint_t node2_1;
2379 2379          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2380 2380          struct cpuid_regs *cp;
2381 2381  
2382 2382          /*
2383 2383           * Calculate the core id (this comes from hardware in family 0x17 if it
2384 2384           * hasn't been stripped by virtualization). We always set the compute
2385 2385           * unit id to the same value. Also, initialize the default number of
2386 2386           * cores per compute unit and nodes per package. This will be
2387 2387           * overwritten when we know information about a particular family.
2388 2388           */
2389 2389          cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2390 2390          cpi->cpi_compunitid = cpi->cpi_coreid;
2391 2391          cpi->cpi_cores_per_compunit = 1;
2392 2392          cpi->cpi_procnodes_per_pkg = 1;
2393 2393  
2394 2394          /*
2395 2395           * To construct the logical ID, we need to determine how many APIC IDs
2396 2396           * are dedicated to the cores and threads. This is provided for us in
2397 2397           * 0x80000008. However, if it's not present (say due to virtualization),
2398 2398           * then we assume it's one. This should be present on all 64-bit AMD
2399 2399           * processors.  It was added in family 0xf (Hammer).
2400 2400           */
2401 2401          if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2402 2402                  coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2403 2403  
2404 2404                  /*
2405 2405                   * In AMD parlance chip is really a node while illumos
2406 2406                   * uses chip as equivalent to socket/package.
2407 2407                   */
2408 2408                  if (coreidsz == 0) {
2409 2409                          /* Use legacy method */
2410 2410                          for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2411 2411                                  coreidsz++;
2412 2412                          if (coreidsz == 0)
2413 2413                                  coreidsz = 1;
2414 2414                  }
2415 2415          } else {
2416 2416                  /* Assume single-core part */
2417 2417                  coreidsz = 1;
2418 2418          }
2419 2419          cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2420 2420  
2421 2421          /*
2422 2422           * The package core ID varies depending on the family. While it may be
2423 2423           * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2424 2424           * this value is the core id in the given node. For non-virtualized
2425 2425           * family 17h, we need to take the logical core id and shift off the
2426 2426           * threads like we do when getting the core id.  Otherwise, we can use
2427 2427           * the clogid as is. When family 17h is virtualized, the clogid should
2428 2428           * be sufficient as if we don't have valid data in the leaf, then we
2429 2429           * won't think we have SMT, in which case the cpi_clogid should be
2430 2430           * sufficient.
2431 2431           */
2432 2432          if (cpi->cpi_family >= 0x17 &&
2433 2433              is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2434 2434              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2435 2435              cpi->cpi_extd[0x1e].cp_ebx != 0) {
2436 2436                  uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2437 2437                  if (nthreads > 1) {
2438 2438                          VERIFY3U(nthreads, ==, 2);
2439 2439                          cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2440 2440                  } else {
2441 2441                          cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2442 2442                  }
2443 2443          } else {
2444 2444                  cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2445 2445          }
2446 2446  
2447 2447          /*
2448 2448           * Obtain the node ID and compute unit IDs. If we're on family 0x15
2449 2449           * (bulldozer) or newer, then we can derive all of this from leaf
2450 2450           * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2451 2451           */
2452 2452          if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2453 2453              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2454 2454                  cp = &cpi->cpi_extd[0x1e];
2455 2455  
2456 2456                  cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2457 2457                  cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2458 2458  
2459 2459                  /*
2460 2460                   * For Bulldozer-era CPUs, recalculate the compute unit
2461 2461                   * information.
2462 2462                   */
2463 2463                  if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2464 2464                          cpi->cpi_cores_per_compunit =
2465 2465                              BITX(cp->cp_ebx, 15, 8) + 1;
2466 2466                          cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2467 2467                              (cpi->cpi_ncore_per_chip /
2468 2468                              cpi->cpi_cores_per_compunit) *
2469 2469                              (cpi->cpi_procnodeid /
2470 2470                              cpi->cpi_procnodes_per_pkg);
2471 2471                  }
2472 2472          } else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2473 2473                  cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2474 2474          } else if (cpi->cpi_family == 0x10) {
2475 2475                  /*
2476 2476                   * See if we are a multi-node processor.
2477 2477                   * All processors in the system have the same number of nodes
2478 2478                   */
2479 2479                  nb_caps_reg =  pci_getl_func(0, 24, 3, 0xe8);
2480 2480                  if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2481 2481                          /* Single-node */
2482 2482                          cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2483 2483                              coreidsz);
2484 2484                  } else {
2485 2485  
2486 2486                          /*
2487 2487                           * Multi-node revision D (2 nodes per package
2488 2488                           * are supported)
2489 2489                           */
2490 2490                          cpi->cpi_procnodes_per_pkg = 2;
2491 2491  
2492 2492                          first_half = (cpi->cpi_pkgcoreid <=
2493 2493                              (cpi->cpi_ncore_per_chip/2 - 1));
2494 2494  
2495 2495                          if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2496 2496                                  /* We are BSP */
2497 2497                                  cpi->cpi_procnodeid = (first_half ? 0 : 1);
2498 2498                          } else {
2499 2499  
2500 2500                                  /* We are AP */
2501 2501                                  /* NodeId[2:1] bits to use for reading F3xe8 */
2502 2502                                  node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2503 2503  
2504 2504                                  nb_caps_reg =
2505 2505                                      pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2506 2506  
2507 2507                                  /*
2508 2508                                   * Check IntNodeNum bit (31:30, but bit 31 is
2509 2509                                   * always 0 on dual-node processors)
2510 2510                                   */
2511 2511                                  if (BITX(nb_caps_reg, 30, 30) == 0)
2512 2512                                          cpi->cpi_procnodeid = node2_1 +
2513 2513                                              !first_half;
2514 2514                                  else
2515 2515                                          cpi->cpi_procnodeid = node2_1 +
2516 2516                                              first_half;
2517 2517                          }
2518 2518                  }
2519 2519          } else {
2520 2520                  cpi->cpi_procnodeid = 0;
2521 2521          }
2522 2522  
2523 2523          cpi->cpi_chipid =
2524 2524              cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2525 2525  
2526 2526          cpi->cpi_ncore_bits = coreidsz;
2527 2527          cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2528 2528              cpi->cpi_ncore_per_chip);
2529 2529  }
2530 2530  
2531 2531  static void
2532 2532  spec_uarch_flush_noop(void)
2533 2533  {
2534 2534  }
2535 2535  
2536 2536  /*
2537 2537   * When microcode is present that mitigates MDS, this wrmsr will also flush the
2538 2538   * MDS-related micro-architectural state that would normally happen by calling
2539 2539   * x86_md_clear().
2540 2540   */
2541 2541  static void
2542 2542  spec_uarch_flush_msr(void)
2543 2543  {
2544 2544          wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2545 2545  }
2546 2546  
2547 2547  /*
2548 2548   * This function points to a function that will flush certain
2549 2549   * micro-architectural state on the processor. This flush is used to mitigate
2550 2550   * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This
2551 2551   * function can point to one of three functions:
2552 2552   *
2553 2553   * - A noop which is done because we either are vulnerable, but do not have
2554 2554   *   microcode available to help deal with a fix, or because we aren't
2555 2555   *   vulnerable.
2556 2556   *
2557 2557   * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2558 2558   *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2559 2559   *   however, it only flushes the MDS related micro-architectural state on the
2560 2560   *   current hyperthread, it does not do anything for the twin.
2561 2561   *
2562 2562   * - x86_md_clear which will flush the MDS related state. This is done when we
2563 2563   *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2564 2564   *   (RDCL_NO is set).
2565 2565   */
2566 2566  void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2567 2567  
2568 2568  static void
2569 2569  cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2570 2570  {
2571 2571          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2572 2572  
2573 2573          /*
2574 2574           * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2575 2575           * has been fixed in hardware, it doesn't cover everything related to
2576 2576           * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2577 2577           * need to mitigate this.
2578 2578           */
2579 2579          if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2580 2580              is_x86_feature(featureset, X86FSET_MDS_NO)) {
2581 2581                  return;
2582 2582          }
2583 2583  
2584 2584          if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2585 2585                  const uint8_t nop = NOP_INSTR;
2586 2586                  uint8_t *md = (uint8_t *)x86_md_clear;
2587 2587  
2588 2588                  *md = nop;
2589 2589          }
2590 2590  
2591 2591          membar_producer();
2592 2592  }
2593 2593  
2594 2594  static void
2595 2595  cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2596 2596  {
2597 2597          boolean_t need_l1d, need_mds;
2598 2598          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2599 2599  
2600 2600          /*
2601 2601           * If we're not on Intel or we've mitigated both RDCL and MDS in
2602 2602           * hardware, then there's nothing left for us to do for enabling the
2603 2603           * flush. We can also go ahead and say that SMT exclusion is
2604 2604           * unnecessary.
2605 2605           */
2606 2606          if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2607 2607              (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2608 2608              is_x86_feature(featureset, X86FSET_MDS_NO))) {
2609 2609                  extern int smt_exclusion;
2610 2610                  smt_exclusion = 0;
2611 2611                  spec_uarch_flush = spec_uarch_flush_noop;
2612 2612                  membar_producer();
2613 2613                  return;
2614 2614          }
2615 2615  
2616 2616          /*
2617 2617           * The locations where we need to perform an L1D flush are required both
2618 2618           * for mitigating L1TF and MDS. When verw support is present in
2619 2619           * microcode, then the L1D flush will take care of doing that as well.
2620 2620           * However, if we have a system where RDCL_NO is present, but we don't
2621 2621           * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2622 2622           * L1D flush.
2623 2623           */
2624 2624          if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2625 2625              is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2626 2626              !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2627 2627                  need_l1d = B_TRUE;
2628 2628          } else {
2629 2629                  need_l1d = B_FALSE;
2630 2630          }
2631 2631  
2632 2632          if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2633 2633              is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2634 2634                  need_mds = B_TRUE;
2635 2635          } else {
2636 2636                  need_mds = B_FALSE;
2637 2637          }
2638 2638  
2639 2639          if (need_l1d) {
2640 2640                  spec_uarch_flush = spec_uarch_flush_msr;
2641 2641          } else if (need_mds) {
2642 2642                  spec_uarch_flush = x86_md_clear;
2643 2643          } else {
2644 2644                  /*
2645 2645                   * We have no hardware mitigations available to us.
2646 2646                   */
2647 2647                  spec_uarch_flush = spec_uarch_flush_noop;
2648 2648          }
2649 2649          membar_producer();
2650 2650  }
2651 2651  
2652 2652  /*
2653 2653   * We default to enabling RSB mitigations.
2654 2654   */
2655 2655  static void
2656 2656  cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)
2657 2657  {
2658 2658          const uint8_t ret = RET_INSTR;
2659 2659          uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
2660 2660  
2661 2661          switch (mit) {
2662 2662          case X86_SPECTREV2_ENHANCED_IBRS:
2663 2663          case X86_SPECTREV2_DISABLED:
2664 2664                  *stuff = ret;
2665 2665                  break;
2666 2666          default:
2667 2667                  break;
2668 2668          }
2669 2669  }
2670 2670  
2671 2671  static void
2672 2672  cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
2673 2673  {
2674 2674          const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
2675 2675              "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
2676 2676              "_r14", "_r15" };
2677 2677          const uint_t nthunks = ARRAY_SIZE(thunks);
2678 2678          const char *type;
2679 2679          uint_t i;
2680 2680  
2681 2681          if (mit == x86_spectrev2_mitigation)
2682 2682                  return;
2683 2683  
2684 2684          switch (mit) {
2685 2685          case X86_SPECTREV2_RETPOLINE:
2686 2686                  type = "gen";
2687 2687                  break;
2688 2688          case X86_SPECTREV2_RETPOLINE_AMD:
2689 2689                  type = "amd";
2690 2690                  break;
2691 2691          case X86_SPECTREV2_ENHANCED_IBRS:
2692 2692          case X86_SPECTREV2_DISABLED:
2693 2693                  type = "jmp";
2694 2694                  break;
2695 2695          default:
2696 2696                  panic("asked to updated retpoline state with unknown state!");
2697 2697          }
2698 2698  
2699 2699          for (i = 0; i < nthunks; i++) {
2700 2700                  uintptr_t source, dest;
2701 2701                  int ssize, dsize;
2702 2702                  char sourcebuf[64], destbuf[64];
2703 2703                  size_t len;
2704 2704  
2705 2705                  (void) snprintf(destbuf, sizeof (destbuf),
2706 2706                      "__x86_indirect_thunk%s", thunks[i]);
2707 2707                  (void) snprintf(sourcebuf, sizeof (sourcebuf),
2708 2708                      "__x86_indirect_thunk_%s%s", type, thunks[i]);
2709 2709  
2710 2710                  source = kobj_getelfsym(sourcebuf, NULL, &ssize);
2711 2711                  dest = kobj_getelfsym(destbuf, NULL, &dsize);
2712 2712                  VERIFY3U(source, !=, 0);
2713 2713                  VERIFY3U(dest, !=, 0);
2714 2714                  VERIFY3S(dsize, >=, ssize);
2715 2715                  bcopy((void *)source, (void *)dest, ssize);
2716 2716          }
2717 2717  }
2718 2718  
2719 2719  static void
2720 2720  cpuid_enable_enhanced_ibrs(void)
2721 2721  {
2722 2722          uint64_t val;
2723 2723  
2724 2724          val = rdmsr(MSR_IA32_SPEC_CTRL);
2725 2725          val |= IA32_SPEC_CTRL_IBRS;
2726 2726          wrmsr(MSR_IA32_SPEC_CTRL, val);
2727 2727  }
2728 2728  
2729 2729  #ifndef __xpv
2730 2730  /*
2731 2731   * Determine whether or not we can use the AMD optimized retpoline
2732 2732   * functionality. We use this when we know we're on an AMD system and we can
2733 2733   * successfully verify that lfence is dispatch serializing.
2734 2734   */
2735 2735  static boolean_t
2736 2736  cpuid_use_amd_retpoline(struct cpuid_info *cpi)
2737 2737  {
2738 2738          uint64_t val;
2739 2739          on_trap_data_t otd;
2740 2740  
2741 2741          if (cpi->cpi_vendor != X86_VENDOR_AMD)
2742 2742                  return (B_FALSE);
2743 2743  
2744 2744          /*
2745 2745           * We need to determine whether or not lfence is serializing. It always
2746 2746           * is on families 0xf and 0x11. On others, it's controlled by
2747 2747           * MSR_AMD_DECODE_CONFIG (MSRC001_1029). If some hypervisor gives us a
2748 2748           * crazy old family, don't try and do anything.
2749 2749           */
2750 2750          if (cpi->cpi_family < 0xf)
2751 2751                  return (B_FALSE);
2752 2752          if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11)
2753 2753                  return (B_TRUE);
2754 2754  
2755 2755          /*
2756 2756           * While it may be tempting to use get_hwenv(), there are no promises
2757 2757           * that a hypervisor will actually declare themselves to be so in a
2758 2758           * friendly way. As such, try to read and set the MSR. If we can then
2759 2759           * read back the value we set (it wasn't just set to zero), then we go
2760 2760           * for it.
2761 2761           */
2762 2762          if (!on_trap(&otd, OT_DATA_ACCESS)) {
2763 2763                  val = rdmsr(MSR_AMD_DECODE_CONFIG);
2764 2764                  val |= AMD_DECODE_CONFIG_LFENCE_DISPATCH;
2765 2765                  wrmsr(MSR_AMD_DECODE_CONFIG, val);
2766 2766                  val = rdmsr(MSR_AMD_DECODE_CONFIG);
2767 2767          } else {
2768 2768                  val = 0;
2769 2769          }
2770 2770          no_trap();
2771 2771  
2772 2772          if ((val & AMD_DECODE_CONFIG_LFENCE_DISPATCH) != 0)
2773 2773                  return (B_TRUE);
2774 2774          return (B_FALSE);
2775 2775  }
2776 2776  #endif  /* !__xpv */
2777 2777  
2778 2778  /*
2779 2779   * Determine how we should mitigate TAA or if we need to. Regardless of TAA, if
2780 2780   * we can disable TSX, we do so.
2781 2781   *
2782 2782   * This determination is done only on the boot CPU, potentially after loading
2783 2783   * updated microcode.
2784 2784   */
2785 2785  static void
2786 2786  cpuid_update_tsx(cpu_t *cpu, uchar_t *featureset)
2787 2787  {
2788 2788          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2789 2789  
2790 2790          VERIFY(cpu->cpu_id == 0);
2791 2791  
2792 2792          if (cpi->cpi_vendor != X86_VENDOR_Intel) {
2793 2793                  x86_taa_mitigation = X86_TAA_HW_MITIGATED;
2794 2794                  return;
2795 2795          }
2796 2796  
2797 2797          if (x86_disable_taa) {
2798 2798                  x86_taa_mitigation = X86_TAA_DISABLED;
2799 2799                  return;
2800 2800          }
2801 2801  
2802 2802          /*
2803 2803           * If we do not have the ability to disable TSX, then our only
2804 2804           * mitigation options are in hardware (TAA_NO), or by using our existing
2805 2805           * MDS mitigation as described above.  The latter relies upon us having
2806 2806           * configured MDS mitigations correctly! This includes disabling SMT if
2807 2807           * we want to cross-CPU-thread protection.
2808 2808           */
2809 2809          if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) {
2810 2810                  /*
2811 2811                   * It's not clear whether any parts will enumerate TAA_NO
2812 2812                   * *without* TSX_CTRL, but let's mark it as such if we see this.
2813 2813                   */
2814 2814                  if (is_x86_feature(featureset, X86FSET_TAA_NO)) {
2815 2815                          x86_taa_mitigation = X86_TAA_HW_MITIGATED;
2816 2816                          return;
2817 2817                  }
2818 2818  
2819 2819                  if (is_x86_feature(featureset, X86FSET_MD_CLEAR) &&
2820 2820                      !is_x86_feature(featureset, X86FSET_MDS_NO)) {
2821 2821                          x86_taa_mitigation = X86_TAA_MD_CLEAR;
2822 2822                  } else {
2823 2823                          x86_taa_mitigation = X86_TAA_NOTHING;
2824 2824                  }
2825 2825                  return;
2826 2826          }
2827 2827  
2828 2828          /*
2829 2829           * We have TSX_CTRL, but we can only fully disable TSX if we're early
2830 2830           * enough in boot.
2831 2831           *

↓ open down ↓

1591 lines elided

↑ open up ↑

2832 2832           * Otherwise, we'll fall back to causing transactions to abort as our
2833 2833           * mitigation. TSX-using code will always take the fallback path.
2834 2834           */
2835 2835          if (cpi->cpi_pass < 4) {
2836 2836                  x86_taa_mitigation = X86_TAA_TSX_DISABLE;
2837 2837          } else {
2838 2838                  x86_taa_mitigation = X86_TAA_TSX_FORCE_ABORT;
2839 2839          }
2840 2840  }
2841 2841  
     2842 +/*
     2843 + * As mentioned, we should only touch the MSR when we've got a suitable
     2844 + * microcode loaded on this CPU.
     2845 + */
2842 2846  static void
2843      -cpuid_apply_tsx(x86_taa_mitigation_t taa)
     2847 +cpuid_apply_tsx(x86_taa_mitigation_t taa, uchar_t *featureset)
2844 2848  {
2845 2849          uint64_t val;
2846 2850  
2847 2851          switch (taa) {
2848 2852          case X86_TAA_TSX_DISABLE:
     2853 +                if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
     2854 +                        return;
2849 2855                  val = rdmsr(MSR_IA32_TSX_CTRL);
2850 2856                  val |= IA32_TSX_CTRL_CPUID_CLEAR | IA32_TSX_CTRL_RTM_DISABLE;
2851 2857                  wrmsr(MSR_IA32_TSX_CTRL, val);
2852 2858                  break;
2853 2859          case X86_TAA_TSX_FORCE_ABORT:
     2860 +                if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
     2861 +                        return;
2854 2862                  val = rdmsr(MSR_IA32_TSX_CTRL);
2855 2863                  val |= IA32_TSX_CTRL_RTM_DISABLE;
2856 2864                  wrmsr(MSR_IA32_TSX_CTRL, val);
2857 2865                  break;
2858 2866          case X86_TAA_HW_MITIGATED:
2859 2867          case X86_TAA_MD_CLEAR:
2860 2868          case X86_TAA_DISABLED:
2861 2869          case X86_TAA_NOTHING:
2862 2870                  break;
2863 2871          }

2864 2872  }
2865 2873  
2866 2874  static void
2867 2875  cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
2868 2876  {
2869 2877          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2870 2878          x86_spectrev2_mitigation_t v2mit;
2871 2879  
2872 2880          if (cpi->cpi_vendor == X86_VENDOR_AMD &&
2873 2881              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2874 2882                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
2875 2883                          add_x86_feature(featureset, X86FSET_IBPB);
2876 2884                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
2877 2885                          add_x86_feature(featureset, X86FSET_IBRS);
2878 2886                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
2879 2887                          add_x86_feature(featureset, X86FSET_STIBP);
2880 2888                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
2881 2889                          add_x86_feature(featureset, X86FSET_STIBP_ALL);
2882 2890                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
2883 2891                          add_x86_feature(featureset, X86FSET_SSBD);
2884 2892                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
2885 2893                          add_x86_feature(featureset, X86FSET_SSBD_VIRT);
2886 2894                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
2887 2895                          add_x86_feature(featureset, X86FSET_SSB_NO);
2888 2896                  /*
2889 2897                   * Don't enable enhanced IBRS unless we're told that we should
2890 2898                   * prefer it and it has the same semantics as Intel. This is
2891 2899                   * split into two bits rather than a single one.
2892 2900                   */
2893 2901                  if ((cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
2894 2902                      (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)) {
2895 2903                          add_x86_feature(featureset, X86FSET_IBRS_ALL);
2896 2904                  }
2897 2905  
2898 2906          } else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
2899 2907              cpi->cpi_maxeax >= 7) {
2900 2908                  struct cpuid_regs *ecp;
2901 2909                  ecp = &cpi->cpi_std[7];
2902 2910  
2903 2911                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
2904 2912                          add_x86_feature(featureset, X86FSET_MD_CLEAR);
2905 2913                  }
2906 2914  
2907 2915                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
2908 2916                          add_x86_feature(featureset, X86FSET_IBRS);
2909 2917                          add_x86_feature(featureset, X86FSET_IBPB);
2910 2918                  }
2911 2919  
2912 2920                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
2913 2921                          add_x86_feature(featureset, X86FSET_STIBP);
2914 2922                  }
2915 2923  
2916 2924                  /*
2917 2925                   * Don't read the arch caps MSR on xpv where we lack the
2918 2926                   * on_trap().
2919 2927                   */
2920 2928  #ifndef __xpv
2921 2929                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
2922 2930                          on_trap_data_t otd;
2923 2931  
2924 2932                          /*
2925 2933                           * Be paranoid and assume we'll get a #GP.
2926 2934                           */
2927 2935                          if (!on_trap(&otd, OT_DATA_ACCESS)) {
2928 2936                                  uint64_t reg;
2929 2937  
2930 2938                                  reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
2931 2939                                  if (reg & IA32_ARCH_CAP_RDCL_NO) {
2932 2940                                          add_x86_feature(featureset,
2933 2941                                              X86FSET_RDCL_NO);
2934 2942                                  }
2935 2943                                  if (reg & IA32_ARCH_CAP_IBRS_ALL) {
2936 2944                                          add_x86_feature(featureset,
2937 2945                                              X86FSET_IBRS_ALL);
2938 2946                                  }
2939 2947                                  if (reg & IA32_ARCH_CAP_RSBA) {
2940 2948                                          add_x86_feature(featureset,
2941 2949                                              X86FSET_RSBA);
2942 2950                                  }
2943 2951                                  if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
2944 2952                                          add_x86_feature(featureset,
2945 2953                                              X86FSET_L1D_VM_NO);
2946 2954                                  }
2947 2955                                  if (reg & IA32_ARCH_CAP_SSB_NO) {
2948 2956                                          add_x86_feature(featureset,
2949 2957                                              X86FSET_SSB_NO);
2950 2958                                  }
2951 2959                                  if (reg & IA32_ARCH_CAP_MDS_NO) {
2952 2960                                          add_x86_feature(featureset,
2953 2961                                              X86FSET_MDS_NO);
2954 2962                                  }
2955 2963                                  if (reg & IA32_ARCH_CAP_TSX_CTRL) {
2956 2964                                          add_x86_feature(featureset,
2957 2965                                              X86FSET_TSX_CTRL);
2958 2966                                  }
2959 2967                                  if (reg & IA32_ARCH_CAP_TAA_NO) {
2960 2968                                          add_x86_feature(featureset,
2961 2969                                              X86FSET_TAA_NO);
2962 2970                                  }
2963 2971                          }
2964 2972                          no_trap();
2965 2973                  }
2966 2974  #endif  /* !__xpv */
2967 2975  
2968 2976                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)

↓ open down ↓

105 lines elided

↑ open up ↑

2969 2977                          add_x86_feature(featureset, X86FSET_SSBD);
2970 2978  
2971 2979                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
2972 2980                          add_x86_feature(featureset, X86FSET_FLUSH_CMD);
2973 2981          }
2974 2982  
2975 2983          /*
2976 2984           * Take care of certain mitigations on the non-boot CPU. The boot CPU
2977 2985           * will have already run this function and determined what we need to
2978 2986           * do. This gives us a hook for per-HW thread mitigations such as
2979      -         * enhanced IBRS, or disabling TSX.  For TSX disabling, we need to be
2980      -         * careful that we've had a chance to load ucode that enables the new
2981      -         * MSRs.
     2987 +         * enhanced IBRS, or disabling TSX.
2982 2988           */
2983 2989          if (cpu->cpu_id != 0) {
2984 2990                  if (x86_spectrev2_mitigation == X86_SPECTREV2_ENHANCED_IBRS) {
2985 2991                          cpuid_enable_enhanced_ibrs();
2986 2992                  }
2987 2993  
2988      -                if (cpi->cpi_pass >= 1)
2989      -                        cpuid_apply_tsx(x86_taa_mitigation);
     2994 +                cpuid_apply_tsx(x86_taa_mitigation, featureset);
2990 2995                  return;
2991 2996          }
2992 2997  
2993 2998          /*
2994 2999           * Go through and initialize various security mechanisms that we should
2995 3000           * only do on a single CPU. This includes Spectre V2, L1TF, MDS, and
2996 3001           * TAA.
2997 3002           */
2998 3003  
2999 3004          /*

3000 3005           * By default we've come in with retpolines enabled. Check whether we
3001 3006           * should disable them or enable enhanced IBRS. RSB stuffing is enabled
3002 3007           * by default, but disabled if we are using enhanced IBRS.
3003 3008           */
3004 3009          if (x86_disable_spectrev2 != 0) {
3005 3010                  v2mit = X86_SPECTREV2_DISABLED;
3006 3011          } else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
3007 3012                  cpuid_enable_enhanced_ibrs();
3008 3013                  v2mit = X86_SPECTREV2_ENHANCED_IBRS;
3009 3014  #ifndef __xpv
3010 3015          } else if (cpuid_use_amd_retpoline(cpi)) {
3011 3016                  v2mit = X86_SPECTREV2_RETPOLINE_AMD;
3012 3017  #endif  /* !__xpv */
3013 3018          } else {
3014 3019                  v2mit = X86_SPECTREV2_RETPOLINE;
3015 3020          }
3016 3021  
3017 3022          cpuid_patch_retpolines(v2mit);
3018 3023          cpuid_patch_rsb(v2mit);
3019 3024          x86_spectrev2_mitigation = v2mit;
3020 3025          membar_producer();
3021 3026  
3022 3027          /*
3023 3028           * We need to determine what changes are required for mitigating L1TF
3024 3029           * and MDS. If the CPU suffers from either of them, then SMT exclusion
3025 3030           * is required.
3026 3031           *
3027 3032           * If any of these are present, then we need to flush u-arch state at
3028 3033           * various points. For MDS, we need to do so whenever we change to a
3029 3034           * lesser privilege level or we are halting the CPU. For L1TF we need to
3030 3035           * flush the L1D cache at VM entry. When we have microcode that handles
3031 3036           * MDS, the L1D flush also clears the other u-arch state that the
3032 3037           * md_clear does.
3033 3038           */
3034 3039  
3035 3040          /*
3036 3041           * Update whether or not we need to be taking explicit action against
3037 3042           * MDS.
3038 3043           */
3039 3044          cpuid_update_md_clear(cpu, featureset);
3040 3045  
3041 3046          /*

↓ open down ↓

42 lines elided

↑ open up ↑

3042 3047           * Determine whether SMT exclusion is required and whether or not we
3043 3048           * need to perform an l1d flush.
3044 3049           */
3045 3050          cpuid_update_l1d_flush(cpu, featureset);
3046 3051  
3047 3052          /*
3048 3053           * Determine what our mitigation strategy should be for TAA and then
3049 3054           * also apply TAA mitigations.
3050 3055           */
3051 3056          cpuid_update_tsx(cpu, featureset);
3052      -        cpuid_apply_tsx(x86_taa_mitigation);
     3057 +        cpuid_apply_tsx(x86_taa_mitigation, featureset);
3053 3058  }
3054 3059  
3055 3060  /*
3056 3061   * Setup XFeature_Enabled_Mask register. Required by xsave feature.
3057 3062   */
3058 3063  void
3059 3064  setup_xfem(void)
3060 3065  {
3061 3066          uint64_t flags = XFEATURE_LEGACY_FP;
3062 3067

3063 3068          ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
3064 3069  
3065 3070          if (is_x86_feature(x86_featureset, X86FSET_SSE))
3066 3071                  flags |= XFEATURE_SSE;
3067 3072  
3068 3073          if (is_x86_feature(x86_featureset, X86FSET_AVX))
3069 3074                  flags |= XFEATURE_AVX;
3070 3075  
3071 3076          if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
3072 3077                  flags |= XFEATURE_AVX512;
3073 3078  
3074 3079          set_xcr(XFEATURE_ENABLED_MASK, flags);
3075 3080  
3076 3081          xsave_bv_all = flags;
3077 3082  }
3078 3083  
3079 3084  static void
3080 3085  cpuid_pass1_topology(cpu_t *cpu, uchar_t *featureset)
3081 3086  {
3082 3087          struct cpuid_info *cpi;
3083 3088  
3084 3089          cpi = cpu->cpu_m.mcpu_cpi;
3085 3090  
3086 3091          if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3087 3092                  cpuid_gather_amd_topology_leaves(cpu);
3088 3093          }
3089 3094  
3090 3095          cpi->cpi_apicid = cpuid_gather_apicid(cpi);
3091 3096  
3092 3097          /*
3093 3098           * Before we can calculate the IDs that we should assign to this
3094 3099           * processor, we need to understand how many cores and threads it has.
3095 3100           */
3096 3101          switch (cpi->cpi_vendor) {
3097 3102          case X86_VENDOR_Intel:
3098 3103                  cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3099 3104                      &cpi->cpi_ncore_per_chip);
3100 3105                  break;
3101 3106          case X86_VENDOR_AMD:
3102 3107                  cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3103 3108                      &cpi->cpi_ncore_per_chip);
3104 3109                  break;
3105 3110          default:
3106 3111                  /*
3107 3112                   * If we have some other x86 compatible chip, it's not clear how
3108 3113                   * they would behave. The most common case is virtualization
3109 3114                   * today, though there are also 64-bit VIA chips. Assume that
3110 3115                   * all we can get is the basic Leaf 1 HTT information.
3111 3116                   */
3112 3117                  if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
3113 3118                          cpi->cpi_ncore_per_chip = 1;
3114 3119                          cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
3115 3120                  }
3116 3121                  break;
3117 3122          }
3118 3123  
3119 3124          /*
3120 3125           * Based on the calculated number of threads and cores, potentially
3121 3126           * assign the HTT and CMT features.
3122 3127           */
3123 3128          if (cpi->cpi_ncore_per_chip > 1) {
3124 3129                  add_x86_feature(featureset, X86FSET_CMP);
3125 3130          }
3126 3131  
3127 3132          if (cpi->cpi_ncpu_per_chip > 1 &&
3128 3133              cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
3129 3134                  add_x86_feature(featureset, X86FSET_HTT);
3130 3135          }
3131 3136  
3132 3137          /*
3133 3138           * Now that has been set up, we need to go through and calculate all of
3134 3139           * the rest of the parameters that exist. If we think the CPU doesn't
3135 3140           * have either SMT (HTT) or CMP, then we basically go through and fake
3136 3141           * up information in some way. The most likely case for this is
3137 3142           * virtualization where we have a lot of partial topology information.
3138 3143           */
3139 3144          if (!is_x86_feature(featureset, X86FSET_HTT) &&
3140 3145              !is_x86_feature(featureset, X86FSET_CMP)) {
3141 3146                  /*
3142 3147                   * This is a single core, single-threaded processor.
3143 3148                   */
3144 3149                  cpi->cpi_procnodes_per_pkg = 1;
3145 3150                  cpi->cpi_cores_per_compunit = 1;
3146 3151                  cpi->cpi_compunitid = 0;
3147 3152                  cpi->cpi_chipid = -1;
3148 3153                  cpi->cpi_clogid = 0;
3149 3154                  cpi->cpi_coreid = cpu->cpu_id;
3150 3155                  cpi->cpi_pkgcoreid = 0;
3151 3156                  if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3152 3157                          cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
3153 3158                  } else {
3154 3159                          cpi->cpi_procnodeid = cpi->cpi_chipid;
3155 3160                  }
3156 3161          } else {
3157 3162                  switch (cpi->cpi_vendor) {
3158 3163                  case X86_VENDOR_Intel:
3159 3164                          cpuid_intel_getids(cpu, featureset);
3160 3165                          break;
3161 3166                  case X86_VENDOR_AMD:
3162 3167                          cpuid_amd_getids(cpu, featureset);
3163 3168                          break;
3164 3169                  default:
3165 3170                          /*
3166 3171                           * In this case, it's hard to say what we should do.
3167 3172                           * We're going to model them to the OS as single core
3168 3173                           * threads. We don't have a good identifier for them, so
3169 3174                           * we're just going to use the cpu id all on a single
3170 3175                           * chip.
3171 3176                           *
3172 3177                           * This case has historically been different from the
3173 3178                           * case above where we don't have HTT or CMP. While they
3174 3179                           * could be combined, we've opted to keep it separate to
3175 3180                           * minimize the risk of topology changes in weird cases.
3176 3181                           */
3177 3182                          cpi->cpi_procnodes_per_pkg = 1;
3178 3183                          cpi->cpi_cores_per_compunit = 1;
3179 3184                          cpi->cpi_chipid = 0;
3180 3185                          cpi->cpi_coreid = cpu->cpu_id;
3181 3186                          cpi->cpi_clogid = cpu->cpu_id;
3182 3187                          cpi->cpi_pkgcoreid = cpu->cpu_id;
3183 3188                          cpi->cpi_procnodeid = cpi->cpi_chipid;
3184 3189                          cpi->cpi_compunitid = cpi->cpi_coreid;
3185 3190                          break;
3186 3191                  }
3187 3192          }
3188 3193  }
3189 3194  
3190 3195  /*
3191 3196   * Gather relevant CPU features from leaf 6 which covers thermal information. We
3192 3197   * always gather leaf 6 if it's supported; however, we only look for features on
3193 3198   * Intel systems as AMD does not currently define any of the features we look
3194 3199   * for below.
3195 3200   */
3196 3201  static void
3197 3202  cpuid_pass1_thermal(cpu_t *cpu, uchar_t *featureset)
3198 3203  {
3199 3204          struct cpuid_regs *cp;
3200 3205          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3201 3206  
3202 3207          if (cpi->cpi_maxeax < 6) {
3203 3208                  return;
3204 3209          }
3205 3210  
3206 3211          cp = &cpi->cpi_std[6];
3207 3212          cp->cp_eax = 6;
3208 3213          cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
3209 3214          (void) __cpuid_insn(cp);
3210 3215          platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
3211 3216  
3212 3217          if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3213 3218                  return;
3214 3219          }
3215 3220  
3216 3221          if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
3217 3222                  add_x86_feature(featureset, X86FSET_CORE_THERMAL);
3218 3223          }
3219 3224  
3220 3225          if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
3221 3226                  add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3222 3227          }
3223 3228  }
3224 3229  
3225 3230  void
3226 3231  cpuid_pass1(cpu_t *cpu, uchar_t *featureset)
3227 3232  {
3228 3233          uint32_t mask_ecx, mask_edx;
3229 3234          struct cpuid_info *cpi;
3230 3235          struct cpuid_regs *cp;
3231 3236          int xcpuid;
3232 3237  #if !defined(__xpv)
3233 3238          extern int idle_cpu_prefer_mwait;
3234 3239  #endif
3235 3240  
3236 3241          /*
3237 3242           * Space statically allocated for BSP, ensure pointer is set
3238 3243           */
3239 3244          if (cpu->cpu_id == 0) {
3240 3245                  if (cpu->cpu_m.mcpu_cpi == NULL)
3241 3246                          cpu->cpu_m.mcpu_cpi = &cpuid_info0;
3242 3247          }
3243 3248  
3244 3249          add_x86_feature(featureset, X86FSET_CPUID);
3245 3250  
3246 3251          cpi = cpu->cpu_m.mcpu_cpi;
3247 3252          ASSERT(cpi != NULL);
3248 3253          cp = &cpi->cpi_std[0];
3249 3254          cp->cp_eax = 0;
3250 3255          cpi->cpi_maxeax = __cpuid_insn(cp);
3251 3256          {
3252 3257                  uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
3253 3258                  *iptr++ = cp->cp_ebx;
3254 3259                  *iptr++ = cp->cp_edx;
3255 3260                  *iptr++ = cp->cp_ecx;
3256 3261                  *(char *)&cpi->cpi_vendorstr[12] = '\0';
3257 3262          }
3258 3263  
3259 3264          cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
3260 3265          x86_vendor = cpi->cpi_vendor; /* for compatibility */
3261 3266  
3262 3267          /*
3263 3268           * Limit the range in case of weird hardware
3264 3269           */
3265 3270          if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
3266 3271                  cpi->cpi_maxeax = CPI_MAXEAX_MAX;
3267 3272          if (cpi->cpi_maxeax < 1)
3268 3273                  goto pass1_done;
3269 3274  
3270 3275          cp = &cpi->cpi_std[1];
3271 3276          cp->cp_eax = 1;
3272 3277          (void) __cpuid_insn(cp);
3273 3278  
3274 3279          /*
3275 3280           * Extract identifying constants for easy access.
3276 3281           */
3277 3282          cpi->cpi_model = CPI_MODEL(cpi);
3278 3283          cpi->cpi_family = CPI_FAMILY(cpi);
3279 3284  
3280 3285          if (cpi->cpi_family == 0xf)
3281 3286                  cpi->cpi_family += CPI_FAMILY_XTD(cpi);
3282 3287  
3283 3288          /*
3284 3289           * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
3285 3290           * Intel, and presumably everyone else, uses model == 0xf, as
3286 3291           * one would expect (max value means possible overflow).  Sigh.
3287 3292           */
3288 3293  
3289 3294          switch (cpi->cpi_vendor) {
3290 3295          case X86_VENDOR_Intel:
3291 3296                  if (IS_EXTENDED_MODEL_INTEL(cpi))
3292 3297                          cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3293 3298                  break;
3294 3299          case X86_VENDOR_AMD:
3295 3300                  if (CPI_FAMILY(cpi) == 0xf)
3296 3301                          cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3297 3302                  break;
3298 3303          default:
3299 3304                  if (cpi->cpi_model == 0xf)
3300 3305                          cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3301 3306                  break;
3302 3307          }
3303 3308  
3304 3309          cpi->cpi_step = CPI_STEP(cpi);
3305 3310          cpi->cpi_brandid = CPI_BRANDID(cpi);
3306 3311  
3307 3312          /*
3308 3313           * *default* assumptions:
3309 3314           * - believe %edx feature word
3310 3315           * - ignore %ecx feature word
3311 3316           * - 32-bit virtual and physical addressing
3312 3317           */
3313 3318          mask_edx = 0xffffffff;
3314 3319          mask_ecx = 0;
3315 3320  
3316 3321          cpi->cpi_pabits = cpi->cpi_vabits = 32;
3317 3322  
3318 3323          switch (cpi->cpi_vendor) {
3319 3324          case X86_VENDOR_Intel:
3320 3325                  if (cpi->cpi_family == 5)
3321 3326                          x86_type = X86_TYPE_P5;
3322 3327                  else if (IS_LEGACY_P6(cpi)) {
3323 3328                          x86_type = X86_TYPE_P6;
3324 3329                          pentiumpro_bug4046376 = 1;
3325 3330                          /*
3326 3331                           * Clear the SEP bit when it was set erroneously
3327 3332                           */
3328 3333                          if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
3329 3334                                  cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
3330 3335                  } else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
3331 3336                          x86_type = X86_TYPE_P4;
3332 3337                          /*
3333 3338                           * We don't currently depend on any of the %ecx
3334 3339                           * features until Prescott, so we'll only check
3335 3340                           * this from P4 onwards.  We might want to revisit
3336 3341                           * that idea later.
3337 3342                           */
3338 3343                          mask_ecx = 0xffffffff;
3339 3344                  } else if (cpi->cpi_family > 0xf)
3340 3345                          mask_ecx = 0xffffffff;
3341 3346                  /*
3342 3347                   * We don't support MONITOR/MWAIT if leaf 5 is not available
3343 3348                   * to obtain the monitor linesize.
3344 3349                   */
3345 3350                  if (cpi->cpi_maxeax < 5)
3346 3351                          mask_ecx &= ~CPUID_INTC_ECX_MON;
3347 3352                  break;
3348 3353          case X86_VENDOR_IntelClone:
3349 3354          default:
3350 3355                  break;
3351 3356          case X86_VENDOR_AMD:
3352 3357  #if defined(OPTERON_ERRATUM_108)
3353 3358                  if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
3354 3359                          cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
3355 3360                          cpi->cpi_model = 0xc;
3356 3361                  } else
3357 3362  #endif
3358 3363                  if (cpi->cpi_family == 5) {
3359 3364                          /*
3360 3365                           * AMD K5 and K6
3361 3366                           *
3362 3367                           * These CPUs have an incomplete implementation
3363 3368                           * of MCA/MCE which we mask away.
3364 3369                           */
3365 3370                          mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
3366 3371  
3367 3372                          /*
3368 3373                           * Model 0 uses the wrong (APIC) bit
3369 3374                           * to indicate PGE.  Fix it here.
3370 3375                           */
3371 3376                          if (cpi->cpi_model == 0) {
3372 3377                                  if (cp->cp_edx & 0x200) {
3373 3378                                          cp->cp_edx &= ~0x200;
3374 3379                                          cp->cp_edx |= CPUID_INTC_EDX_PGE;
3375 3380                                  }
3376 3381                          }
3377 3382  
3378 3383                          /*
3379 3384                           * Early models had problems w/ MMX; disable.
3380 3385                           */
3381 3386                          if (cpi->cpi_model < 6)
3382 3387                                  mask_edx &= ~CPUID_INTC_EDX_MMX;
3383 3388                  }
3384 3389  
3385 3390                  /*
3386 3391                   * For newer families, SSE3 and CX16, at least, are valid;
3387 3392                   * enable all
3388 3393                   */
3389 3394                  if (cpi->cpi_family >= 0xf)
3390 3395                          mask_ecx = 0xffffffff;
3391 3396                  /*
3392 3397                   * We don't support MONITOR/MWAIT if leaf 5 is not available
3393 3398                   * to obtain the monitor linesize.
3394 3399                   */
3395 3400                  if (cpi->cpi_maxeax < 5)
3396 3401                          mask_ecx &= ~CPUID_INTC_ECX_MON;
3397 3402  
3398 3403  #if !defined(__xpv)
3399 3404                  /*
3400 3405                   * AMD has not historically used MWAIT in the CPU's idle loop.
3401 3406                   * Pre-family-10h Opterons do not have the MWAIT instruction. We
3402 3407                   * know for certain that in at least family 17h, per AMD, mwait
3403 3408                   * is preferred. Families in-between are less certain.
3404 3409                   */
3405 3410                  if (cpi->cpi_family < 0x17) {
3406 3411                          idle_cpu_prefer_mwait = 0;
3407 3412                  }
3408 3413  #endif
3409 3414  
3410 3415                  break;
3411 3416          case X86_VENDOR_TM:
3412 3417                  /*
3413 3418                   * workaround the NT workaround in CMS 4.1
3414 3419                   */
3415 3420                  if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
3416 3421                      (cpi->cpi_step == 2 || cpi->cpi_step == 3))
3417 3422                          cp->cp_edx |= CPUID_INTC_EDX_CX8;
3418 3423                  break;
3419 3424          case X86_VENDOR_Centaur:
3420 3425                  /*
3421 3426                   * workaround the NT workarounds again
3422 3427                   */
3423 3428                  if (cpi->cpi_family == 6)
3424 3429                          cp->cp_edx |= CPUID_INTC_EDX_CX8;
3425 3430                  break;
3426 3431          case X86_VENDOR_Cyrix:
3427 3432                  /*
3428 3433                   * We rely heavily on the probing in locore
3429 3434                   * to actually figure out what parts, if any,
3430 3435                   * of the Cyrix cpuid instruction to believe.
3431 3436                   */
3432 3437                  switch (x86_type) {
3433 3438                  case X86_TYPE_CYRIX_486:
3434 3439                          mask_edx = 0;
3435 3440                          break;
3436 3441                  case X86_TYPE_CYRIX_6x86:
3437 3442                          mask_edx = 0;
3438 3443                          break;
3439 3444                  case X86_TYPE_CYRIX_6x86L:
3440 3445                          mask_edx =
3441 3446                              CPUID_INTC_EDX_DE |
3442 3447                              CPUID_INTC_EDX_CX8;
3443 3448                          break;
3444 3449                  case X86_TYPE_CYRIX_6x86MX:
3445 3450                          mask_edx =
3446 3451                              CPUID_INTC_EDX_DE |
3447 3452                              CPUID_INTC_EDX_MSR |
3448 3453                              CPUID_INTC_EDX_CX8 |
3449 3454                              CPUID_INTC_EDX_PGE |
3450 3455                              CPUID_INTC_EDX_CMOV |
3451 3456                              CPUID_INTC_EDX_MMX;
3452 3457                          break;
3453 3458                  case X86_TYPE_CYRIX_GXm:
3454 3459                          mask_edx =
3455 3460                              CPUID_INTC_EDX_MSR |
3456 3461                              CPUID_INTC_EDX_CX8 |
3457 3462                              CPUID_INTC_EDX_CMOV |
3458 3463                              CPUID_INTC_EDX_MMX;
3459 3464                          break;
3460 3465                  case X86_TYPE_CYRIX_MediaGX:
3461 3466                          break;
3462 3467                  case X86_TYPE_CYRIX_MII:
3463 3468                  case X86_TYPE_VIA_CYRIX_III:
3464 3469                          mask_edx =
3465 3470                              CPUID_INTC_EDX_DE |
3466 3471                              CPUID_INTC_EDX_TSC |
3467 3472                              CPUID_INTC_EDX_MSR |
3468 3473                              CPUID_INTC_EDX_CX8 |
3469 3474                              CPUID_INTC_EDX_PGE |
3470 3475                              CPUID_INTC_EDX_CMOV |
3471 3476                              CPUID_INTC_EDX_MMX;
3472 3477                          break;
3473 3478                  default:
3474 3479                          break;
3475 3480                  }
3476 3481                  break;
3477 3482          }
3478 3483  
3479 3484  #if defined(__xpv)
3480 3485          /*
3481 3486           * Do not support MONITOR/MWAIT under a hypervisor
3482 3487           */
3483 3488          mask_ecx &= ~CPUID_INTC_ECX_MON;
3484 3489          /*
3485 3490           * Do not support XSAVE under a hypervisor for now
3486 3491           */
3487 3492          xsave_force_disable = B_TRUE;
3488 3493  
3489 3494  #endif  /* __xpv */
3490 3495  
3491 3496          if (xsave_force_disable) {
3492 3497                  mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
3493 3498                  mask_ecx &= ~CPUID_INTC_ECX_AVX;
3494 3499                  mask_ecx &= ~CPUID_INTC_ECX_F16C;
3495 3500                  mask_ecx &= ~CPUID_INTC_ECX_FMA;
3496 3501          }
3497 3502  
3498 3503          /*
3499 3504           * Now we've figured out the masks that determine
3500 3505           * which bits we choose to believe, apply the masks
3501 3506           * to the feature words, then map the kernel's view
3502 3507           * of these feature words into its feature word.
3503 3508           */
3504 3509          cp->cp_edx &= mask_edx;
3505 3510          cp->cp_ecx &= mask_ecx;
3506 3511  
3507 3512          /*
3508 3513           * apply any platform restrictions (we don't call this
3509 3514           * immediately after __cpuid_insn here, because we need the
3510 3515           * workarounds applied above first)
3511 3516           */
3512 3517          platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
3513 3518  
3514 3519          /*
3515 3520           * In addition to ecx and edx, Intel and AMD are storing a bunch of
3516 3521           * instruction set extensions in leaf 7's ebx, ecx, and edx.
3517 3522           */
3518 3523          if (cpi->cpi_maxeax >= 7) {
3519 3524                  struct cpuid_regs *ecp;
3520 3525                  ecp = &cpi->cpi_std[7];
3521 3526                  ecp->cp_eax = 7;
3522 3527                  ecp->cp_ecx = 0;
3523 3528                  (void) __cpuid_insn(ecp);
3524 3529  
3525 3530                  /*
3526 3531                   * If XSAVE has been disabled, just ignore all of the
3527 3532                   * extended-save-area dependent flags here.
3528 3533                   */
3529 3534                  if (xsave_force_disable) {
3530 3535                          ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
3531 3536                          ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
3532 3537                          ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
3533 3538                          ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
3534 3539                          ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
3535 3540                          ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
3536 3541                          ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
3537 3542                  }
3538 3543  
3539 3544                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
3540 3545                          add_x86_feature(featureset, X86FSET_SMEP);
3541 3546  
3542 3547                  /*
3543 3548                   * We check disable_smap here in addition to in startup_smap()
3544 3549                   * to ensure CPUs that aren't the boot CPU don't accidentally
3545 3550                   * include it in the feature set and thus generate a mismatched
3546 3551                   * x86 feature set across CPUs.
3547 3552                   */
3548 3553                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
3549 3554                      disable_smap == 0)
3550 3555                          add_x86_feature(featureset, X86FSET_SMAP);
3551 3556  
3552 3557                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
3553 3558                          add_x86_feature(featureset, X86FSET_RDSEED);
3554 3559  
3555 3560                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
3556 3561                          add_x86_feature(featureset, X86FSET_ADX);
3557 3562  
3558 3563                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
3559 3564                          add_x86_feature(featureset, X86FSET_FSGSBASE);
3560 3565  
3561 3566                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
3562 3567                          add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
3563 3568  
3564 3569                  if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3565 3570                          if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
3566 3571                                  add_x86_feature(featureset, X86FSET_INVPCID);
3567 3572  
3568 3573                          if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
3569 3574                                  add_x86_feature(featureset, X86FSET_MPX);
3570 3575  
3571 3576                          if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
3572 3577                                  add_x86_feature(featureset, X86FSET_CLWB);
3573 3578                  }
3574 3579          }
3575 3580  
3576 3581          /*
3577 3582           * fold in overrides from the "eeprom" mechanism
3578 3583           */
3579 3584          cp->cp_edx |= cpuid_feature_edx_include;
3580 3585          cp->cp_edx &= ~cpuid_feature_edx_exclude;
3581 3586  
3582 3587          cp->cp_ecx |= cpuid_feature_ecx_include;
3583 3588          cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
3584 3589  
3585 3590          if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
3586 3591                  add_x86_feature(featureset, X86FSET_LARGEPAGE);
3587 3592          }
3588 3593          if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
3589 3594                  add_x86_feature(featureset, X86FSET_TSC);
3590 3595          }
3591 3596          if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
3592 3597                  add_x86_feature(featureset, X86FSET_MSR);
3593 3598          }
3594 3599          if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
3595 3600                  add_x86_feature(featureset, X86FSET_MTRR);
3596 3601          }
3597 3602          if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
3598 3603                  add_x86_feature(featureset, X86FSET_PGE);
3599 3604          }
3600 3605          if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
3601 3606                  add_x86_feature(featureset, X86FSET_CMOV);
3602 3607          }
3603 3608          if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
3604 3609                  add_x86_feature(featureset, X86FSET_MMX);
3605 3610          }
3606 3611          if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
3607 3612              (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
3608 3613                  add_x86_feature(featureset, X86FSET_MCA);
3609 3614          }
3610 3615          if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
3611 3616                  add_x86_feature(featureset, X86FSET_PAE);
3612 3617          }
3613 3618          if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
3614 3619                  add_x86_feature(featureset, X86FSET_CX8);
3615 3620          }
3616 3621          if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
3617 3622                  add_x86_feature(featureset, X86FSET_CX16);
3618 3623          }
3619 3624          if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
3620 3625                  add_x86_feature(featureset, X86FSET_PAT);
3621 3626          }
3622 3627          if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
3623 3628                  add_x86_feature(featureset, X86FSET_SEP);
3624 3629          }
3625 3630          if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
3626 3631                  /*
3627 3632                   * In our implementation, fxsave/fxrstor
3628 3633                   * are prerequisites before we'll even
3629 3634                   * try and do SSE things.
3630 3635                   */
3631 3636                  if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
3632 3637                          add_x86_feature(featureset, X86FSET_SSE);
3633 3638                  }
3634 3639                  if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
3635 3640                          add_x86_feature(featureset, X86FSET_SSE2);
3636 3641                  }
3637 3642                  if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
3638 3643                          add_x86_feature(featureset, X86FSET_SSE3);
3639 3644                  }
3640 3645                  if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
3641 3646                          add_x86_feature(featureset, X86FSET_SSSE3);
3642 3647                  }
3643 3648                  if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
3644 3649                          add_x86_feature(featureset, X86FSET_SSE4_1);
3645 3650                  }
3646 3651                  if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
3647 3652                          add_x86_feature(featureset, X86FSET_SSE4_2);
3648 3653                  }
3649 3654                  if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
3650 3655                          add_x86_feature(featureset, X86FSET_AES);
3651 3656                  }
3652 3657                  if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
3653 3658                          add_x86_feature(featureset, X86FSET_PCLMULQDQ);
3654 3659                  }
3655 3660  
3656 3661                  if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
3657 3662                          add_x86_feature(featureset, X86FSET_SHA);
3658 3663  
3659 3664                  if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
3660 3665                          add_x86_feature(featureset, X86FSET_UMIP);
3661 3666                  if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_PKU)
3662 3667                          add_x86_feature(featureset, X86FSET_PKU);
3663 3668                  if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
3664 3669                          add_x86_feature(featureset, X86FSET_OSPKE);
3665 3670  
3666 3671                  if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
3667 3672                          add_x86_feature(featureset, X86FSET_XSAVE);
3668 3673  
3669 3674                          /* We only test AVX & AVX512 when there is XSAVE */
3670 3675  
3671 3676                          if (cp->cp_ecx & CPUID_INTC_ECX_AVX) {
3672 3677                                  add_x86_feature(featureset,
3673 3678                                      X86FSET_AVX);
3674 3679  
3675 3680                                  /*
3676 3681                                   * Intel says we can't check these without also
3677 3682                                   * checking AVX.
3678 3683                                   */
3679 3684                                  if (cp->cp_ecx & CPUID_INTC_ECX_F16C)
3680 3685                                          add_x86_feature(featureset,
3681 3686                                              X86FSET_F16C);
3682 3687  
3683 3688                                  if (cp->cp_ecx & CPUID_INTC_ECX_FMA)
3684 3689                                          add_x86_feature(featureset,
3685 3690                                              X86FSET_FMA);
3686 3691  
3687 3692                                  if (cpi->cpi_std[7].cp_ebx &
3688 3693                                      CPUID_INTC_EBX_7_0_BMI1)
3689 3694                                          add_x86_feature(featureset,
3690 3695                                              X86FSET_BMI1);
3691 3696  
3692 3697                                  if (cpi->cpi_std[7].cp_ebx &
3693 3698                                      CPUID_INTC_EBX_7_0_BMI2)
3694 3699                                          add_x86_feature(featureset,
3695 3700                                              X86FSET_BMI2);
3696 3701  
3697 3702                                  if (cpi->cpi_std[7].cp_ebx &
3698 3703                                      CPUID_INTC_EBX_7_0_AVX2)
3699 3704                                          add_x86_feature(featureset,
3700 3705                                              X86FSET_AVX2);
3701 3706                          }
3702 3707  
3703 3708                          if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3704 3709                              (cpi->cpi_std[7].cp_ebx &
3705 3710                              CPUID_INTC_EBX_7_0_AVX512F) != 0) {
3706 3711                                  add_x86_feature(featureset, X86FSET_AVX512F);
3707 3712  
3708 3713                                  if (cpi->cpi_std[7].cp_ebx &
3709 3714                                      CPUID_INTC_EBX_7_0_AVX512DQ)
3710 3715                                          add_x86_feature(featureset,
3711 3716                                              X86FSET_AVX512DQ);
3712 3717                                  if (cpi->cpi_std[7].cp_ebx &
3713 3718                                      CPUID_INTC_EBX_7_0_AVX512IFMA)
3714 3719                                          add_x86_feature(featureset,
3715 3720                                              X86FSET_AVX512FMA);
3716 3721                                  if (cpi->cpi_std[7].cp_ebx &
3717 3722                                      CPUID_INTC_EBX_7_0_AVX512PF)
3718 3723                                          add_x86_feature(featureset,
3719 3724                                              X86FSET_AVX512PF);
3720 3725                                  if (cpi->cpi_std[7].cp_ebx &
3721 3726                                      CPUID_INTC_EBX_7_0_AVX512ER)
3722 3727                                          add_x86_feature(featureset,
3723 3728                                              X86FSET_AVX512ER);
3724 3729                                  if (cpi->cpi_std[7].cp_ebx &
3725 3730                                      CPUID_INTC_EBX_7_0_AVX512CD)
3726 3731                                          add_x86_feature(featureset,
3727 3732                                              X86FSET_AVX512CD);
3728 3733                                  if (cpi->cpi_std[7].cp_ebx &
3729 3734                                      CPUID_INTC_EBX_7_0_AVX512BW)
3730 3735                                          add_x86_feature(featureset,
3731 3736                                              X86FSET_AVX512BW);
3732 3737                                  if (cpi->cpi_std[7].cp_ebx &
3733 3738                                      CPUID_INTC_EBX_7_0_AVX512VL)
3734 3739                                          add_x86_feature(featureset,
3735 3740                                              X86FSET_AVX512VL);
3736 3741  
3737 3742                                  if (cpi->cpi_std[7].cp_ecx &
3738 3743                                      CPUID_INTC_ECX_7_0_AVX512VBMI)
3739 3744                                          add_x86_feature(featureset,
3740 3745                                              X86FSET_AVX512VBMI);
3741 3746                                  if (cpi->cpi_std[7].cp_ecx &
3742 3747                                      CPUID_INTC_ECX_7_0_AVX512VNNI)
3743 3748                                          add_x86_feature(featureset,
3744 3749                                              X86FSET_AVX512VNNI);
3745 3750                                  if (cpi->cpi_std[7].cp_ecx &
3746 3751                                      CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3747 3752                                          add_x86_feature(featureset,
3748 3753                                              X86FSET_AVX512VPOPCDQ);
3749 3754  
3750 3755                                  if (cpi->cpi_std[7].cp_edx &
3751 3756                                      CPUID_INTC_EDX_7_0_AVX5124NNIW)
3752 3757                                          add_x86_feature(featureset,
3753 3758                                              X86FSET_AVX512NNIW);
3754 3759                                  if (cpi->cpi_std[7].cp_edx &
3755 3760                                      CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3756 3761                                          add_x86_feature(featureset,
3757 3762                                              X86FSET_AVX512FMAPS);
3758 3763                          }
3759 3764                  }
3760 3765          }
3761 3766  
3762 3767          if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3763 3768                  if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
3764 3769                          add_x86_feature(featureset, X86FSET_PCID);
3765 3770                  }
3766 3771          }
3767 3772  
3768 3773          if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
3769 3774                  add_x86_feature(featureset, X86FSET_X2APIC);
3770 3775          }
3771 3776          if (cp->cp_edx & CPUID_INTC_EDX_DE) {
3772 3777                  add_x86_feature(featureset, X86FSET_DE);
3773 3778          }
3774 3779  #if !defined(__xpv)
3775 3780          if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
3776 3781  
3777 3782                  /*
3778 3783                   * We require the CLFLUSH instruction for erratum workaround
3779 3784                   * to use MONITOR/MWAIT.
3780 3785                   */
3781 3786                  if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3782 3787                          cpi->cpi_mwait.support |= MWAIT_SUPPORT;
3783 3788                          add_x86_feature(featureset, X86FSET_MWAIT);
3784 3789                  } else {
3785 3790                          extern int idle_cpu_assert_cflush_monitor;
3786 3791  
3787 3792                          /*
3788 3793                           * All processors we are aware of which have
3789 3794                           * MONITOR/MWAIT also have CLFLUSH.
3790 3795                           */
3791 3796                          if (idle_cpu_assert_cflush_monitor) {
3792 3797                                  ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
3793 3798                                      (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
3794 3799                          }
3795 3800                  }
3796 3801          }
3797 3802  #endif  /* __xpv */
3798 3803  
3799 3804          if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
3800 3805                  add_x86_feature(featureset, X86FSET_VMX);
3801 3806          }
3802 3807  
3803 3808          if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
3804 3809                  add_x86_feature(featureset, X86FSET_RDRAND);
3805 3810  
3806 3811          /*
3807 3812           * Only need it first time, rest of the cpus would follow suit.
3808 3813           * we only capture this for the bootcpu.
3809 3814           */
3810 3815          if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3811 3816                  add_x86_feature(featureset, X86FSET_CLFSH);
3812 3817                  x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
3813 3818          }
3814 3819          if (is_x86_feature(featureset, X86FSET_PAE))
3815 3820                  cpi->cpi_pabits = 36;
3816 3821  
3817 3822          if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
3818 3823                  struct cpuid_regs r, *ecp;
3819 3824  
3820 3825                  ecp = &r;
3821 3826                  ecp->cp_eax = 0xD;
3822 3827                  ecp->cp_ecx = 1;
3823 3828                  ecp->cp_edx = ecp->cp_ebx = 0;
3824 3829                  (void) __cpuid_insn(ecp);
3825 3830  
3826 3831                  if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
3827 3832                          add_x86_feature(featureset, X86FSET_XSAVEOPT);
3828 3833                  if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
3829 3834                          add_x86_feature(featureset, X86FSET_XSAVEC);
3830 3835                  if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
3831 3836                          add_x86_feature(featureset, X86FSET_XSAVES);
3832 3837          }
3833 3838  
3834 3839          /*
3835 3840           * Work on the "extended" feature information, doing
3836 3841           * some basic initialization for cpuid_pass2()
3837 3842           */
3838 3843          xcpuid = 0;
3839 3844          switch (cpi->cpi_vendor) {
3840 3845          case X86_VENDOR_Intel:
3841 3846                  /*
3842 3847                   * On KVM we know we will have proper support for extended
3843 3848                   * cpuid.
3844 3849                   */
3845 3850                  if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
3846 3851                      (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
3847 3852                      (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
3848 3853                          xcpuid++;
3849 3854                  break;
3850 3855          case X86_VENDOR_AMD:
3851 3856                  if (cpi->cpi_family > 5 ||
3852 3857                      (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
3853 3858                          xcpuid++;
3854 3859                  break;
3855 3860          case X86_VENDOR_Cyrix:
3856 3861                  /*
3857 3862                   * Only these Cyrix CPUs are -known- to support
3858 3863                   * extended cpuid operations.
3859 3864                   */
3860 3865                  if (x86_type == X86_TYPE_VIA_CYRIX_III ||
3861 3866                      x86_type == X86_TYPE_CYRIX_GXm)
3862 3867                          xcpuid++;
3863 3868                  break;
3864 3869          case X86_VENDOR_Centaur:
3865 3870          case X86_VENDOR_TM:
3866 3871          default:
3867 3872                  xcpuid++;
3868 3873                  break;
3869 3874          }
3870 3875  
3871 3876          if (xcpuid) {
3872 3877                  cp = &cpi->cpi_extd[0];
3873 3878                  cp->cp_eax = CPUID_LEAF_EXT_0;
3874 3879                  cpi->cpi_xmaxeax = __cpuid_insn(cp);
3875 3880          }
3876 3881  
3877 3882          if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
3878 3883  
3879 3884                  if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
3880 3885                          cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
3881 3886  
3882 3887                  switch (cpi->cpi_vendor) {
3883 3888                  case X86_VENDOR_Intel:
3884 3889                  case X86_VENDOR_AMD:
3885 3890                          if (cpi->cpi_xmaxeax < 0x80000001)
3886 3891                                  break;
3887 3892                          cp = &cpi->cpi_extd[1];
3888 3893                          cp->cp_eax = 0x80000001;
3889 3894                          (void) __cpuid_insn(cp);
3890 3895  
3891 3896                          if (cpi->cpi_vendor == X86_VENDOR_AMD &&
3892 3897                              cpi->cpi_family == 5 &&
3893 3898                              cpi->cpi_model == 6 &&
3894 3899                              cpi->cpi_step == 6) {
3895 3900                                  /*
3896 3901                                   * K6 model 6 uses bit 10 to indicate SYSC
3897 3902                                   * Later models use bit 11. Fix it here.
3898 3903                                   */
3899 3904                                  if (cp->cp_edx & 0x400) {
3900 3905                                          cp->cp_edx &= ~0x400;
3901 3906                                          cp->cp_edx |= CPUID_AMD_EDX_SYSC;
3902 3907                                  }
3903 3908                          }
3904 3909  
3905 3910                          platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
3906 3911  
3907 3912                          /*
3908 3913                           * Compute the additions to the kernel's feature word.
3909 3914                           */
3910 3915                          if (cp->cp_edx & CPUID_AMD_EDX_NX) {
3911 3916                                  add_x86_feature(featureset, X86FSET_NX);
3912 3917                          }
3913 3918  
3914 3919                          /*
3915 3920                           * Regardless whether or not we boot 64-bit,
3916 3921                           * we should have a way to identify whether
3917 3922                           * the CPU is capable of running 64-bit.
3918 3923                           */
3919 3924                          if (cp->cp_edx & CPUID_AMD_EDX_LM) {
3920 3925                                  add_x86_feature(featureset, X86FSET_64);
3921 3926                          }
3922 3927  
3923 3928                          /* 1 GB large page - enable only for 64 bit kernel */
3924 3929                          if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
3925 3930                                  add_x86_feature(featureset, X86FSET_1GPG);
3926 3931                          }
3927 3932  
3928 3933                          if ((cpi->cpi_vendor == X86_VENDOR_AMD) &&
3929 3934                              (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
3930 3935                              (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
3931 3936                                  add_x86_feature(featureset, X86FSET_SSE4A);
3932 3937                          }
3933 3938  
3934 3939                          /*
3935 3940                           * It's really tricky to support syscall/sysret in
3936 3941                           * the i386 kernel; we rely on sysenter/sysexit
3937 3942                           * instead.  In the amd64 kernel, things are -way-
3938 3943                           * better.
3939 3944                           */
3940 3945                          if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
3941 3946                                  add_x86_feature(featureset, X86FSET_ASYSC);
3942 3947                          }
3943 3948  
3944 3949                          /*
3945 3950                           * While we're thinking about system calls, note
3946 3951                           * that AMD processors don't support sysenter
3947 3952                           * in long mode at all, so don't try to program them.
3948 3953                           */
3949 3954                          if (x86_vendor == X86_VENDOR_AMD) {
3950 3955                                  remove_x86_feature(featureset, X86FSET_SEP);
3951 3956                          }
3952 3957  
3953 3958                          if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
3954 3959                                  add_x86_feature(featureset, X86FSET_TSCP);
3955 3960                          }
3956 3961  
3957 3962                          if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
3958 3963                                  add_x86_feature(featureset, X86FSET_SVM);
3959 3964                          }
3960 3965  
3961 3966                          if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
3962 3967                                  add_x86_feature(featureset, X86FSET_TOPOEXT);
3963 3968                          }
3964 3969  
3965 3970                          if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
3966 3971                                  add_x86_feature(featureset, X86FSET_AMD_PCEC);
3967 3972                          }
3968 3973  
3969 3974                          if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
3970 3975                                  add_x86_feature(featureset, X86FSET_XOP);
3971 3976                          }
3972 3977  
3973 3978                          if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
3974 3979                                  add_x86_feature(featureset, X86FSET_FMA4);
3975 3980                          }
3976 3981  
3977 3982                          if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
3978 3983                                  add_x86_feature(featureset, X86FSET_TBM);
3979 3984                          }
3980 3985  
3981 3986                          if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
3982 3987                                  add_x86_feature(featureset, X86FSET_MONITORX);
3983 3988                          }
3984 3989                          break;
3985 3990                  default:
3986 3991                          break;
3987 3992                  }
3988 3993  
3989 3994                  /*
3990 3995                   * Get CPUID data about processor cores and hyperthreads.
3991 3996                   */
3992 3997                  switch (cpi->cpi_vendor) {
3993 3998                  case X86_VENDOR_Intel:
3994 3999                          if (cpi->cpi_maxeax >= 4) {
3995 4000                                  cp = &cpi->cpi_std[4];
3996 4001                                  cp->cp_eax = 4;
3997 4002                                  cp->cp_ecx = 0;
3998 4003                                  (void) __cpuid_insn(cp);
3999 4004                                  platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
4000 4005                          }
4001 4006                          /*FALLTHROUGH*/
4002 4007                  case X86_VENDOR_AMD:
4003 4008                          if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
4004 4009                                  break;
4005 4010                          cp = &cpi->cpi_extd[8];
4006 4011                          cp->cp_eax = CPUID_LEAF_EXT_8;
4007 4012                          (void) __cpuid_insn(cp);
4008 4013                          platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
4009 4014                              cp);
4010 4015  
4011 4016                          /*
4012 4017                           * AMD uses ebx for some extended functions.
4013 4018                           */
4014 4019                          if (cpi->cpi_vendor == X86_VENDOR_AMD) {
4015 4020                                  /*
4016 4021                                   * While we're here, check for the AMD "Error
4017 4022                                   * Pointer Zero/Restore" feature. This can be
4018 4023                                   * used to setup the FP save handlers
4019 4024                                   * appropriately.
4020 4025                                   */
4021 4026                                  if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4022 4027                                          cpi->cpi_fp_amd_save = 0;
4023 4028                                  } else {
4024 4029                                          cpi->cpi_fp_amd_save = 1;
4025 4030                                  }
4026 4031  
4027 4032                                  if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
4028 4033                                          add_x86_feature(featureset,
4029 4034                                              X86FSET_CLZERO);
4030 4035                                  }
4031 4036                          }
4032 4037  
4033 4038                          /*
4034 4039                           * Virtual and physical address limits from
4035 4040                           * cpuid override previously guessed values.
4036 4041                           */
4037 4042                          cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
4038 4043                          cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
4039 4044                          break;
4040 4045                  default:
4041 4046                          break;
4042 4047                  }
4043 4048  
4044 4049                  /*
4045 4050                   * Get CPUID data about TSC Invariance in Deep C-State.
4046 4051                   */
4047 4052                  switch (cpi->cpi_vendor) {
4048 4053                  case X86_VENDOR_Intel:
4049 4054                  case X86_VENDOR_AMD:
4050 4055                          if (cpi->cpi_maxeax >= 7) {
4051 4056                                  cp = &cpi->cpi_extd[7];
4052 4057                                  cp->cp_eax = 0x80000007;
4053 4058                                  cp->cp_ecx = 0;
4054 4059                                  (void) __cpuid_insn(cp);
4055 4060                          }
4056 4061                          break;
4057 4062                  default:
4058 4063                          break;
4059 4064                  }
4060 4065          }
4061 4066  
4062 4067          cpuid_pass1_topology(cpu, featureset);
4063 4068          cpuid_pass1_thermal(cpu, featureset);
4064 4069  
4065 4070          /*
4066 4071           * Synthesize chip "revision" and socket type
4067 4072           */
4068 4073          cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
4069 4074              cpi->cpi_model, cpi->cpi_step);
4070 4075          cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
4071 4076              cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
4072 4077          cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
4073 4078              cpi->cpi_model, cpi->cpi_step);
4074 4079  
4075 4080          if (cpi->cpi_vendor == X86_VENDOR_AMD) {
4076 4081                  if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
4077 4082                      cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4078 4083                          /* Special handling for AMD FP not necessary. */
4079 4084                          cpi->cpi_fp_amd_save = 0;
4080 4085                  } else {
4081 4086                          cpi->cpi_fp_amd_save = 1;
4082 4087                  }
4083 4088          }
4084 4089  
4085 4090          /*
4086 4091           * Check the processor leaves that are used for security features.
4087 4092           */
4088 4093          cpuid_scan_security(cpu, featureset);
4089 4094  
4090 4095  pass1_done:
4091 4096          cpi->cpi_pass = 1;
4092 4097  }
4093 4098  
4094 4099  /*
4095 4100   * Make copies of the cpuid table entries we depend on, in
4096 4101   * part for ease of parsing now, in part so that we have only
4097 4102   * one place to correct any of it, in part for ease of
4098 4103   * later export to userland, and in part so we can look at
4099 4104   * this stuff in a crash dump.
4100 4105   */
4101 4106  
4102 4107  /*ARGSUSED*/
4103 4108  void
4104 4109  cpuid_pass2(cpu_t *cpu)
4105 4110  {
4106 4111          uint_t n, nmax;
4107 4112          int i;
4108 4113          struct cpuid_regs *cp;
4109 4114          uint8_t *dp;
4110 4115          uint32_t *iptr;
4111 4116          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4112 4117  
4113 4118          ASSERT(cpi->cpi_pass == 1);
4114 4119  
4115 4120          if (cpi->cpi_maxeax < 1)
4116 4121                  goto pass2_done;
4117 4122  
4118 4123          if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
4119 4124                  nmax = NMAX_CPI_STD;
4120 4125          /*
4121 4126           * (We already handled n == 0 and n == 1 in pass 1)
4122 4127           */
4123 4128          for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
4124 4129                  /*
4125 4130                   * leaves 6 and 7 were handled in pass 1
4126 4131                   */
4127 4132                  if (n == 6 || n == 7)
4128 4133                          continue;
4129 4134  
4130 4135                  cp->cp_eax = n;
4131 4136  
4132 4137                  /*
4133 4138                   * CPUID function 4 expects %ecx to be initialized
4134 4139                   * with an index which indicates which cache to return
4135 4140                   * information about. The OS is expected to call function 4
4136 4141                   * with %ecx set to 0, 1, 2, ... until it returns with
4137 4142                   * EAX[4:0] set to 0, which indicates there are no more
4138 4143                   * caches.
4139 4144                   *
4140 4145                   * Here, populate cpi_std[4] with the information returned by
4141 4146                   * function 4 when %ecx == 0, and do the rest in cpuid_pass3()
4142 4147                   * when dynamic memory allocation becomes available.
4143 4148                   *
4144 4149                   * Note: we need to explicitly initialize %ecx here, since
4145 4150                   * function 4 may have been previously invoked.
4146 4151                   */
4147 4152                  if (n == 4)
4148 4153                          cp->cp_ecx = 0;
4149 4154  
4150 4155                  (void) __cpuid_insn(cp);
4151 4156                  platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
4152 4157                  switch (n) {
4153 4158                  case 2:
4154 4159                          /*
4155 4160                           * "the lower 8 bits of the %eax register
4156 4161                           * contain a value that identifies the number
4157 4162                           * of times the cpuid [instruction] has to be
4158 4163                           * executed to obtain a complete image of the
4159 4164                           * processor's caching systems."
4160 4165                           *
4161 4166                           * How *do* they make this stuff up?
4162 4167                           */
4163 4168                          cpi->cpi_ncache = sizeof (*cp) *
4164 4169                              BITX(cp->cp_eax, 7, 0);
4165 4170                          if (cpi->cpi_ncache == 0)
4166 4171                                  break;
4167 4172                          cpi->cpi_ncache--;      /* skip count byte */
4168 4173  
4169 4174                          /*
4170 4175                           * Well, for now, rather than attempt to implement
4171 4176                           * this slightly dubious algorithm, we just look
4172 4177                           * at the first 15 ..
4173 4178                           */
4174 4179                          if (cpi->cpi_ncache > (sizeof (*cp) - 1))
4175 4180                                  cpi->cpi_ncache = sizeof (*cp) - 1;
4176 4181  
4177 4182                          dp = cpi->cpi_cacheinfo;
4178 4183                          if (BITX(cp->cp_eax, 31, 31) == 0) {
4179 4184                                  uint8_t *p = (void *)&cp->cp_eax;
4180 4185                                  for (i = 1; i < 4; i++)
4181 4186                                          if (p[i] != 0)
4182 4187                                                  *dp++ = p[i];
4183 4188                          }
4184 4189                          if (BITX(cp->cp_ebx, 31, 31) == 0) {
4185 4190                                  uint8_t *p = (void *)&cp->cp_ebx;
4186 4191                                  for (i = 0; i < 4; i++)
4187 4192                                          if (p[i] != 0)
4188 4193                                                  *dp++ = p[i];
4189 4194                          }
4190 4195                          if (BITX(cp->cp_ecx, 31, 31) == 0) {
4191 4196                                  uint8_t *p = (void *)&cp->cp_ecx;
4192 4197                                  for (i = 0; i < 4; i++)
4193 4198                                          if (p[i] != 0)
4194 4199                                                  *dp++ = p[i];
4195 4200                          }
4196 4201                          if (BITX(cp->cp_edx, 31, 31) == 0) {
4197 4202                                  uint8_t *p = (void *)&cp->cp_edx;
4198 4203                                  for (i = 0; i < 4; i++)
4199 4204                                          if (p[i] != 0)
4200 4205                                                  *dp++ = p[i];
4201 4206                          }
4202 4207                          break;
4203 4208  
4204 4209                  case 3: /* Processor serial number, if PSN supported */
4205 4210                          break;
4206 4211  
4207 4212                  case 4: /* Deterministic cache parameters */
4208 4213                          break;
4209 4214  
4210 4215                  case 5: /* Monitor/Mwait parameters */
4211 4216                  {
4212 4217                          size_t mwait_size;
4213 4218  
4214 4219                          /*
4215 4220                           * check cpi_mwait.support which was set in cpuid_pass1
4216 4221                           */
4217 4222                          if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
4218 4223                                  break;
4219 4224  
4220 4225                          /*
4221 4226                           * Protect ourself from insane mwait line size.
4222 4227                           * Workaround for incomplete hardware emulator(s).
4223 4228                           */
4224 4229                          mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
4225 4230                          if (mwait_size < sizeof (uint32_t) ||
4226 4231                              !ISP2(mwait_size)) {
4227 4232  #if DEBUG
4228 4233                                  cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
4229 4234                                      "size %ld", cpu->cpu_id, (long)mwait_size);
4230 4235  #endif
4231 4236                                  break;
4232 4237                          }
4233 4238  
4234 4239                          cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
4235 4240                          cpi->cpi_mwait.mon_max = mwait_size;
4236 4241                          if (MWAIT_EXTENSION(cpi)) {
4237 4242                                  cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
4238 4243                                  if (MWAIT_INT_ENABLE(cpi))
4239 4244                                          cpi->cpi_mwait.support |=
4240 4245                                              MWAIT_ECX_INT_ENABLE;
4241 4246                          }
4242 4247                          break;
4243 4248                  }
4244 4249                  default:
4245 4250                          break;
4246 4251                  }
4247 4252          }
4248 4253  
4249 4254          /*
4250 4255           * XSAVE enumeration
4251 4256           */
4252 4257          if (cpi->cpi_maxeax >= 0xD) {
4253 4258                  struct cpuid_regs regs;
4254 4259                  boolean_t cpuid_d_valid = B_TRUE;
4255 4260  
4256 4261                  cp = &regs;
4257 4262                  cp->cp_eax = 0xD;
4258 4263                  cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
4259 4264  
4260 4265                  (void) __cpuid_insn(cp);
4261 4266  
4262 4267                  /*
4263 4268                   * Sanity checks for debug
4264 4269                   */
4265 4270                  if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
4266 4271                      (cp->cp_eax & XFEATURE_SSE) == 0) {
4267 4272                          cpuid_d_valid = B_FALSE;
4268 4273                  }
4269 4274  
4270 4275                  cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
4271 4276                  cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
4272 4277                  cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
4273 4278  
4274 4279                  /*
4275 4280                   * If the hw supports AVX, get the size and offset in the save
4276 4281                   * area for the ymm state.
4277 4282                   */
4278 4283                  if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
4279 4284                          cp->cp_eax = 0xD;
4280 4285                          cp->cp_ecx = 2;
4281 4286                          cp->cp_edx = cp->cp_ebx = 0;
4282 4287  
4283 4288                          (void) __cpuid_insn(cp);
4284 4289  
4285 4290                          if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
4286 4291                              cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
4287 4292                                  cpuid_d_valid = B_FALSE;
4288 4293                          }
4289 4294  
4290 4295                          cpi->cpi_xsave.ymm_size = cp->cp_eax;
4291 4296                          cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
4292 4297                  }
4293 4298  
4294 4299                  /*
4295 4300                   * If the hw supports MPX, get the size and offset in the
4296 4301                   * save area for BNDREGS and BNDCSR.
4297 4302                   */
4298 4303                  if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
4299 4304                          cp->cp_eax = 0xD;
4300 4305                          cp->cp_ecx = 3;
4301 4306                          cp->cp_edx = cp->cp_ebx = 0;
4302 4307  
4303 4308                          (void) __cpuid_insn(cp);
4304 4309  
4305 4310                          cpi->cpi_xsave.bndregs_size = cp->cp_eax;
4306 4311                          cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
4307 4312  
4308 4313                          cp->cp_eax = 0xD;
4309 4314                          cp->cp_ecx = 4;
4310 4315                          cp->cp_edx = cp->cp_ebx = 0;
4311 4316  
4312 4317                          (void) __cpuid_insn(cp);
4313 4318  
4314 4319                          cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
4315 4320                          cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
4316 4321                  }
4317 4322  
4318 4323                  /*
4319 4324                   * If the hw supports AVX512, get the size and offset in the
4320 4325                   * save area for the opmask registers and zmm state.
4321 4326                   */
4322 4327                  if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
4323 4328                          cp->cp_eax = 0xD;
4324 4329                          cp->cp_ecx = 5;
4325 4330                          cp->cp_edx = cp->cp_ebx = 0;
4326 4331  
4327 4332                          (void) __cpuid_insn(cp);
4328 4333  
4329 4334                          cpi->cpi_xsave.opmask_size = cp->cp_eax;
4330 4335                          cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
4331 4336  
4332 4337                          cp->cp_eax = 0xD;
4333 4338                          cp->cp_ecx = 6;
4334 4339                          cp->cp_edx = cp->cp_ebx = 0;
4335 4340  
4336 4341                          (void) __cpuid_insn(cp);
4337 4342  
4338 4343                          cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
4339 4344                          cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
4340 4345  
4341 4346                          cp->cp_eax = 0xD;
4342 4347                          cp->cp_ecx = 7;
4343 4348                          cp->cp_edx = cp->cp_ebx = 0;
4344 4349  
4345 4350                          (void) __cpuid_insn(cp);
4346 4351  
4347 4352                          cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
4348 4353                          cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
4349 4354                  }
4350 4355  
4351 4356                  if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
4352 4357                          xsave_state_size = 0;
4353 4358                  } else if (cpuid_d_valid) {
4354 4359                          xsave_state_size = cpi->cpi_xsave.xsav_max_size;
4355 4360                  } else {
4356 4361                          /* Broken CPUID 0xD, probably in HVM */
4357 4362                          cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
4358 4363                              "value: hw_low = %d, hw_high = %d, xsave_size = %d"
4359 4364                              ", ymm_size = %d, ymm_offset = %d\n",
4360 4365                              cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
4361 4366                              cpi->cpi_xsave.xsav_hw_features_high,
4362 4367                              (int)cpi->cpi_xsave.xsav_max_size,
4363 4368                              (int)cpi->cpi_xsave.ymm_size,
4364 4369                              (int)cpi->cpi_xsave.ymm_offset);
4365 4370  
4366 4371                          if (xsave_state_size != 0) {
4367 4372                                  /*
4368 4373                                   * This must be a non-boot CPU. We cannot
4369 4374                                   * continue, because boot cpu has already
4370 4375                                   * enabled XSAVE.
4371 4376                                   */
4372 4377                                  ASSERT(cpu->cpu_id != 0);
4373 4378                                  cmn_err(CE_PANIC, "cpu%d: we have already "
4374 4379                                      "enabled XSAVE on boot cpu, cannot "
4375 4380                                      "continue.", cpu->cpu_id);
4376 4381                          } else {
4377 4382                                  /*
4378 4383                                   * If we reached here on the boot CPU, it's also
4379 4384                                   * almost certain that we'll reach here on the
4380 4385                                   * non-boot CPUs. When we're here on a boot CPU
4381 4386                                   * we should disable the feature, on a non-boot
4382 4387                                   * CPU we need to confirm that we have.
4383 4388                                   */
4384 4389                                  if (cpu->cpu_id == 0) {
4385 4390                                          remove_x86_feature(x86_featureset,
4386 4391                                              X86FSET_XSAVE);
4387 4392                                          remove_x86_feature(x86_featureset,
4388 4393                                              X86FSET_AVX);
4389 4394                                          remove_x86_feature(x86_featureset,
4390 4395                                              X86FSET_F16C);
4391 4396                                          remove_x86_feature(x86_featureset,
4392 4397                                              X86FSET_BMI1);
4393 4398                                          remove_x86_feature(x86_featureset,
4394 4399                                              X86FSET_BMI2);
4395 4400                                          remove_x86_feature(x86_featureset,
4396 4401                                              X86FSET_FMA);
4397 4402                                          remove_x86_feature(x86_featureset,
4398 4403                                              X86FSET_AVX2);
4399 4404                                          remove_x86_feature(x86_featureset,
4400 4405                                              X86FSET_MPX);
4401 4406                                          remove_x86_feature(x86_featureset,
4402 4407                                              X86FSET_AVX512F);
4403 4408                                          remove_x86_feature(x86_featureset,
4404 4409                                              X86FSET_AVX512DQ);
4405 4410                                          remove_x86_feature(x86_featureset,
4406 4411                                              X86FSET_AVX512PF);
4407 4412                                          remove_x86_feature(x86_featureset,
4408 4413                                              X86FSET_AVX512ER);
4409 4414                                          remove_x86_feature(x86_featureset,
4410 4415                                              X86FSET_AVX512CD);
4411 4416                                          remove_x86_feature(x86_featureset,
4412 4417                                              X86FSET_AVX512BW);
4413 4418                                          remove_x86_feature(x86_featureset,
4414 4419                                              X86FSET_AVX512VL);
4415 4420                                          remove_x86_feature(x86_featureset,
4416 4421                                              X86FSET_AVX512FMA);
4417 4422                                          remove_x86_feature(x86_featureset,
4418 4423                                              X86FSET_AVX512VBMI);
4419 4424                                          remove_x86_feature(x86_featureset,
4420 4425                                              X86FSET_AVX512VNNI);
4421 4426                                          remove_x86_feature(x86_featureset,
4422 4427                                              X86FSET_AVX512VPOPCDQ);
4423 4428                                          remove_x86_feature(x86_featureset,
4424 4429                                              X86FSET_AVX512NNIW);
4425 4430                                          remove_x86_feature(x86_featureset,
4426 4431                                              X86FSET_AVX512FMAPS);
4427 4432  
4428 4433                                          CPI_FEATURES_ECX(cpi) &=
4429 4434                                              ~CPUID_INTC_ECX_XSAVE;
4430 4435                                          CPI_FEATURES_ECX(cpi) &=
4431 4436                                              ~CPUID_INTC_ECX_AVX;
4432 4437                                          CPI_FEATURES_ECX(cpi) &=
4433 4438                                              ~CPUID_INTC_ECX_F16C;
4434 4439                                          CPI_FEATURES_ECX(cpi) &=
4435 4440                                              ~CPUID_INTC_ECX_FMA;
4436 4441                                          CPI_FEATURES_7_0_EBX(cpi) &=
4437 4442                                              ~CPUID_INTC_EBX_7_0_BMI1;
4438 4443                                          CPI_FEATURES_7_0_EBX(cpi) &=
4439 4444                                              ~CPUID_INTC_EBX_7_0_BMI2;
4440 4445                                          CPI_FEATURES_7_0_EBX(cpi) &=
4441 4446                                              ~CPUID_INTC_EBX_7_0_AVX2;
4442 4447                                          CPI_FEATURES_7_0_EBX(cpi) &=
4443 4448                                              ~CPUID_INTC_EBX_7_0_MPX;
4444 4449                                          CPI_FEATURES_7_0_EBX(cpi) &=
4445 4450                                              ~CPUID_INTC_EBX_7_0_ALL_AVX512;
4446 4451  
4447 4452                                          CPI_FEATURES_7_0_ECX(cpi) &=
4448 4453                                              ~CPUID_INTC_ECX_7_0_ALL_AVX512;
4449 4454  
4450 4455                                          CPI_FEATURES_7_0_EDX(cpi) &=
4451 4456                                              ~CPUID_INTC_EDX_7_0_ALL_AVX512;
4452 4457  
4453 4458                                          xsave_force_disable = B_TRUE;
4454 4459                                  } else {
4455 4460                                          VERIFY(is_x86_feature(x86_featureset,
4456 4461                                              X86FSET_XSAVE) == B_FALSE);
4457 4462                                  }
4458 4463                          }
4459 4464                  }
4460 4465          }
4461 4466  
4462 4467  
4463 4468          if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
4464 4469                  goto pass2_done;
4465 4470  
4466 4471          if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
4467 4472                  nmax = NMAX_CPI_EXTD;
4468 4473          /*
4469 4474           * Copy the extended properties, fixing them as we go.
4470 4475           * (We already handled n == 0 and n == 1 in pass 1)
4471 4476           */
4472 4477          iptr = (void *)cpi->cpi_brandstr;
4473 4478          for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
4474 4479                  cp->cp_eax = CPUID_LEAF_EXT_0 + n;
4475 4480                  (void) __cpuid_insn(cp);
4476 4481                  platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
4477 4482                      cp);
4478 4483                  switch (n) {
4479 4484                  case 2:
4480 4485                  case 3:
4481 4486                  case 4:
4482 4487                          /*
4483 4488                           * Extract the brand string
4484 4489                           */
4485 4490                          *iptr++ = cp->cp_eax;
4486 4491                          *iptr++ = cp->cp_ebx;
4487 4492                          *iptr++ = cp->cp_ecx;
4488 4493                          *iptr++ = cp->cp_edx;
4489 4494                          break;
4490 4495                  case 5:
4491 4496                          switch (cpi->cpi_vendor) {
4492 4497                          case X86_VENDOR_AMD:
4493 4498                                  /*
4494 4499                                   * The Athlon and Duron were the first
4495 4500                                   * parts to report the sizes of the
4496 4501                                   * TLB for large pages. Before then,
4497 4502                                   * we don't trust the data.
4498 4503                                   */
4499 4504                                  if (cpi->cpi_family < 6 ||
4500 4505                                      (cpi->cpi_family == 6 &&
4501 4506                                      cpi->cpi_model < 1))
4502 4507                                          cp->cp_eax = 0;
4503 4508                                  break;
4504 4509                          default:
4505 4510                                  break;
4506 4511                          }
4507 4512                          break;
4508 4513                  case 6:
4509 4514                          switch (cpi->cpi_vendor) {
4510 4515                          case X86_VENDOR_AMD:
4511 4516                                  /*
4512 4517                                   * The Athlon and Duron were the first
4513 4518                                   * AMD parts with L2 TLB's.
4514 4519                                   * Before then, don't trust the data.
4515 4520                                   */
4516 4521                                  if (cpi->cpi_family < 6 ||
4517 4522                                      cpi->cpi_family == 6 &&
4518 4523                                      cpi->cpi_model < 1)
4519 4524                                          cp->cp_eax = cp->cp_ebx = 0;
4520 4525                                  /*
4521 4526                                   * AMD Duron rev A0 reports L2
4522 4527                                   * cache size incorrectly as 1K
4523 4528                                   * when it is really 64K
4524 4529                                   */
4525 4530                                  if (cpi->cpi_family == 6 &&
4526 4531                                      cpi->cpi_model == 3 &&
4527 4532                                      cpi->cpi_step == 0) {
4528 4533                                          cp->cp_ecx &= 0xffff;
4529 4534                                          cp->cp_ecx |= 0x400000;
4530 4535                                  }
4531 4536                                  break;
4532 4537                          case X86_VENDOR_Cyrix:  /* VIA C3 */
4533 4538                                  /*
4534 4539                                   * VIA C3 processors are a bit messed
4535 4540                                   * up w.r.t. encoding cache sizes in %ecx
4536 4541                                   */
4537 4542                                  if (cpi->cpi_family != 6)
4538 4543                                          break;
4539 4544                                  /*
4540 4545                                   * model 7 and 8 were incorrectly encoded
4541 4546                                   *
4542 4547                                   * xxx is model 8 really broken?
4543 4548                                   */
4544 4549                                  if (cpi->cpi_model == 7 ||
4545 4550                                      cpi->cpi_model == 8)
4546 4551                                          cp->cp_ecx =
4547 4552                                              BITX(cp->cp_ecx, 31, 24) << 16 |
4548 4553                                              BITX(cp->cp_ecx, 23, 16) << 12 |
4549 4554                                              BITX(cp->cp_ecx, 15, 8) << 8 |
4550 4555                                              BITX(cp->cp_ecx, 7, 0);
4551 4556                                  /*
4552 4557                                   * model 9 stepping 1 has wrong associativity
4553 4558                                   */
4554 4559                                  if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
4555 4560                                          cp->cp_ecx |= 8 << 12;
4556 4561                                  break;
4557 4562                          case X86_VENDOR_Intel:
4558 4563                                  /*
4559 4564                                   * Extended L2 Cache features function.
4560 4565                                   * First appeared on Prescott.
4561 4566                                   */
4562 4567                          default:
4563 4568                                  break;
4564 4569                          }
4565 4570                          break;
4566 4571                  default:
4567 4572                          break;
4568 4573                  }
4569 4574          }
4570 4575  
4571 4576  pass2_done:
4572 4577          cpi->cpi_pass = 2;
4573 4578  }
4574 4579  
4575 4580  static const char *
4576 4581  intel_cpubrand(const struct cpuid_info *cpi)
4577 4582  {
4578 4583          int i;
4579 4584  
4580 4585          if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4581 4586              cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
4582 4587                  return ("i486");
4583 4588  
4584 4589          switch (cpi->cpi_family) {
4585 4590          case 5:
4586 4591                  return ("Intel Pentium(r)");
4587 4592          case 6:
4588 4593                  switch (cpi->cpi_model) {
4589 4594                          uint_t celeron, xeon;
4590 4595                          const struct cpuid_regs *cp;
4591 4596                  case 0:
4592 4597                  case 1:
4593 4598                  case 2:
4594 4599                          return ("Intel Pentium(r) Pro");
4595 4600                  case 3:
4596 4601                  case 4:
4597 4602                          return ("Intel Pentium(r) II");
4598 4603                  case 6:
4599 4604                          return ("Intel Celeron(r)");
4600 4605                  case 5:
4601 4606                  case 7:
4602 4607                          celeron = xeon = 0;
4603 4608                          cp = &cpi->cpi_std[2];  /* cache info */
4604 4609  
4605 4610                          for (i = 1; i < 4; i++) {
4606 4611                                  uint_t tmp;
4607 4612  
4608 4613                                  tmp = (cp->cp_eax >> (8 * i)) & 0xff;
4609 4614                                  if (tmp == 0x40)
4610 4615                                          celeron++;
4611 4616                                  if (tmp >= 0x44 && tmp <= 0x45)
4612 4617                                          xeon++;
4613 4618                          }
4614 4619  
4615 4620                          for (i = 0; i < 2; i++) {
4616 4621                                  uint_t tmp;
4617 4622  
4618 4623                                  tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
4619 4624                                  if (tmp == 0x40)
4620 4625                                          celeron++;
4621 4626                                  else if (tmp >= 0x44 && tmp <= 0x45)
4622 4627                                          xeon++;
4623 4628                          }
4624 4629  
4625 4630                          for (i = 0; i < 4; i++) {
4626 4631                                  uint_t tmp;
4627 4632  
4628 4633                                  tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
4629 4634                                  if (tmp == 0x40)
4630 4635                                          celeron++;
4631 4636                                  else if (tmp >= 0x44 && tmp <= 0x45)
4632 4637                                          xeon++;
4633 4638                          }
4634 4639  
4635 4640                          for (i = 0; i < 4; i++) {
4636 4641                                  uint_t tmp;
4637 4642  
4638 4643                                  tmp = (cp->cp_edx >> (8 * i)) & 0xff;
4639 4644                                  if (tmp == 0x40)
4640 4645                                          celeron++;
4641 4646                                  else if (tmp >= 0x44 && tmp <= 0x45)
4642 4647                                          xeon++;
4643 4648                          }
4644 4649  
4645 4650                          if (celeron)
4646 4651                                  return ("Intel Celeron(r)");
4647 4652                          if (xeon)
4648 4653                                  return (cpi->cpi_model == 5 ?
4649 4654                                      "Intel Pentium(r) II Xeon(tm)" :
4650 4655                                      "Intel Pentium(r) III Xeon(tm)");
4651 4656                          return (cpi->cpi_model == 5 ?
4652 4657                              "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
4653 4658                              "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
4654 4659                  default:
4655 4660                          break;
4656 4661                  }
4657 4662          default:
4658 4663                  break;
4659 4664          }
4660 4665  
4661 4666          /* BrandID is present if the field is nonzero */
4662 4667          if (cpi->cpi_brandid != 0) {
4663 4668                  static const struct {
4664 4669                          uint_t bt_bid;
4665 4670                          const char *bt_str;
4666 4671                  } brand_tbl[] = {
4667 4672                          { 0x1,  "Intel(r) Celeron(r)" },
4668 4673                          { 0x2,  "Intel(r) Pentium(r) III" },
4669 4674                          { 0x3,  "Intel(r) Pentium(r) III Xeon(tm)" },
4670 4675                          { 0x4,  "Intel(r) Pentium(r) III" },
4671 4676                          { 0x6,  "Mobile Intel(r) Pentium(r) III" },
4672 4677                          { 0x7,  "Mobile Intel(r) Celeron(r)" },
4673 4678                          { 0x8,  "Intel(r) Pentium(r) 4" },
4674 4679                          { 0x9,  "Intel(r) Pentium(r) 4" },
4675 4680                          { 0xa,  "Intel(r) Celeron(r)" },
4676 4681                          { 0xb,  "Intel(r) Xeon(tm)" },
4677 4682                          { 0xc,  "Intel(r) Xeon(tm) MP" },
4678 4683                          { 0xe,  "Mobile Intel(r) Pentium(r) 4" },
4679 4684                          { 0xf,  "Mobile Intel(r) Celeron(r)" },
4680 4685                          { 0x11, "Mobile Genuine Intel(r)" },
4681 4686                          { 0x12, "Intel(r) Celeron(r) M" },
4682 4687                          { 0x13, "Mobile Intel(r) Celeron(r)" },
4683 4688                          { 0x14, "Intel(r) Celeron(r)" },
4684 4689                          { 0x15, "Mobile Genuine Intel(r)" },
4685 4690                          { 0x16, "Intel(r) Pentium(r) M" },
4686 4691                          { 0x17, "Mobile Intel(r) Celeron(r)" }
4687 4692                  };
4688 4693                  uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
4689 4694                  uint_t sgn;
4690 4695  
4691 4696                  sgn = (cpi->cpi_family << 8) |
4692 4697                      (cpi->cpi_model << 4) | cpi->cpi_step;
4693 4698  
4694 4699                  for (i = 0; i < btblmax; i++)
4695 4700                          if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
4696 4701                                  break;
4697 4702                  if (i < btblmax) {
4698 4703                          if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
4699 4704                                  return ("Intel(r) Celeron(r)");
4700 4705                          if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
4701 4706                                  return ("Intel(r) Xeon(tm) MP");
4702 4707                          if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
4703 4708                                  return ("Intel(r) Xeon(tm)");
4704 4709                          return (brand_tbl[i].bt_str);
4705 4710                  }
4706 4711          }
4707 4712  
4708 4713          return (NULL);
4709 4714  }
4710 4715  
4711 4716  static const char *
4712 4717  amd_cpubrand(const struct cpuid_info *cpi)
4713 4718  {
4714 4719          if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4715 4720              cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
4716 4721                  return ("i486 compatible");
4717 4722  
4718 4723          switch (cpi->cpi_family) {
4719 4724          case 5:
4720 4725                  switch (cpi->cpi_model) {
4721 4726                  case 0:
4722 4727                  case 1:
4723 4728                  case 2:
4724 4729                  case 3:
4725 4730                  case 4:
4726 4731                  case 5:
4727 4732                          return ("AMD-K5(r)");
4728 4733                  case 6:
4729 4734                  case 7:
4730 4735                          return ("AMD-K6(r)");
4731 4736                  case 8:
4732 4737                          return ("AMD-K6(r)-2");
4733 4738                  case 9:
4734 4739                          return ("AMD-K6(r)-III");
4735 4740                  default:
4736 4741                          return ("AMD (family 5)");
4737 4742                  }
4738 4743          case 6:
4739 4744                  switch (cpi->cpi_model) {
4740 4745                  case 1:
4741 4746                          return ("AMD-K7(tm)");
4742 4747                  case 0:
4743 4748                  case 2:
4744 4749                  case 4:
4745 4750                          return ("AMD Athlon(tm)");
4746 4751                  case 3:
4747 4752                  case 7:
4748 4753                          return ("AMD Duron(tm)");
4749 4754                  case 6:
4750 4755                  case 8:
4751 4756                  case 10:
4752 4757                          /*
4753 4758                           * Use the L2 cache size to distinguish
4754 4759                           */
4755 4760                          return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
4756 4761                              "AMD Athlon(tm)" : "AMD Duron(tm)");
4757 4762                  default:
4758 4763                          return ("AMD (family 6)");
4759 4764                  }
4760 4765          default:
4761 4766                  break;
4762 4767          }
4763 4768  
4764 4769          if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
4765 4770              cpi->cpi_brandid != 0) {
4766 4771                  switch (BITX(cpi->cpi_brandid, 7, 5)) {
4767 4772                  case 3:
4768 4773                          return ("AMD Opteron(tm) UP 1xx");
4769 4774                  case 4:
4770 4775                          return ("AMD Opteron(tm) DP 2xx");
4771 4776                  case 5:
4772 4777                          return ("AMD Opteron(tm) MP 8xx");
4773 4778                  default:
4774 4779                          return ("AMD Opteron(tm)");
4775 4780                  }
4776 4781          }
4777 4782  
4778 4783          return (NULL);
4779 4784  }
4780 4785  
4781 4786  static const char *
4782 4787  cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
4783 4788  {
4784 4789          if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4785 4790              cpi->cpi_maxeax < 1 || cpi->cpi_family < 5 ||
4786 4791              type == X86_TYPE_CYRIX_486)
4787 4792                  return ("i486 compatible");
4788 4793  
4789 4794          switch (type) {
4790 4795          case X86_TYPE_CYRIX_6x86:
4791 4796                  return ("Cyrix 6x86");
4792 4797          case X86_TYPE_CYRIX_6x86L:
4793 4798                  return ("Cyrix 6x86L");
4794 4799          case X86_TYPE_CYRIX_6x86MX:
4795 4800                  return ("Cyrix 6x86MX");
4796 4801          case X86_TYPE_CYRIX_GXm:
4797 4802                  return ("Cyrix GXm");
4798 4803          case X86_TYPE_CYRIX_MediaGX:
4799 4804                  return ("Cyrix MediaGX");
4800 4805          case X86_TYPE_CYRIX_MII:
4801 4806                  return ("Cyrix M2");
4802 4807          case X86_TYPE_VIA_CYRIX_III:
4803 4808                  return ("VIA Cyrix M3");
4804 4809          default:
4805 4810                  /*
4806 4811                   * Have another wild guess ..
4807 4812                   */
4808 4813                  if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
4809 4814                          return ("Cyrix 5x86");
4810 4815                  else if (cpi->cpi_family == 5) {
4811 4816                          switch (cpi->cpi_model) {
4812 4817                          case 2:
4813 4818                                  return ("Cyrix 6x86");  /* Cyrix M1 */
4814 4819                          case 4:
4815 4820                                  return ("Cyrix MediaGX");
4816 4821                          default:
4817 4822                                  break;
4818 4823                          }
4819 4824                  } else if (cpi->cpi_family == 6) {
4820 4825                          switch (cpi->cpi_model) {
4821 4826                          case 0:
4822 4827                                  return ("Cyrix 6x86MX"); /* Cyrix M2? */
4823 4828                          case 5:
4824 4829                          case 6:
4825 4830                          case 7:
4826 4831                          case 8:
4827 4832                          case 9:
4828 4833                                  return ("VIA C3");
4829 4834                          default:
4830 4835                                  break;
4831 4836                          }
4832 4837                  }
4833 4838                  break;
4834 4839          }
4835 4840          return (NULL);
4836 4841  }
4837 4842  
4838 4843  /*
4839 4844   * This only gets called in the case that the CPU extended
4840 4845   * feature brand string (0x80000002, 0x80000003, 0x80000004)
4841 4846   * aren't available, or contain null bytes for some reason.
4842 4847   */
4843 4848  static void
4844 4849  fabricate_brandstr(struct cpuid_info *cpi)
4845 4850  {
4846 4851          const char *brand = NULL;
4847 4852  
4848 4853          switch (cpi->cpi_vendor) {
4849 4854          case X86_VENDOR_Intel:
4850 4855                  brand = intel_cpubrand(cpi);
4851 4856                  break;
4852 4857          case X86_VENDOR_AMD:
4853 4858                  brand = amd_cpubrand(cpi);
4854 4859                  break;
4855 4860          case X86_VENDOR_Cyrix:
4856 4861                  brand = cyrix_cpubrand(cpi, x86_type);
4857 4862                  break;
4858 4863          case X86_VENDOR_NexGen:
4859 4864                  if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4860 4865                          brand = "NexGen Nx586";
4861 4866                  break;
4862 4867          case X86_VENDOR_Centaur:
4863 4868                  if (cpi->cpi_family == 5)
4864 4869                          switch (cpi->cpi_model) {
4865 4870                          case 4:
4866 4871                                  brand = "Centaur C6";
4867 4872                                  break;
4868 4873                          case 8:
4869 4874                                  brand = "Centaur C2";
4870 4875                                  break;
4871 4876                          case 9:
4872 4877                                  brand = "Centaur C3";
4873 4878                                  break;
4874 4879                          default:
4875 4880                                  break;
4876 4881                          }
4877 4882                  break;
4878 4883          case X86_VENDOR_Rise:
4879 4884                  if (cpi->cpi_family == 5 &&
4880 4885                      (cpi->cpi_model == 0 || cpi->cpi_model == 2))
4881 4886                          brand = "Rise mP6";
4882 4887                  break;
4883 4888          case X86_VENDOR_SiS:
4884 4889                  if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4885 4890                          brand = "SiS 55x";
4886 4891                  break;
4887 4892          case X86_VENDOR_TM:
4888 4893                  if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
4889 4894                          brand = "Transmeta Crusoe TM3x00 or TM5x00";
4890 4895                  break;
4891 4896          case X86_VENDOR_NSC:
4892 4897          case X86_VENDOR_UMC:
4893 4898          default:
4894 4899                  break;
4895 4900          }
4896 4901          if (brand) {
4897 4902                  (void) strcpy((char *)cpi->cpi_brandstr, brand);
4898 4903                  return;
4899 4904          }
4900 4905  
4901 4906          /*
4902 4907           * If all else fails ...
4903 4908           */
4904 4909          (void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
4905 4910              "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
4906 4911              cpi->cpi_model, cpi->cpi_step);
4907 4912  }
4908 4913  
4909 4914  /*
4910 4915   * This routine is called just after kernel memory allocation
4911 4916   * becomes available on cpu0, and as part of mp_startup() on
4912 4917   * the other cpus.
4913 4918   *
4914 4919   * Fixup the brand string, and collect any information from cpuid
4915 4920   * that requires dynamically allocated storage to represent.
4916 4921   */
4917 4922  /*ARGSUSED*/
4918 4923  void
4919 4924  cpuid_pass3(cpu_t *cpu)
4920 4925  {
4921 4926          int     i, max, shft, level, size;
4922 4927          struct cpuid_regs regs;
4923 4928          struct cpuid_regs *cp;
4924 4929          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4925 4930  
4926 4931          ASSERT(cpi->cpi_pass == 2);
4927 4932  
4928 4933          /*
4929 4934           * Deterministic cache parameters
4930 4935           *
4931 4936           * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
4932 4937           * values that are present are currently defined to be the same. This
4933 4938           * means we can use the same logic to parse it as long as we use the
4934 4939           * appropriate leaf to get the data. If you're updating this, make sure
4935 4940           * you're careful about which vendor supports which aspect.
4936 4941           *
4937 4942           * Take this opportunity to detect the number of threads sharing the
4938 4943           * last level cache, and construct a corresponding cache id. The
4939 4944           * respective cpuid_info members are initialized to the default case of
4940 4945           * "no last level cache sharing".
4941 4946           */
4942 4947          cpi->cpi_ncpu_shr_last_cache = 1;
4943 4948          cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
4944 4949  
4945 4950          if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
4946 4951              (cpi->cpi_vendor == X86_VENDOR_AMD &&
4947 4952              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
4948 4953              is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
4949 4954                  uint32_t leaf;
4950 4955  
4951 4956                  if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4952 4957                          leaf = 4;
4953 4958                  } else {
4954 4959                          leaf = CPUID_LEAF_EXT_1d;
4955 4960                  }
4956 4961  
4957 4962                  /*
4958 4963                   * Find the # of elements (size) returned by the leaf and along
4959 4964                   * the way detect last level cache sharing details.
4960 4965                   */
4961 4966                  bzero(&regs, sizeof (regs));
4962 4967                  cp = &regs;
4963 4968                  for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
4964 4969                          cp->cp_eax = leaf;
4965 4970                          cp->cp_ecx = i;
4966 4971  
4967 4972                          (void) __cpuid_insn(cp);
4968 4973  
4969 4974                          if (CPI_CACHE_TYPE(cp) == 0)
4970 4975                                  break;
4971 4976                          level = CPI_CACHE_LVL(cp);
4972 4977                          if (level > max) {
4973 4978                                  max = level;
4974 4979                                  cpi->cpi_ncpu_shr_last_cache =
4975 4980                                      CPI_NTHR_SHR_CACHE(cp) + 1;
4976 4981                          }
4977 4982                  }
4978 4983                  cpi->cpi_cache_leaf_size = size = i;
4979 4984  
4980 4985                  /*
4981 4986                   * Allocate the cpi_cache_leaves array. The first element
4982 4987                   * references the regs for the corresponding leaf with %ecx set
4983 4988                   * to 0. This was gathered in cpuid_pass2().
4984 4989                   */
4985 4990                  if (size > 0) {
4986 4991                          cpi->cpi_cache_leaves =
4987 4992                              kmem_alloc(size * sizeof (cp), KM_SLEEP);
4988 4993                          if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4989 4994                                  cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
4990 4995                          } else {
4991 4996                                  cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
4992 4997                          }
4993 4998  
4994 4999                          /*
4995 5000                           * Allocate storage to hold the additional regs
4996 5001                           * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
4997 5002                           *
4998 5003                           * The regs for the leaf, %ecx == 0 has already
4999 5004                           * been allocated as indicated above.
5000 5005                           */
5001 5006                          for (i = 1; i < size; i++) {
5002 5007                                  cp = cpi->cpi_cache_leaves[i] =
5003 5008                                      kmem_zalloc(sizeof (regs), KM_SLEEP);
5004 5009                                  cp->cp_eax = leaf;
5005 5010                                  cp->cp_ecx = i;
5006 5011  
5007 5012                                  (void) __cpuid_insn(cp);
5008 5013                          }
5009 5014                  }
5010 5015                  /*
5011 5016                   * Determine the number of bits needed to represent
5012 5017                   * the number of CPUs sharing the last level cache.
5013 5018                   *
5014 5019                   * Shift off that number of bits from the APIC id to
5015 5020                   * derive the cache id.
5016 5021                   */
5017 5022                  shft = 0;
5018 5023                  for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
5019 5024                          shft++;
5020 5025                  cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
5021 5026          }
5022 5027  
5023 5028          /*
5024 5029           * Now fixup the brand string
5025 5030           */
5026 5031          if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
5027 5032                  fabricate_brandstr(cpi);
5028 5033          } else {
5029 5034  
5030 5035                  /*
5031 5036                   * If we successfully extracted a brand string from the cpuid
5032 5037                   * instruction, clean it up by removing leading spaces and
5033 5038                   * similar junk.
5034 5039                   */
5035 5040                  if (cpi->cpi_brandstr[0]) {
5036 5041                          size_t maxlen = sizeof (cpi->cpi_brandstr);
5037 5042                          char *src, *dst;
5038 5043  
5039 5044                          dst = src = (char *)cpi->cpi_brandstr;
5040 5045                          src[maxlen - 1] = '\0';
5041 5046                          /*
5042 5047                           * strip leading spaces
5043 5048                           */
5044 5049                          while (*src == ' ')
5045 5050                                  src++;
5046 5051                          /*
5047 5052                           * Remove any 'Genuine' or "Authentic" prefixes
5048 5053                           */
5049 5054                          if (strncmp(src, "Genuine ", 8) == 0)
5050 5055                                  src += 8;
5051 5056                          if (strncmp(src, "Authentic ", 10) == 0)
5052 5057                                  src += 10;
5053 5058  
5054 5059                          /*
5055 5060                           * Now do an in-place copy.
5056 5061                           * Map (R) to (r) and (TM) to (tm).
5057 5062                           * The era of teletypes is long gone, and there's
5058 5063                           * -really- no need to shout.
5059 5064                           */
5060 5065                          while (*src != '\0') {
5061 5066                                  if (src[0] == '(') {
5062 5067                                          if (strncmp(src + 1, "R)", 2) == 0) {
5063 5068                                                  (void) strncpy(dst, "(r)", 3);
5064 5069                                                  src += 3;
5065 5070                                                  dst += 3;
5066 5071                                                  continue;
5067 5072                                          }
5068 5073                                          if (strncmp(src + 1, "TM)", 3) == 0) {
5069 5074                                                  (void) strncpy(dst, "(tm)", 4);
5070 5075                                                  src += 4;
5071 5076                                                  dst += 4;
5072 5077                                                  continue;
5073 5078                                          }
5074 5079                                  }
5075 5080                                  *dst++ = *src++;
5076 5081                          }
5077 5082                          *dst = '\0';
5078 5083  
5079 5084                          /*
5080 5085                           * Finally, remove any trailing spaces
5081 5086                           */
5082 5087                          while (--dst > cpi->cpi_brandstr)
5083 5088                                  if (*dst == ' ')
5084 5089                                          *dst = '\0';
5085 5090                                  else
5086 5091                                          break;
5087 5092                  } else
5088 5093                          fabricate_brandstr(cpi);
5089 5094          }
5090 5095          cpi->cpi_pass = 3;
5091 5096  }
5092 5097  
5093 5098  /*
5094 5099   * This routine is called out of bind_hwcap() much later in the life
5095 5100   * of the kernel (post_startup()).  The job of this routine is to resolve
5096 5101   * the hardware feature support and kernel support for those features into
5097 5102   * what we're actually going to tell applications via the aux vector.
5098 5103   */
5099 5104  void
5100 5105  cpuid_pass4(cpu_t *cpu, uint_t *hwcap_out)
5101 5106  {
5102 5107          struct cpuid_info *cpi;
5103 5108          uint_t hwcap_flags = 0, hwcap_flags_2 = 0;
5104 5109  
5105 5110          if (cpu == NULL)
5106 5111                  cpu = CPU;
5107 5112          cpi = cpu->cpu_m.mcpu_cpi;
5108 5113  
5109 5114          ASSERT(cpi->cpi_pass == 3);
5110 5115  
5111 5116          if (cpi->cpi_maxeax >= 1) {
5112 5117                  uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
5113 5118                  uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
5114 5119                  uint32_t *ebx = &cpi->cpi_support[STD_EBX_FEATURES];
5115 5120  
5116 5121                  *edx = CPI_FEATURES_EDX(cpi);
5117 5122                  *ecx = CPI_FEATURES_ECX(cpi);
5118 5123                  *ebx = CPI_FEATURES_7_0_EBX(cpi);
5119 5124  
5120 5125                  /*
5121 5126                   * [these require explicit kernel support]
5122 5127                   */
5123 5128                  if (!is_x86_feature(x86_featureset, X86FSET_SEP))
5124 5129                          *edx &= ~CPUID_INTC_EDX_SEP;
5125 5130  
5126 5131                  if (!is_x86_feature(x86_featureset, X86FSET_SSE))
5127 5132                          *edx &= ~(CPUID_INTC_EDX_FXSR|CPUID_INTC_EDX_SSE);
5128 5133                  if (!is_x86_feature(x86_featureset, X86FSET_SSE2))
5129 5134                          *edx &= ~CPUID_INTC_EDX_SSE2;
5130 5135  
5131 5136                  if (!is_x86_feature(x86_featureset, X86FSET_HTT))
5132 5137                          *edx &= ~CPUID_INTC_EDX_HTT;
5133 5138  
5134 5139                  if (!is_x86_feature(x86_featureset, X86FSET_SSE3))
5135 5140                          *ecx &= ~CPUID_INTC_ECX_SSE3;
5136 5141  
5137 5142                  if (!is_x86_feature(x86_featureset, X86FSET_SSSE3))
5138 5143                          *ecx &= ~CPUID_INTC_ECX_SSSE3;
5139 5144                  if (!is_x86_feature(x86_featureset, X86FSET_SSE4_1))
5140 5145                          *ecx &= ~CPUID_INTC_ECX_SSE4_1;
5141 5146                  if (!is_x86_feature(x86_featureset, X86FSET_SSE4_2))
5142 5147                          *ecx &= ~CPUID_INTC_ECX_SSE4_2;
5143 5148                  if (!is_x86_feature(x86_featureset, X86FSET_AES))
5144 5149                          *ecx &= ~CPUID_INTC_ECX_AES;
5145 5150                  if (!is_x86_feature(x86_featureset, X86FSET_PCLMULQDQ))
5146 5151                          *ecx &= ~CPUID_INTC_ECX_PCLMULQDQ;
5147 5152                  if (!is_x86_feature(x86_featureset, X86FSET_XSAVE))
5148 5153                          *ecx &= ~(CPUID_INTC_ECX_XSAVE |
5149 5154                              CPUID_INTC_ECX_OSXSAVE);
5150 5155                  if (!is_x86_feature(x86_featureset, X86FSET_AVX))
5151 5156                          *ecx &= ~CPUID_INTC_ECX_AVX;
5152 5157                  if (!is_x86_feature(x86_featureset, X86FSET_F16C))
5153 5158                          *ecx &= ~CPUID_INTC_ECX_F16C;
5154 5159                  if (!is_x86_feature(x86_featureset, X86FSET_FMA))
5155 5160                          *ecx &= ~CPUID_INTC_ECX_FMA;
5156 5161                  if (!is_x86_feature(x86_featureset, X86FSET_BMI1))
5157 5162                          *ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
5158 5163                  if (!is_x86_feature(x86_featureset, X86FSET_BMI2))
5159 5164                          *ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
5160 5165                  if (!is_x86_feature(x86_featureset, X86FSET_AVX2))
5161 5166                          *ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
5162 5167                  if (!is_x86_feature(x86_featureset, X86FSET_RDSEED))
5163 5168                          *ebx &= ~CPUID_INTC_EBX_7_0_RDSEED;
5164 5169                  if (!is_x86_feature(x86_featureset, X86FSET_ADX))
5165 5170                          *ebx &= ~CPUID_INTC_EBX_7_0_ADX;
5166 5171  
5167 5172                  /*
5168 5173                   * [no explicit support required beyond x87 fp context]
5169 5174                   */
5170 5175                  if (!fpu_exists)
5171 5176                          *edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
5172 5177  
5173 5178                  /*
5174 5179                   * Now map the supported feature vector to things that we
5175 5180                   * think userland will care about.
5176 5181                   */
5177 5182                  if (*edx & CPUID_INTC_EDX_SEP)
5178 5183                          hwcap_flags |= AV_386_SEP;
5179 5184                  if (*edx & CPUID_INTC_EDX_SSE)
5180 5185                          hwcap_flags |= AV_386_FXSR | AV_386_SSE;
5181 5186                  if (*edx & CPUID_INTC_EDX_SSE2)
5182 5187                          hwcap_flags |= AV_386_SSE2;
5183 5188                  if (*ecx & CPUID_INTC_ECX_SSE3)
5184 5189                          hwcap_flags |= AV_386_SSE3;
5185 5190                  if (*ecx & CPUID_INTC_ECX_SSSE3)
5186 5191                          hwcap_flags |= AV_386_SSSE3;
5187 5192                  if (*ecx & CPUID_INTC_ECX_SSE4_1)
5188 5193                          hwcap_flags |= AV_386_SSE4_1;
5189 5194                  if (*ecx & CPUID_INTC_ECX_SSE4_2)
5190 5195                          hwcap_flags |= AV_386_SSE4_2;
5191 5196                  if (*ecx & CPUID_INTC_ECX_MOVBE)
5192 5197                          hwcap_flags |= AV_386_MOVBE;
5193 5198                  if (*ecx & CPUID_INTC_ECX_AES)
5194 5199                          hwcap_flags |= AV_386_AES;
5195 5200                  if (*ecx & CPUID_INTC_ECX_PCLMULQDQ)
5196 5201                          hwcap_flags |= AV_386_PCLMULQDQ;
5197 5202                  if ((*ecx & CPUID_INTC_ECX_XSAVE) &&
5198 5203                      (*ecx & CPUID_INTC_ECX_OSXSAVE)) {
5199 5204                          hwcap_flags |= AV_386_XSAVE;
5200 5205  
5201 5206                          if (*ecx & CPUID_INTC_ECX_AVX) {
5202 5207                                  uint32_t *ecx_7 = &CPI_FEATURES_7_0_ECX(cpi);
5203 5208                                  uint32_t *edx_7 = &CPI_FEATURES_7_0_EDX(cpi);
5204 5209  
5205 5210                                  hwcap_flags |= AV_386_AVX;
5206 5211                                  if (*ecx & CPUID_INTC_ECX_F16C)
5207 5212                                          hwcap_flags_2 |= AV_386_2_F16C;
5208 5213                                  if (*ecx & CPUID_INTC_ECX_FMA)
5209 5214                                          hwcap_flags_2 |= AV_386_2_FMA;
5210 5215  
5211 5216                                  if (*ebx & CPUID_INTC_EBX_7_0_BMI1)
5212 5217                                          hwcap_flags_2 |= AV_386_2_BMI1;
5213 5218                                  if (*ebx & CPUID_INTC_EBX_7_0_BMI2)
5214 5219                                          hwcap_flags_2 |= AV_386_2_BMI2;
5215 5220                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX2)
5216 5221                                          hwcap_flags_2 |= AV_386_2_AVX2;
5217 5222                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512F)
5218 5223                                          hwcap_flags_2 |= AV_386_2_AVX512F;
5219 5224                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
5220 5225                                          hwcap_flags_2 |= AV_386_2_AVX512DQ;
5221 5226                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
5222 5227                                          hwcap_flags_2 |= AV_386_2_AVX512IFMA;
5223 5228                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512PF)
5224 5229                                          hwcap_flags_2 |= AV_386_2_AVX512PF;
5225 5230                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512ER)
5226 5231                                          hwcap_flags_2 |= AV_386_2_AVX512ER;
5227 5232                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512CD)
5228 5233                                          hwcap_flags_2 |= AV_386_2_AVX512CD;
5229 5234                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512BW)
5230 5235                                          hwcap_flags_2 |= AV_386_2_AVX512BW;
5231 5236                                  if (*ebx & CPUID_INTC_EBX_7_0_AVX512VL)
5232 5237                                          hwcap_flags_2 |= AV_386_2_AVX512VL;
5233 5238  
5234 5239                                  if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VBMI)
5235 5240                                          hwcap_flags_2 |= AV_386_2_AVX512VBMI;
5236 5241                                  if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VNNI)
5237 5242                                          hwcap_flags_2 |= AV_386_2_AVX512_VNNI;
5238 5243                                  if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
5239 5244                                          hwcap_flags_2 |= AV_386_2_AVX512VPOPCDQ;
5240 5245  
5241 5246                                  if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124NNIW)
5242 5247                                          hwcap_flags_2 |= AV_386_2_AVX512_4NNIW;
5243 5248                                  if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
5244 5249                                          hwcap_flags_2 |= AV_386_2_AVX512_4FMAPS;
5245 5250                          }
5246 5251                  }
5247 5252                  if (*ecx & CPUID_INTC_ECX_VMX)
5248 5253                          hwcap_flags |= AV_386_VMX;
5249 5254                  if (*ecx & CPUID_INTC_ECX_POPCNT)
5250 5255                          hwcap_flags |= AV_386_POPCNT;
5251 5256                  if (*edx & CPUID_INTC_EDX_FPU)
5252 5257                          hwcap_flags |= AV_386_FPU;
5253 5258                  if (*edx & CPUID_INTC_EDX_MMX)
5254 5259                          hwcap_flags |= AV_386_MMX;
5255 5260  
5256 5261                  if (*edx & CPUID_INTC_EDX_TSC)
5257 5262                          hwcap_flags |= AV_386_TSC;
5258 5263                  if (*edx & CPUID_INTC_EDX_CX8)
5259 5264                          hwcap_flags |= AV_386_CX8;
5260 5265                  if (*edx & CPUID_INTC_EDX_CMOV)
5261 5266                          hwcap_flags |= AV_386_CMOV;
5262 5267                  if (*ecx & CPUID_INTC_ECX_CX16)
5263 5268                          hwcap_flags |= AV_386_CX16;
5264 5269  
5265 5270                  if (*ecx & CPUID_INTC_ECX_RDRAND)
5266 5271                          hwcap_flags_2 |= AV_386_2_RDRAND;
5267 5272                  if (*ebx & CPUID_INTC_EBX_7_0_ADX)
5268 5273                          hwcap_flags_2 |= AV_386_2_ADX;
5269 5274                  if (*ebx & CPUID_INTC_EBX_7_0_RDSEED)
5270 5275                          hwcap_flags_2 |= AV_386_2_RDSEED;
5271 5276                  if (*ebx & CPUID_INTC_EBX_7_0_SHA)
5272 5277                          hwcap_flags_2 |= AV_386_2_SHA;
5273 5278                  if (*ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
5274 5279                          hwcap_flags_2 |= AV_386_2_FSGSBASE;
5275 5280                  if (*ebx & CPUID_INTC_EBX_7_0_CLWB)
5276 5281                          hwcap_flags_2 |= AV_386_2_CLWB;
5277 5282                  if (*ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
5278 5283                          hwcap_flags_2 |= AV_386_2_CLFLUSHOPT;
5279 5284  
5280 5285          }
5281 5286          /*
5282 5287           * Check a few miscilaneous features.
5283 5288           */
5284 5289          if (is_x86_feature(x86_featureset, X86FSET_CLZERO))
5285 5290                  hwcap_flags_2 |= AV_386_2_CLZERO;
5286 5291  
5287 5292          if (cpi->cpi_xmaxeax < 0x80000001)
5288 5293                  goto pass4_done;
5289 5294  
5290 5295          switch (cpi->cpi_vendor) {
5291 5296                  struct cpuid_regs cp;
5292 5297                  uint32_t *edx, *ecx;
5293 5298  
5294 5299          case X86_VENDOR_Intel:
5295 5300                  /*
5296 5301                   * Seems like Intel duplicated what we necessary
5297 5302                   * here to make the initial crop of 64-bit OS's work.
5298 5303                   * Hopefully, those are the only "extended" bits
5299 5304                   * they'll add.
5300 5305                   */
5301 5306                  /*FALLTHROUGH*/
5302 5307  
5303 5308          case X86_VENDOR_AMD:
5304 5309                  edx = &cpi->cpi_support[AMD_EDX_FEATURES];
5305 5310                  ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
5306 5311  
5307 5312                  *edx = CPI_FEATURES_XTD_EDX(cpi);
5308 5313                  *ecx = CPI_FEATURES_XTD_ECX(cpi);
5309 5314  
5310 5315                  /*
5311 5316                   * [these features require explicit kernel support]
5312 5317                   */
5313 5318                  switch (cpi->cpi_vendor) {
5314 5319                  case X86_VENDOR_Intel:
5315 5320                          if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
5316 5321                                  *edx &= ~CPUID_AMD_EDX_TSCP;
5317 5322                          break;
5318 5323  
5319 5324                  case X86_VENDOR_AMD:
5320 5325                          if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
5321 5326                                  *edx &= ~CPUID_AMD_EDX_TSCP;
5322 5327                          if (!is_x86_feature(x86_featureset, X86FSET_SSE4A))
5323 5328                                  *ecx &= ~CPUID_AMD_ECX_SSE4A;
5324 5329                          break;
5325 5330  
5326 5331                  default:
5327 5332                          break;
5328 5333                  }
5329 5334  
5330 5335                  /*
5331 5336                   * [no explicit support required beyond
5332 5337                   * x87 fp context and exception handlers]
5333 5338                   */
5334 5339                  if (!fpu_exists)
5335 5340                          *edx &= ~(CPUID_AMD_EDX_MMXamd |
5336 5341                              CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
5337 5342  
5338 5343                  if (!is_x86_feature(x86_featureset, X86FSET_NX))
5339 5344                          *edx &= ~CPUID_AMD_EDX_NX;
5340 5345  #if !defined(__amd64)
5341 5346                  *edx &= ~CPUID_AMD_EDX_LM;
5342 5347  #endif
5343 5348                  /*
5344 5349                   * Now map the supported feature vector to
5345 5350                   * things that we think userland will care about.
5346 5351                   */
5347 5352  #if defined(__amd64)
5348 5353                  if (*edx & CPUID_AMD_EDX_SYSC)
5349 5354                          hwcap_flags |= AV_386_AMD_SYSC;
5350 5355  #endif
5351 5356                  if (*edx & CPUID_AMD_EDX_MMXamd)
5352 5357                          hwcap_flags |= AV_386_AMD_MMX;
5353 5358                  if (*edx & CPUID_AMD_EDX_3DNow)
5354 5359                          hwcap_flags |= AV_386_AMD_3DNow;
5355 5360                  if (*edx & CPUID_AMD_EDX_3DNowx)
5356 5361                          hwcap_flags |= AV_386_AMD_3DNowx;
5357 5362                  if (*ecx & CPUID_AMD_ECX_SVM)
5358 5363                          hwcap_flags |= AV_386_AMD_SVM;
5359 5364  
5360 5365                  switch (cpi->cpi_vendor) {
5361 5366                  case X86_VENDOR_AMD:
5362 5367                          if (*edx & CPUID_AMD_EDX_TSCP)
5363 5368                                  hwcap_flags |= AV_386_TSCP;
5364 5369                          if (*ecx & CPUID_AMD_ECX_AHF64)
5365 5370                                  hwcap_flags |= AV_386_AHF;
5366 5371                          if (*ecx & CPUID_AMD_ECX_SSE4A)
5367 5372                                  hwcap_flags |= AV_386_AMD_SSE4A;
5368 5373                          if (*ecx & CPUID_AMD_ECX_LZCNT)
5369 5374                                  hwcap_flags |= AV_386_AMD_LZCNT;
5370 5375                          if (*ecx & CPUID_AMD_ECX_MONITORX)
5371 5376                                  hwcap_flags_2 |= AV_386_2_MONITORX;
5372 5377                          break;
5373 5378  
5374 5379                  case X86_VENDOR_Intel:
5375 5380                          if (*edx & CPUID_AMD_EDX_TSCP)
5376 5381                                  hwcap_flags |= AV_386_TSCP;
5377 5382                          if (*ecx & CPUID_AMD_ECX_LZCNT)
5378 5383                                  hwcap_flags |= AV_386_AMD_LZCNT;
5379 5384                          /*
5380 5385                           * Aarrgh.
5381 5386                           * Intel uses a different bit in the same word.
5382 5387                           */
5383 5388                          if (*ecx & CPUID_INTC_ECX_AHF64)
5384 5389                                  hwcap_flags |= AV_386_AHF;
5385 5390                          break;
5386 5391  
5387 5392                  default:
5388 5393                          break;
5389 5394                  }
5390 5395                  break;
5391 5396  
5392 5397          case X86_VENDOR_TM:
5393 5398                  cp.cp_eax = 0x80860001;
5394 5399                  (void) __cpuid_insn(&cp);
5395 5400                  cpi->cpi_support[TM_EDX_FEATURES] = cp.cp_edx;
5396 5401                  break;
5397 5402  
5398 5403          default:
5399 5404                  break;
5400 5405          }
5401 5406  
5402 5407  pass4_done:
5403 5408          cpi->cpi_pass = 4;
5404 5409          if (hwcap_out != NULL) {
5405 5410                  hwcap_out[0] = hwcap_flags;
5406 5411                  hwcap_out[1] = hwcap_flags_2;
5407 5412          }
5408 5413  }
5409 5414  
5410 5415  
5411 5416  /*
5412 5417   * Simulate the cpuid instruction using the data we previously
5413 5418   * captured about this CPU.  We try our best to return the truth
5414 5419   * about the hardware, independently of kernel support.
5415 5420   */
5416 5421  uint32_t
5417 5422  cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
5418 5423  {
5419 5424          struct cpuid_info *cpi;
5420 5425          struct cpuid_regs *xcp;
5421 5426  
5422 5427          if (cpu == NULL)
5423 5428                  cpu = CPU;
5424 5429          cpi = cpu->cpu_m.mcpu_cpi;
5425 5430  
5426 5431          ASSERT(cpuid_checkpass(cpu, 3));
5427 5432  
5428 5433          /*
5429 5434           * CPUID data is cached in two separate places: cpi_std for standard
5430 5435           * CPUID leaves , and cpi_extd for extended CPUID leaves.
5431 5436           */
5432 5437          if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
5433 5438                  xcp = &cpi->cpi_std[cp->cp_eax];
5434 5439          } else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
5435 5440              cp->cp_eax <= cpi->cpi_xmaxeax &&
5436 5441              cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
5437 5442                  xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
5438 5443          } else {
5439 5444                  /*
5440 5445                   * The caller is asking for data from an input parameter which
5441 5446                   * the kernel has not cached.  In this case we go fetch from
5442 5447                   * the hardware and return the data directly to the user.
5443 5448                   */
5444 5449                  return (__cpuid_insn(cp));
5445 5450          }
5446 5451  
5447 5452          cp->cp_eax = xcp->cp_eax;
5448 5453          cp->cp_ebx = xcp->cp_ebx;
5449 5454          cp->cp_ecx = xcp->cp_ecx;
5450 5455          cp->cp_edx = xcp->cp_edx;
5451 5456          return (cp->cp_eax);
5452 5457  }
5453 5458  
5454 5459  int
5455 5460  cpuid_checkpass(cpu_t *cpu, int pass)
5456 5461  {
5457 5462          return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
5458 5463              cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
5459 5464  }
5460 5465  
5461 5466  int
5462 5467  cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
5463 5468  {
5464 5469          ASSERT(cpuid_checkpass(cpu, 3));
5465 5470  
5466 5471          return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
5467 5472  }
5468 5473  
5469 5474  int
5470 5475  cpuid_is_cmt(cpu_t *cpu)
5471 5476  {
5472 5477          if (cpu == NULL)
5473 5478                  cpu = CPU;
5474 5479  
5475 5480          ASSERT(cpuid_checkpass(cpu, 1));
5476 5481  
5477 5482          return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
5478 5483  }
5479 5484  
5480 5485  /*
5481 5486   * AMD and Intel both implement the 64-bit variant of the syscall
5482 5487   * instruction (syscallq), so if there's -any- support for syscall,
5483 5488   * cpuid currently says "yes, we support this".
5484 5489   *
5485 5490   * However, Intel decided to -not- implement the 32-bit variant of the
5486 5491   * syscall instruction, so we provide a predicate to allow our caller
5487 5492   * to test that subtlety here.
5488 5493   *
5489 5494   * XXPV Currently, 32-bit syscall instructions don't work via the hypervisor,
5490 5495   *      even in the case where the hardware would in fact support it.
5491 5496   */
5492 5497  /*ARGSUSED*/
5493 5498  int
5494 5499  cpuid_syscall32_insn(cpu_t *cpu)
5495 5500  {
5496 5501          ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), 1));
5497 5502  
5498 5503  #if !defined(__xpv)
5499 5504          if (cpu == NULL)
5500 5505                  cpu = CPU;
5501 5506  
5502 5507          /*CSTYLED*/
5503 5508          {
5504 5509                  struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5505 5510  
5506 5511                  if (cpi->cpi_vendor == X86_VENDOR_AMD &&
5507 5512                      cpi->cpi_xmaxeax >= 0x80000001 &&
5508 5513                      (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
5509 5514                          return (1);
5510 5515          }
5511 5516  #endif
5512 5517          return (0);
5513 5518  }
5514 5519  
5515 5520  int
5516 5521  cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
5517 5522  {
5518 5523          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5519 5524  
5520 5525          static const char fmt[] =
5521 5526              "x86 (%s %X family %d model %d step %d clock %d MHz)";
5522 5527          static const char fmt_ht[] =
5523 5528              "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
5524 5529  
5525 5530          ASSERT(cpuid_checkpass(cpu, 1));
5526 5531  
5527 5532          if (cpuid_is_cmt(cpu))
5528 5533                  return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
5529 5534                      cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5530 5535                      cpi->cpi_family, cpi->cpi_model,
5531 5536                      cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5532 5537          return (snprintf(s, n, fmt,
5533 5538              cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5534 5539              cpi->cpi_family, cpi->cpi_model,
5535 5540              cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5536 5541  }
5537 5542  
5538 5543  const char *
5539 5544  cpuid_getvendorstr(cpu_t *cpu)
5540 5545  {
5541 5546          ASSERT(cpuid_checkpass(cpu, 1));
5542 5547          return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
5543 5548  }
5544 5549  
5545 5550  uint_t
5546 5551  cpuid_getvendor(cpu_t *cpu)
5547 5552  {
5548 5553          ASSERT(cpuid_checkpass(cpu, 1));
5549 5554          return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
5550 5555  }
5551 5556  
5552 5557  uint_t
5553 5558  cpuid_getfamily(cpu_t *cpu)
5554 5559  {
5555 5560          ASSERT(cpuid_checkpass(cpu, 1));
5556 5561          return (cpu->cpu_m.mcpu_cpi->cpi_family);
5557 5562  }
5558 5563  
5559 5564  uint_t
5560 5565  cpuid_getmodel(cpu_t *cpu)
5561 5566  {
5562 5567          ASSERT(cpuid_checkpass(cpu, 1));
5563 5568          return (cpu->cpu_m.mcpu_cpi->cpi_model);
5564 5569  }
5565 5570  
5566 5571  uint_t
5567 5572  cpuid_get_ncpu_per_chip(cpu_t *cpu)
5568 5573  {
5569 5574          ASSERT(cpuid_checkpass(cpu, 1));
5570 5575          return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
5571 5576  }
5572 5577  
5573 5578  uint_t
5574 5579  cpuid_get_ncore_per_chip(cpu_t *cpu)
5575 5580  {
5576 5581          ASSERT(cpuid_checkpass(cpu, 1));
5577 5582          return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
5578 5583  }
5579 5584  
5580 5585  uint_t
5581 5586  cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
5582 5587  {
5583 5588          ASSERT(cpuid_checkpass(cpu, 2));
5584 5589          return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
5585 5590  }
5586 5591  
5587 5592  id_t
5588 5593  cpuid_get_last_lvl_cacheid(cpu_t *cpu)
5589 5594  {
5590 5595          ASSERT(cpuid_checkpass(cpu, 2));
5591 5596          return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5592 5597  }
5593 5598  
5594 5599  uint_t
5595 5600  cpuid_getstep(cpu_t *cpu)
5596 5601  {
5597 5602          ASSERT(cpuid_checkpass(cpu, 1));
5598 5603          return (cpu->cpu_m.mcpu_cpi->cpi_step);
5599 5604  }
5600 5605  
5601 5606  uint_t
5602 5607  cpuid_getsig(struct cpu *cpu)
5603 5608  {
5604 5609          ASSERT(cpuid_checkpass(cpu, 1));
5605 5610          return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
5606 5611  }
5607 5612  
5608 5613  uint32_t
5609 5614  cpuid_getchiprev(struct cpu *cpu)
5610 5615  {
5611 5616          ASSERT(cpuid_checkpass(cpu, 1));
5612 5617          return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
5613 5618  }
5614 5619  
5615 5620  const char *
5616 5621  cpuid_getchiprevstr(struct cpu *cpu)
5617 5622  {
5618 5623          ASSERT(cpuid_checkpass(cpu, 1));
5619 5624          return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
5620 5625  }
5621 5626  
5622 5627  uint32_t
5623 5628  cpuid_getsockettype(struct cpu *cpu)
5624 5629  {
5625 5630          ASSERT(cpuid_checkpass(cpu, 1));
5626 5631          return (cpu->cpu_m.mcpu_cpi->cpi_socket);
5627 5632  }
5628 5633  
5629 5634  const char *
5630 5635  cpuid_getsocketstr(cpu_t *cpu)
5631 5636  {
5632 5637          static const char *socketstr = NULL;
5633 5638          struct cpuid_info *cpi;
5634 5639  
5635 5640          ASSERT(cpuid_checkpass(cpu, 1));
5636 5641          cpi = cpu->cpu_m.mcpu_cpi;
5637 5642  
5638 5643          /* Assume that socket types are the same across the system */
5639 5644          if (socketstr == NULL)
5640 5645                  socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
5641 5646                      cpi->cpi_model, cpi->cpi_step);
5642 5647  
5643 5648  
5644 5649          return (socketstr);
5645 5650  }
5646 5651  
5647 5652  int
5648 5653  cpuid_get_chipid(cpu_t *cpu)
5649 5654  {
5650 5655          ASSERT(cpuid_checkpass(cpu, 1));
5651 5656  
5652 5657          if (cpuid_is_cmt(cpu))
5653 5658                  return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
5654 5659          return (cpu->cpu_id);
5655 5660  }
5656 5661  
5657 5662  id_t
5658 5663  cpuid_get_coreid(cpu_t *cpu)
5659 5664  {
5660 5665          ASSERT(cpuid_checkpass(cpu, 1));
5661 5666          return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
5662 5667  }
5663 5668  
5664 5669  int
5665 5670  cpuid_get_pkgcoreid(cpu_t *cpu)
5666 5671  {
5667 5672          ASSERT(cpuid_checkpass(cpu, 1));
5668 5673          return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
5669 5674  }
5670 5675  
5671 5676  int
5672 5677  cpuid_get_clogid(cpu_t *cpu)
5673 5678  {
5674 5679          ASSERT(cpuid_checkpass(cpu, 1));
5675 5680          return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
5676 5681  }
5677 5682  
5678 5683  int
5679 5684  cpuid_get_cacheid(cpu_t *cpu)
5680 5685  {
5681 5686          ASSERT(cpuid_checkpass(cpu, 1));
5682 5687          return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5683 5688  }
5684 5689  
5685 5690  uint_t
5686 5691  cpuid_get_procnodeid(cpu_t *cpu)
5687 5692  {
5688 5693          ASSERT(cpuid_checkpass(cpu, 1));
5689 5694          return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
5690 5695  }
5691 5696  
5692 5697  uint_t
5693 5698  cpuid_get_procnodes_per_pkg(cpu_t *cpu)
5694 5699  {
5695 5700          ASSERT(cpuid_checkpass(cpu, 1));
5696 5701          return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
5697 5702  }
5698 5703  
5699 5704  uint_t
5700 5705  cpuid_get_compunitid(cpu_t *cpu)
5701 5706  {
5702 5707          ASSERT(cpuid_checkpass(cpu, 1));
5703 5708          return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
5704 5709  }
5705 5710  
5706 5711  uint_t
5707 5712  cpuid_get_cores_per_compunit(cpu_t *cpu)
5708 5713  {
5709 5714          ASSERT(cpuid_checkpass(cpu, 1));
5710 5715          return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
5711 5716  }
5712 5717  
5713 5718  /*ARGSUSED*/
5714 5719  int
5715 5720  cpuid_have_cr8access(cpu_t *cpu)
5716 5721  {
5717 5722  #if defined(__amd64)
5718 5723          return (1);
5719 5724  #else
5720 5725          struct cpuid_info *cpi;
5721 5726  
5722 5727          ASSERT(cpu != NULL);
5723 5728          cpi = cpu->cpu_m.mcpu_cpi;
5724 5729          if (cpi->cpi_vendor == X86_VENDOR_AMD && cpi->cpi_maxeax >= 1 &&
5725 5730              (CPI_FEATURES_XTD_ECX(cpi) & CPUID_AMD_ECX_CR8D) != 0)
5726 5731                  return (1);
5727 5732          return (0);
5728 5733  #endif
5729 5734  }
5730 5735  
5731 5736  uint32_t
5732 5737  cpuid_get_apicid(cpu_t *cpu)
5733 5738  {
5734 5739          ASSERT(cpuid_checkpass(cpu, 1));
5735 5740          if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
5736 5741                  return (UINT32_MAX);
5737 5742          } else {
5738 5743                  return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
5739 5744          }
5740 5745  }
5741 5746  
5742 5747  void
5743 5748  cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
5744 5749  {
5745 5750          struct cpuid_info *cpi;
5746 5751  
5747 5752          if (cpu == NULL)
5748 5753                  cpu = CPU;
5749 5754          cpi = cpu->cpu_m.mcpu_cpi;
5750 5755  
5751 5756          ASSERT(cpuid_checkpass(cpu, 1));
5752 5757  
5753 5758          if (pabits)
5754 5759                  *pabits = cpi->cpi_pabits;
5755 5760          if (vabits)
5756 5761                  *vabits = cpi->cpi_vabits;
5757 5762  }
5758 5763  
5759 5764  size_t
5760 5765  cpuid_get_xsave_size()
5761 5766  {
5762 5767          return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
5763 5768              sizeof (struct xsave_state)));
5764 5769  }
5765 5770  
5766 5771  /*
5767 5772   * Return true if the CPUs on this system require 'pointer clearing' for the
5768 5773   * floating point error pointer exception handling. In the past, this has been
5769 5774   * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
5770 5775   * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
5771 5776   * feature bit and is reflected in the cpi_fp_amd_save member.
5772 5777   */
5773 5778  boolean_t
5774 5779  cpuid_need_fp_excp_handling()
5775 5780  {
5776 5781          return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
5777 5782              cpuid_info0.cpi_fp_amd_save != 0);
5778 5783  }
5779 5784  
5780 5785  /*
5781 5786   * Returns the number of data TLB entries for a corresponding
5782 5787   * pagesize.  If it can't be computed, or isn't known, the
5783 5788   * routine returns zero.  If you ask about an architecturally
5784 5789   * impossible pagesize, the routine will panic (so that the
5785 5790   * hat implementor knows that things are inconsistent.)
5786 5791   */
5787 5792  uint_t
5788 5793  cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
5789 5794  {
5790 5795          struct cpuid_info *cpi;
5791 5796          uint_t dtlb_nent = 0;
5792 5797  
5793 5798          if (cpu == NULL)
5794 5799                  cpu = CPU;
5795 5800          cpi = cpu->cpu_m.mcpu_cpi;
5796 5801  
5797 5802          ASSERT(cpuid_checkpass(cpu, 1));
5798 5803  
5799 5804          /*
5800 5805           * Check the L2 TLB info
5801 5806           */
5802 5807          if (cpi->cpi_xmaxeax >= 0x80000006) {
5803 5808                  struct cpuid_regs *cp = &cpi->cpi_extd[6];
5804 5809  
5805 5810                  switch (pagesize) {
5806 5811  
5807 5812                  case 4 * 1024:
5808 5813                          /*
5809 5814                           * All zero in the top 16 bits of the register
5810 5815                           * indicates a unified TLB. Size is in low 16 bits.
5811 5816                           */
5812 5817                          if ((cp->cp_ebx & 0xffff0000) == 0)
5813 5818                                  dtlb_nent = cp->cp_ebx & 0x0000ffff;
5814 5819                          else
5815 5820                                  dtlb_nent = BITX(cp->cp_ebx, 27, 16);
5816 5821                          break;
5817 5822  
5818 5823                  case 2 * 1024 * 1024:
5819 5824                          if ((cp->cp_eax & 0xffff0000) == 0)
5820 5825                                  dtlb_nent = cp->cp_eax & 0x0000ffff;
5821 5826                          else
5822 5827                                  dtlb_nent = BITX(cp->cp_eax, 27, 16);
5823 5828                          break;
5824 5829  
5825 5830                  default:
5826 5831                          panic("unknown L2 pagesize");
5827 5832                          /*NOTREACHED*/
5828 5833                  }
5829 5834          }
5830 5835  
5831 5836          if (dtlb_nent != 0)
5832 5837                  return (dtlb_nent);
5833 5838  
5834 5839          /*
5835 5840           * No L2 TLB support for this size, try L1.
5836 5841           */
5837 5842          if (cpi->cpi_xmaxeax >= 0x80000005) {
5838 5843                  struct cpuid_regs *cp = &cpi->cpi_extd[5];
5839 5844  
5840 5845                  switch (pagesize) {
5841 5846                  case 4 * 1024:
5842 5847                          dtlb_nent = BITX(cp->cp_ebx, 23, 16);
5843 5848                          break;
5844 5849                  case 2 * 1024 * 1024:
5845 5850                          dtlb_nent = BITX(cp->cp_eax, 23, 16);
5846 5851                          break;
5847 5852                  default:
5848 5853                          panic("unknown L1 d-TLB pagesize");
5849 5854                          /*NOTREACHED*/
5850 5855                  }
5851 5856          }
5852 5857  
5853 5858          return (dtlb_nent);
5854 5859  }
5855 5860  
5856 5861  /*
5857 5862   * Return 0 if the erratum is not present or not applicable, positive
5858 5863   * if it is, and negative if the status of the erratum is unknown.
5859 5864   *
5860 5865   * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
5861 5866   * Processors" #25759, Rev 3.57, August 2005
5862 5867   */
5863 5868  int
5864 5869  cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
5865 5870  {
5866 5871          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5867 5872          uint_t eax;
5868 5873  
5869 5874          /*
5870 5875           * Bail out if this CPU isn't an AMD CPU, or if it's
5871 5876           * a legacy (32-bit) AMD CPU.
5872 5877           */
5873 5878          if (cpi->cpi_vendor != X86_VENDOR_AMD ||
5874 5879              cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
5875 5880              cpi->cpi_family == 6) {
5876 5881                  return (0);
5877 5882          }
5878 5883  
5879 5884          eax = cpi->cpi_std[1].cp_eax;
5880 5885  
5881 5886  #define SH_B0(eax)      (eax == 0xf40 || eax == 0xf50)
5882 5887  #define SH_B3(eax)      (eax == 0xf51)
5883 5888  #define B(eax)          (SH_B0(eax) || SH_B3(eax))
5884 5889  
5885 5890  #define SH_C0(eax)      (eax == 0xf48 || eax == 0xf58)
5886 5891  
5887 5892  #define SH_CG(eax)      (eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
5888 5893  #define DH_CG(eax)      (eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
5889 5894  #define CH_CG(eax)      (eax == 0xf82 || eax == 0xfb2)
5890 5895  #define CG(eax)         (SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
5891 5896  
5892 5897  #define SH_D0(eax)      (eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
5893 5898  #define DH_D0(eax)      (eax == 0x10fc0 || eax == 0x10ff0)
5894 5899  #define CH_D0(eax)      (eax == 0x10f80 || eax == 0x10fb0)
5895 5900  #define D0(eax)         (SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
5896 5901  
5897 5902  #define SH_E0(eax)      (eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
5898 5903  #define JH_E1(eax)      (eax == 0x20f10)        /* JH8_E0 had 0x20f30 */
5899 5904  #define DH_E3(eax)      (eax == 0x20fc0 || eax == 0x20ff0)
5900 5905  #define SH_E4(eax)      (eax == 0x20f51 || eax == 0x20f71)
5901 5906  #define BH_E4(eax)      (eax == 0x20fb1)
5902 5907  #define SH_E5(eax)      (eax == 0x20f42)
5903 5908  #define DH_E6(eax)      (eax == 0x20ff2 || eax == 0x20fc2)
5904 5909  #define JH_E6(eax)      (eax == 0x20f12 || eax == 0x20f32)
5905 5910  #define EX(eax)         (SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
5906 5911                              SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
5907 5912                              DH_E6(eax) || JH_E6(eax))
5908 5913  
5909 5914  #define DR_AX(eax)      (eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
5910 5915  #define DR_B0(eax)      (eax == 0x100f20)
5911 5916  #define DR_B1(eax)      (eax == 0x100f21)
5912 5917  #define DR_BA(eax)      (eax == 0x100f2a)
5913 5918  #define DR_B2(eax)      (eax == 0x100f22)
5914 5919  #define DR_B3(eax)      (eax == 0x100f23)
5915 5920  #define RB_C0(eax)      (eax == 0x100f40)
5916 5921  
5917 5922          switch (erratum) {
5918 5923          case 1:
5919 5924                  return (cpi->cpi_family < 0x10);
5920 5925          case 51:        /* what does the asterisk mean? */
5921 5926                  return (B(eax) || SH_C0(eax) || CG(eax));
5922 5927          case 52:
5923 5928                  return (B(eax));
5924 5929          case 57:
5925 5930                  return (cpi->cpi_family <= 0x11);
5926 5931          case 58:
5927 5932                  return (B(eax));
5928 5933          case 60:
5929 5934                  return (cpi->cpi_family <= 0x11);
5930 5935          case 61:
5931 5936          case 62:
5932 5937          case 63:
5933 5938          case 64:
5934 5939          case 65:
5935 5940          case 66:
5936 5941          case 68:
5937 5942          case 69:
5938 5943          case 70:
5939 5944          case 71:
5940 5945                  return (B(eax));
5941 5946          case 72:
5942 5947                  return (SH_B0(eax));
5943 5948          case 74:
5944 5949                  return (B(eax));
5945 5950          case 75:
5946 5951                  return (cpi->cpi_family < 0x10);
5947 5952          case 76:
5948 5953                  return (B(eax));
5949 5954          case 77:
5950 5955                  return (cpi->cpi_family <= 0x11);
5951 5956          case 78:
5952 5957                  return (B(eax) || SH_C0(eax));
5953 5958          case 79:
5954 5959                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5955 5960          case 80:
5956 5961          case 81:
5957 5962          case 82:
5958 5963                  return (B(eax));
5959 5964          case 83:
5960 5965                  return (B(eax) || SH_C0(eax) || CG(eax));
5961 5966          case 85:
5962 5967                  return (cpi->cpi_family < 0x10);
5963 5968          case 86:
5964 5969                  return (SH_C0(eax) || CG(eax));
5965 5970          case 88:
5966 5971  #if !defined(__amd64)
5967 5972                  return (0);
5968 5973  #else
5969 5974                  return (B(eax) || SH_C0(eax));
5970 5975  #endif
5971 5976          case 89:
5972 5977                  return (cpi->cpi_family < 0x10);
5973 5978          case 90:
5974 5979                  return (B(eax) || SH_C0(eax) || CG(eax));
5975 5980          case 91:
5976 5981          case 92:
5977 5982                  return (B(eax) || SH_C0(eax));
5978 5983          case 93:
5979 5984                  return (SH_C0(eax));
5980 5985          case 94:
5981 5986                  return (B(eax) || SH_C0(eax) || CG(eax));
5982 5987          case 95:
5983 5988  #if !defined(__amd64)
5984 5989                  return (0);
5985 5990  #else
5986 5991                  return (B(eax) || SH_C0(eax));
5987 5992  #endif
5988 5993          case 96:
5989 5994                  return (B(eax) || SH_C0(eax) || CG(eax));
5990 5995          case 97:
5991 5996          case 98:
5992 5997                  return (SH_C0(eax) || CG(eax));
5993 5998          case 99:
5994 5999                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5995 6000          case 100:
5996 6001                  return (B(eax) || SH_C0(eax));
5997 6002          case 101:
5998 6003          case 103:
5999 6004                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6000 6005          case 104:
6001 6006                  return (SH_C0(eax) || CG(eax) || D0(eax));
6002 6007          case 105:
6003 6008          case 106:
6004 6009          case 107:
6005 6010                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6006 6011          case 108:
6007 6012                  return (DH_CG(eax));
6008 6013          case 109:
6009 6014                  return (SH_C0(eax) || CG(eax) || D0(eax));
6010 6015          case 110:
6011 6016                  return (D0(eax) || EX(eax));
6012 6017          case 111:
6013 6018                  return (CG(eax));
6014 6019          case 112:
6015 6020                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6016 6021          case 113:
6017 6022                  return (eax == 0x20fc0);
6018 6023          case 114:
6019 6024                  return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6020 6025          case 115:
6021 6026                  return (SH_E0(eax) || JH_E1(eax));
6022 6027          case 116:
6023 6028                  return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6024 6029          case 117:
6025 6030                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6026 6031          case 118:
6027 6032                  return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
6028 6033                      JH_E6(eax));
6029 6034          case 121:
6030 6035                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6031 6036          case 122:
6032 6037                  return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
6033 6038          case 123:
6034 6039                  return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
6035 6040          case 131:
6036 6041                  return (cpi->cpi_family < 0x10);
6037 6042          case 6336786:
6038 6043  
6039 6044                  /*
6040 6045                   * Test for AdvPowerMgmtInfo.TscPStateInvariant
6041 6046                   * if this is a K8 family or newer processor. We're testing for
6042 6047                   * this 'erratum' to determine whether or not we have a constant
6043 6048                   * TSC.
6044 6049                   *
6045 6050                   * Our current fix for this is to disable the C1-Clock ramping.
6046 6051                   * However, this doesn't work on newer processor families nor
6047 6052                   * does it work when virtualized as those devices don't exist.
6048 6053                   */
6049 6054                  if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
6050 6055                          return (0);
6051 6056                  }
6052 6057  
6053 6058                  if (CPI_FAMILY(cpi) == 0xf) {
6054 6059                          struct cpuid_regs regs;
6055 6060                          regs.cp_eax = 0x80000007;
6056 6061                          (void) __cpuid_insn(&regs);
6057 6062                          return (!(regs.cp_edx & 0x100));
6058 6063                  }
6059 6064                  return (0);
6060 6065          case 6323525:
6061 6066                  /*
6062 6067                   * This erratum (K8 #147) is not present on family 10 and newer.
6063 6068                   */
6064 6069                  if (cpi->cpi_family >= 0x10) {
6065 6070                          return (0);
6066 6071                  }
6067 6072                  return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
6068 6073                      (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
6069 6074  
6070 6075          case 6671130:
6071 6076                  /*
6072 6077                   * check for processors (pre-Shanghai) that do not provide
6073 6078                   * optimal management of 1gb ptes in its tlb.
6074 6079                   */
6075 6080                  return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
6076 6081  
6077 6082          case 298:
6078 6083                  return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
6079 6084                      DR_B2(eax) || RB_C0(eax));
6080 6085  
6081 6086          case 721:
6082 6087  #if defined(__amd64)
6083 6088                  return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
6084 6089  #else
6085 6090                  return (0);
6086 6091  #endif
6087 6092  
6088 6093          default:
6089 6094                  return (-1);
6090 6095  
6091 6096          }
6092 6097  }
6093 6098  
6094 6099  /*
6095 6100   * Determine if specified erratum is present via OSVW (OS Visible Workaround).
6096 6101   * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
6097 6102   */
6098 6103  int
6099 6104  osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
6100 6105  {
6101 6106          struct cpuid_info       *cpi;
6102 6107          uint_t                  osvwid;
6103 6108          static int              osvwfeature = -1;
6104 6109          uint64_t                osvwlength;
6105 6110  
6106 6111  
6107 6112          cpi = cpu->cpu_m.mcpu_cpi;
6108 6113  
6109 6114          /* confirm OSVW supported */
6110 6115          if (osvwfeature == -1) {
6111 6116                  osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
6112 6117          } else {
6113 6118                  /* assert that osvw feature setting is consistent on all cpus */
6114 6119                  ASSERT(osvwfeature ==
6115 6120                      (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
6116 6121          }
6117 6122          if (!osvwfeature)
6118 6123                  return (-1);
6119 6124  
6120 6125          osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
6121 6126  
6122 6127          switch (erratum) {
6123 6128          case 298:       /* osvwid is 0 */
6124 6129                  osvwid = 0;
6125 6130                  if (osvwlength <= (uint64_t)osvwid) {
6126 6131                          /* osvwid 0 is unknown */
6127 6132                          return (-1);
6128 6133                  }
6129 6134  
6130 6135                  /*
6131 6136                   * Check the OSVW STATUS MSR to determine the state
6132 6137                   * of the erratum where:
6133 6138                   *   0 - fixed by HW
6134 6139                   *   1 - BIOS has applied the workaround when BIOS
6135 6140                   *   workaround is available. (Or for other errata,
6136 6141                   *   OS workaround is required.)
6137 6142                   * For a value of 1, caller will confirm that the
6138 6143                   * erratum 298 workaround has indeed been applied by BIOS.
6139 6144                   *
6140 6145                   * A 1 may be set in cpus that have a HW fix
6141 6146                   * in a mixed cpu system. Regarding erratum 298:
6142 6147                   *   In a multiprocessor platform, the workaround above
6143 6148                   *   should be applied to all processors regardless of
6144 6149                   *   silicon revision when an affected processor is
6145 6150                   *   present.
6146 6151                   */
6147 6152  
6148 6153                  return (rdmsr(MSR_AMD_OSVW_STATUS +
6149 6154                      (osvwid / OSVW_ID_CNT_PER_MSR)) &
6150 6155                      (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
6151 6156  
6152 6157          default:
6153 6158                  return (-1);
6154 6159          }
6155 6160  }
6156 6161  
6157 6162  static const char assoc_str[] = "associativity";
6158 6163  static const char line_str[] = "line-size";
6159 6164  static const char size_str[] = "size";
6160 6165  
6161 6166  static void
6162 6167  add_cache_prop(dev_info_t *devi, const char *label, const char *type,
6163 6168      uint32_t val)
6164 6169  {
6165 6170          char buf[128];
6166 6171  
6167 6172          /*
6168 6173           * ndi_prop_update_int() is used because it is desirable for
6169 6174           * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
6170 6175           */
6171 6176          if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
6172 6177                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
6173 6178  }
6174 6179  
6175 6180  /*
6176 6181   * Intel-style cache/tlb description
6177 6182   *
6178 6183   * Standard cpuid level 2 gives a randomly ordered
6179 6184   * selection of tags that index into a table that describes
6180 6185   * cache and tlb properties.
6181 6186   */
6182 6187  
6183 6188  static const char l1_icache_str[] = "l1-icache";
6184 6189  static const char l1_dcache_str[] = "l1-dcache";
6185 6190  static const char l2_cache_str[] = "l2-cache";
6186 6191  static const char l3_cache_str[] = "l3-cache";
6187 6192  static const char itlb4k_str[] = "itlb-4K";
6188 6193  static const char dtlb4k_str[] = "dtlb-4K";
6189 6194  static const char itlb2M_str[] = "itlb-2M";
6190 6195  static const char itlb4M_str[] = "itlb-4M";
6191 6196  static const char dtlb4M_str[] = "dtlb-4M";
6192 6197  static const char dtlb24_str[] = "dtlb0-2M-4M";
6193 6198  static const char itlb424_str[] = "itlb-4K-2M-4M";
6194 6199  static const char itlb24_str[] = "itlb-2M-4M";
6195 6200  static const char dtlb44_str[] = "dtlb-4K-4M";
6196 6201  static const char sl1_dcache_str[] = "sectored-l1-dcache";
6197 6202  static const char sl2_cache_str[] = "sectored-l2-cache";
6198 6203  static const char itrace_str[] = "itrace-cache";
6199 6204  static const char sl3_cache_str[] = "sectored-l3-cache";
6200 6205  static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
6201 6206  
6202 6207  static const struct cachetab {
6203 6208          uint8_t         ct_code;
6204 6209          uint8_t         ct_assoc;
6205 6210          uint16_t        ct_line_size;
6206 6211          size_t          ct_size;
6207 6212          const char      *ct_label;
6208 6213  } intel_ctab[] = {
6209 6214          /*
6210 6215           * maintain descending order!
6211 6216           *
6212 6217           * Codes ignored - Reason
6213 6218           * ----------------------
6214 6219           * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
6215 6220           * f0H/f1H - Currently we do not interpret prefetch size by design
6216 6221           */
6217 6222          { 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
6218 6223          { 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
6219 6224          { 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
6220 6225          { 0xde, 12, 64, 6*1024*1024, l3_cache_str},
6221 6226          { 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
6222 6227          { 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
6223 6228          { 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
6224 6229          { 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
6225 6230          { 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
6226 6231          { 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
6227 6232          { 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
6228 6233          { 0xd0, 4, 64, 512*1024, l3_cache_str},
6229 6234          { 0xca, 4, 0, 512, sh_l2_tlb4k_str},
6230 6235          { 0xc0, 4, 0, 8, dtlb44_str },
6231 6236          { 0xba, 4, 0, 64, dtlb4k_str },
6232 6237          { 0xb4, 4, 0, 256, dtlb4k_str },
6233 6238          { 0xb3, 4, 0, 128, dtlb4k_str },
6234 6239          { 0xb2, 4, 0, 64, itlb4k_str },
6235 6240          { 0xb0, 4, 0, 128, itlb4k_str },
6236 6241          { 0x87, 8, 64, 1024*1024, l2_cache_str},
6237 6242          { 0x86, 4, 64, 512*1024, l2_cache_str},
6238 6243          { 0x85, 8, 32, 2*1024*1024, l2_cache_str},
6239 6244          { 0x84, 8, 32, 1024*1024, l2_cache_str},
6240 6245          { 0x83, 8, 32, 512*1024, l2_cache_str},
6241 6246          { 0x82, 8, 32, 256*1024, l2_cache_str},
6242 6247          { 0x80, 8, 64, 512*1024, l2_cache_str},
6243 6248          { 0x7f, 2, 64, 512*1024, l2_cache_str},
6244 6249          { 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
6245 6250          { 0x7c, 8, 64, 1024*1024, sl2_cache_str},
6246 6251          { 0x7b, 8, 64, 512*1024, sl2_cache_str},
6247 6252          { 0x7a, 8, 64, 256*1024, sl2_cache_str},
6248 6253          { 0x79, 8, 64, 128*1024, sl2_cache_str},
6249 6254          { 0x78, 8, 64, 1024*1024, l2_cache_str},
6250 6255          { 0x73, 8, 0, 64*1024, itrace_str},
6251 6256          { 0x72, 8, 0, 32*1024, itrace_str},
6252 6257          { 0x71, 8, 0, 16*1024, itrace_str},
6253 6258          { 0x70, 8, 0, 12*1024, itrace_str},
6254 6259          { 0x68, 4, 64, 32*1024, sl1_dcache_str},
6255 6260          { 0x67, 4, 64, 16*1024, sl1_dcache_str},
6256 6261          { 0x66, 4, 64, 8*1024, sl1_dcache_str},
6257 6262          { 0x60, 8, 64, 16*1024, sl1_dcache_str},
6258 6263          { 0x5d, 0, 0, 256, dtlb44_str},
6259 6264          { 0x5c, 0, 0, 128, dtlb44_str},
6260 6265          { 0x5b, 0, 0, 64, dtlb44_str},
6261 6266          { 0x5a, 4, 0, 32, dtlb24_str},
6262 6267          { 0x59, 0, 0, 16, dtlb4k_str},
6263 6268          { 0x57, 4, 0, 16, dtlb4k_str},
6264 6269          { 0x56, 4, 0, 16, dtlb4M_str},
6265 6270          { 0x55, 0, 0, 7, itlb24_str},
6266 6271          { 0x52, 0, 0, 256, itlb424_str},
6267 6272          { 0x51, 0, 0, 128, itlb424_str},
6268 6273          { 0x50, 0, 0, 64, itlb424_str},
6269 6274          { 0x4f, 0, 0, 32, itlb4k_str},
6270 6275          { 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
6271 6276          { 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
6272 6277          { 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
6273 6278          { 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
6274 6279          { 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
6275 6280          { 0x49, 16, 64, 4*1024*1024, l3_cache_str},
6276 6281          { 0x48, 12, 64, 3*1024*1024, l2_cache_str},
6277 6282          { 0x47, 8, 64, 8*1024*1024, l3_cache_str},
6278 6283          { 0x46, 4, 64, 4*1024*1024, l3_cache_str},
6279 6284          { 0x45, 4, 32, 2*1024*1024, l2_cache_str},
6280 6285          { 0x44, 4, 32, 1024*1024, l2_cache_str},
6281 6286          { 0x43, 4, 32, 512*1024, l2_cache_str},
6282 6287          { 0x42, 4, 32, 256*1024, l2_cache_str},
6283 6288          { 0x41, 4, 32, 128*1024, l2_cache_str},
6284 6289          { 0x3e, 4, 64, 512*1024, sl2_cache_str},
6285 6290          { 0x3d, 6, 64, 384*1024, sl2_cache_str},
6286 6291          { 0x3c, 4, 64, 256*1024, sl2_cache_str},
6287 6292          { 0x3b, 2, 64, 128*1024, sl2_cache_str},
6288 6293          { 0x3a, 6, 64, 192*1024, sl2_cache_str},
6289 6294          { 0x39, 4, 64, 128*1024, sl2_cache_str},
6290 6295          { 0x30, 8, 64, 32*1024, l1_icache_str},
6291 6296          { 0x2c, 8, 64, 32*1024, l1_dcache_str},
6292 6297          { 0x29, 8, 64, 4096*1024, sl3_cache_str},
6293 6298          { 0x25, 8, 64, 2048*1024, sl3_cache_str},
6294 6299          { 0x23, 8, 64, 1024*1024, sl3_cache_str},
6295 6300          { 0x22, 4, 64, 512*1024, sl3_cache_str},
6296 6301          { 0x0e, 6, 64, 24*1024, l1_dcache_str},
6297 6302          { 0x0d, 4, 32, 16*1024, l1_dcache_str},
6298 6303          { 0x0c, 4, 32, 16*1024, l1_dcache_str},
6299 6304          { 0x0b, 4, 0, 4, itlb4M_str},
6300 6305          { 0x0a, 2, 32, 8*1024, l1_dcache_str},
6301 6306          { 0x08, 4, 32, 16*1024, l1_icache_str},
6302 6307          { 0x06, 4, 32, 8*1024, l1_icache_str},
6303 6308          { 0x05, 4, 0, 32, dtlb4M_str},
6304 6309          { 0x04, 4, 0, 8, dtlb4M_str},
6305 6310          { 0x03, 4, 0, 64, dtlb4k_str},
6306 6311          { 0x02, 4, 0, 2, itlb4M_str},
6307 6312          { 0x01, 4, 0, 32, itlb4k_str},
6308 6313          { 0 }
6309 6314  };
6310 6315  
6311 6316  static const struct cachetab cyrix_ctab[] = {
6312 6317          { 0x70, 4, 0, 32, "tlb-4K" },
6313 6318          { 0x80, 4, 16, 16*1024, "l1-cache" },
6314 6319          { 0 }
6315 6320  };
6316 6321  
6317 6322  /*
6318 6323   * Search a cache table for a matching entry
6319 6324   */
6320 6325  static const struct cachetab *
6321 6326  find_cacheent(const struct cachetab *ct, uint_t code)
6322 6327  {
6323 6328          if (code != 0) {
6324 6329                  for (; ct->ct_code != 0; ct++)
6325 6330                          if (ct->ct_code <= code)
6326 6331                                  break;
6327 6332                  if (ct->ct_code == code)
6328 6333                          return (ct);
6329 6334          }
6330 6335          return (NULL);
6331 6336  }
6332 6337  
6333 6338  /*
6334 6339   * Populate cachetab entry with L2 or L3 cache-information using
6335 6340   * cpuid function 4. This function is called from intel_walk_cacheinfo()
6336 6341   * when descriptor 0x49 is encountered. It returns 0 if no such cache
6337 6342   * information is found.
6338 6343   */
6339 6344  static int
6340 6345  intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
6341 6346  {
6342 6347          uint32_t level, i;
6343 6348          int ret = 0;
6344 6349  
6345 6350          for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
6346 6351                  level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
6347 6352  
6348 6353                  if (level == 2 || level == 3) {
6349 6354                          ct->ct_assoc =
6350 6355                              CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
6351 6356                          ct->ct_line_size =
6352 6357                              CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
6353 6358                          ct->ct_size = ct->ct_assoc *
6354 6359                              (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
6355 6360                              ct->ct_line_size *
6356 6361                              (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
6357 6362  
6358 6363                          if (level == 2) {
6359 6364                                  ct->ct_label = l2_cache_str;
6360 6365                          } else if (level == 3) {
6361 6366                                  ct->ct_label = l3_cache_str;
6362 6367                          }
6363 6368                          ret = 1;
6364 6369                  }
6365 6370          }
6366 6371  
6367 6372          return (ret);
6368 6373  }
6369 6374  
6370 6375  /*
6371 6376   * Walk the cacheinfo descriptor, applying 'func' to every valid element
6372 6377   * The walk is terminated if the walker returns non-zero.
6373 6378   */
6374 6379  static void
6375 6380  intel_walk_cacheinfo(struct cpuid_info *cpi,
6376 6381      void *arg, int (*func)(void *, const struct cachetab *))
6377 6382  {
6378 6383          const struct cachetab *ct;
6379 6384          struct cachetab des_49_ct, des_b1_ct;
6380 6385          uint8_t *dp;
6381 6386          int i;
6382 6387  
6383 6388          if ((dp = cpi->cpi_cacheinfo) == NULL)
6384 6389                  return;
6385 6390          for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6386 6391                  /*
6387 6392                   * For overloaded descriptor 0x49 we use cpuid function 4
6388 6393                   * if supported by the current processor, to create
6389 6394                   * cache information.
6390 6395                   * For overloaded descriptor 0xb1 we use X86_PAE flag
6391 6396                   * to disambiguate the cache information.
6392 6397                   */
6393 6398                  if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
6394 6399                      intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
6395 6400                                  ct = &des_49_ct;
6396 6401                  } else if (*dp == 0xb1) {
6397 6402                          des_b1_ct.ct_code = 0xb1;
6398 6403                          des_b1_ct.ct_assoc = 4;
6399 6404                          des_b1_ct.ct_line_size = 0;
6400 6405                          if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
6401 6406                                  des_b1_ct.ct_size = 8;
6402 6407                                  des_b1_ct.ct_label = itlb2M_str;
6403 6408                          } else {
6404 6409                                  des_b1_ct.ct_size = 4;
6405 6410                                  des_b1_ct.ct_label = itlb4M_str;
6406 6411                          }
6407 6412                          ct = &des_b1_ct;
6408 6413                  } else {
6409 6414                          if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
6410 6415                                  continue;
6411 6416                          }
6412 6417                  }
6413 6418  
6414 6419                  if (func(arg, ct) != 0) {
6415 6420                          break;
6416 6421                  }
6417 6422          }
6418 6423  }
6419 6424  
6420 6425  /*
6421 6426   * (Like the Intel one, except for Cyrix CPUs)
6422 6427   */
6423 6428  static void
6424 6429  cyrix_walk_cacheinfo(struct cpuid_info *cpi,
6425 6430      void *arg, int (*func)(void *, const struct cachetab *))
6426 6431  {
6427 6432          const struct cachetab *ct;
6428 6433          uint8_t *dp;
6429 6434          int i;
6430 6435  
6431 6436          if ((dp = cpi->cpi_cacheinfo) == NULL)
6432 6437                  return;
6433 6438          for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6434 6439                  /*
6435 6440                   * Search Cyrix-specific descriptor table first ..
6436 6441                   */
6437 6442                  if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
6438 6443                          if (func(arg, ct) != 0)
6439 6444                                  break;
6440 6445                          continue;
6441 6446                  }
6442 6447                  /*
6443 6448                   * .. else fall back to the Intel one
6444 6449                   */
6445 6450                  if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
6446 6451                          if (func(arg, ct) != 0)
6447 6452                                  break;
6448 6453                          continue;
6449 6454                  }
6450 6455          }
6451 6456  }
6452 6457  
6453 6458  /*
6454 6459   * A cacheinfo walker that adds associativity, line-size, and size properties
6455 6460   * to the devinfo node it is passed as an argument.
6456 6461   */
6457 6462  static int
6458 6463  add_cacheent_props(void *arg, const struct cachetab *ct)
6459 6464  {
6460 6465          dev_info_t *devi = arg;
6461 6466  
6462 6467          add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
6463 6468          if (ct->ct_line_size != 0)
6464 6469                  add_cache_prop(devi, ct->ct_label, line_str,
6465 6470                      ct->ct_line_size);
6466 6471          add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
6467 6472          return (0);
6468 6473  }
6469 6474  
6470 6475  
6471 6476  static const char fully_assoc[] = "fully-associative?";
6472 6477  
6473 6478  /*
6474 6479   * AMD style cache/tlb description
6475 6480   *
6476 6481   * Extended functions 5 and 6 directly describe properties of
6477 6482   * tlbs and various cache levels.
6478 6483   */
6479 6484  static void
6480 6485  add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6481 6486  {
6482 6487          switch (assoc) {
6483 6488          case 0: /* reserved; ignore */
6484 6489                  break;
6485 6490          default:
6486 6491                  add_cache_prop(devi, label, assoc_str, assoc);
6487 6492                  break;
6488 6493          case 0xff:
6489 6494                  add_cache_prop(devi, label, fully_assoc, 1);
6490 6495                  break;
6491 6496          }
6492 6497  }
6493 6498  
6494 6499  static void
6495 6500  add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6496 6501  {
6497 6502          if (size == 0)
6498 6503                  return;
6499 6504          add_cache_prop(devi, label, size_str, size);
6500 6505          add_amd_assoc(devi, label, assoc);
6501 6506  }
6502 6507  
6503 6508  static void
6504 6509  add_amd_cache(dev_info_t *devi, const char *label,
6505 6510      uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6506 6511  {
6507 6512          if (size == 0 || line_size == 0)
6508 6513                  return;
6509 6514          add_amd_assoc(devi, label, assoc);
6510 6515          /*
6511 6516           * Most AMD parts have a sectored cache. Multiple cache lines are
6512 6517           * associated with each tag. A sector consists of all cache lines
6513 6518           * associated with a tag. For example, the AMD K6-III has a sector
6514 6519           * size of 2 cache lines per tag.
6515 6520           */
6516 6521          if (lines_per_tag != 0)
6517 6522                  add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6518 6523          add_cache_prop(devi, label, line_str, line_size);
6519 6524          add_cache_prop(devi, label, size_str, size * 1024);
6520 6525  }
6521 6526  
6522 6527  static void
6523 6528  add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6524 6529  {
6525 6530          switch (assoc) {
6526 6531          case 0: /* off */
6527 6532                  break;
6528 6533          case 1:
6529 6534          case 2:
6530 6535          case 4:
6531 6536                  add_cache_prop(devi, label, assoc_str, assoc);
6532 6537                  break;
6533 6538          case 6:
6534 6539                  add_cache_prop(devi, label, assoc_str, 8);
6535 6540                  break;
6536 6541          case 8:
6537 6542                  add_cache_prop(devi, label, assoc_str, 16);
6538 6543                  break;
6539 6544          case 0xf:
6540 6545                  add_cache_prop(devi, label, fully_assoc, 1);
6541 6546                  break;
6542 6547          default: /* reserved; ignore */
6543 6548                  break;
6544 6549          }
6545 6550  }
6546 6551  
6547 6552  static void
6548 6553  add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6549 6554  {
6550 6555          if (size == 0 || assoc == 0)
6551 6556                  return;
6552 6557          add_amd_l2_assoc(devi, label, assoc);
6553 6558          add_cache_prop(devi, label, size_str, size);
6554 6559  }
6555 6560  
6556 6561  static void
6557 6562  add_amd_l2_cache(dev_info_t *devi, const char *label,
6558 6563      uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6559 6564  {
6560 6565          if (size == 0 || assoc == 0 || line_size == 0)
6561 6566                  return;
6562 6567          add_amd_l2_assoc(devi, label, assoc);
6563 6568          if (lines_per_tag != 0)
6564 6569                  add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6565 6570          add_cache_prop(devi, label, line_str, line_size);
6566 6571          add_cache_prop(devi, label, size_str, size * 1024);
6567 6572  }
6568 6573  
6569 6574  static void
6570 6575  amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
6571 6576  {
6572 6577          struct cpuid_regs *cp;
6573 6578  
6574 6579          if (cpi->cpi_xmaxeax < 0x80000005)
6575 6580                  return;
6576 6581          cp = &cpi->cpi_extd[5];
6577 6582  
6578 6583          /*
6579 6584           * 4M/2M L1 TLB configuration
6580 6585           *
6581 6586           * We report the size for 2M pages because AMD uses two
6582 6587           * TLB entries for one 4M page.
6583 6588           */
6584 6589          add_amd_tlb(devi, "dtlb-2M",
6585 6590              BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
6586 6591          add_amd_tlb(devi, "itlb-2M",
6587 6592              BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
6588 6593  
6589 6594          /*
6590 6595           * 4K L1 TLB configuration
6591 6596           */
6592 6597  
6593 6598          switch (cpi->cpi_vendor) {
6594 6599                  uint_t nentries;
6595 6600          case X86_VENDOR_TM:
6596 6601                  if (cpi->cpi_family >= 5) {
6597 6602                          /*
6598 6603                           * Crusoe processors have 256 TLB entries, but
6599 6604                           * cpuid data format constrains them to only
6600 6605                           * reporting 255 of them.
6601 6606                           */
6602 6607                          if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
6603 6608                                  nentries = 256;
6604 6609                          /*
6605 6610                           * Crusoe processors also have a unified TLB
6606 6611                           */
6607 6612                          add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
6608 6613                              nentries);
6609 6614                          break;
6610 6615                  }
6611 6616                  /*FALLTHROUGH*/
6612 6617          default:
6613 6618                  add_amd_tlb(devi, itlb4k_str,
6614 6619                      BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
6615 6620                  add_amd_tlb(devi, dtlb4k_str,
6616 6621                      BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
6617 6622                  break;
6618 6623          }
6619 6624  
6620 6625          /*
6621 6626           * data L1 cache configuration
6622 6627           */
6623 6628  
6624 6629          add_amd_cache(devi, l1_dcache_str,
6625 6630              BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
6626 6631              BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
6627 6632  
6628 6633          /*
6629 6634           * code L1 cache configuration
6630 6635           */
6631 6636  
6632 6637          add_amd_cache(devi, l1_icache_str,
6633 6638              BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
6634 6639              BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
6635 6640  
6636 6641          if (cpi->cpi_xmaxeax < 0x80000006)
6637 6642                  return;
6638 6643          cp = &cpi->cpi_extd[6];
6639 6644  
6640 6645          /* Check for a unified L2 TLB for large pages */
6641 6646  
6642 6647          if (BITX(cp->cp_eax, 31, 16) == 0)
6643 6648                  add_amd_l2_tlb(devi, "l2-tlb-2M",
6644 6649                      BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6645 6650          else {
6646 6651                  add_amd_l2_tlb(devi, "l2-dtlb-2M",
6647 6652                      BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6648 6653                  add_amd_l2_tlb(devi, "l2-itlb-2M",
6649 6654                      BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6650 6655          }
6651 6656  
6652 6657          /* Check for a unified L2 TLB for 4K pages */
6653 6658  
6654 6659          if (BITX(cp->cp_ebx, 31, 16) == 0) {
6655 6660                  add_amd_l2_tlb(devi, "l2-tlb-4K",
6656 6661                      BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6657 6662          } else {
6658 6663                  add_amd_l2_tlb(devi, "l2-dtlb-4K",
6659 6664                      BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6660 6665                  add_amd_l2_tlb(devi, "l2-itlb-4K",
6661 6666                      BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6662 6667          }
6663 6668  
6664 6669          add_amd_l2_cache(devi, l2_cache_str,
6665 6670              BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
6666 6671              BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
6667 6672  }
6668 6673  
6669 6674  /*
6670 6675   * There are two basic ways that the x86 world describes it cache
6671 6676   * and tlb architecture - Intel's way and AMD's way.
6672 6677   *
6673 6678   * Return which flavor of cache architecture we should use
6674 6679   */
6675 6680  static int
6676 6681  x86_which_cacheinfo(struct cpuid_info *cpi)
6677 6682  {
6678 6683          switch (cpi->cpi_vendor) {
6679 6684          case X86_VENDOR_Intel:
6680 6685                  if (cpi->cpi_maxeax >= 2)
6681 6686                          return (X86_VENDOR_Intel);
6682 6687                  break;
6683 6688          case X86_VENDOR_AMD:
6684 6689                  /*
6685 6690                   * The K5 model 1 was the first part from AMD that reported
6686 6691                   * cache sizes via extended cpuid functions.
6687 6692                   */
6688 6693                  if (cpi->cpi_family > 5 ||
6689 6694                      (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
6690 6695                          return (X86_VENDOR_AMD);
6691 6696                  break;
6692 6697          case X86_VENDOR_TM:
6693 6698                  if (cpi->cpi_family >= 5)
6694 6699                          return (X86_VENDOR_AMD);
6695 6700                  /*FALLTHROUGH*/
6696 6701          default:
6697 6702                  /*
6698 6703                   * If they have extended CPU data for 0x80000005
6699 6704                   * then we assume they have AMD-format cache
6700 6705                   * information.
6701 6706                   *
6702 6707                   * If not, and the vendor happens to be Cyrix,
6703 6708                   * then try our-Cyrix specific handler.
6704 6709                   *
6705 6710                   * If we're not Cyrix, then assume we're using Intel's
6706 6711                   * table-driven format instead.
6707 6712                   */
6708 6713                  if (cpi->cpi_xmaxeax >= 0x80000005)
6709 6714                          return (X86_VENDOR_AMD);
6710 6715                  else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
6711 6716                          return (X86_VENDOR_Cyrix);
6712 6717                  else if (cpi->cpi_maxeax >= 2)
6713 6718                          return (X86_VENDOR_Intel);
6714 6719                  break;
6715 6720          }
6716 6721          return (-1);
6717 6722  }
6718 6723  
6719 6724  void
6720 6725  cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
6721 6726      struct cpuid_info *cpi)
6722 6727  {
6723 6728          dev_info_t *cpu_devi;
6724 6729          int create;
6725 6730  
6726 6731          cpu_devi = (dev_info_t *)dip;
6727 6732  
6728 6733          /* device_type */
6729 6734          (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6730 6735              "device_type", "cpu");
6731 6736  
6732 6737          /* reg */
6733 6738          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6734 6739              "reg", cpu_id);
6735 6740  
6736 6741          /* cpu-mhz, and clock-frequency */
6737 6742          if (cpu_freq > 0) {
6738 6743                  long long mul;
6739 6744  
6740 6745                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6741 6746                      "cpu-mhz", cpu_freq);
6742 6747                  if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
6743 6748                          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6744 6749                              "clock-frequency", (int)mul);
6745 6750          }
6746 6751  
6747 6752          if (!is_x86_feature(x86_featureset, X86FSET_CPUID)) {
6748 6753                  return;
6749 6754          }
6750 6755  
6751 6756          /* vendor-id */
6752 6757          (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6753 6758              "vendor-id", cpi->cpi_vendorstr);
6754 6759  
6755 6760          if (cpi->cpi_maxeax == 0) {
6756 6761                  return;
6757 6762          }
6758 6763  
6759 6764          /*
6760 6765           * family, model, and step
6761 6766           */
6762 6767          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6763 6768              "family", CPI_FAMILY(cpi));
6764 6769          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6765 6770              "cpu-model", CPI_MODEL(cpi));
6766 6771          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6767 6772              "stepping-id", CPI_STEP(cpi));
6768 6773  
6769 6774          /* type */
6770 6775          switch (cpi->cpi_vendor) {
6771 6776          case X86_VENDOR_Intel:
6772 6777                  create = 1;
6773 6778                  break;
6774 6779          default:
6775 6780                  create = 0;
6776 6781                  break;
6777 6782          }
6778 6783          if (create)
6779 6784                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6780 6785                      "type", CPI_TYPE(cpi));
6781 6786  
6782 6787          /* ext-family */
6783 6788          switch (cpi->cpi_vendor) {
6784 6789          case X86_VENDOR_Intel:
6785 6790          case X86_VENDOR_AMD:
6786 6791                  create = cpi->cpi_family >= 0xf;
6787 6792                  break;
6788 6793          default:
6789 6794                  create = 0;
6790 6795                  break;
6791 6796          }
6792 6797          if (create)
6793 6798                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6794 6799                      "ext-family", CPI_FAMILY_XTD(cpi));
6795 6800  
6796 6801          /* ext-model */
6797 6802          switch (cpi->cpi_vendor) {
6798 6803          case X86_VENDOR_Intel:
6799 6804                  create = IS_EXTENDED_MODEL_INTEL(cpi);
6800 6805                  break;
6801 6806          case X86_VENDOR_AMD:
6802 6807                  create = CPI_FAMILY(cpi) == 0xf;
6803 6808                  break;
6804 6809          default:
6805 6810                  create = 0;
6806 6811                  break;
6807 6812          }
6808 6813          if (create)
6809 6814                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6810 6815                      "ext-model", CPI_MODEL_XTD(cpi));
6811 6816  
6812 6817          /* generation */
6813 6818          switch (cpi->cpi_vendor) {
6814 6819          case X86_VENDOR_AMD:
6815 6820                  /*
6816 6821                   * AMD K5 model 1 was the first part to support this
6817 6822                   */
6818 6823                  create = cpi->cpi_xmaxeax >= 0x80000001;
6819 6824                  break;
6820 6825          default:
6821 6826                  create = 0;
6822 6827                  break;
6823 6828          }
6824 6829          if (create)
6825 6830                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6826 6831                      "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
6827 6832  
6828 6833          /* brand-id */
6829 6834          switch (cpi->cpi_vendor) {
6830 6835          case X86_VENDOR_Intel:
6831 6836                  /*
6832 6837                   * brand id first appeared on Pentium III Xeon model 8,
6833 6838                   * and Celeron model 8 processors and Opteron
6834 6839                   */
6835 6840                  create = cpi->cpi_family > 6 ||
6836 6841                      (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
6837 6842                  break;
6838 6843          case X86_VENDOR_AMD:
6839 6844                  create = cpi->cpi_family >= 0xf;
6840 6845                  break;
6841 6846          default:
6842 6847                  create = 0;
6843 6848                  break;
6844 6849          }
6845 6850          if (create && cpi->cpi_brandid != 0) {
6846 6851                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6847 6852                      "brand-id", cpi->cpi_brandid);
6848 6853          }
6849 6854  
6850 6855          /* chunks, and apic-id */
6851 6856          switch (cpi->cpi_vendor) {
6852 6857                  /*
6853 6858                   * first available on Pentium IV and Opteron (K8)
6854 6859                   */
6855 6860          case X86_VENDOR_Intel:
6856 6861                  create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6857 6862                  break;
6858 6863          case X86_VENDOR_AMD:
6859 6864                  create = cpi->cpi_family >= 0xf;
6860 6865                  break;
6861 6866          default:
6862 6867                  create = 0;
6863 6868                  break;
6864 6869          }
6865 6870          if (create) {
6866 6871                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6867 6872                      "chunks", CPI_CHUNKS(cpi));
6868 6873                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6869 6874                      "apic-id", cpi->cpi_apicid);
6870 6875                  if (cpi->cpi_chipid >= 0) {
6871 6876                          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6872 6877                              "chip#", cpi->cpi_chipid);
6873 6878                          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6874 6879                              "clog#", cpi->cpi_clogid);
6875 6880                  }
6876 6881          }
6877 6882  
6878 6883          /* cpuid-features */
6879 6884          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6880 6885              "cpuid-features", CPI_FEATURES_EDX(cpi));
6881 6886  
6882 6887  
6883 6888          /* cpuid-features-ecx */
6884 6889          switch (cpi->cpi_vendor) {
6885 6890          case X86_VENDOR_Intel:
6886 6891                  create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6887 6892                  break;
6888 6893          case X86_VENDOR_AMD:
6889 6894                  create = cpi->cpi_family >= 0xf;
6890 6895                  break;
6891 6896          default:
6892 6897                  create = 0;
6893 6898                  break;
6894 6899          }
6895 6900          if (create)
6896 6901                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6897 6902                      "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
6898 6903  
6899 6904          /* ext-cpuid-features */
6900 6905          switch (cpi->cpi_vendor) {
6901 6906          case X86_VENDOR_Intel:
6902 6907          case X86_VENDOR_AMD:
6903 6908          case X86_VENDOR_Cyrix:
6904 6909          case X86_VENDOR_TM:
6905 6910          case X86_VENDOR_Centaur:
6906 6911                  create = cpi->cpi_xmaxeax >= 0x80000001;
6907 6912                  break;
6908 6913          default:
6909 6914                  create = 0;
6910 6915                  break;
6911 6916          }
6912 6917          if (create) {
6913 6918                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6914 6919                      "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
6915 6920                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6916 6921                      "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
6917 6922          }
6918 6923  
6919 6924          /*
6920 6925           * Brand String first appeared in Intel Pentium IV, AMD K5
6921 6926           * model 1, and Cyrix GXm.  On earlier models we try and
6922 6927           * simulate something similar .. so this string should always
6923 6928           * same -something- about the processor, however lame.
6924 6929           */
6925 6930          (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6926 6931              "brand-string", cpi->cpi_brandstr);
6927 6932  
6928 6933          /*
6929 6934           * Finally, cache and tlb information
6930 6935           */
6931 6936          switch (x86_which_cacheinfo(cpi)) {
6932 6937          case X86_VENDOR_Intel:
6933 6938                  intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
6934 6939                  break;
6935 6940          case X86_VENDOR_Cyrix:
6936 6941                  cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
6937 6942                  break;
6938 6943          case X86_VENDOR_AMD:
6939 6944                  amd_cache_info(cpi, cpu_devi);
6940 6945                  break;
6941 6946          default:
6942 6947                  break;
6943 6948          }
6944 6949  }
6945 6950  
6946 6951  struct l2info {
6947 6952          int *l2i_csz;
6948 6953          int *l2i_lsz;
6949 6954          int *l2i_assoc;
6950 6955          int l2i_ret;
6951 6956  };
6952 6957  
6953 6958  /*
6954 6959   * A cacheinfo walker that fetches the size, line-size and associativity
6955 6960   * of the L2 cache
6956 6961   */
6957 6962  static int
6958 6963  intel_l2cinfo(void *arg, const struct cachetab *ct)
6959 6964  {
6960 6965          struct l2info *l2i = arg;
6961 6966          int *ip;
6962 6967  
6963 6968          if (ct->ct_label != l2_cache_str &&
6964 6969              ct->ct_label != sl2_cache_str)
6965 6970                  return (0);     /* not an L2 -- keep walking */
6966 6971  
6967 6972          if ((ip = l2i->l2i_csz) != NULL)
6968 6973                  *ip = ct->ct_size;
6969 6974          if ((ip = l2i->l2i_lsz) != NULL)
6970 6975                  *ip = ct->ct_line_size;
6971 6976          if ((ip = l2i->l2i_assoc) != NULL)
6972 6977                  *ip = ct->ct_assoc;
6973 6978          l2i->l2i_ret = ct->ct_size;
6974 6979          return (1);             /* was an L2 -- terminate walk */
6975 6980  }
6976 6981  
6977 6982  /*
6978 6983   * AMD L2/L3 Cache and TLB Associativity Field Definition:
6979 6984   *
6980 6985   *      Unlike the associativity for the L1 cache and tlb where the 8 bit
6981 6986   *      value is the associativity, the associativity for the L2 cache and
6982 6987   *      tlb is encoded in the following table. The 4 bit L2 value serves as
6983 6988   *      an index into the amd_afd[] array to determine the associativity.
6984 6989   *      -1 is undefined. 0 is fully associative.
6985 6990   */
6986 6991  
6987 6992  static int amd_afd[] =
6988 6993          {-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
6989 6994  
6990 6995  static void
6991 6996  amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
6992 6997  {
6993 6998          struct cpuid_regs *cp;
6994 6999          uint_t size, assoc;
6995 7000          int i;
6996 7001          int *ip;
6997 7002  
6998 7003          if (cpi->cpi_xmaxeax < 0x80000006)
6999 7004                  return;
7000 7005          cp = &cpi->cpi_extd[6];
7001 7006  
7002 7007          if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
7003 7008              (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
7004 7009                  uint_t cachesz = size * 1024;
7005 7010                  assoc = amd_afd[i];
7006 7011  
7007 7012                  ASSERT(assoc != -1);
7008 7013  
7009 7014                  if ((ip = l2i->l2i_csz) != NULL)
7010 7015                          *ip = cachesz;
7011 7016                  if ((ip = l2i->l2i_lsz) != NULL)
7012 7017                          *ip = BITX(cp->cp_ecx, 7, 0);
7013 7018                  if ((ip = l2i->l2i_assoc) != NULL)
7014 7019                          *ip = assoc;
7015 7020                  l2i->l2i_ret = cachesz;
7016 7021          }
7017 7022  }
7018 7023  
7019 7024  int
7020 7025  getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
7021 7026  {
7022 7027          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7023 7028          struct l2info __l2info, *l2i = &__l2info;
7024 7029  
7025 7030          l2i->l2i_csz = csz;
7026 7031          l2i->l2i_lsz = lsz;
7027 7032          l2i->l2i_assoc = assoc;
7028 7033          l2i->l2i_ret = -1;
7029 7034  
7030 7035          switch (x86_which_cacheinfo(cpi)) {
7031 7036          case X86_VENDOR_Intel:
7032 7037                  intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7033 7038                  break;
7034 7039          case X86_VENDOR_Cyrix:
7035 7040                  cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7036 7041                  break;
7037 7042          case X86_VENDOR_AMD:
7038 7043                  amd_l2cacheinfo(cpi, l2i);
7039 7044                  break;
7040 7045          default:
7041 7046                  break;
7042 7047          }
7043 7048          return (l2i->l2i_ret);
7044 7049  }
7045 7050  
7046 7051  #if !defined(__xpv)
7047 7052  
7048 7053  uint32_t *
7049 7054  cpuid_mwait_alloc(cpu_t *cpu)
7050 7055  {
7051 7056          uint32_t        *ret;
7052 7057          size_t          mwait_size;
7053 7058  
7054 7059          ASSERT(cpuid_checkpass(CPU, 2));
7055 7060  
7056 7061          mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
7057 7062          if (mwait_size == 0)
7058 7063                  return (NULL);
7059 7064  
7060 7065          /*
7061 7066           * kmem_alloc() returns cache line size aligned data for mwait_size
7062 7067           * allocations.  mwait_size is currently cache line sized.  Neither
7063 7068           * of these implementation details are guarantied to be true in the
7064 7069           * future.
7065 7070           *
7066 7071           * First try allocating mwait_size as kmem_alloc() currently returns
7067 7072           * correctly aligned memory.  If kmem_alloc() does not return
7068 7073           * mwait_size aligned memory, then use mwait_size ROUNDUP.
7069 7074           *
7070 7075           * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
7071 7076           * decide to free this memory.
7072 7077           */
7073 7078          ret = kmem_zalloc(mwait_size, KM_SLEEP);
7074 7079          if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
7075 7080                  cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7076 7081                  cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
7077 7082                  *ret = MWAIT_RUNNING;
7078 7083                  return (ret);
7079 7084          } else {
7080 7085                  kmem_free(ret, mwait_size);
7081 7086                  ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
7082 7087                  cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7083 7088                  cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
7084 7089                  ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
7085 7090                  *ret = MWAIT_RUNNING;
7086 7091                  return (ret);
7087 7092          }
7088 7093  }
7089 7094  
7090 7095  void
7091 7096  cpuid_mwait_free(cpu_t *cpu)
7092 7097  {
7093 7098          if (cpu->cpu_m.mcpu_cpi == NULL) {
7094 7099                  return;
7095 7100          }
7096 7101  
7097 7102          if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
7098 7103              cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
7099 7104                  kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
7100 7105                      cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
7101 7106          }
7102 7107  
7103 7108          cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
7104 7109          cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
7105 7110  }
7106 7111  
7107 7112  void
7108 7113  patch_tsc_read(int flag)
7109 7114  {
7110 7115          size_t cnt;
7111 7116  
7112 7117          switch (flag) {
7113 7118          case TSC_NONE:
7114 7119                  cnt = &_no_rdtsc_end - &_no_rdtsc_start;
7115 7120                  (void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
7116 7121                  break;
7117 7122          case TSC_RDTSC_MFENCE:
7118 7123                  cnt = &_tsc_mfence_end - &_tsc_mfence_start;
7119 7124                  (void) memcpy((void *)tsc_read,
7120 7125                      (void *)&_tsc_mfence_start, cnt);
7121 7126                  break;
7122 7127          case TSC_RDTSC_LFENCE:
7123 7128                  cnt = &_tsc_lfence_end - &_tsc_lfence_start;
7124 7129                  (void) memcpy((void *)tsc_read,
7125 7130                      (void *)&_tsc_lfence_start, cnt);
7126 7131                  break;
7127 7132          case TSC_TSCP:
7128 7133                  cnt = &_tscp_end - &_tscp_start;
7129 7134                  (void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
7130 7135                  break;
7131 7136          default:
7132 7137                  /* Bail for unexpected TSC types. (TSC_NONE covers 0) */
7133 7138                  cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
7134 7139                  break;
7135 7140          }
7136 7141          tsc_type = flag;
7137 7142  }
7138 7143  
7139 7144  int
7140 7145  cpuid_deep_cstates_supported(void)
7141 7146  {
7142 7147          struct cpuid_info *cpi;
7143 7148          struct cpuid_regs regs;
7144 7149  
7145 7150          ASSERT(cpuid_checkpass(CPU, 1));
7146 7151  
7147 7152          cpi = CPU->cpu_m.mcpu_cpi;
7148 7153  
7149 7154          if (!is_x86_feature(x86_featureset, X86FSET_CPUID))
7150 7155                  return (0);
7151 7156  
7152 7157          switch (cpi->cpi_vendor) {
7153 7158          case X86_VENDOR_Intel:
7154 7159                  if (cpi->cpi_xmaxeax < 0x80000007)
7155 7160                          return (0);
7156 7161  
7157 7162                  /*
7158 7163                   * TSC run at a constant rate in all ACPI C-states?
7159 7164                   */
7160 7165                  regs.cp_eax = 0x80000007;
7161 7166                  (void) __cpuid_insn(&regs);
7162 7167                  return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
7163 7168  
7164 7169          default:
7165 7170                  return (0);
7166 7171          }
7167 7172  }
7168 7173  
7169 7174  #endif  /* !__xpv */
7170 7175  
7171 7176  void
7172 7177  post_startup_cpu_fixups(void)
7173 7178  {
7174 7179  #ifndef __xpv
7175 7180          /*
7176 7181           * Some AMD processors support C1E state. Entering this state will
7177 7182           * cause the local APIC timer to stop, which we can't deal with at
7178 7183           * this time.
7179 7184           */
7180 7185          if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
7181 7186                  on_trap_data_t otd;
7182 7187                  uint64_t reg;
7183 7188  
7184 7189                  if (!on_trap(&otd, OT_DATA_ACCESS)) {
7185 7190                          reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
7186 7191                          /* Disable C1E state if it is enabled by BIOS */
7187 7192                          if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
7188 7193                              AMD_ACTONCMPHALT_MASK) {
7189 7194                                  reg &= ~(AMD_ACTONCMPHALT_MASK <<
7190 7195                                      AMD_ACTONCMPHALT_SHIFT);
7191 7196                                  wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
7192 7197                          }
7193 7198                  }
7194 7199                  no_trap();
7195 7200          }
7196 7201  #endif  /* !__xpv */
7197 7202  }
7198 7203  
7199 7204  void
7200 7205  enable_pcid(void)
7201 7206  {
7202 7207          if (x86_use_pcid == -1)
7203 7208                  x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
7204 7209  
7205 7210          if (x86_use_invpcid == -1) {
7206 7211                  x86_use_invpcid = is_x86_feature(x86_featureset,
7207 7212                      X86FSET_INVPCID);
7208 7213          }
7209 7214  
7210 7215          if (!x86_use_pcid)
7211 7216                  return;
7212 7217  
7213 7218          /*
7214 7219           * Intel say that on setting PCIDE, it immediately starts using the PCID
7215 7220           * bits; better make sure there's nothing there.
7216 7221           */
7217 7222          ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
7218 7223  
7219 7224          setcr4(getcr4() | CR4_PCIDE);
7220 7225  }
7221 7226  
7222 7227  /*
7223 7228   * Setup necessary registers to enable XSAVE feature on this processor.
7224 7229   * This function needs to be called early enough, so that no xsave/xrstor
7225 7230   * ops will execute on the processor before the MSRs are properly set up.
7226 7231   *
7227 7232   * Current implementation has the following assumption:
7228 7233   * - cpuid_pass1() is done, so that X86 features are known.
7229 7234   * - fpu_probe() is done, so that fp_save_mech is chosen.
7230 7235   */
7231 7236  void
7232 7237  xsave_setup_msr(cpu_t *cpu)
7233 7238  {
7234 7239          ASSERT(fp_save_mech == FP_XSAVE);
7235 7240          ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
7236 7241  
7237 7242          /* Enable OSXSAVE in CR4. */
7238 7243          setcr4(getcr4() | CR4_OSXSAVE);
7239 7244          /*
7240 7245           * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report
7241 7246           * correct value.
7242 7247           */
7243 7248          cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE;
7244 7249          setup_xfem();
7245 7250  }
7246 7251  
7247 7252  /*
7248 7253   * Starting with the Westmere processor the local
7249 7254   * APIC timer will continue running in all C-states,
7250 7255   * including the deepest C-states.
7251 7256   */
7252 7257  int
7253 7258  cpuid_arat_supported(void)
7254 7259  {
7255 7260          struct cpuid_info *cpi;
7256 7261          struct cpuid_regs regs;
7257 7262  
7258 7263          ASSERT(cpuid_checkpass(CPU, 1));
7259 7264          ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7260 7265  
7261 7266          cpi = CPU->cpu_m.mcpu_cpi;
7262 7267  
7263 7268          switch (cpi->cpi_vendor) {
7264 7269          case X86_VENDOR_Intel:
7265 7270                  /*
7266 7271                   * Always-running Local APIC Timer is
7267 7272                   * indicated by CPUID.6.EAX[2].
7268 7273                   */
7269 7274                  if (cpi->cpi_maxeax >= 6) {
7270 7275                          regs.cp_eax = 6;
7271 7276                          (void) cpuid_insn(NULL, &regs);
7272 7277                          return (regs.cp_eax & CPUID_INTC_EAX_ARAT);
7273 7278                  } else {
7274 7279                          return (0);
7275 7280                  }
7276 7281          default:
7277 7282                  return (0);
7278 7283          }
7279 7284  }
7280 7285  
7281 7286  /*
7282 7287   * Check support for Intel ENERGY_PERF_BIAS feature
7283 7288   */
7284 7289  int
7285 7290  cpuid_iepb_supported(struct cpu *cp)
7286 7291  {
7287 7292          struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi;
7288 7293          struct cpuid_regs regs;
7289 7294  
7290 7295          ASSERT(cpuid_checkpass(cp, 1));
7291 7296  
7292 7297          if (!(is_x86_feature(x86_featureset, X86FSET_CPUID)) ||
7293 7298              !(is_x86_feature(x86_featureset, X86FSET_MSR))) {
7294 7299                  return (0);
7295 7300          }
7296 7301  
7297 7302          /*
7298 7303           * Intel ENERGY_PERF_BIAS MSR is indicated by
7299 7304           * capability bit CPUID.6.ECX.3
7300 7305           */
7301 7306          if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6))
7302 7307                  return (0);
7303 7308  
7304 7309          regs.cp_eax = 0x6;
7305 7310          (void) cpuid_insn(NULL, &regs);
7306 7311          return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS);
7307 7312  }
7308 7313  
7309 7314  /*
7310 7315   * Check support for TSC deadline timer
7311 7316   *
7312 7317   * TSC deadline timer provides a superior software programming
7313 7318   * model over local APIC timer that eliminates "time drifts".
7314 7319   * Instead of specifying a relative time, software specifies an
7315 7320   * absolute time as the target at which the processor should
7316 7321   * generate a timer event.
7317 7322   */
7318 7323  int
7319 7324  cpuid_deadline_tsc_supported(void)
7320 7325  {
7321 7326          struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi;
7322 7327          struct cpuid_regs regs;
7323 7328  
7324 7329          ASSERT(cpuid_checkpass(CPU, 1));
7325 7330          ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7326 7331  
7327 7332          switch (cpi->cpi_vendor) {
7328 7333          case X86_VENDOR_Intel:
7329 7334                  if (cpi->cpi_maxeax >= 1) {
7330 7335                          regs.cp_eax = 1;
7331 7336                          (void) cpuid_insn(NULL, &regs);
7332 7337                          return (regs.cp_ecx & CPUID_DEADLINE_TSC);
7333 7338                  } else {
7334 7339                          return (0);
7335 7340                  }
7336 7341          default:
7337 7342                  return (0);
7338 7343          }
7339 7344  }
7340 7345  
7341 7346  #if defined(__amd64) && !defined(__xpv)
7342 7347  /*
7343 7348   * Patch in versions of bcopy for high performance Intel Nhm processors
7344 7349   * and later...
7345 7350   */
7346 7351  void
7347 7352  patch_memops(uint_t vendor)
7348 7353  {
7349 7354          size_t cnt, i;
7350 7355          caddr_t to, from;
7351 7356  
7352 7357          if ((vendor == X86_VENDOR_Intel) &&
7353 7358              is_x86_feature(x86_featureset, X86FSET_SSE4_2)) {
7354 7359                  cnt = &bcopy_patch_end - &bcopy_patch_start;
7355 7360                  to = &bcopy_ck_size;
7356 7361                  from = &bcopy_patch_start;
7357 7362                  for (i = 0; i < cnt; i++) {
7358 7363                          *to++ = *from++;
7359 7364                  }
7360 7365          }
7361 7366  }
7362 7367  #endif  /* __amd64 && !__xpv */
7363 7368  
7364 7369  /*
7365 7370   * We're being asked to tell the system how many bits are required to represent
7366 7371   * the various thread and strand IDs. While it's tempting to derive this based
7367 7372   * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite
7368 7373   * correct. Instead, this needs to be based on the number of bits that the APIC
7369 7374   * allows for these different configurations. We only update these to a larger
7370 7375   * value if we find one.
7371 7376   */
7372 7377  void
7373 7378  cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
7374 7379  {
7375 7380          struct cpuid_info *cpi;
7376 7381  
7377 7382          VERIFY(cpuid_checkpass(CPU, 1));
7378 7383          cpi = cpu->cpu_m.mcpu_cpi;
7379 7384  
7380 7385          if (cpi->cpi_ncore_bits > *core_nbits) {
7381 7386                  *core_nbits = cpi->cpi_ncore_bits;
7382 7387          }
7383 7388  
7384 7389          if (cpi->cpi_nthread_bits > *strand_nbits) {
7385 7390                  *strand_nbits = cpi->cpi_nthread_bits;
7386 7391          }
7387 7392  }
7388 7393  
7389 7394  void
7390 7395  cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
7391 7396  {
7392 7397          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7393 7398          struct cpuid_regs cp;
7394 7399  
7395 7400          /*
7396 7401           * Reread the CPUID portions that we need for various security
7397 7402           * information.
7398 7403           */
7399 7404          if (cpi->cpi_vendor == X86_VENDOR_Intel) {
7400 7405                  /*
7401 7406                   * Check if we now have leaf 7 available to us.
7402 7407                   */
7403 7408                  if (cpi->cpi_maxeax < 7) {
7404 7409                          bzero(&cp, sizeof (cp));
7405 7410                          cp.cp_eax = 0;
7406 7411                          cpi->cpi_maxeax = __cpuid_insn(&cp);
7407 7412                          if (cpi->cpi_maxeax < 7)
7408 7413                                  return;
7409 7414                  }
7410 7415  
7411 7416                  bzero(&cp, sizeof (cp));
7412 7417                  cp.cp_eax = 7;
7413 7418                  cp.cp_ecx = 0;
7414 7419                  (void) __cpuid_insn(&cp);
7415 7420                  cpi->cpi_std[7] = cp;
7416 7421          } else if (cpi->cpi_vendor == X86_VENDOR_AMD) {
7417 7422                  /* No xcpuid support */
7418 7423                  if (cpi->cpi_family < 5 ||
7419 7424                      (cpi->cpi_family == 5 && cpi->cpi_model < 1))
7420 7425                          return;
7421 7426  
7422 7427                  if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7423 7428                          bzero(&cp, sizeof (cp));
7424 7429                          cp.cp_eax = CPUID_LEAF_EXT_0;
7425 7430                          cpi->cpi_xmaxeax = __cpuid_insn(&cp);
7426 7431                          if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7427 7432                                  return;
7428 7433                          }
7429 7434                  }
7430 7435  
7431 7436                  bzero(&cp, sizeof (cp));
7432 7437                  cp.cp_eax = CPUID_LEAF_EXT_8;
7433 7438                  (void) __cpuid_insn(&cp);
7434 7439                  platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
7435 7440                  cpi->cpi_extd[8] = cp;
7436 7441          } else {
7437 7442                  /*
7438 7443                   * Nothing to do here. Return an empty set which has already
7439 7444                   * been zeroed for us.
7440 7445                   */
7441 7446                  return;
7442 7447          }
7443 7448          cpuid_scan_security(cpu, fset);
7444 7449  }
7445 7450  
7446 7451  /* ARGSUSED */
7447 7452  static int
7448 7453  cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
7449 7454  {
7450 7455          uchar_t *fset;
7451 7456          boolean_t first_pass = (boolean_t)arg1;
7452 7457  
7453 7458          fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
7454 7459          if (first_pass && CPU->cpu_id != 0)
7455 7460                  return (0);
7456 7461          if (!first_pass && CPU->cpu_id == 0)
7457 7462                  return (0);
7458 7463          cpuid_pass_ucode(CPU, fset);
7459 7464  
7460 7465          return (0);
7461 7466  }
7462 7467  
7463 7468  /*
7464 7469   * After a microcode update where the version has changed, then we need to
7465 7470   * rescan CPUID. To do this we check every CPU to make sure that they have the
7466 7471   * same microcode. Then we perform a cross call to all such CPUs. It's the
7467 7472   * caller's job to make sure that no one else can end up doing an update while
7468 7473   * this is going on.
7469 7474   *
7470 7475   * We assume that the system is microcode capable if we're called.
7471 7476   */
7472 7477  void
7473 7478  cpuid_post_ucodeadm(void)
7474 7479  {
7475 7480          uint32_t rev;
7476 7481          int i;
7477 7482          struct cpu *cpu;
7478 7483          cpuset_t cpuset;
7479 7484          void *argdata;
7480 7485          uchar_t *f0;
7481 7486  
7482 7487          argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
7483 7488  
7484 7489          mutex_enter(&cpu_lock);
7485 7490          cpu = cpu_get(0);
7486 7491          rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
7487 7492          CPUSET_ONLY(cpuset, 0);
7488 7493          for (i = 1; i < max_ncpus; i++) {
7489 7494                  if ((cpu = cpu_get(i)) == NULL)
7490 7495                          continue;
7491 7496  
7492 7497                  if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
7493 7498                          panic("post microcode update CPU %d has differing "
7494 7499                              "microcode revision (%u) from CPU 0 (%u)",
7495 7500                              i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
7496 7501                  }
7497 7502                  CPUSET_ADD(cpuset, i);
7498 7503          }
7499 7504  
7500 7505          /*
7501 7506           * We do the cross calls in two passes. The first pass is only for the
7502 7507           * boot CPU. The second pass is for all of the other CPUs. This allows
7503 7508           * the boot CPU to go through and change behavior related to patching or
7504 7509           * whether or not Enhanced IBRS needs to be enabled and then allow all
7505 7510           * other CPUs to follow suit.
7506 7511           */
7507 7512          kpreempt_disable();
7508 7513          xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset),
7509 7514              cpuid_post_ucodeadm_xc);
7510 7515          xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset),
7511 7516              cpuid_post_ucodeadm_xc);
7512 7517          kpreempt_enable();
7513 7518  
7514 7519          /*
7515 7520           * OK, now look at each CPU and see if their feature sets are equal.
7516 7521           */
7517 7522          f0 = argdata;
7518 7523          for (i = 1; i < max_ncpus; i++) {
7519 7524                  uchar_t *fset;
7520 7525                  if (!CPU_IN_SET(cpuset, i))
7521 7526                          continue;
7522 7527  
7523 7528                  fset = (uchar_t *)((uintptr_t)argdata +
7524 7529                      sizeof (x86_featureset) * i);
7525 7530  
7526 7531                  if (!compare_x86_featureset(f0, fset)) {
7527 7532                          panic("Post microcode update CPU %d has "
7528 7533                              "differing security feature (%p) set from CPU 0 "
7529 7534                              "(%p), not appending to feature set", i,
7530 7535                              (void *)fset, (void *)f0);
7531 7536                  }
7532 7537          }
7533 7538  
7534 7539          mutex_exit(&cpu_lock);
7535 7540  
7536 7541          for (i = 0; i < NUM_X86_FEATURES; i++) {
7537 7542                  cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n",
7538 7543                      x86_feature_names[i]);
7539 7544                  if (is_x86_feature(f0, i)) {
7540 7545                          add_x86_feature(x86_featureset, i);
7541 7546                  }
7542 7547          }
7543 7548          kmem_free(argdata, sizeof (x86_featureset) * NCPU);
7544 7549  }

↓ open down ↓

4482 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX