1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2016 by Delphix. All rights reserved. 24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved. 25 * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net> 26 * Copyright 2020 Joyent, Inc. 27 */ 28 /* 29 * Copyright (c) 2010, Intel Corporation. 30 * All rights reserved. 31 */ 32 /* 33 * Portions Copyright 2009 Advanced Micro Devices, Inc. 34 */ 35 36 /* 37 * CPU Identification logic 38 * 39 * The purpose of this file and its companion, cpuid_subr.c, is to help deal 40 * with the identification of CPUs, their features, and their topologies. More 41 * specifically, this file helps drive the following: 42 * 43 * 1. Enumeration of features of the processor which are used by the kernel to 44 * determine what features to enable or disable. These may be instruction set 45 * enhancements or features that we use. 46 * 47 * 2. Enumeration of instruction set architecture (ISA) additions that userland 48 * will be told about through the auxiliary vector. 49 * 50 * 3. Understanding the physical topology of the CPU such as the number of 51 * caches, how many cores it has, whether or not it supports symmetric 52 * multi-processing (SMT), etc. 53 * 54 * ------------------------ 55 * CPUID History and Basics 56 * ------------------------ 57 * 58 * The cpuid instruction was added by Intel roughly around the time that the 59 * original Pentium was introduced. The purpose of cpuid was to tell in a 60 * programmatic fashion information about the CPU that previously was guessed 61 * at. For example, an important part of cpuid is that we can know what 62 * extensions to the ISA exist. If you use an invalid opcode you would get a 63 * #UD, so this method allows a program (whether a user program or the kernel) 64 * to determine what exists without crashing or getting a SIGILL. Of course, 65 * this was also during the era of the clones and the AMD Am5x86. The vendor 66 * name shows up first in cpuid for a reason. 67 * 68 * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts 69 * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has 70 * its own meaning. The different leaves are broken down into different regions: 71 * 72 * [ 0, 7fffffff ] This region is called the 'basic' 73 * region. This region is generally defined 74 * by Intel, though some of the original 75 * portions have different meanings based 76 * on the manufacturer. These days, Intel 77 * adds most new features to this region. 78 * AMD adds non-Intel compatible 79 * information in the third, extended 80 * region. Intel uses this for everything 81 * including ISA extensions, CPU 82 * features, cache information, topology, 83 * and more. 84 * 85 * There is a hole carved out of this 86 * region which is reserved for 87 * hypervisors. 88 * 89 * [ 40000000, 4fffffff ] This region, which is found in the 90 * middle of the previous region, is 91 * explicitly promised to never be used by 92 * CPUs. Instead, it is used by hypervisors 93 * to communicate information about 94 * themselves to the operating system. The 95 * values and details are unique for each 96 * hypervisor. 97 * 98 * [ 80000000, ffffffff ] This region is called the 'extended' 99 * region. Some of the low leaves mirror 100 * parts of the basic leaves. This region 101 * has generally been used by AMD for 102 * various extensions. For example, AMD- 103 * specific information about caches, 104 * features, and topology are found in this 105 * region. 106 * 107 * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx, 108 * and %edx, and then issue the cpuid instruction. At the first leaf in each of 109 * the ranges, one of the primary things returned is the maximum valid leaf in 110 * that range. This allows for discovery of what range of CPUID is valid. 111 * 112 * The CPUs have potentially surprising behavior when using an invalid leaf or 113 * unimplemented leaf. If the requested leaf is within the valid basic or 114 * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be 115 * set to zero. However, if you specify a leaf that is outside of a valid range, 116 * then instead it will be filled with the last valid _basic_ leaf. For example, 117 * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or 118 * an invalid extended leaf will return the information for leaf 3. 119 * 120 * Some leaves are broken down into sub-leaves. This means that the value 121 * depends on both the leaf asked for in %eax and a secondary register. For 122 * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get 123 * additional information. Or when getting topology information in leaf 0xb, the 124 * initial value in %ecx changes which level of the topology that you are 125 * getting information about. 126 * 127 * cpuid values are always kept to 32 bits regardless of whether or not the 128 * program is in 64-bit mode. When executing in 64-bit mode, the upper 129 * 32 bits of the register are always set to zero so that way the values are the 130 * same regardless of execution mode. 131 * 132 * ---------------------- 133 * Identifying Processors 134 * ---------------------- 135 * 136 * We can identify a processor in two steps. The first step looks at cpuid leaf 137 * 0. Leaf 0 contains the processor's vendor information. This is done by 138 * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is 139 * 'AuthenticAMD' and on Intel it is 'GenuineIntel'. 140 * 141 * From there, a processor is identified by a combination of three different 142 * values: 143 * 144 * 1. Family 145 * 2. Model 146 * 3. Stepping 147 * 148 * Each vendor uses the family and model to uniquely identify a processor. The 149 * way that family and model are changed depends on the vendor. For example, 150 * Intel has been using family 0x6 for almost all of their processor since the 151 * Pentium Pro/Pentium II era, often called the P6. The model is used to 152 * identify the exact processor. Different models are often used for the client 153 * (consumer) and server parts. Even though each processor often has major 154 * architectural differences, they still are considered the same family by 155 * Intel. 156 * 157 * On the other hand, each major AMD architecture generally has its own family. 158 * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it 159 * the model number is used to help identify specific processors. 160 * 161 * The stepping is used to refer to a revision of a specific microprocessor. The 162 * term comes from equipment used to produce masks that are used to create 163 * integrated circuits. 164 * 165 * The information is present in leaf 1, %eax. In technical documentation you 166 * will see the terms extended model and extended family. The original family, 167 * model, and stepping fields were each 4 bits wide. If the values in either 168 * are 0xf, then one is to consult the extended model and extended family, which 169 * take previously reserved bits and allow for a larger number of models and add 170 * 0xf to them. 171 * 172 * When we process this information, we store the full family, model, and 173 * stepping in the struct cpuid_info members cpi_family, cpi_model, and 174 * cpi_step, respectively. Whenever you are performing comparisons with the 175 * family, model, and stepping, you should use these members and not the raw 176 * values from cpuid. If you must use the raw values from cpuid directly, you 177 * must make sure that you add the extended model and family to the base model 178 * and family. 179 * 180 * In general, we do not use information about the family, model, and stepping 181 * to determine whether or not a feature is present; that is generally driven by 182 * specific leaves. However, when something we care about on the processor is 183 * not considered 'architectural' meaning that it is specific to a set of 184 * processors and not promised in the architecture model to be consistent from 185 * generation to generation, then we will fall back on this information. The 186 * most common cases where this comes up is when we have to workaround errata in 187 * the processor, are dealing with processor-specific features such as CPU 188 * performance counters, or we want to provide additional information for things 189 * such as fault management. 190 * 191 * While processors also do have a brand string, which is the name that people 192 * are familiar with when buying the processor, they are not meant for 193 * programmatic consumption. That is what the family, model, and stepping are 194 * for. 195 * 196 * ------------ 197 * CPUID Passes 198 * ------------ 199 * 200 * As part of performing feature detection, we break this into several different 201 * passes. The passes are as follows: 202 * 203 * Pass 0 This is a primordial pass done in locore.s to deal with 204 * Cyrix CPUs that don't support cpuid. The reality is that 205 * we likely don't run on them any more, but there is still 206 * logic for handling them. 207 * 208 * Pass 1 This is the primary pass and is responsible for doing a 209 * large number of different things: 210 * 211 * 1. Determine which vendor manufactured the CPU and 212 * determining the family, model, and stepping information. 213 * 214 * 2. Gathering a large number of feature flags to 215 * determine which features the CPU support and which 216 * indicate things that we need to do other work in the OS 217 * to enable. Features detected this way are added to the 218 * x86_featureset which can be queried to 219 * determine what we should do. This includes processing 220 * all of the basic and extended CPU features that we care 221 * about. 222 * 223 * 3. Determining the CPU's topology. This includes 224 * information about how many cores and threads are present 225 * in the package. It also is responsible for figuring out 226 * which logical CPUs are potentially part of the same core 227 * and what other resources they might share. For more 228 * information see the 'Topology' section. 229 * 230 * 4. Determining the set of CPU security-specific features 231 * that we need to worry about and determine the 232 * appropriate set of workarounds. 233 * 234 * Pass 1 on the boot CPU occurs before KMDB is started. 235 * 236 * Pass 2 The second pass is done after startup(). Here, we check 237 * other miscellaneous features. Most of this is gathering 238 * additional basic and extended features that we'll use in 239 * later passes or for debugging support. 240 * 241 * Pass 3 The third pass occurs after the kernel memory allocator 242 * has been fully initialized. This gathers information 243 * where we might need dynamic memory available for our 244 * uses. This includes several varying width leaves that 245 * have cache information and the processor's brand string. 246 * 247 * Pass 4 The fourth and final normal pass is performed after the 248 * kernel has brought most everything online. This is 249 * invoked from post_startup(). In this pass, we go through 250 * the set of features that we have enabled and turn that 251 * into the hardware auxiliary vector features that 252 * userland receives. This is used by userland, primarily 253 * by the run-time link-editor (RTLD), though userland 254 * software could also refer to it directly. 255 * 256 * Microcode After a microcode update, we do a selective rescan of 257 * the cpuid leaves to determine what features have 258 * changed. Microcode updates can provide more details 259 * about security related features to deal with issues like 260 * Spectre and L1TF. On occasion, vendors have violated 261 * their contract and removed bits. However, we don't try 262 * to detect that because that puts us in a situation that 263 * we really can't deal with. As such, the only thing we 264 * rescan are security related features today. See 265 * cpuid_pass_ucode(). 266 * 267 * All of the passes (except pass 0) are run on all CPUs. However, for the most 268 * part we only care about what the boot CPU says about this information and use 269 * the other CPUs as a rough guide to sanity check that we have the same feature 270 * set. 271 * 272 * We do not support running multiple logical CPUs with disjoint, let alone 273 * different, feature sets. 274 * 275 * ------------------ 276 * Processor Topology 277 * ------------------ 278 * 279 * One of the important things that we need to do is to understand the topology 280 * of the underlying processor. When we say topology in this case, we're trying 281 * to understand the relationship between the logical CPUs that the operating 282 * system sees and the underlying physical layout. Different logical CPUs may 283 * share different resources which can have important consequences for the 284 * performance of the system. For example, they may share caches, execution 285 * units, and more. 286 * 287 * The topology of the processor changes from generation to generation and 288 * vendor to vendor. Along with that, different vendors use different 289 * terminology, and the operating system itself uses occasionally overlapping 290 * terminology. It's important to understand what this topology looks like so 291 * one can understand the different things that we try to calculate and 292 * determine. 293 * 294 * To get started, let's talk about a little bit of terminology that we've used 295 * so far, is used throughout this file, and is fairly generic across multiple 296 * vendors: 297 * 298 * CPU 299 * A central processing unit (CPU) refers to a logical and/or virtual 300 * entity that the operating system can execute instructions on. The 301 * underlying resources for this CPU may be shared between multiple 302 * entities; however, to the operating system it is a discrete unit. 303 * 304 * PROCESSOR and PACKAGE 305 * 306 * Generally, when we use the term 'processor' on its own, we are referring 307 * to the physical entity that one buys and plugs into a board. However, 308 * because processor has been overloaded and one might see it used to mean 309 * multiple different levels, we will instead use the term 'package' for 310 * the rest of this file. The term package comes from the electrical 311 * engineering side and refers to the physical entity that encloses the 312 * electronics inside. Strictly speaking the package can contain more than 313 * just the CPU, for example, on many processors it may also have what's 314 * called an 'integrated graphical processing unit (GPU)'. Because the 315 * package can encapsulate multiple units, it is the largest physical unit 316 * that we refer to. 317 * 318 * SOCKET 319 * 320 * A socket refers to unit on a system board (generally the motherboard) 321 * that can receive a package. A single package, or processor, is plugged 322 * into a single socket. A system may have multiple sockets. Often times, 323 * the term socket is used interchangeably with package and refers to the 324 * electrical component that has plugged in, and not the receptacle itself. 325 * 326 * CORE 327 * 328 * A core refers to the physical instantiation of a CPU, generally, with a 329 * full set of hardware resources available to it. A package may contain 330 * multiple cores inside of it or it may just have a single one. A 331 * processor with more than one core is often referred to as 'multi-core'. 332 * In illumos, we will use the feature X86FSET_CMP to refer to a system 333 * that has 'multi-core' processors. 334 * 335 * A core may expose a single logical CPU to the operating system, or it 336 * may expose multiple CPUs, which we call threads, defined below. 337 * 338 * Some resources may still be shared by cores in the same package. For 339 * example, many processors will share the level 3 cache between cores. 340 * Some AMD generations share hardware resources between cores. For more 341 * information on that see the section 'AMD Topology'. 342 * 343 * THREAD and STRAND 344 * 345 * In this file, generally a thread refers to a hardware resources and not 346 * the operating system's logical abstraction. A thread is always exposed 347 * as an independent logical CPU to the operating system. A thread belongs 348 * to a specific core. A core may have more than one thread. When that is 349 * the case, the threads that are part of the same core are often referred 350 * to as 'siblings'. 351 * 352 * When multiple threads exist, this is generally referred to as 353 * simultaneous multi-threading (SMT). When Intel introduced this in their 354 * processors they called it hyper-threading (HT). When multiple threads 355 * are active in a core, they split the resources of the core. For example, 356 * two threads may share the same set of hardware execution units. 357 * 358 * The operating system often uses the term 'strand' to refer to a thread. 359 * This helps disambiguate it from the software concept. 360 * 361 * CHIP 362 * 363 * Unfortunately, the term 'chip' is dramatically overloaded. At its most 364 * base meaning, it is used to refer to a single integrated circuit, which 365 * may or may not be the only thing in the package. In illumos, when you 366 * see the term 'chip' it is almost always referring to the same thing as 367 * the 'package'. However, many vendors may use chip to refer to one of 368 * many integrated circuits that have been placed in the package. As an 369 * example, see the subsequent definition. 370 * 371 * To try and keep things consistent, we will only use chip when referring 372 * to the entire integrated circuit package, with the exception of the 373 * definition of multi-chip module (because it is in the name) and use the 374 * term 'die' when we want the more general, potential sub-component 375 * definition. 376 * 377 * DIE 378 * 379 * A die refers to an integrated circuit. Inside of the package there may 380 * be a single die or multiple dies. This is sometimes called a 'chip' in 381 * vendor's parlance, but in this file, we use the term die to refer to a 382 * subcomponent. 383 * 384 * MULTI-CHIP MODULE 385 * 386 * A multi-chip module (MCM) refers to putting multiple distinct chips that 387 * are connected together in the same package. When a multi-chip design is 388 * used, generally each chip is manufactured independently and then joined 389 * together in the package. For example, on AMD's Zen microarchitecture 390 * (family 0x17), the package contains several dies (the second meaning of 391 * chip from above) that are connected together. 392 * 393 * CACHE 394 * 395 * A cache is a part of the processor that maintains copies of recently 396 * accessed memory. Caches are split into levels and then into types. 397 * Commonly there are one to three levels, called level one, two, and 398 * three. The lower the level, the smaller it is, the closer it is to the 399 * execution units of the CPU, and the faster it is to access. The layout 400 * and design of the cache come in many different flavors, consult other 401 * resources for a discussion of those. 402 * 403 * Caches are generally split into two types, the instruction and data 404 * cache. The caches contain what their names suggest, the instruction 405 * cache has executable program text, while the data cache has all other 406 * memory that the processor accesses. As of this writing, data is kept 407 * coherent between all of the caches on x86, so if one modifies program 408 * text before it is executed, that will be in the data cache, and the 409 * instruction cache will be synchronized with that change when the 410 * processor actually executes those instructions. This coherency also 411 * covers the fact that data could show up in multiple caches. 412 * 413 * Generally, the lowest level caches are specific to a core. However, the 414 * last layer cache is shared between some number of cores. The number of 415 * CPUs sharing this last level cache is important. This has implications 416 * for the choices that the scheduler makes, as accessing memory that might 417 * be in a remote cache after thread migration can be quite expensive. 418 * 419 * Sometimes, the word cache is abbreviated with a '$', because in US 420 * English the word cache is pronounced the same as cash. So L1D$ refers to 421 * the L1 data cache, and L2$ would be the L2 cache. This will not be used 422 * in the rest of this theory statement for clarity. 423 * 424 * MEMORY CONTROLLER 425 * 426 * The memory controller is a component that provides access to DRAM. Each 427 * memory controller can access a set number of DRAM channels. Each channel 428 * can have a number of DIMMs (sticks of memory) associated with it. A 429 * given package may have more than one memory controller. The association 430 * of the memory controller to a group of cores is important as it is 431 * cheaper to access memory on the controller that you are associated with. 432 * 433 * NUMA 434 * 435 * NUMA or non-uniform memory access, describes a way that systems are 436 * built. On x86, any processor core can address all of the memory in the 437 * system. However, When using multiple sockets or possibly within a 438 * multi-chip module, some of that memory is physically closer and some of 439 * it is further. Memory that is further away is more expensive to access. 440 * Consider the following image of multiple sockets with memory: 441 * 442 * +--------+ +--------+ 443 * | DIMM A | +----------+ +----------+ | DIMM D | 444 * +--------+-+ | | | | +-+------+-+ 445 * | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E | 446 * +--------+-+ | | | | +-+------+-+ 447 * | DIMM C | +----------+ +----------+ | DIMM F | 448 * +--------+ +--------+ 449 * 450 * In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is 451 * closer to DIMMs D-F. This means that it is cheaper for socket 0 to 452 * access DIMMs A-C and more expensive to access D-F as it has to go 453 * through Socket 1 to get there. The inverse is true for Socket 1. DIMMs 454 * D-F are cheaper than A-C. While the socket form is the most common, when 455 * using multi-chip modules, this can also sometimes occur. For another 456 * example of this that's more involved, see the AMD topology section. 457 * 458 * 459 * Intel Topology 460 * -------------- 461 * 462 * Most Intel processors since Nehalem, (as of this writing the current gen 463 * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of 464 * the package is a single monolithic die. MCMs currently aren't used. Most 465 * parts have three levels of caches, with the L3 cache being shared between 466 * all of the cores on the package. The L1/L2 cache is generally specific to 467 * an individual core. The following image shows at a simplified level what 468 * this looks like. The memory controller is commonly part of something called 469 * the 'Uncore', that used to be separate physical chips that were not a part of 470 * the package, but are now part of the same chip. 471 * 472 * +-----------------------------------------------------------------------+ 473 * | Package | 474 * | +-------------------+ +-------------------+ +-------------------+ | 475 * | | Core | | Core | | Core | | 476 * | | +--------+ +---+ | | +--------+ +---+ | | +--------+ +---+ | | 477 * | | | Thread | | L | | | | Thread | | L | | | | Thread | | L | | | 478 * | | +--------+ | 1 | | | +--------+ | 1 | | | +--------+ | 1 | | | 479 * | | +--------+ | | | | +--------+ | | | | +--------+ | | | | 480 * | | | Thread | | | | | | Thread | | | | | | Thread | | | | | 481 * | | +--------+ +---+ | | +--------+ +---+ | | +--------+ +---+ | | 482 * | | +--------------+ | | +--------------+ | | +--------------+ | | 483 * | | | L2 Cache | | | | L2 Cache | | | | L2 Cache | | | 484 * | | +--------------+ | | +--------------+ | | +--------------+ | | 485 * | +-------------------+ +-------------------+ +-------------------+ | 486 * | +-------------------------------------------------------------------+ | 487 * | | Shared L3 Cache | | 488 * | +-------------------------------------------------------------------+ | 489 * | +-------------------------------------------------------------------+ | 490 * | | Memory Controller | | 491 * | +-------------------------------------------------------------------+ | 492 * +-----------------------------------------------------------------------+ 493 * 494 * A side effect of this current architecture is that what we care about from a 495 * scheduling and topology perspective, is simplified. In general we care about 496 * understanding which logical CPUs are part of the same core and socket. 497 * 498 * To determine the relationship between threads and cores, Intel initially used 499 * the identifier in the advanced programmable interrupt controller (APIC). They 500 * also added cpuid leaf 4 to give additional information about the number of 501 * threads and CPUs in the processor. With the addition of x2apic (which 502 * increased the number of addressable logical CPUs from 8-bits to 32-bits), an 503 * additional cpuid topology leaf 0xB was added. 504 * 505 * AMD Topology 506 * ------------ 507 * 508 * When discussing AMD topology, we want to break this into three distinct 509 * generations of topology. There's the basic topology that has been used in 510 * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced 511 * with family 0x15 (Bulldozer), and there's the topology that was introduced 512 * with family 0x17 (Zen). AMD also has some additional terminology that's worth 513 * talking about. 514 * 515 * Until the introduction of family 0x17 (Zen), AMD did not implement something 516 * that they considered SMT. Whether or not the AMD processors have SMT 517 * influences many things including scheduling and reliability, availability, 518 * and serviceability (RAS) features. 519 * 520 * NODE 521 * 522 * AMD uses the term node to refer to a die that contains a number of cores 523 * and I/O resources. Depending on the processor family and model, more 524 * than one node can be present in the package. When there is more than one 525 * node this indicates a multi-chip module. Usually each node has its own 526 * access to memory and I/O devices. This is important and generally 527 * different from the corresponding Intel Nehalem-Skylake+ processors. As a 528 * result, we track this relationship in the operating system. 529 * 530 * In processors with an L3 cache, the L3 cache is generally shared across 531 * the entire node, though the way this is carved up varies from generation 532 * to generation. 533 * 534 * BULLDOZER 535 * 536 * Starting with the Bulldozer family (0x15) and continuing until the 537 * introduction of the Zen microarchitecture, AMD introduced the idea of a 538 * compute unit. In a compute unit, two traditional cores share a number of 539 * hardware resources. Critically, they share the FPU, L1 instruction 540 * cache, and the L2 cache. Several compute units were then combined inside 541 * of a single node. Because the integer execution units, L1 data cache, 542 * and some other resources were not shared between the cores, AMD never 543 * considered this to be SMT. 544 * 545 * ZEN 546 * 547 * The Zen family (0x17) uses a multi-chip module (MCM) design, the module 548 * is called Zeppelin. These modules are similar to the idea of nodes used 549 * previously. Each of these nodes has two DRAM channels which all of the 550 * cores in the node can access uniformly. These nodes are linked together 551 * in the package, creating a NUMA environment. 552 * 553 * The Zeppelin die itself contains two different 'core complexes'. Each 554 * core complex consists of four cores which each have two threads, for a 555 * total of 8 logical CPUs per complex. Unlike other generations, 556 * where all the logical CPUs in a given node share the L3 cache, here each 557 * core complex has its own shared L3 cache. 558 * 559 * A further thing that we need to consider is that in some configurations, 560 * particularly with the Threadripper line of processors, not every die 561 * actually has its memory controllers wired up to actual memory channels. 562 * This means that some cores have memory attached to them and others 563 * don't. 564 * 565 * To put Zen in perspective, consider the following images: 566 * 567 * +--------------------------------------------------------+ 568 * | Core Complex | 569 * | +-------------------+ +-------------------+ +---+ | 570 * | | Core +----+ | | Core +----+ | | | | 571 * | | +--------+ | L2 | | | +--------+ | L2 | | | | | 572 * | | | Thread | +----+ | | | Thread | +----+ | | | | 573 * | | +--------+-+ +--+ | | +--------+-+ +--+ | | L | | 574 * | | | Thread | |L1| | | | Thread | |L1| | | 3 | | 575 * | | +--------+ +--+ | | +--------+ +--+ | | | | 576 * | +-------------------+ +-------------------+ | C | | 577 * | +-------------------+ +-------------------+ | a | | 578 * | | Core +----+ | | Core +----+ | | c | | 579 * | | +--------+ | L2 | | | +--------+ | L2 | | | h | | 580 * | | | Thread | +----+ | | | Thread | +----+ | | e | | 581 * | | +--------+-+ +--+ | | +--------+-+ +--+ | | | | 582 * | | | Thread | |L1| | | | Thread | |L1| | | | | 583 * | | +--------+ +--+ | | +--------+ +--+ | | | | 584 * | +-------------------+ +-------------------+ +---+ | 585 * | | 586 * +--------------------------------------------------------+ 587 * 588 * This first image represents a single Zen core complex that consists of four 589 * cores. 590 * 591 * 592 * +--------------------------------------------------------+ 593 * | Zeppelin Die | 594 * | +--------------------------------------------------+ | 595 * | | I/O Units (PCIe, SATA, USB, etc.) | | 596 * | +--------------------------------------------------+ | 597 * | HH | 598 * | +-----------+ HH +-----------+ | 599 * | | | HH | | | 600 * | | Core |==========| Core | | 601 * | | Complex |==========| Complex | | 602 * | | | HH | | | 603 * | +-----------+ HH +-----------+ | 604 * | HH | 605 * | +--------------------------------------------------+ | 606 * | | Memory Controller | | 607 * | +--------------------------------------------------+ | 608 * | | 609 * +--------------------------------------------------------+ 610 * 611 * This image represents a single Zeppelin Die. Note how both cores are 612 * connected to the same memory controller and I/O units. While each core 613 * complex has its own L3 cache as seen in the first image, they both have 614 * uniform access to memory. 615 * 616 * 617 * PP PP 618 * PP PP 619 * +----------PP---------------------PP---------+ 620 * | PP PP | 621 * | +-----------+ +-----------+ | 622 * | | | | | | 623 * MMMMMMMMM| Zeppelin |==========| Zeppelin |MMMMMMMMM 624 * MMMMMMMMM| Die |==========| Die |MMMMMMMMM 625 * | | | | | | 626 * | +-----------+ooo ...+-----------+ | 627 * | HH ooo ... HH | 628 * | HH oo.. HH | 629 * | HH ..oo HH | 630 * | HH ... ooo HH | 631 * | +-----------+... ooo+-----------+ | 632 * | | | | | | 633 * MMMMMMMMM| Zeppelin |==========| Zeppelin |MMMMMMMMM 634 * MMMMMMMMM| Die |==========| Die |MMMMMMMMM 635 * | | | | | | 636 * | +-----------+ +-----------+ | 637 * | PP PP | 638 * +----------PP---------------------PP---------+ 639 * PP PP 640 * PP PP 641 * 642 * This image represents a single Zen package. In this example, it has four 643 * Zeppelin dies, though some configurations only have a single one. In this 644 * example, each die is directly connected to the next. Also, each die is 645 * represented as being connected to memory by the 'M' character and connected 646 * to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin 647 * die is made up of two core complexes, we have multiple different NUMA 648 * domains that we care about for these systems. 649 * 650 * CPUID LEAVES 651 * 652 * There are a few different CPUID leaves that we can use to try and understand 653 * the actual state of the world. As part of the introduction of family 0xf, AMD 654 * added CPUID leaf 0x80000008. This leaf tells us the number of logical 655 * processors that are in the system. Because families before Zen didn't have 656 * SMT, this was always the number of cores that were in the system. However, it 657 * should always be thought of as the number of logical threads to be consistent 658 * between generations. In addition we also get the size of the APIC ID that is 659 * used to represent the number of logical processors. This is important for 660 * deriving topology information. 661 * 662 * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a 663 * bit between Bulldozer and later families, but it is quite useful in 664 * determining the topology information. Because this information has changed 665 * across family generations, it's worth calling out what these mean 666 * explicitly. The registers have the following meanings: 667 * 668 * %eax The APIC ID. The entire register is defined to have a 32-bit 669 * APIC ID, even though on systems without x2apic support, it will 670 * be limited to 8 bits. 671 * 672 * %ebx On Bulldozer-era systems this contains information about the 673 * number of cores that are in a compute unit (cores that share 674 * resources). It also contains a per-package compute unit ID that 675 * identifies which compute unit the logical CPU is a part of. 676 * 677 * On Zen-era systems this instead contains the number of threads 678 * per core and the ID of the core that the logical CPU is a part 679 * of. Note, this ID is unique only to the package, it is not 680 * globally unique across the entire system. 681 * 682 * %ecx This contains the number of nodes that exist in the package. It 683 * also contains an ID that identifies which node the logical CPU 684 * is a part of. 685 * 686 * Finally, we also use cpuid leaf 0x8000001D to determine information about the 687 * cache layout to determine which logical CPUs are sharing which caches. 688 * 689 * illumos Topology 690 * ---------------- 691 * 692 * Based on the above we synthesize the information into several different 693 * variables that we store in the 'struct cpuid_info'. We'll go into the details 694 * of what each member is supposed to represent and their uniqueness. In 695 * general, there are two levels of uniqueness that we care about. We care about 696 * an ID that is globally unique. That means that it will be unique across all 697 * entities in the system. For example, the default logical CPU ID is globally 698 * unique. On the other hand, there is some information that we only care about 699 * being unique within the context of a single package / socket. Here are the 700 * variables that we keep track of and their meaning. 701 * 702 * Several of the values that are asking for an identifier, with the exception 703 * of cpi_apicid, are allowed to be synthetic. 704 * 705 * 706 * cpi_apicid 707 * 708 * This is the value of the CPU's APIC id. This should be the full 32-bit 709 * ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit 710 * APIC ID. This value is globally unique between all logical CPUs across 711 * all packages. This is usually required by the APIC. 712 * 713 * cpi_chipid 714 * 715 * This value indicates the ID of the package that the logical CPU is a 716 * part of. This value is allowed to be synthetic. It is usually derived by 717 * taking the CPU's APIC ID and determining how many bits are used to 718 * represent CPU cores in the package. All logical CPUs that are part of 719 * the same package must have the same value. 720 * 721 * cpi_coreid 722 * 723 * This represents the ID of a CPU core. Two logical CPUs should only have 724 * the same cpi_coreid value if they are part of the same core. These 725 * values may be synthetic. On systems that support SMT, this value is 726 * usually derived from the APIC ID, otherwise it is often synthetic and 727 * just set to the value of the cpu_id in the cpu_t. 728 * 729 * cpi_pkgcoreid 730 * 731 * This is similar to the cpi_coreid in that logical CPUs that are part of 732 * the same core should have the same ID. The main difference is that these 733 * values are only required to be unique to a given socket. 734 * 735 * cpi_clogid 736 * 737 * This represents the logical ID of a logical CPU. This value should be 738 * unique within a given socket for each logical CPU. This is allowed to be 739 * synthetic, though it is usually based off of the CPU's apic ID. The 740 * broader system expects that logical CPUs that have are part of the same 741 * core have contiguous numbers. For example, if there were two threads per 742 * core, then the core IDs divided by two should be the same and the first 743 * modulus two should be zero and the second one. For example, IDs 4 and 5 744 * indicate two logical CPUs that are part of the same core. But IDs 5 and 745 * 6 represent two logical CPUs that are part of different cores. 746 * 747 * While it is common for the cpi_coreid and the cpi_clogid to be derived 748 * from the same source, strictly speaking, they don't have to be and the 749 * two values should be considered logically independent. One should not 750 * try to compare a logical CPU's cpi_coreid and cpi_clogid to determine 751 * some kind of relationship. While this is tempting, we've seen cases on 752 * AMD family 0xf where the system's cpu id is not related to its APIC ID. 753 * 754 * cpi_ncpu_per_chip 755 * 756 * This value indicates the total number of logical CPUs that exist in the 757 * physical package. Critically, this is not the number of logical CPUs 758 * that exist for just the single core. 759 * 760 * This value should be the same for all logical CPUs in the same package. 761 * 762 * cpi_ncore_per_chip 763 * 764 * This value indicates the total number of physical CPU cores that exist 765 * in the package. The system compares this value with cpi_ncpu_per_chip to 766 * determine if simultaneous multi-threading (SMT) is enabled. When 767 * cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and 768 * the X86FSET_HTT feature is not set. If this value is greater than one, 769 * than we consider the processor to have the feature X86FSET_CMP, to 770 * indicate that there is support for more than one core. 771 * 772 * This value should be the same for all logical CPUs in the same package. 773 * 774 * cpi_procnodes_per_pkg 775 * 776 * This value indicates the number of 'nodes' that exist in the package. 777 * When processors are actually a multi-chip module, this represents the 778 * number of such modules that exist in the package. Currently, on Intel 779 * based systems this member is always set to 1. 780 * 781 * This value should be the same for all logical CPUs in the same package. 782 * 783 * cpi_procnodeid 784 * 785 * This value indicates the ID of the node that the logical CPU is a part 786 * of. All logical CPUs that are in the same node must have the same value 787 * here. This value must be unique across all of the packages in the 788 * system. On Intel based systems, this is currently set to the value in 789 * cpi_chipid because there is only one node. 790 * 791 * cpi_cores_per_compunit 792 * 793 * This value indicates the number of cores that are part of a compute 794 * unit. See the AMD topology section for this. This member only has real 795 * meaning currently for AMD Bulldozer family processors. For all other 796 * processors, this should currently be set to 1. 797 * 798 * cpi_compunitid 799 * 800 * This indicates the compute unit that the logical CPU belongs to. For 801 * processors without AMD Bulldozer-style compute units this should be set 802 * to the value of cpi_coreid. 803 * 804 * cpi_ncpu_shr_last_cache 805 * 806 * This indicates the number of logical CPUs that are sharing the same last 807 * level cache. This value should be the same for all CPUs that are sharing 808 * that cache. The last cache refers to the cache that is closest to memory 809 * and furthest away from the CPU. 810 * 811 * cpi_last_lvl_cacheid 812 * 813 * This indicates the ID of the last cache that the logical CPU uses. This 814 * cache is often shared between multiple logical CPUs and is the cache 815 * that is closest to memory and furthest away from the CPU. This value 816 * should be the same for a group of logical CPUs only if they actually 817 * share the same last level cache. IDs should not overlap between 818 * packages. 819 * 820 * cpi_ncore_bits 821 * 822 * This indicates the number of bits that are required to represent all of 823 * the cores in the system. As cores are derived based on their APIC IDs, 824 * we aren't guaranteed a run of APIC IDs starting from zero. It's OK for 825 * this value to be larger than the actual number of IDs that are present 826 * in the system. This is used to size tables by the CMI framework. It is 827 * only filled in for Intel and AMD CPUs. 828 * 829 * cpi_nthread_bits 830 * 831 * This indicates the number of bits required to represent all of the IDs 832 * that cover the logical CPUs that exist on a given core. It's OK for this 833 * value to be larger than the actual number of IDs that are present in the 834 * system. This is used to size tables by the CMI framework. It is 835 * only filled in for Intel and AMD CPUs. 836 * 837 * ----------- 838 * Hypervisors 839 * ----------- 840 * 841 * If trying to manage the differences between vendors wasn't bad enough, it can 842 * get worse thanks to our friend hardware virtualization. Hypervisors are given 843 * the ability to interpose on all cpuid instructions and change them to suit 844 * their purposes. In general, this is necessary as the hypervisor wants to be 845 * able to present a more uniform set of features or not necessarily give the 846 * guest operating system kernel knowledge of all features so it can be 847 * more easily migrated between systems. 848 * 849 * When it comes to trying to determine topology information, this can be a 850 * double edged sword. When a hypervisor doesn't actually implement a cpuid 851 * leaf, it'll often return all zeros. Because of that, you'll often see various 852 * checks scattered about fields being non-zero before we assume we can use 853 * them. 854 * 855 * When it comes to topology information, the hypervisor is often incentivized 856 * to lie to you about topology. This is because it doesn't always actually 857 * guarantee that topology at all. The topology path we take in the system 858 * depends on how the CPU advertises itself. If it advertises itself as an Intel 859 * or AMD CPU, then we basically do our normal path. However, when they don't 860 * use an actual vendor, then that usually turns into multiple one-core CPUs 861 * that we enumerate that are often on different sockets. The actual behavior 862 * depends greatly on what the hypervisor actually exposes to us. 863 * 864 * -------------------- 865 * Exposing Information 866 * -------------------- 867 * 868 * We expose CPUID information in three different forms in the system. 869 * 870 * The first is through the x86_featureset variable. This is used in conjunction 871 * with the is_x86_feature() function. This is queried by x86-specific functions 872 * to determine which features are or aren't present in the system and to make 873 * decisions based upon them. For example, users of this include everything from 874 * parts of the system dedicated to reliability, availability, and 875 * serviceability (RAS), to making decisions about how to handle security 876 * mitigations, to various x86-specific drivers. General purpose or 877 * architecture independent drivers should never be calling this function. 878 * 879 * The second means is through the auxiliary vector. The auxiliary vector is a 880 * series of tagged data that the kernel passes down to a user program when it 881 * begins executing. This information is used to indicate to programs what 882 * instruction set extensions are present. For example, information about the 883 * CPU supporting the machine check architecture (MCA) wouldn't be passed down 884 * since user programs cannot make use of it. However, things like the AVX 885 * instruction sets are. Programs use this information to make run-time 886 * decisions about what features they should use. As an example, the run-time 887 * link-editor (rtld) can relocate different functions depending on the hardware 888 * support available. 889 * 890 * The final form is through a series of accessor functions that all have the 891 * form cpuid_get*. This is used by a number of different subsystems in the 892 * kernel to determine more detailed information about what we're running on, 893 * topology information, etc. Some of these subsystems include processor groups 894 * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI, 895 * microcode, and performance monitoring. These functions all ASSERT that the 896 * CPU they're being called on has reached a certain cpuid pass. If the passes 897 * are rearranged, then this needs to be adjusted. 898 * 899 * ----------------------------------------------- 900 * Speculative Execution CPU Side Channel Security 901 * ----------------------------------------------- 902 * 903 * With the advent of the Spectre and Meltdown attacks which exploit speculative 904 * execution in the CPU to create side channels there have been a number of 905 * different attacks and corresponding issues that the operating system needs to 906 * mitigate against. The following list is some of the common, but not 907 * exhaustive, set of issues that we know about and have done some or need to do 908 * more work in the system to mitigate against: 909 * 910 * - Spectre v1 911 * - swapgs (Spectre v1 variant) 912 * - Spectre v2 913 * - Meltdown (Spectre v3) 914 * - Rogue Register Read (Spectre v3a) 915 * - Speculative Store Bypass (Spectre v4) 916 * - ret2spec, SpectreRSB 917 * - L1 Terminal Fault (L1TF) 918 * - Microarchitectural Data Sampling (MDS) 919 * 920 * Each of these requires different sets of mitigations and has different attack 921 * surfaces. For the most part, this discussion is about protecting the kernel 922 * from non-kernel executing environments such as user processes and hardware 923 * virtual machines. Unfortunately, there are a number of user vs. user 924 * scenarios that exist with these. The rest of this section will describe the 925 * overall approach that the system has taken to address these as well as their 926 * shortcomings. Unfortunately, not all of the above have been handled today. 927 * 928 * SPECTRE v2, ret2spec, SpectreRSB 929 * 930 * The second variant of the spectre attack focuses on performing branch target 931 * injection. This generally impacts indirect call instructions in the system. 932 * There are three different ways to mitigate this issue that are commonly 933 * described today: 934 * 935 * 1. Using Indirect Branch Restricted Speculation (IBRS). 936 * 2. Using Retpolines and RSB Stuffing 937 * 3. Using Enhanced Indirect Branch Restricted Speculation (EIBRS) 938 * 939 * IBRS uses a feature added to microcode to restrict speculation, among other 940 * things. This form of mitigation has not been used as it has been generally 941 * seen as too expensive and requires reactivation upon various transitions in 942 * the system. 943 * 944 * As a less impactful alternative to IBRS, retpolines were developed by 945 * Google. These basically require one to replace indirect calls with a specific 946 * trampoline that will cause speculation to fail and break the attack. 947 * Retpolines require compiler support. We always build with retpolines in the 948 * external thunk mode. This means that a traditional indirect call is replaced 949 * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect 950 * of this is that all indirect function calls are performed through a register. 951 * 952 * We have to use a common external location of the thunk and not inline it into 953 * the callsite so that way we can have a single place to patch these functions. 954 * As it turns out, we actually have three different forms of retpolines that 955 * exist in the system: 956 * 957 * 1. A full retpoline 958 * 2. An AMD-specific optimized retpoline 959 * 3. A no-op version 960 * 961 * The first one is used in the general case. The second one is used if we can 962 * determine that we're on an AMD system and we can successfully toggle the 963 * lfence serializing MSR that exists on the platform. Basically with this 964 * present, an lfence is sufficient and we don't need to do anywhere near as 965 * complicated a dance to successfully use retpolines. 966 * 967 * The third form described above is the most curious. It turns out that the way 968 * that retpolines are implemented is that they rely on how speculation is 969 * performed on a 'ret' instruction. Intel has continued to optimize this 970 * process (which is partly why we need to have return stack buffer stuffing, 971 * but more on that in a bit) and in processors starting with Cascade Lake 972 * on the server side, it's dangerous to rely on retpolines. Instead, a new 973 * mechanism has been introduced called Enhanced IBRS (EIBRS). 974 * 975 * Unlike IBRS, EIBRS is designed to be enabled once at boot and left on each 976 * physical core. However, if this is the case, we don't want to use retpolines 977 * any more. Therefore if EIBRS is present, we end up turning each retpoline 978 * function (called a thunk) into a jmp instruction. This means that we're still 979 * paying the cost of an extra jump to the external thunk, but it gives us 980 * flexibility and the ability to have a single kernel image that works across a 981 * wide variety of systems and hardware features. 982 * 983 * Unfortunately, this alone is insufficient. First, Skylake systems have 984 * additional speculation for the Return Stack Buffer (RSB) which is used to 985 * return from call instructions which retpolines take advantage of. However, 986 * this problem is not just limited to Skylake and is actually more pernicious. 987 * The SpectreRSB paper introduces several more problems that can arise with 988 * dealing with this. The RSB can be poisoned just like the indirect branch 989 * predictor. This means that one needs to clear the RSB when transitioning 990 * between two different privilege domains. Some examples include: 991 * 992 * - Switching between two different user processes 993 * - Going between user land and the kernel 994 * - Returning to the kernel from a hardware virtual machine 995 * 996 * Mitigating this involves combining a couple of different things. The first is 997 * SMEP (supervisor mode execution protection) which was introduced in Ivy 998 * Bridge. When an RSB entry refers to a user address and we're executing in the 999 * kernel, speculation through it will be stopped when SMEP is enabled. This 1000 * protects against a number of the different cases that we would normally be 1001 * worried about such as when we enter the kernel from user land. 1002 * 1003 * To prevent against additional manipulation of the RSB from other contexts 1004 * such as a non-root VMX context attacking the kernel we first look to enhanced 1005 * IBRS. When EIBRS is present and enabled, then there is nothing else that we 1006 * need to do to protect the kernel at this time. 1007 * 1008 * On CPUs without EIBRS we need to manually overwrite the contents of the 1009 * return stack buffer. We do this through the x86_rsb_stuff() function. 1010 * Currently this is employed on context switch. The x86_rsb_stuff() function is 1011 * disabled when enhanced IBRS is present because Intel claims on such systems 1012 * it will be ineffective. Stuffing the RSB in context switch helps prevent user 1013 * to user attacks via the RSB. 1014 * 1015 * If SMEP is not present, then we would have to stuff the RSB every time we 1016 * transitioned from user mode to the kernel, which isn't very practical right 1017 * now. 1018 * 1019 * To fully protect user to user and vmx to vmx attacks from these classes of 1020 * issues, we would also need to allow them to opt into performing an Indirect 1021 * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up. 1022 * 1023 * By default, the system will enable RSB stuffing and the required variant of 1024 * retpolines and store that information in the x86_spectrev2_mitigation value. 1025 * This will be evaluated after a microcode update as well, though it is 1026 * expected that microcode updates will not take away features. This may mean 1027 * that a late loaded microcode may not end up in the optimal configuration 1028 * (though this should be rare). 1029 * 1030 * Currently we do not build kmdb with retpolines or perform any additional side 1031 * channel security mitigations for it. One complication with kmdb is that it 1032 * requires its own retpoline thunks and it would need to adjust itself based on 1033 * what the kernel does. The threat model of kmdb is more limited and therefore 1034 * it may make more sense to investigate using prediction barriers as the whole 1035 * system is only executing a single instruction at a time while in kmdb. 1036 * 1037 * SPECTRE v1, v4 1038 * 1039 * The v1 and v4 variants of spectre are not currently mitigated in the 1040 * system and require other classes of changes to occur in the code. 1041 * 1042 * SPECTRE v1 (SWAPGS VARIANT) 1043 * 1044 * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but 1045 * can generally affect any branch-dependent code. The swapgs issue is one 1046 * variant of this. If we are coming in from userspace, we can have code like 1047 * this: 1048 * 1049 * cmpw $KCS_SEL, REGOFF_CS(%rsp) 1050 * je 1f 1051 * movq $0, REGOFF_SAVFP(%rsp) 1052 * swapgs 1053 * 1: 1054 * movq %gs:CPU_THREAD, %rax 1055 * 1056 * If an attacker can cause a mis-speculation of the branch here, we could skip 1057 * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based 1058 * load. If subsequent code can act as the usual Spectre cache gadget, this 1059 * would potentially allow KPTI bypass. To fix this, we need an lfence prior to 1060 * any use of the %gs override. 1061 * 1062 * The other case is also an issue: if we're coming into a trap from kernel 1063 * space, we could mis-speculate and swapgs the user %gsbase back in prior to 1064 * using it. AMD systems are not vulnerable to this version, as a swapgs is 1065 * serializing with respect to subsequent uses. But as AMD /does/ need the other 1066 * case, and the fix is the same in both cases (an lfence at the branch target 1067 * 1: in this example), we'll just do it unconditionally. 1068 * 1069 * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it 1070 * harder for user-space to actually set a useful %gsbase value: although it's 1071 * not clear, it might still be feasible via lwp_setprivate(), though, so we 1072 * mitigate anyway. 1073 * 1074 * MELTDOWN 1075 * 1076 * Meltdown, or spectre v3, allowed a user process to read any data in their 1077 * address space regardless of whether or not the page tables in question 1078 * allowed the user to have the ability to read them. The solution to meltdown 1079 * is kernel page table isolation. In this world, there are two page tables that 1080 * are used for a process, one in user land and one in the kernel. To implement 1081 * this we use per-CPU page tables and switch between the user and kernel 1082 * variants when entering and exiting the kernel. For more information about 1083 * this process and how the trampolines work, please see the big theory 1084 * statements and additional comments in: 1085 * 1086 * - uts/i86pc/ml/kpti_trampolines.s 1087 * - uts/i86pc/vm/hat_i86.c 1088 * 1089 * While Meltdown only impacted Intel systems and there are also Intel systems 1090 * that have Meltdown fixed (called Rogue Data Cache Load), we always have 1091 * kernel page table isolation enabled. While this may at first seem weird, an 1092 * important thing to remember is that you can't speculatively read an address 1093 * if it's never in your page table at all. Having user processes without kernel 1094 * pages present provides us with an important layer of defense in the kernel 1095 * against any other side channel attacks that exist and have yet to be 1096 * discovered. As such, kernel page table isolation (KPTI) is always enabled by 1097 * default, no matter the x86 system. 1098 * 1099 * L1 TERMINAL FAULT 1100 * 1101 * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative 1102 * execution uses page table entries. Effectively, it is two different problems. 1103 * The first is that it ignores the not present bit in the page table entries 1104 * when performing speculative execution. This means that something can 1105 * speculatively read the listed physical address if it's present in the L1 1106 * cache under certain conditions (see Intel's documentation for the full set of 1107 * conditions). Secondly, this can be used to bypass hardware virtualization 1108 * extended page tables (EPT) that are part of Intel's hardware virtual machine 1109 * instructions. 1110 * 1111 * For the non-hardware virtualized case, this is relatively easy to deal with. 1112 * We must make sure that all unmapped pages have an address of zero. This means 1113 * that they could read the first 4k of physical memory; however, we never use 1114 * that first page in the operating system and always skip putting it in our 1115 * memory map, even if firmware tells us we can use it in our memory map. While 1116 * other systems try to put extra metadata in the address and reserved bits, 1117 * which led to this being problematic in those cases, we do not. 1118 * 1119 * For hardware virtual machines things are more complicated. Because they can 1120 * construct their own page tables, it isn't hard for them to perform this 1121 * attack against any physical address. The one wrinkle is that this physical 1122 * address must be in the L1 data cache. Thus Intel added an MSR that we can use 1123 * to flush the L1 data cache. We wrap this up in the function 1124 * spec_uarch_flush(). This function is also used in the mitigation of 1125 * microarchitectural data sampling (MDS) discussed later on. Kernel based 1126 * hypervisors such as KVM or bhyve are responsible for performing this before 1127 * entering the guest. 1128 * 1129 * Because this attack takes place in the L1 cache, there's another wrinkle 1130 * here. The L1 cache is shared between all logical CPUs in a core in most Intel 1131 * designs. This means that when a thread enters a hardware virtualized context 1132 * and flushes the L1 data cache, the other thread on the processor may then go 1133 * ahead and put new data in it that can be potentially attacked. While one 1134 * solution is to disable SMT on the system, another option that is available is 1135 * to use a feature for hardware virtualization called 'SMT exclusion'. This 1136 * goes through and makes sure that if a HVM is being scheduled on one thread, 1137 * then the thing on the other thread is from the same hardware virtual machine. 1138 * If an interrupt comes in or the guest exits to the broader system, then the 1139 * other SMT thread will be kicked out. 1140 * 1141 * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the 1142 * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not 1143 * perform L1TF related mitigations. 1144 * 1145 * MICROARCHITECTURAL DATA SAMPLING 1146 * 1147 * Microarchitectural data sampling (MDS) is a combination of four discrete 1148 * vulnerabilities that are similar issues affecting various parts of the CPU's 1149 * microarchitectural implementation around load, store, and fill buffers. 1150 * Specifically it is made up of the following subcomponents: 1151 * 1152 * 1. Microarchitectural Store Buffer Data Sampling (MSBDS) 1153 * 2. Microarchitectural Fill Buffer Data Sampling (MFBDS) 1154 * 3. Microarchitectural Load Port Data Sampling (MLPDS) 1155 * 4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM) 1156 * 1157 * To begin addressing these, Intel has introduced another feature in microcode 1158 * called MD_CLEAR. This changes the verw instruction to operate in a different 1159 * way. This allows us to execute the verw instruction in a particular way to 1160 * flush the state of the affected parts. The L1TF L1D flush mechanism is also 1161 * updated when this microcode is present to flush this state. 1162 * 1163 * Primarily we need to flush this state whenever we transition from the kernel 1164 * to a less privileged context such as user mode or an HVM guest. MSBDS is a 1165 * little bit different. Here the structures are statically sized when a logical 1166 * CPU is in use and resized when it goes to sleep. Therefore, we also need to 1167 * flush the microarchitectural state before the CPU goes idles by calling hlt, 1168 * mwait, or another ACPI method. To perform these flushes, we call 1169 * x86_md_clear() at all of these transition points. 1170 * 1171 * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF, 1172 * then we change the spec_uarch_flush() function to point to x86_md_clear(). If 1173 * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes 1174 * a no-op. 1175 * 1176 * Unfortunately, with this issue hyperthreading rears its ugly head. In 1177 * particular, everything we've discussed above is only valid for a single 1178 * thread executing on a core. In the case where you have hyper-threading 1179 * present, this attack can be performed between threads. The theoretical fix 1180 * for this is to ensure that both threads are always in the same security 1181 * domain. This means that they are executing in the same ring and mutually 1182 * trust each other. Practically speaking, this would mean that a system call 1183 * would have to issue an inter-processor interrupt (IPI) to the other thread. 1184 * Rather than implement this, we recommend that one disables hyper-threading 1185 * through the use of psradm -aS. 1186 * 1187 * TSX ASYNCHRONOUS ABORT 1188 * 1189 * TSX Asynchronous Abort (TAA) is another side-channel vulnerability that 1190 * behaves like MDS, but leverages Intel's transactional instructions as another 1191 * vector. Effectively, when a transaction hits one of these cases (unmapped 1192 * page, various cache snoop activity, etc.) then the same data can be exposed 1193 * as in the case of MDS. This means that you can attack your twin. 1194 * 1195 * Intel has described that there are two different ways that we can mitigate 1196 * this problem on affected processors: 1197 * 1198 * 1) We can use the same techniques used to deal with MDS. Flushing the 1199 * microarchitectural buffers and disabling hyperthreading will mitigate 1200 * this in the same way. 1201 * 1202 * 2) Using microcode to disable TSX. 1203 * 1204 * Now, most processors that are subject to MDS (as in they don't have MDS_NO in 1205 * the IA32_ARCH_CAPABILITIES MSR) will not receive microcode to disable TSX. 1206 * That's OK as we're already doing all such mitigations. On the other hand, 1207 * processors with MDS_NO are all supposed to receive microcode updates that 1208 * enumerate support for disabling TSX. In general, we'd rather use this method 1209 * when available as it doesn't require disabling hyperthreading to be 1210 * effective. Currently we basically are relying on microcode for processors 1211 * that enumerate MDS_NO. 1212 * 1213 * The microcode features are enumerated as part of the IA32_ARCH_CAPABILITIES. 1214 * When bit 7 (IA32_ARCH_CAP_TSX_CTRL) is present, then we are given two 1215 * different powers. The first allows us to cause all transactions to 1216 * immediately abort. The second gives us a means of disabling TSX completely, 1217 * which includes removing it from cpuid. If we have support for this in 1218 * microcode during the first cpuid pass, then we'll disable TSX completely such 1219 * that user land never has a chance to observe the bit. However, if we are late 1220 * loading the microcode, then we must use the functionality to cause 1221 * transactions to automatically abort. This is necessary for user land's sake. 1222 * Once a program sees a cpuid bit, it must not be taken away. 1223 * 1224 * We track whether or not we should do this based on what cpuid pass we're in. 1225 * Whenever we hit cpuid_scan_security() on the boot CPU and we're still on pass 1226 * 1 of the cpuid logic, then we can completely turn off TSX. Notably this 1227 * should happen twice. Once in the normal cpuid_pass1() code and then a second 1228 * time after we do the initial microcode update. As a result we need to be 1229 * careful in cpuid_apply_tsx() to only use the MSR if we've loaded a suitable 1230 * microcode on the current CPU (which happens prior to cpuid_pass_ucode()). 1231 * 1232 * If TAA has been fixed, then it will be enumerated in IA32_ARCH_CAPABILITIES 1233 * as TAA_NO. In such a case, we will still disable TSX: it's proven to be an 1234 * unfortunate feature in a number of ways, and taking the opportunity to 1235 * finally be able to turn it off is likely to be of benefit in the future. 1236 * 1237 * SUMMARY 1238 * 1239 * The following table attempts to summarize the mitigations for various issues 1240 * and what's done in various places: 1241 * 1242 * - Spectre v1: Not currently mitigated 1243 * - swapgs: lfences after swapgs paths 1244 * - Spectre v2: Retpolines/RSB Stuffing or EIBRS if HW support 1245 * - Meltdown: Kernel Page Table Isolation 1246 * - Spectre v3a: Updated CPU microcode 1247 * - Spectre v4: Not currently mitigated 1248 * - SpectreRSB: SMEP and RSB Stuffing 1249 * - L1TF: spec_uarch_flush, SMT exclusion, requires microcode 1250 * - MDS: x86_md_clear, requires microcode, disabling SMT 1251 * - TAA: x86_md_clear and disabling SMT OR microcode and disabling TSX 1252 * 1253 * The following table indicates the x86 feature set bits that indicate that a 1254 * given problem has been solved or a notable feature is present: 1255 * 1256 * - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS 1257 * - MDS_NO: All forms of MDS 1258 * - TAA_NO: TAA 1259 */ 1260 1261 #include <sys/types.h> 1262 #include <sys/archsystm.h> 1263 #include <sys/x86_archext.h> 1264 #include <sys/kmem.h> 1265 #include <sys/systm.h> 1266 #include <sys/cmn_err.h> 1267 #include <sys/sunddi.h> 1268 #include <sys/sunndi.h> 1269 #include <sys/cpuvar.h> 1270 #include <sys/processor.h> 1271 #include <sys/sysmacros.h> 1272 #include <sys/pg.h> 1273 #include <sys/fp.h> 1274 #include <sys/controlregs.h> 1275 #include <sys/bitmap.h> 1276 #include <sys/auxv_386.h> 1277 #include <sys/memnode.h> 1278 #include <sys/pci_cfgspace.h> 1279 #include <sys/comm_page.h> 1280 #include <sys/mach_mmu.h> 1281 #include <sys/ucode.h> 1282 #include <sys/tsc.h> 1283 #include <sys/kobj.h> 1284 #include <sys/asm_misc.h> 1285 1286 #ifdef __xpv 1287 #include <sys/hypervisor.h> 1288 #else 1289 #include <sys/ontrap.h> 1290 #endif 1291 1292 uint_t x86_vendor = X86_VENDOR_IntelClone; 1293 uint_t x86_type = X86_TYPE_OTHER; 1294 uint_t x86_clflush_size = 0; 1295 1296 #if defined(__xpv) 1297 int x86_use_pcid = 0; 1298 int x86_use_invpcid = 0; 1299 #else 1300 int x86_use_pcid = -1; 1301 int x86_use_invpcid = -1; 1302 #endif 1303 1304 typedef enum { 1305 X86_SPECTREV2_RETPOLINE, 1306 X86_SPECTREV2_RETPOLINE_AMD, 1307 X86_SPECTREV2_ENHANCED_IBRS, 1308 X86_SPECTREV2_DISABLED 1309 } x86_spectrev2_mitigation_t; 1310 1311 uint_t x86_disable_spectrev2 = 0; 1312 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation = 1313 X86_SPECTREV2_RETPOLINE; 1314 1315 /* 1316 * The mitigation status for TAA: 1317 * X86_TAA_NOTHING -- no mitigation available for TAA side-channels 1318 * X86_TAA_DISABLED -- mitigation disabled via x86_disable_taa 1319 * X86_TAA_MD_CLEAR -- MDS mitigation also suffices for TAA 1320 * X86_TAA_TSX_FORCE_ABORT -- transactions are forced to abort 1321 * X86_TAA_TSX_DISABLE -- force abort transactions and hide from CPUID 1322 * X86_TAA_HW_MITIGATED -- TSX potentially active but H/W not TAA-vulnerable 1323 */ 1324 typedef enum { 1325 X86_TAA_NOTHING, 1326 X86_TAA_DISABLED, 1327 X86_TAA_MD_CLEAR, 1328 X86_TAA_TSX_FORCE_ABORT, 1329 X86_TAA_TSX_DISABLE, 1330 X86_TAA_HW_MITIGATED 1331 } x86_taa_mitigation_t; 1332 1333 uint_t x86_disable_taa = 0; 1334 static x86_taa_mitigation_t x86_taa_mitigation = X86_TAA_NOTHING; 1335 1336 uint_t pentiumpro_bug4046376; 1337 1338 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)]; 1339 1340 static char *x86_feature_names[NUM_X86_FEATURES] = { 1341 "lgpg", 1342 "tsc", 1343 "msr", 1344 "mtrr", 1345 "pge", 1346 "de", 1347 "cmov", 1348 "mmx", 1349 "mca", 1350 "pae", 1351 "cv8", 1352 "pat", 1353 "sep", 1354 "sse", 1355 "sse2", 1356 "htt", 1357 "asysc", 1358 "nx", 1359 "sse3", 1360 "cx16", 1361 "cmp", 1362 "tscp", 1363 "mwait", 1364 "sse4a", 1365 "cpuid", 1366 "ssse3", 1367 "sse4_1", 1368 "sse4_2", 1369 "1gpg", 1370 "clfsh", 1371 "64", 1372 "aes", 1373 "pclmulqdq", 1374 "xsave", 1375 "avx", 1376 "vmx", 1377 "svm", 1378 "topoext", 1379 "f16c", 1380 "rdrand", 1381 "x2apic", 1382 "avx2", 1383 "bmi1", 1384 "bmi2", 1385 "fma", 1386 "smep", 1387 "smap", 1388 "adx", 1389 "rdseed", 1390 "mpx", 1391 "avx512f", 1392 "avx512dq", 1393 "avx512pf", 1394 "avx512er", 1395 "avx512cd", 1396 "avx512bw", 1397 "avx512vl", 1398 "avx512fma", 1399 "avx512vbmi", 1400 "avx512_vpopcntdq", 1401 "avx512_4vnniw", 1402 "avx512_4fmaps", 1403 "xsaveopt", 1404 "xsavec", 1405 "xsaves", 1406 "sha", 1407 "umip", 1408 "pku", 1409 "ospke", 1410 "pcid", 1411 "invpcid", 1412 "ibrs", 1413 "ibpb", 1414 "stibp", 1415 "ssbd", 1416 "ssbd_virt", 1417 "rdcl_no", 1418 "ibrs_all", 1419 "rsba", 1420 "ssb_no", 1421 "stibp_all", 1422 "flush_cmd", 1423 "l1d_vmentry_no", 1424 "fsgsbase", 1425 "clflushopt", 1426 "clwb", 1427 "monitorx", 1428 "clzero", 1429 "xop", 1430 "fma4", 1431 "tbm", 1432 "avx512_vnni", 1433 "amd_pcec", 1434 "mb_clear", 1435 "mds_no", 1436 "core_thermal", 1437 "pkg_thermal", 1438 "tsx_ctrl", 1439 "taa_no" 1440 }; 1441 1442 boolean_t 1443 is_x86_feature(void *featureset, uint_t feature) 1444 { 1445 ASSERT(feature < NUM_X86_FEATURES); 1446 return (BT_TEST((ulong_t *)featureset, feature)); 1447 } 1448 1449 void 1450 add_x86_feature(void *featureset, uint_t feature) 1451 { 1452 ASSERT(feature < NUM_X86_FEATURES); 1453 BT_SET((ulong_t *)featureset, feature); 1454 } 1455 1456 void 1457 remove_x86_feature(void *featureset, uint_t feature) 1458 { 1459 ASSERT(feature < NUM_X86_FEATURES); 1460 BT_CLEAR((ulong_t *)featureset, feature); 1461 } 1462 1463 boolean_t 1464 compare_x86_featureset(void *setA, void *setB) 1465 { 1466 /* 1467 * We assume that the unused bits of the bitmap are always zero. 1468 */ 1469 if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) { 1470 return (B_TRUE); 1471 } else { 1472 return (B_FALSE); 1473 } 1474 } 1475 1476 void 1477 print_x86_featureset(void *featureset) 1478 { 1479 uint_t i; 1480 1481 for (i = 0; i < NUM_X86_FEATURES; i++) { 1482 if (is_x86_feature(featureset, i)) { 1483 cmn_err(CE_CONT, "?x86_feature: %s\n", 1484 x86_feature_names[i]); 1485 } 1486 } 1487 } 1488 1489 /* Note: This is the maximum size for the CPU, not the size of the structure. */ 1490 static size_t xsave_state_size = 0; 1491 uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE); 1492 boolean_t xsave_force_disable = B_FALSE; 1493 extern int disable_smap; 1494 1495 /* 1496 * This is set to platform type we are running on. 1497 */ 1498 static int platform_type = -1; 1499 1500 #if !defined(__xpv) 1501 /* 1502 * Variable to patch if hypervisor platform detection needs to be 1503 * disabled (e.g. platform_type will always be HW_NATIVE if this is 0). 1504 */ 1505 int enable_platform_detection = 1; 1506 #endif 1507 1508 /* 1509 * monitor/mwait info. 1510 * 1511 * size_actual and buf_actual are the real address and size allocated to get 1512 * proper mwait_buf alignement. buf_actual and size_actual should be passed 1513 * to kmem_free(). Currently kmem_alloc() and mwait happen to both use 1514 * processor cache-line alignment, but this is not guarantied in the furture. 1515 */ 1516 struct mwait_info { 1517 size_t mon_min; /* min size to avoid missed wakeups */ 1518 size_t mon_max; /* size to avoid false wakeups */ 1519 size_t size_actual; /* size actually allocated */ 1520 void *buf_actual; /* memory actually allocated */ 1521 uint32_t support; /* processor support of monitor/mwait */ 1522 }; 1523 1524 /* 1525 * xsave/xrestor info. 1526 * 1527 * This structure contains HW feature bits and the size of the xsave save area. 1528 * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure 1529 * (xsave_state) to describe the xsave layout. However, at runtime the 1530 * per-lwp xsave area is dynamically allocated based on xsav_max_size. The 1531 * xsave_state structure simply represents the legacy layout of the beginning 1532 * of the xsave area. 1533 */ 1534 struct xsave_info { 1535 uint32_t xsav_hw_features_low; /* Supported HW features */ 1536 uint32_t xsav_hw_features_high; /* Supported HW features */ 1537 size_t xsav_max_size; /* max size save area for HW features */ 1538 size_t ymm_size; /* AVX: size of ymm save area */ 1539 size_t ymm_offset; /* AVX: offset for ymm save area */ 1540 size_t bndregs_size; /* MPX: size of bndregs save area */ 1541 size_t bndregs_offset; /* MPX: offset for bndregs save area */ 1542 size_t bndcsr_size; /* MPX: size of bndcsr save area */ 1543 size_t bndcsr_offset; /* MPX: offset for bndcsr save area */ 1544 size_t opmask_size; /* AVX512: size of opmask save */ 1545 size_t opmask_offset; /* AVX512: offset for opmask save */ 1546 size_t zmmlo_size; /* AVX512: size of zmm 256 save */ 1547 size_t zmmlo_offset; /* AVX512: offset for zmm 256 save */ 1548 size_t zmmhi_size; /* AVX512: size of zmm hi reg save */ 1549 size_t zmmhi_offset; /* AVX512: offset for zmm hi reg save */ 1550 }; 1551 1552 1553 /* 1554 * These constants determine how many of the elements of the 1555 * cpuid we cache in the cpuid_info data structure; the 1556 * remaining elements are accessible via the cpuid instruction. 1557 */ 1558 1559 #define NMAX_CPI_STD 8 /* eax = 0 .. 7 */ 1560 #define NMAX_CPI_EXTD 0x1f /* eax = 0x80000000 .. 0x8000001e */ 1561 1562 /* 1563 * See the big theory statement for a more detailed explanation of what some of 1564 * these members mean. 1565 */ 1566 struct cpuid_info { 1567 uint_t cpi_pass; /* last pass completed */ 1568 /* 1569 * standard function information 1570 */ 1571 uint_t cpi_maxeax; /* fn 0: %eax */ 1572 char cpi_vendorstr[13]; /* fn 0: %ebx:%ecx:%edx */ 1573 uint_t cpi_vendor; /* enum of cpi_vendorstr */ 1574 1575 uint_t cpi_family; /* fn 1: extended family */ 1576 uint_t cpi_model; /* fn 1: extended model */ 1577 uint_t cpi_step; /* fn 1: stepping */ 1578 chipid_t cpi_chipid; /* fn 1: %ebx: Intel: chip # */ 1579 /* AMD: package/socket # */ 1580 uint_t cpi_brandid; /* fn 1: %ebx: brand ID */ 1581 int cpi_clogid; /* fn 1: %ebx: thread # */ 1582 uint_t cpi_ncpu_per_chip; /* fn 1: %ebx: logical cpu count */ 1583 uint8_t cpi_cacheinfo[16]; /* fn 2: intel-style cache desc */ 1584 uint_t cpi_ncache; /* fn 2: number of elements */ 1585 uint_t cpi_ncpu_shr_last_cache; /* fn 4: %eax: ncpus sharing cache */ 1586 id_t cpi_last_lvl_cacheid; /* fn 4: %eax: derived cache id */ 1587 uint_t cpi_cache_leaf_size; /* Number of cache elements */ 1588 /* Intel fn: 4, AMD fn: 8000001d */ 1589 struct cpuid_regs **cpi_cache_leaves; /* Acual leaves from above */ 1590 struct cpuid_regs cpi_std[NMAX_CPI_STD]; /* 0 .. 7 */ 1591 /* 1592 * extended function information 1593 */ 1594 uint_t cpi_xmaxeax; /* fn 0x80000000: %eax */ 1595 char cpi_brandstr[49]; /* fn 0x8000000[234] */ 1596 uint8_t cpi_pabits; /* fn 0x80000006: %eax */ 1597 uint8_t cpi_vabits; /* fn 0x80000006: %eax */ 1598 uint8_t cpi_fp_amd_save; /* AMD: FP error pointer save rqd. */ 1599 struct cpuid_regs cpi_extd[NMAX_CPI_EXTD]; /* 0x800000XX */ 1600 1601 id_t cpi_coreid; /* same coreid => strands share core */ 1602 int cpi_pkgcoreid; /* core number within single package */ 1603 uint_t cpi_ncore_per_chip; /* AMD: fn 0x80000008: %ecx[7-0] */ 1604 /* Intel: fn 4: %eax[31-26] */ 1605 1606 /* 1607 * These values represent the number of bits that are required to store 1608 * information about the number of cores and threads. 1609 */ 1610 uint_t cpi_ncore_bits; 1611 uint_t cpi_nthread_bits; 1612 /* 1613 * supported feature information 1614 */ 1615 uint32_t cpi_support[6]; 1616 #define STD_EDX_FEATURES 0 1617 #define AMD_EDX_FEATURES 1 1618 #define TM_EDX_FEATURES 2 1619 #define STD_ECX_FEATURES 3 1620 #define AMD_ECX_FEATURES 4 1621 #define STD_EBX_FEATURES 5 1622 /* 1623 * Synthesized information, where known. 1624 */ 1625 uint32_t cpi_chiprev; /* See X86_CHIPREV_* in x86_archext.h */ 1626 const char *cpi_chiprevstr; /* May be NULL if chiprev unknown */ 1627 uint32_t cpi_socket; /* Chip package/socket type */ 1628 1629 struct mwait_info cpi_mwait; /* fn 5: monitor/mwait info */ 1630 uint32_t cpi_apicid; 1631 uint_t cpi_procnodeid; /* AMD: nodeID on HT, Intel: chipid */ 1632 uint_t cpi_procnodes_per_pkg; /* AMD: # of nodes in the package */ 1633 /* Intel: 1 */ 1634 uint_t cpi_compunitid; /* AMD: ComputeUnit ID, Intel: coreid */ 1635 uint_t cpi_cores_per_compunit; /* AMD: # of cores in the ComputeUnit */ 1636 1637 struct xsave_info cpi_xsave; /* fn D: xsave/xrestor info */ 1638 }; 1639 1640 1641 static struct cpuid_info cpuid_info0; 1642 1643 /* 1644 * These bit fields are defined by the Intel Application Note AP-485 1645 * "Intel Processor Identification and the CPUID Instruction" 1646 */ 1647 #define CPI_FAMILY_XTD(cpi) BITX((cpi)->cpi_std[1].cp_eax, 27, 20) 1648 #define CPI_MODEL_XTD(cpi) BITX((cpi)->cpi_std[1].cp_eax, 19, 16) 1649 #define CPI_TYPE(cpi) BITX((cpi)->cpi_std[1].cp_eax, 13, 12) 1650 #define CPI_FAMILY(cpi) BITX((cpi)->cpi_std[1].cp_eax, 11, 8) 1651 #define CPI_STEP(cpi) BITX((cpi)->cpi_std[1].cp_eax, 3, 0) 1652 #define CPI_MODEL(cpi) BITX((cpi)->cpi_std[1].cp_eax, 7, 4) 1653 1654 #define CPI_FEATURES_EDX(cpi) ((cpi)->cpi_std[1].cp_edx) 1655 #define CPI_FEATURES_ECX(cpi) ((cpi)->cpi_std[1].cp_ecx) 1656 #define CPI_FEATURES_XTD_EDX(cpi) ((cpi)->cpi_extd[1].cp_edx) 1657 #define CPI_FEATURES_XTD_ECX(cpi) ((cpi)->cpi_extd[1].cp_ecx) 1658 #define CPI_FEATURES_7_0_EBX(cpi) ((cpi)->cpi_std[7].cp_ebx) 1659 #define CPI_FEATURES_7_0_ECX(cpi) ((cpi)->cpi_std[7].cp_ecx) 1660 #define CPI_FEATURES_7_0_EDX(cpi) ((cpi)->cpi_std[7].cp_edx) 1661 1662 #define CPI_BRANDID(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 7, 0) 1663 #define CPI_CHUNKS(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 15, 7) 1664 #define CPI_CPU_COUNT(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 23, 16) 1665 #define CPI_APIC_ID(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 31, 24) 1666 1667 #define CPI_MAXEAX_MAX 0x100 /* sanity control */ 1668 #define CPI_XMAXEAX_MAX 0x80000100 1669 #define CPI_FN4_ECX_MAX 0x20 /* sanity: max fn 4 levels */ 1670 #define CPI_FNB_ECX_MAX 0x20 /* sanity: max fn B levels */ 1671 1672 /* 1673 * Function 4 (Deterministic Cache Parameters) macros 1674 * Defined by Intel Application Note AP-485 1675 */ 1676 #define CPI_NUM_CORES(regs) BITX((regs)->cp_eax, 31, 26) 1677 #define CPI_NTHR_SHR_CACHE(regs) BITX((regs)->cp_eax, 25, 14) 1678 #define CPI_FULL_ASSOC_CACHE(regs) BITX((regs)->cp_eax, 9, 9) 1679 #define CPI_SELF_INIT_CACHE(regs) BITX((regs)->cp_eax, 8, 8) 1680 #define CPI_CACHE_LVL(regs) BITX((regs)->cp_eax, 7, 5) 1681 #define CPI_CACHE_TYPE(regs) BITX((regs)->cp_eax, 4, 0) 1682 #define CPI_CPU_LEVEL_TYPE(regs) BITX((regs)->cp_ecx, 15, 8) 1683 1684 #define CPI_CACHE_WAYS(regs) BITX((regs)->cp_ebx, 31, 22) 1685 #define CPI_CACHE_PARTS(regs) BITX((regs)->cp_ebx, 21, 12) 1686 #define CPI_CACHE_COH_LN_SZ(regs) BITX((regs)->cp_ebx, 11, 0) 1687 1688 #define CPI_CACHE_SETS(regs) BITX((regs)->cp_ecx, 31, 0) 1689 1690 #define CPI_PREFCH_STRIDE(regs) BITX((regs)->cp_edx, 9, 0) 1691 1692 1693 /* 1694 * A couple of shorthand macros to identify "later" P6-family chips 1695 * like the Pentium M and Core. First, the "older" P6-based stuff 1696 * (loosely defined as "pre-Pentium-4"): 1697 * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon 1698 */ 1699 #define IS_LEGACY_P6(cpi) ( \ 1700 cpi->cpi_family == 6 && \ 1701 (cpi->cpi_model == 1 || \ 1702 cpi->cpi_model == 3 || \ 1703 cpi->cpi_model == 5 || \ 1704 cpi->cpi_model == 6 || \ 1705 cpi->cpi_model == 7 || \ 1706 cpi->cpi_model == 8 || \ 1707 cpi->cpi_model == 0xA || \ 1708 cpi->cpi_model == 0xB) \ 1709 ) 1710 1711 /* A "new F6" is everything with family 6 that's not the above */ 1712 #define IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi)) 1713 1714 /* Extended family/model support */ 1715 #define IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \ 1716 cpi->cpi_family >= 0xf) 1717 1718 /* 1719 * Info for monitor/mwait idle loop. 1720 * 1721 * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's 1722 * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November 1723 * 2006. 1724 * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual 1725 * Documentation Updates" #33633, Rev 2.05, December 2006. 1726 */ 1727 #define MWAIT_SUPPORT (0x00000001) /* mwait supported */ 1728 #define MWAIT_EXTENSIONS (0x00000002) /* extenstion supported */ 1729 #define MWAIT_ECX_INT_ENABLE (0x00000004) /* ecx 1 extension supported */ 1730 #define MWAIT_SUPPORTED(cpi) ((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON) 1731 #define MWAIT_INT_ENABLE(cpi) ((cpi)->cpi_std[5].cp_ecx & 0x2) 1732 #define MWAIT_EXTENSION(cpi) ((cpi)->cpi_std[5].cp_ecx & 0x1) 1733 #define MWAIT_SIZE_MIN(cpi) BITX((cpi)->cpi_std[5].cp_eax, 15, 0) 1734 #define MWAIT_SIZE_MAX(cpi) BITX((cpi)->cpi_std[5].cp_ebx, 15, 0) 1735 /* 1736 * Number of sub-cstates for a given c-state. 1737 */ 1738 #define MWAIT_NUM_SUBC_STATES(cpi, c_state) \ 1739 BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state) 1740 1741 /* 1742 * XSAVE leaf 0xD enumeration 1743 */ 1744 #define CPUID_LEAFD_2_YMM_OFFSET 576 1745 #define CPUID_LEAFD_2_YMM_SIZE 256 1746 1747 /* 1748 * Common extended leaf names to cut down on typos. 1749 */ 1750 #define CPUID_LEAF_EXT_0 0x80000000 1751 #define CPUID_LEAF_EXT_8 0x80000008 1752 #define CPUID_LEAF_EXT_1d 0x8000001d 1753 #define CPUID_LEAF_EXT_1e 0x8000001e 1754 1755 /* 1756 * Functions we consune from cpuid_subr.c; don't publish these in a header 1757 * file to try and keep people using the expected cpuid_* interfaces. 1758 */ 1759 extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t); 1760 extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t); 1761 extern uint32_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t); 1762 extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t); 1763 extern uint_t _cpuid_vendorstr_to_vendorcode(char *); 1764 1765 /* 1766 * Apply up various platform-dependent restrictions where the 1767 * underlying platform restrictions mean the CPU can be marked 1768 * as less capable than its cpuid instruction would imply. 1769 */ 1770 #if defined(__xpv) 1771 static void 1772 platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp) 1773 { 1774 switch (eax) { 1775 case 1: { 1776 uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ? 1777 0 : CPUID_INTC_EDX_MCA; 1778 cp->cp_edx &= 1779 ~(mcamask | 1780 CPUID_INTC_EDX_PSE | 1781 CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE | 1782 CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR | 1783 CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT | 1784 CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP | 1785 CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT); 1786 break; 1787 } 1788 1789 case 0x80000001: 1790 cp->cp_edx &= 1791 ~(CPUID_AMD_EDX_PSE | 1792 CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE | 1793 CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE | 1794 CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 | 1795 CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP | 1796 CPUID_AMD_EDX_TSCP); 1797 cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY; 1798 break; 1799 default: 1800 break; 1801 } 1802 1803 switch (vendor) { 1804 case X86_VENDOR_Intel: 1805 switch (eax) { 1806 case 4: 1807 /* 1808 * Zero out the (ncores-per-chip - 1) field 1809 */ 1810 cp->cp_eax &= 0x03fffffff; 1811 break; 1812 default: 1813 break; 1814 } 1815 break; 1816 case X86_VENDOR_AMD: 1817 switch (eax) { 1818 1819 case 0x80000001: 1820 cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D; 1821 break; 1822 1823 case CPUID_LEAF_EXT_8: 1824 /* 1825 * Zero out the (ncores-per-chip - 1) field 1826 */ 1827 cp->cp_ecx &= 0xffffff00; 1828 break; 1829 default: 1830 break; 1831 } 1832 break; 1833 default: 1834 break; 1835 } 1836 } 1837 #else 1838 #define platform_cpuid_mangle(vendor, eax, cp) /* nothing */ 1839 #endif 1840 1841 /* 1842 * Some undocumented ways of patching the results of the cpuid 1843 * instruction to permit running Solaris 10 on future cpus that 1844 * we don't currently support. Could be set to non-zero values 1845 * via settings in eeprom. 1846 */ 1847 1848 uint32_t cpuid_feature_ecx_include; 1849 uint32_t cpuid_feature_ecx_exclude; 1850 uint32_t cpuid_feature_edx_include; 1851 uint32_t cpuid_feature_edx_exclude; 1852 1853 /* 1854 * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs. 1855 */ 1856 void 1857 cpuid_alloc_space(cpu_t *cpu) 1858 { 1859 /* 1860 * By convention, cpu0 is the boot cpu, which is set up 1861 * before memory allocation is available. All other cpus get 1862 * their cpuid_info struct allocated here. 1863 */ 1864 ASSERT(cpu->cpu_id != 0); 1865 ASSERT(cpu->cpu_m.mcpu_cpi == NULL); 1866 cpu->cpu_m.mcpu_cpi = 1867 kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP); 1868 } 1869 1870 void 1871 cpuid_free_space(cpu_t *cpu) 1872 { 1873 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 1874 int i; 1875 1876 ASSERT(cpi != NULL); 1877 ASSERT(cpi != &cpuid_info0); 1878 1879 /* 1880 * Free up any cache leaf related dynamic storage. The first entry was 1881 * cached from the standard cpuid storage, so we should not free it. 1882 */ 1883 for (i = 1; i < cpi->cpi_cache_leaf_size; i++) 1884 kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs)); 1885 if (cpi->cpi_cache_leaf_size > 0) 1886 kmem_free(cpi->cpi_cache_leaves, 1887 cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *)); 1888 1889 kmem_free(cpi, sizeof (*cpi)); 1890 cpu->cpu_m.mcpu_cpi = NULL; 1891 } 1892 1893 #if !defined(__xpv) 1894 /* 1895 * Determine the type of the underlying platform. This is used to customize 1896 * initialization of various subsystems (e.g. TSC). determine_platform() must 1897 * only ever be called once to prevent two processors from seeing different 1898 * values of platform_type. Must be called before cpuid_pass1(), the earliest 1899 * consumer to execute (uses _cpuid_chiprev --> synth_amd_info --> get_hwenv). 1900 */ 1901 void 1902 determine_platform(void) 1903 { 1904 struct cpuid_regs cp; 1905 uint32_t base; 1906 uint32_t regs[4]; 1907 char *hvstr = (char *)regs; 1908 1909 ASSERT(platform_type == -1); 1910 1911 platform_type = HW_NATIVE; 1912 1913 if (!enable_platform_detection) 1914 return; 1915 1916 /* 1917 * If Hypervisor CPUID bit is set, try to determine hypervisor 1918 * vendor signature, and set platform type accordingly. 1919 * 1920 * References: 1921 * http://lkml.org/lkml/2008/10/1/246 1922 * http://kb.vmware.com/kb/1009458 1923 */ 1924 cp.cp_eax = 0x1; 1925 (void) __cpuid_insn(&cp); 1926 if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) { 1927 cp.cp_eax = 0x40000000; 1928 (void) __cpuid_insn(&cp); 1929 regs[0] = cp.cp_ebx; 1930 regs[1] = cp.cp_ecx; 1931 regs[2] = cp.cp_edx; 1932 regs[3] = 0; 1933 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) { 1934 platform_type = HW_XEN_HVM; 1935 return; 1936 } 1937 if (strcmp(hvstr, HVSIG_VMWARE) == 0) { 1938 platform_type = HW_VMWARE; 1939 return; 1940 } 1941 if (strcmp(hvstr, HVSIG_KVM) == 0) { 1942 platform_type = HW_KVM; 1943 return; 1944 } 1945 if (strcmp(hvstr, HVSIG_BHYVE) == 0) { 1946 platform_type = HW_BHYVE; 1947 return; 1948 } 1949 if (strcmp(hvstr, HVSIG_MICROSOFT) == 0) 1950 platform_type = HW_MICROSOFT; 1951 } else { 1952 /* 1953 * Check older VMware hardware versions. VMware hypervisor is 1954 * detected by performing an IN operation to VMware hypervisor 1955 * port and checking that value returned in %ebx is VMware 1956 * hypervisor magic value. 1957 * 1958 * References: http://kb.vmware.com/kb/1009458 1959 */ 1960 vmware_port(VMWARE_HVCMD_GETVERSION, regs); 1961 if (regs[1] == VMWARE_HVMAGIC) { 1962 platform_type = HW_VMWARE; 1963 return; 1964 } 1965 } 1966 1967 /* 1968 * Check Xen hypervisor. In a fully virtualized domain, 1969 * Xen's pseudo-cpuid function returns a string representing the 1970 * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum 1971 * supported cpuid function. We need at least a (base + 2) leaf value 1972 * to do what we want to do. Try different base values, since the 1973 * hypervisor might use a different one depending on whether Hyper-V 1974 * emulation is switched on by default or not. 1975 */ 1976 for (base = 0x40000000; base < 0x40010000; base += 0x100) { 1977 cp.cp_eax = base; 1978 (void) __cpuid_insn(&cp); 1979 regs[0] = cp.cp_ebx; 1980 regs[1] = cp.cp_ecx; 1981 regs[2] = cp.cp_edx; 1982 regs[3] = 0; 1983 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 && 1984 cp.cp_eax >= (base + 2)) { 1985 platform_type &= ~HW_NATIVE; 1986 platform_type |= HW_XEN_HVM; 1987 return; 1988 } 1989 } 1990 } 1991 1992 int 1993 get_hwenv(void) 1994 { 1995 ASSERT(platform_type != -1); 1996 return (platform_type); 1997 } 1998 1999 int 2000 is_controldom(void) 2001 { 2002 return (0); 2003 } 2004 2005 #else 2006 2007 int 2008 get_hwenv(void) 2009 { 2010 return (HW_XEN_PV); 2011 } 2012 2013 int 2014 is_controldom(void) 2015 { 2016 return (DOMAIN_IS_INITDOMAIN(xen_info)); 2017 } 2018 2019 #endif /* __xpv */ 2020 2021 /* 2022 * Make sure that we have gathered all of the CPUID leaves that we might need to 2023 * determine topology. We assume that the standard leaf 1 has already been done 2024 * and that xmaxeax has already been calculated. 2025 */ 2026 static void 2027 cpuid_gather_amd_topology_leaves(cpu_t *cpu) 2028 { 2029 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2030 2031 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) { 2032 struct cpuid_regs *cp; 2033 2034 cp = &cpi->cpi_extd[8]; 2035 cp->cp_eax = CPUID_LEAF_EXT_8; 2036 (void) __cpuid_insn(cp); 2037 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp); 2038 } 2039 2040 if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 2041 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { 2042 struct cpuid_regs *cp; 2043 2044 cp = &cpi->cpi_extd[0x1e]; 2045 cp->cp_eax = CPUID_LEAF_EXT_1e; 2046 (void) __cpuid_insn(cp); 2047 } 2048 } 2049 2050 /* 2051 * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer 2052 * it to everything else. If not, and we're on an AMD system where 8000001e is 2053 * valid, then we use that. Othewrise, we fall back to the default value for the 2054 * APIC ID in leaf 1. 2055 */ 2056 static uint32_t 2057 cpuid_gather_apicid(struct cpuid_info *cpi) 2058 { 2059 /* 2060 * Leaf B changes based on the arguments to it. Beacuse we don't cache 2061 * it, we need to gather it again. 2062 */ 2063 if (cpi->cpi_maxeax >= 0xB) { 2064 struct cpuid_regs regs; 2065 struct cpuid_regs *cp; 2066 2067 cp = ®s; 2068 cp->cp_eax = 0xB; 2069 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0; 2070 (void) __cpuid_insn(cp); 2071 2072 if (cp->cp_ebx != 0) { 2073 return (cp->cp_edx); 2074 } 2075 } 2076 2077 if (cpi->cpi_vendor == X86_VENDOR_AMD && 2078 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 2079 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { 2080 return (cpi->cpi_extd[0x1e].cp_eax); 2081 } 2082 2083 return (CPI_APIC_ID(cpi)); 2084 } 2085 2086 /* 2087 * For AMD processors, attempt to calculate the number of chips and cores that 2088 * exist. The way that we do this varies based on the generation, because the 2089 * generations themselves have changed dramatically. 2090 * 2091 * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores. 2092 * However, with the advent of family 17h (Zen) it actually tells us the number 2093 * of threads, so we need to look at leaf 0x8000001e if available to determine 2094 * its value. Otherwise, for all prior families, the number of enabled cores is 2095 * the same as threads. 2096 * 2097 * If we do not have leaf 0x80000008, then we assume that this processor does 2098 * not have anything. AMD's older CPUID specification says there's no reason to 2099 * fall back to leaf 1. 2100 * 2101 * In some virtualization cases we will not have leaf 8000001e or it will be 2102 * zero. When that happens we assume the number of threads is one. 2103 */ 2104 static void 2105 cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores) 2106 { 2107 uint_t nthreads, nthread_per_core; 2108 2109 nthreads = nthread_per_core = 1; 2110 2111 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) { 2112 nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1; 2113 } else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) { 2114 nthreads = CPI_CPU_COUNT(cpi); 2115 } 2116 2117 /* 2118 * For us to have threads, and know about it, we have to be at least at 2119 * family 17h and have the cpuid bit that says we have extended 2120 * topology. 2121 */ 2122 if (cpi->cpi_family >= 0x17 && 2123 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 2124 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { 2125 nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1; 2126 } 2127 2128 *ncpus = nthreads; 2129 *ncores = nthreads / nthread_per_core; 2130 } 2131 2132 /* 2133 * Seed the initial values for the cores and threads for an Intel based 2134 * processor. These values will be overwritten if we detect that the processor 2135 * supports CPUID leaf 0xb. 2136 */ 2137 static void 2138 cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores) 2139 { 2140 /* 2141 * Only seed the number of physical cores from the first level leaf 4 2142 * information. The number of threads there indicate how many share the 2143 * L1 cache, which may or may not have anything to do with the number of 2144 * logical CPUs per core. 2145 */ 2146 if (cpi->cpi_maxeax >= 4) { 2147 *ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1; 2148 } else { 2149 *ncores = 1; 2150 } 2151 2152 if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) { 2153 *ncpus = CPI_CPU_COUNT(cpi); 2154 } else { 2155 *ncpus = *ncores; 2156 } 2157 } 2158 2159 static boolean_t 2160 cpuid_leafB_getids(cpu_t *cpu) 2161 { 2162 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2163 struct cpuid_regs regs; 2164 struct cpuid_regs *cp; 2165 2166 if (cpi->cpi_maxeax < 0xB) 2167 return (B_FALSE); 2168 2169 cp = ®s; 2170 cp->cp_eax = 0xB; 2171 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0; 2172 2173 (void) __cpuid_insn(cp); 2174 2175 /* 2176 * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which 2177 * indicates that the extended topology enumeration leaf is 2178 * available. 2179 */ 2180 if (cp->cp_ebx != 0) { 2181 uint32_t x2apic_id = 0; 2182 uint_t coreid_shift = 0; 2183 uint_t ncpu_per_core = 1; 2184 uint_t chipid_shift = 0; 2185 uint_t ncpu_per_chip = 1; 2186 uint_t i; 2187 uint_t level; 2188 2189 for (i = 0; i < CPI_FNB_ECX_MAX; i++) { 2190 cp->cp_eax = 0xB; 2191 cp->cp_ecx = i; 2192 2193 (void) __cpuid_insn(cp); 2194 level = CPI_CPU_LEVEL_TYPE(cp); 2195 2196 if (level == 1) { 2197 x2apic_id = cp->cp_edx; 2198 coreid_shift = BITX(cp->cp_eax, 4, 0); 2199 ncpu_per_core = BITX(cp->cp_ebx, 15, 0); 2200 } else if (level == 2) { 2201 x2apic_id = cp->cp_edx; 2202 chipid_shift = BITX(cp->cp_eax, 4, 0); 2203 ncpu_per_chip = BITX(cp->cp_ebx, 15, 0); 2204 } 2205 } 2206 2207 /* 2208 * cpi_apicid is taken care of in cpuid_gather_apicid. 2209 */ 2210 cpi->cpi_ncpu_per_chip = ncpu_per_chip; 2211 cpi->cpi_ncore_per_chip = ncpu_per_chip / 2212 ncpu_per_core; 2213 cpi->cpi_chipid = x2apic_id >> chipid_shift; 2214 cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1); 2215 cpi->cpi_coreid = x2apic_id >> coreid_shift; 2216 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift; 2217 cpi->cpi_procnodeid = cpi->cpi_chipid; 2218 cpi->cpi_compunitid = cpi->cpi_coreid; 2219 2220 if (coreid_shift > 0 && chipid_shift > coreid_shift) { 2221 cpi->cpi_nthread_bits = coreid_shift; 2222 cpi->cpi_ncore_bits = chipid_shift - coreid_shift; 2223 } 2224 2225 return (B_TRUE); 2226 } else { 2227 return (B_FALSE); 2228 } 2229 } 2230 2231 static void 2232 cpuid_intel_getids(cpu_t *cpu, void *feature) 2233 { 2234 uint_t i; 2235 uint_t chipid_shift = 0; 2236 uint_t coreid_shift = 0; 2237 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2238 2239 /* 2240 * There are no compute units or processor nodes currently on Intel. 2241 * Always set these to one. 2242 */ 2243 cpi->cpi_procnodes_per_pkg = 1; 2244 cpi->cpi_cores_per_compunit = 1; 2245 2246 /* 2247 * If cpuid Leaf B is present, use that to try and get this information. 2248 * It will be the most accurate for Intel CPUs. 2249 */ 2250 if (cpuid_leafB_getids(cpu)) 2251 return; 2252 2253 /* 2254 * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip 2255 * and ncore_per_chip. These represent the largest power of two values 2256 * that we need to cover all of the IDs in the system. Therefore, we use 2257 * those values to seed the number of bits needed to cover information 2258 * in the case when leaf B is not available. These values will probably 2259 * be larger than required, but that's OK. 2260 */ 2261 cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip); 2262 cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip); 2263 2264 for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1) 2265 chipid_shift++; 2266 2267 cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift; 2268 cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1); 2269 2270 if (is_x86_feature(feature, X86FSET_CMP)) { 2271 /* 2272 * Multi-core (and possibly multi-threaded) 2273 * processors. 2274 */ 2275 uint_t ncpu_per_core; 2276 if (cpi->cpi_ncore_per_chip == 1) 2277 ncpu_per_core = cpi->cpi_ncpu_per_chip; 2278 else if (cpi->cpi_ncore_per_chip > 1) 2279 ncpu_per_core = cpi->cpi_ncpu_per_chip / 2280 cpi->cpi_ncore_per_chip; 2281 /* 2282 * 8bit APIC IDs on dual core Pentiums 2283 * look like this: 2284 * 2285 * +-----------------------+------+------+ 2286 * | Physical Package ID | MC | HT | 2287 * +-----------------------+------+------+ 2288 * <------- chipid --------> 2289 * <------- coreid ---------------> 2290 * <--- clogid --> 2291 * <------> 2292 * pkgcoreid 2293 * 2294 * Where the number of bits necessary to 2295 * represent MC and HT fields together equals 2296 * to the minimum number of bits necessary to 2297 * store the value of cpi->cpi_ncpu_per_chip. 2298 * Of those bits, the MC part uses the number 2299 * of bits necessary to store the value of 2300 * cpi->cpi_ncore_per_chip. 2301 */ 2302 for (i = 1; i < ncpu_per_core; i <<= 1) 2303 coreid_shift++; 2304 cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift; 2305 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift; 2306 } else if (is_x86_feature(feature, X86FSET_HTT)) { 2307 /* 2308 * Single-core multi-threaded processors. 2309 */ 2310 cpi->cpi_coreid = cpi->cpi_chipid; 2311 cpi->cpi_pkgcoreid = 0; 2312 } else { 2313 /* 2314 * Single-core single-thread processors. 2315 */ 2316 cpi->cpi_coreid = cpu->cpu_id; 2317 cpi->cpi_pkgcoreid = 0; 2318 } 2319 cpi->cpi_procnodeid = cpi->cpi_chipid; 2320 cpi->cpi_compunitid = cpi->cpi_coreid; 2321 } 2322 2323 /* 2324 * Historically, AMD has had CMP chips with only a single thread per core. 2325 * However, starting in family 17h (Zen), this has changed and they now have 2326 * multiple threads. Our internal core id needs to be a unique value. 2327 * 2328 * To determine the core id of an AMD system, if we're from a family before 17h, 2329 * then we just use the cpu id, as that gives us a good value that will be 2330 * unique for each core. If instead, we're on family 17h or later, then we need 2331 * to do something more complicated. CPUID leaf 0x8000001e can tell us 2332 * how many threads are in the system. Based on that, we'll shift the APIC ID. 2333 * We can't use the normal core id in that leaf as it's only unique within the 2334 * socket, which is perfect for cpi_pkgcoreid, but not us. 2335 */ 2336 static id_t 2337 cpuid_amd_get_coreid(cpu_t *cpu) 2338 { 2339 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2340 2341 if (cpi->cpi_family >= 0x17 && 2342 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 2343 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { 2344 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1; 2345 if (nthreads > 1) { 2346 VERIFY3U(nthreads, ==, 2); 2347 return (cpi->cpi_apicid >> 1); 2348 } 2349 } 2350 2351 return (cpu->cpu_id); 2352 } 2353 2354 /* 2355 * IDs on AMD is a more challenging task. This is notable because of the 2356 * following two facts: 2357 * 2358 * 1. Before family 0x17 (Zen), there was no support for SMT and there was 2359 * also no way to get an actual unique core id from the system. As such, we 2360 * synthesize this case by using cpu->cpu_id. This scheme does not, 2361 * however, guarantee that sibling cores of a chip will have sequential 2362 * coreids starting at a multiple of the number of cores per chip - that is 2363 * usually the case, but if the ACPI MADT table is presented in a different 2364 * order then we need to perform a few more gymnastics for the pkgcoreid. 2365 * 2366 * 2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups 2367 * called compute units. These compute units share the L1I cache, L2 cache, 2368 * and the FPU. To deal with this, a new topology leaf was added in 2369 * 0x8000001e. However, parts of this leaf have different meanings 2370 * once we get to family 0x17. 2371 */ 2372 2373 static void 2374 cpuid_amd_getids(cpu_t *cpu, uchar_t *features) 2375 { 2376 int i, first_half, coreidsz; 2377 uint32_t nb_caps_reg; 2378 uint_t node2_1; 2379 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2380 struct cpuid_regs *cp; 2381 2382 /* 2383 * Calculate the core id (this comes from hardware in family 0x17 if it 2384 * hasn't been stripped by virtualization). We always set the compute 2385 * unit id to the same value. Also, initialize the default number of 2386 * cores per compute unit and nodes per package. This will be 2387 * overwritten when we know information about a particular family. 2388 */ 2389 cpi->cpi_coreid = cpuid_amd_get_coreid(cpu); 2390 cpi->cpi_compunitid = cpi->cpi_coreid; 2391 cpi->cpi_cores_per_compunit = 1; 2392 cpi->cpi_procnodes_per_pkg = 1; 2393 2394 /* 2395 * To construct the logical ID, we need to determine how many APIC IDs 2396 * are dedicated to the cores and threads. This is provided for us in 2397 * 0x80000008. However, if it's not present (say due to virtualization), 2398 * then we assume it's one. This should be present on all 64-bit AMD 2399 * processors. It was added in family 0xf (Hammer). 2400 */ 2401 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) { 2402 coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12); 2403 2404 /* 2405 * In AMD parlance chip is really a node while illumos 2406 * uses chip as equivalent to socket/package. 2407 */ 2408 if (coreidsz == 0) { 2409 /* Use legacy method */ 2410 for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1) 2411 coreidsz++; 2412 if (coreidsz == 0) 2413 coreidsz = 1; 2414 } 2415 } else { 2416 /* Assume single-core part */ 2417 coreidsz = 1; 2418 } 2419 cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1); 2420 2421 /* 2422 * The package core ID varies depending on the family. While it may be 2423 * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately, 2424 * this value is the core id in the given node. For non-virtualized 2425 * family 17h, we need to take the logical core id and shift off the 2426 * threads like we do when getting the core id. Otherwise, we can use 2427 * the clogid as is. When family 17h is virtualized, the clogid should 2428 * be sufficient as if we don't have valid data in the leaf, then we 2429 * won't think we have SMT, in which case the cpi_clogid should be 2430 * sufficient. 2431 */ 2432 if (cpi->cpi_family >= 0x17 && 2433 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 2434 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e && 2435 cpi->cpi_extd[0x1e].cp_ebx != 0) { 2436 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1; 2437 if (nthreads > 1) { 2438 VERIFY3U(nthreads, ==, 2); 2439 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1; 2440 } else { 2441 cpi->cpi_pkgcoreid = cpi->cpi_clogid; 2442 } 2443 } else { 2444 cpi->cpi_pkgcoreid = cpi->cpi_clogid; 2445 } 2446 2447 /* 2448 * Obtain the node ID and compute unit IDs. If we're on family 0x15 2449 * (bulldozer) or newer, then we can derive all of this from leaf 2450 * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family. 2451 */ 2452 if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 2453 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { 2454 cp = &cpi->cpi_extd[0x1e]; 2455 2456 cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1; 2457 cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0); 2458 2459 /* 2460 * For Bulldozer-era CPUs, recalculate the compute unit 2461 * information. 2462 */ 2463 if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) { 2464 cpi->cpi_cores_per_compunit = 2465 BITX(cp->cp_ebx, 15, 8) + 1; 2466 cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) + 2467 (cpi->cpi_ncore_per_chip / 2468 cpi->cpi_cores_per_compunit) * 2469 (cpi->cpi_procnodeid / 2470 cpi->cpi_procnodes_per_pkg); 2471 } 2472 } else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) { 2473 cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7; 2474 } else if (cpi->cpi_family == 0x10) { 2475 /* 2476 * See if we are a multi-node processor. 2477 * All processors in the system have the same number of nodes 2478 */ 2479 nb_caps_reg = pci_getl_func(0, 24, 3, 0xe8); 2480 if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) { 2481 /* Single-node */ 2482 cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5, 2483 coreidsz); 2484 } else { 2485 2486 /* 2487 * Multi-node revision D (2 nodes per package 2488 * are supported) 2489 */ 2490 cpi->cpi_procnodes_per_pkg = 2; 2491 2492 first_half = (cpi->cpi_pkgcoreid <= 2493 (cpi->cpi_ncore_per_chip/2 - 1)); 2494 2495 if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) { 2496 /* We are BSP */ 2497 cpi->cpi_procnodeid = (first_half ? 0 : 1); 2498 } else { 2499 2500 /* We are AP */ 2501 /* NodeId[2:1] bits to use for reading F3xe8 */ 2502 node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1; 2503 2504 nb_caps_reg = 2505 pci_getl_func(0, 24 + node2_1, 3, 0xe8); 2506 2507 /* 2508 * Check IntNodeNum bit (31:30, but bit 31 is 2509 * always 0 on dual-node processors) 2510 */ 2511 if (BITX(nb_caps_reg, 30, 30) == 0) 2512 cpi->cpi_procnodeid = node2_1 + 2513 !first_half; 2514 else 2515 cpi->cpi_procnodeid = node2_1 + 2516 first_half; 2517 } 2518 } 2519 } else { 2520 cpi->cpi_procnodeid = 0; 2521 } 2522 2523 cpi->cpi_chipid = 2524 cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg; 2525 2526 cpi->cpi_ncore_bits = coreidsz; 2527 cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip / 2528 cpi->cpi_ncore_per_chip); 2529 } 2530 2531 static void 2532 spec_uarch_flush_noop(void) 2533 { 2534 } 2535 2536 /* 2537 * When microcode is present that mitigates MDS, this wrmsr will also flush the 2538 * MDS-related micro-architectural state that would normally happen by calling 2539 * x86_md_clear(). 2540 */ 2541 static void 2542 spec_uarch_flush_msr(void) 2543 { 2544 wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D); 2545 } 2546 2547 /* 2548 * This function points to a function that will flush certain 2549 * micro-architectural state on the processor. This flush is used to mitigate 2550 * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This 2551 * function can point to one of three functions: 2552 * 2553 * - A noop which is done because we either are vulnerable, but do not have 2554 * microcode available to help deal with a fix, or because we aren't 2555 * vulnerable. 2556 * 2557 * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to 2558 * mitigate MDS is present, also perform the equivalent of the MDS flush; 2559 * however, it only flushes the MDS related micro-architectural state on the 2560 * current hyperthread, it does not do anything for the twin. 2561 * 2562 * - x86_md_clear which will flush the MDS related state. This is done when we 2563 * have a processor that is vulnerable to MDS, but is not vulnerable to L1TF 2564 * (RDCL_NO is set). 2565 */ 2566 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop; 2567 2568 static void 2569 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset) 2570 { 2571 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2572 2573 /* 2574 * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS) 2575 * has been fixed in hardware, it doesn't cover everything related to 2576 * MDS. Therefore we can only rely on MDS_NO to determine that we don't 2577 * need to mitigate this. 2578 */ 2579 if (cpi->cpi_vendor != X86_VENDOR_Intel || 2580 is_x86_feature(featureset, X86FSET_MDS_NO)) { 2581 return; 2582 } 2583 2584 if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) { 2585 const uint8_t nop = NOP_INSTR; 2586 uint8_t *md = (uint8_t *)x86_md_clear; 2587 2588 *md = nop; 2589 } 2590 2591 membar_producer(); 2592 } 2593 2594 static void 2595 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset) 2596 { 2597 boolean_t need_l1d, need_mds; 2598 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2599 2600 /* 2601 * If we're not on Intel or we've mitigated both RDCL and MDS in 2602 * hardware, then there's nothing left for us to do for enabling the 2603 * flush. We can also go ahead and say that SMT exclusion is 2604 * unnecessary. 2605 */ 2606 if (cpi->cpi_vendor != X86_VENDOR_Intel || 2607 (is_x86_feature(featureset, X86FSET_RDCL_NO) && 2608 is_x86_feature(featureset, X86FSET_MDS_NO))) { 2609 extern int smt_exclusion; 2610 smt_exclusion = 0; 2611 spec_uarch_flush = spec_uarch_flush_noop; 2612 membar_producer(); 2613 return; 2614 } 2615 2616 /* 2617 * The locations where we need to perform an L1D flush are required both 2618 * for mitigating L1TF and MDS. When verw support is present in 2619 * microcode, then the L1D flush will take care of doing that as well. 2620 * However, if we have a system where RDCL_NO is present, but we don't 2621 * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full 2622 * L1D flush. 2623 */ 2624 if (!is_x86_feature(featureset, X86FSET_RDCL_NO) && 2625 is_x86_feature(featureset, X86FSET_FLUSH_CMD) && 2626 !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) { 2627 need_l1d = B_TRUE; 2628 } else { 2629 need_l1d = B_FALSE; 2630 } 2631 2632 if (!is_x86_feature(featureset, X86FSET_MDS_NO) && 2633 is_x86_feature(featureset, X86FSET_MD_CLEAR)) { 2634 need_mds = B_TRUE; 2635 } else { 2636 need_mds = B_FALSE; 2637 } 2638 2639 if (need_l1d) { 2640 spec_uarch_flush = spec_uarch_flush_msr; 2641 } else if (need_mds) { 2642 spec_uarch_flush = x86_md_clear; 2643 } else { 2644 /* 2645 * We have no hardware mitigations available to us. 2646 */ 2647 spec_uarch_flush = spec_uarch_flush_noop; 2648 } 2649 membar_producer(); 2650 } 2651 2652 /* 2653 * We default to enabling RSB mitigations. 2654 */ 2655 static void 2656 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit) 2657 { 2658 const uint8_t ret = RET_INSTR; 2659 uint8_t *stuff = (uint8_t *)x86_rsb_stuff; 2660 2661 switch (mit) { 2662 case X86_SPECTREV2_ENHANCED_IBRS: 2663 case X86_SPECTREV2_DISABLED: 2664 *stuff = ret; 2665 break; 2666 default: 2667 break; 2668 } 2669 } 2670 2671 static void 2672 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit) 2673 { 2674 const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi", 2675 "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13", 2676 "_r14", "_r15" }; 2677 const uint_t nthunks = ARRAY_SIZE(thunks); 2678 const char *type; 2679 uint_t i; 2680 2681 if (mit == x86_spectrev2_mitigation) 2682 return; 2683 2684 switch (mit) { 2685 case X86_SPECTREV2_RETPOLINE: 2686 type = "gen"; 2687 break; 2688 case X86_SPECTREV2_RETPOLINE_AMD: 2689 type = "amd"; 2690 break; 2691 case X86_SPECTREV2_ENHANCED_IBRS: 2692 case X86_SPECTREV2_DISABLED: 2693 type = "jmp"; 2694 break; 2695 default: 2696 panic("asked to updated retpoline state with unknown state!"); 2697 } 2698 2699 for (i = 0; i < nthunks; i++) { 2700 uintptr_t source, dest; 2701 int ssize, dsize; 2702 char sourcebuf[64], destbuf[64]; 2703 size_t len; 2704 2705 (void) snprintf(destbuf, sizeof (destbuf), 2706 "__x86_indirect_thunk%s", thunks[i]); 2707 (void) snprintf(sourcebuf, sizeof (sourcebuf), 2708 "__x86_indirect_thunk_%s%s", type, thunks[i]); 2709 2710 source = kobj_getelfsym(sourcebuf, NULL, &ssize); 2711 dest = kobj_getelfsym(destbuf, NULL, &dsize); 2712 VERIFY3U(source, !=, 0); 2713 VERIFY3U(dest, !=, 0); 2714 VERIFY3S(dsize, >=, ssize); 2715 bcopy((void *)source, (void *)dest, ssize); 2716 } 2717 } 2718 2719 static void 2720 cpuid_enable_enhanced_ibrs(void) 2721 { 2722 uint64_t val; 2723 2724 val = rdmsr(MSR_IA32_SPEC_CTRL); 2725 val |= IA32_SPEC_CTRL_IBRS; 2726 wrmsr(MSR_IA32_SPEC_CTRL, val); 2727 } 2728 2729 #ifndef __xpv 2730 /* 2731 * Determine whether or not we can use the AMD optimized retpoline 2732 * functionality. We use this when we know we're on an AMD system and we can 2733 * successfully verify that lfence is dispatch serializing. 2734 */ 2735 static boolean_t 2736 cpuid_use_amd_retpoline(struct cpuid_info *cpi) 2737 { 2738 uint64_t val; 2739 on_trap_data_t otd; 2740 2741 if (cpi->cpi_vendor != X86_VENDOR_AMD) 2742 return (B_FALSE); 2743 2744 /* 2745 * We need to determine whether or not lfence is serializing. It always 2746 * is on families 0xf and 0x11. On others, it's controlled by 2747 * MSR_AMD_DECODE_CONFIG (MSRC001_1029). If some hypervisor gives us a 2748 * crazy old family, don't try and do anything. 2749 */ 2750 if (cpi->cpi_family < 0xf) 2751 return (B_FALSE); 2752 if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11) 2753 return (B_TRUE); 2754 2755 /* 2756 * While it may be tempting to use get_hwenv(), there are no promises 2757 * that a hypervisor will actually declare themselves to be so in a 2758 * friendly way. As such, try to read and set the MSR. If we can then 2759 * read back the value we set (it wasn't just set to zero), then we go 2760 * for it. 2761 */ 2762 if (!on_trap(&otd, OT_DATA_ACCESS)) { 2763 val = rdmsr(MSR_AMD_DECODE_CONFIG); 2764 val |= AMD_DECODE_CONFIG_LFENCE_DISPATCH; 2765 wrmsr(MSR_AMD_DECODE_CONFIG, val); 2766 val = rdmsr(MSR_AMD_DECODE_CONFIG); 2767 } else { 2768 val = 0; 2769 } 2770 no_trap(); 2771 2772 if ((val & AMD_DECODE_CONFIG_LFENCE_DISPATCH) != 0) 2773 return (B_TRUE); 2774 return (B_FALSE); 2775 } 2776 #endif /* !__xpv */ 2777 2778 /* 2779 * Determine how we should mitigate TAA or if we need to. Regardless of TAA, if 2780 * we can disable TSX, we do so. 2781 * 2782 * This determination is done only on the boot CPU, potentially after loading 2783 * updated microcode. 2784 */ 2785 static void 2786 cpuid_update_tsx(cpu_t *cpu, uchar_t *featureset) 2787 { 2788 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2789 2790 VERIFY(cpu->cpu_id == 0); 2791 2792 if (cpi->cpi_vendor != X86_VENDOR_Intel) { 2793 x86_taa_mitigation = X86_TAA_HW_MITIGATED; 2794 return; 2795 } 2796 2797 if (x86_disable_taa) { 2798 x86_taa_mitigation = X86_TAA_DISABLED; 2799 return; 2800 } 2801 2802 /* 2803 * If we do not have the ability to disable TSX, then our only 2804 * mitigation options are in hardware (TAA_NO), or by using our existing 2805 * MDS mitigation as described above. The latter relies upon us having 2806 * configured MDS mitigations correctly! This includes disabling SMT if 2807 * we want to cross-CPU-thread protection. 2808 */ 2809 if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) { 2810 /* 2811 * It's not clear whether any parts will enumerate TAA_NO 2812 * *without* TSX_CTRL, but let's mark it as such if we see this. 2813 */ 2814 if (is_x86_feature(featureset, X86FSET_TAA_NO)) { 2815 x86_taa_mitigation = X86_TAA_HW_MITIGATED; 2816 return; 2817 } 2818 2819 if (is_x86_feature(featureset, X86FSET_MD_CLEAR) && 2820 !is_x86_feature(featureset, X86FSET_MDS_NO)) { 2821 x86_taa_mitigation = X86_TAA_MD_CLEAR; 2822 } else { 2823 x86_taa_mitigation = X86_TAA_NOTHING; 2824 } 2825 return; 2826 } 2827 2828 /* 2829 * We have TSX_CTRL, but we can only fully disable TSX if we're early 2830 * enough in boot. 2831 * 2832 * Otherwise, we'll fall back to causing transactions to abort as our 2833 * mitigation. TSX-using code will always take the fallback path. 2834 */ 2835 if (cpi->cpi_pass < 4) { 2836 x86_taa_mitigation = X86_TAA_TSX_DISABLE; 2837 } else { 2838 x86_taa_mitigation = X86_TAA_TSX_FORCE_ABORT; 2839 } 2840 } 2841 2842 /* 2843 * As mentioned, we should only touch the MSR when we've got a suitable 2844 * microcode loaded on this CPU. 2845 */ 2846 static void 2847 cpuid_apply_tsx(x86_taa_mitigation_t taa, uchar_t *featureset) 2848 { 2849 uint64_t val; 2850 2851 switch (taa) { 2852 case X86_TAA_TSX_DISABLE: 2853 if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) 2854 return; 2855 val = rdmsr(MSR_IA32_TSX_CTRL); 2856 val |= IA32_TSX_CTRL_CPUID_CLEAR | IA32_TSX_CTRL_RTM_DISABLE; 2857 wrmsr(MSR_IA32_TSX_CTRL, val); 2858 break; 2859 case X86_TAA_TSX_FORCE_ABORT: 2860 if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) 2861 return; 2862 val = rdmsr(MSR_IA32_TSX_CTRL); 2863 val |= IA32_TSX_CTRL_RTM_DISABLE; 2864 wrmsr(MSR_IA32_TSX_CTRL, val); 2865 break; 2866 case X86_TAA_HW_MITIGATED: 2867 case X86_TAA_MD_CLEAR: 2868 case X86_TAA_DISABLED: 2869 case X86_TAA_NOTHING: 2870 break; 2871 } 2872 } 2873 2874 static void 2875 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset) 2876 { 2877 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2878 x86_spectrev2_mitigation_t v2mit; 2879 2880 if (cpi->cpi_vendor == X86_VENDOR_AMD && 2881 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) { 2882 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB) 2883 add_x86_feature(featureset, X86FSET_IBPB); 2884 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS) 2885 add_x86_feature(featureset, X86FSET_IBRS); 2886 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP) 2887 add_x86_feature(featureset, X86FSET_STIBP); 2888 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL) 2889 add_x86_feature(featureset, X86FSET_STIBP_ALL); 2890 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD) 2891 add_x86_feature(featureset, X86FSET_SSBD); 2892 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD) 2893 add_x86_feature(featureset, X86FSET_SSBD_VIRT); 2894 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO) 2895 add_x86_feature(featureset, X86FSET_SSB_NO); 2896 /* 2897 * Don't enable enhanced IBRS unless we're told that we should 2898 * prefer it and it has the same semantics as Intel. This is 2899 * split into two bits rather than a single one. 2900 */ 2901 if ((cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) && 2902 (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)) { 2903 add_x86_feature(featureset, X86FSET_IBRS_ALL); 2904 } 2905 2906 } else if (cpi->cpi_vendor == X86_VENDOR_Intel && 2907 cpi->cpi_maxeax >= 7) { 2908 struct cpuid_regs *ecp; 2909 ecp = &cpi->cpi_std[7]; 2910 2911 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) { 2912 add_x86_feature(featureset, X86FSET_MD_CLEAR); 2913 } 2914 2915 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) { 2916 add_x86_feature(featureset, X86FSET_IBRS); 2917 add_x86_feature(featureset, X86FSET_IBPB); 2918 } 2919 2920 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) { 2921 add_x86_feature(featureset, X86FSET_STIBP); 2922 } 2923 2924 /* 2925 * Don't read the arch caps MSR on xpv where we lack the 2926 * on_trap(). 2927 */ 2928 #ifndef __xpv 2929 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) { 2930 on_trap_data_t otd; 2931 2932 /* 2933 * Be paranoid and assume we'll get a #GP. 2934 */ 2935 if (!on_trap(&otd, OT_DATA_ACCESS)) { 2936 uint64_t reg; 2937 2938 reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES); 2939 if (reg & IA32_ARCH_CAP_RDCL_NO) { 2940 add_x86_feature(featureset, 2941 X86FSET_RDCL_NO); 2942 } 2943 if (reg & IA32_ARCH_CAP_IBRS_ALL) { 2944 add_x86_feature(featureset, 2945 X86FSET_IBRS_ALL); 2946 } 2947 if (reg & IA32_ARCH_CAP_RSBA) { 2948 add_x86_feature(featureset, 2949 X86FSET_RSBA); 2950 } 2951 if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) { 2952 add_x86_feature(featureset, 2953 X86FSET_L1D_VM_NO); 2954 } 2955 if (reg & IA32_ARCH_CAP_SSB_NO) { 2956 add_x86_feature(featureset, 2957 X86FSET_SSB_NO); 2958 } 2959 if (reg & IA32_ARCH_CAP_MDS_NO) { 2960 add_x86_feature(featureset, 2961 X86FSET_MDS_NO); 2962 } 2963 if (reg & IA32_ARCH_CAP_TSX_CTRL) { 2964 add_x86_feature(featureset, 2965 X86FSET_TSX_CTRL); 2966 } 2967 if (reg & IA32_ARCH_CAP_TAA_NO) { 2968 add_x86_feature(featureset, 2969 X86FSET_TAA_NO); 2970 } 2971 } 2972 no_trap(); 2973 } 2974 #endif /* !__xpv */ 2975 2976 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD) 2977 add_x86_feature(featureset, X86FSET_SSBD); 2978 2979 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD) 2980 add_x86_feature(featureset, X86FSET_FLUSH_CMD); 2981 } 2982 2983 /* 2984 * Take care of certain mitigations on the non-boot CPU. The boot CPU 2985 * will have already run this function and determined what we need to 2986 * do. This gives us a hook for per-HW thread mitigations such as 2987 * enhanced IBRS, or disabling TSX. 2988 */ 2989 if (cpu->cpu_id != 0) { 2990 if (x86_spectrev2_mitigation == X86_SPECTREV2_ENHANCED_IBRS) { 2991 cpuid_enable_enhanced_ibrs(); 2992 } 2993 2994 cpuid_apply_tsx(x86_taa_mitigation, featureset); 2995 return; 2996 } 2997 2998 /* 2999 * Go through and initialize various security mechanisms that we should 3000 * only do on a single CPU. This includes Spectre V2, L1TF, MDS, and 3001 * TAA. 3002 */ 3003 3004 /* 3005 * By default we've come in with retpolines enabled. Check whether we 3006 * should disable them or enable enhanced IBRS. RSB stuffing is enabled 3007 * by default, but disabled if we are using enhanced IBRS. 3008 */ 3009 if (x86_disable_spectrev2 != 0) { 3010 v2mit = X86_SPECTREV2_DISABLED; 3011 } else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) { 3012 cpuid_enable_enhanced_ibrs(); 3013 v2mit = X86_SPECTREV2_ENHANCED_IBRS; 3014 #ifndef __xpv 3015 } else if (cpuid_use_amd_retpoline(cpi)) { 3016 v2mit = X86_SPECTREV2_RETPOLINE_AMD; 3017 #endif /* !__xpv */ 3018 } else { 3019 v2mit = X86_SPECTREV2_RETPOLINE; 3020 } 3021 3022 cpuid_patch_retpolines(v2mit); 3023 cpuid_patch_rsb(v2mit); 3024 x86_spectrev2_mitigation = v2mit; 3025 membar_producer(); 3026 3027 /* 3028 * We need to determine what changes are required for mitigating L1TF 3029 * and MDS. If the CPU suffers from either of them, then SMT exclusion 3030 * is required. 3031 * 3032 * If any of these are present, then we need to flush u-arch state at 3033 * various points. For MDS, we need to do so whenever we change to a 3034 * lesser privilege level or we are halting the CPU. For L1TF we need to 3035 * flush the L1D cache at VM entry. When we have microcode that handles 3036 * MDS, the L1D flush also clears the other u-arch state that the 3037 * md_clear does. 3038 */ 3039 3040 /* 3041 * Update whether or not we need to be taking explicit action against 3042 * MDS. 3043 */ 3044 cpuid_update_md_clear(cpu, featureset); 3045 3046 /* 3047 * Determine whether SMT exclusion is required and whether or not we 3048 * need to perform an l1d flush. 3049 */ 3050 cpuid_update_l1d_flush(cpu, featureset); 3051 3052 /* 3053 * Determine what our mitigation strategy should be for TAA and then 3054 * also apply TAA mitigations. 3055 */ 3056 cpuid_update_tsx(cpu, featureset); 3057 cpuid_apply_tsx(x86_taa_mitigation, featureset); 3058 } 3059 3060 /* 3061 * Setup XFeature_Enabled_Mask register. Required by xsave feature. 3062 */ 3063 void 3064 setup_xfem(void) 3065 { 3066 uint64_t flags = XFEATURE_LEGACY_FP; 3067 3068 ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE)); 3069 3070 if (is_x86_feature(x86_featureset, X86FSET_SSE)) 3071 flags |= XFEATURE_SSE; 3072 3073 if (is_x86_feature(x86_featureset, X86FSET_AVX)) 3074 flags |= XFEATURE_AVX; 3075 3076 if (is_x86_feature(x86_featureset, X86FSET_AVX512F)) 3077 flags |= XFEATURE_AVX512; 3078 3079 set_xcr(XFEATURE_ENABLED_MASK, flags); 3080 3081 xsave_bv_all = flags; 3082 } 3083 3084 static void 3085 cpuid_pass1_topology(cpu_t *cpu, uchar_t *featureset) 3086 { 3087 struct cpuid_info *cpi; 3088 3089 cpi = cpu->cpu_m.mcpu_cpi; 3090 3091 if (cpi->cpi_vendor == X86_VENDOR_AMD) { 3092 cpuid_gather_amd_topology_leaves(cpu); 3093 } 3094 3095 cpi->cpi_apicid = cpuid_gather_apicid(cpi); 3096 3097 /* 3098 * Before we can calculate the IDs that we should assign to this 3099 * processor, we need to understand how many cores and threads it has. 3100 */ 3101 switch (cpi->cpi_vendor) { 3102 case X86_VENDOR_Intel: 3103 cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip, 3104 &cpi->cpi_ncore_per_chip); 3105 break; 3106 case X86_VENDOR_AMD: 3107 cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip, 3108 &cpi->cpi_ncore_per_chip); 3109 break; 3110 default: 3111 /* 3112 * If we have some other x86 compatible chip, it's not clear how 3113 * they would behave. The most common case is virtualization 3114 * today, though there are also 64-bit VIA chips. Assume that 3115 * all we can get is the basic Leaf 1 HTT information. 3116 */ 3117 if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) { 3118 cpi->cpi_ncore_per_chip = 1; 3119 cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi); 3120 } 3121 break; 3122 } 3123 3124 /* 3125 * Based on the calculated number of threads and cores, potentially 3126 * assign the HTT and CMT features. 3127 */ 3128 if (cpi->cpi_ncore_per_chip > 1) { 3129 add_x86_feature(featureset, X86FSET_CMP); 3130 } 3131 3132 if (cpi->cpi_ncpu_per_chip > 1 && 3133 cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) { 3134 add_x86_feature(featureset, X86FSET_HTT); 3135 } 3136 3137 /* 3138 * Now that has been set up, we need to go through and calculate all of 3139 * the rest of the parameters that exist. If we think the CPU doesn't 3140 * have either SMT (HTT) or CMP, then we basically go through and fake 3141 * up information in some way. The most likely case for this is 3142 * virtualization where we have a lot of partial topology information. 3143 */ 3144 if (!is_x86_feature(featureset, X86FSET_HTT) && 3145 !is_x86_feature(featureset, X86FSET_CMP)) { 3146 /* 3147 * This is a single core, single-threaded processor. 3148 */ 3149 cpi->cpi_procnodes_per_pkg = 1; 3150 cpi->cpi_cores_per_compunit = 1; 3151 cpi->cpi_compunitid = 0; 3152 cpi->cpi_chipid = -1; 3153 cpi->cpi_clogid = 0; 3154 cpi->cpi_coreid = cpu->cpu_id; 3155 cpi->cpi_pkgcoreid = 0; 3156 if (cpi->cpi_vendor == X86_VENDOR_AMD) { 3157 cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0); 3158 } else { 3159 cpi->cpi_procnodeid = cpi->cpi_chipid; 3160 } 3161 } else { 3162 switch (cpi->cpi_vendor) { 3163 case X86_VENDOR_Intel: 3164 cpuid_intel_getids(cpu, featureset); 3165 break; 3166 case X86_VENDOR_AMD: 3167 cpuid_amd_getids(cpu, featureset); 3168 break; 3169 default: 3170 /* 3171 * In this case, it's hard to say what we should do. 3172 * We're going to model them to the OS as single core 3173 * threads. We don't have a good identifier for them, so 3174 * we're just going to use the cpu id all on a single 3175 * chip. 3176 * 3177 * This case has historically been different from the 3178 * case above where we don't have HTT or CMP. While they 3179 * could be combined, we've opted to keep it separate to 3180 * minimize the risk of topology changes in weird cases. 3181 */ 3182 cpi->cpi_procnodes_per_pkg = 1; 3183 cpi->cpi_cores_per_compunit = 1; 3184 cpi->cpi_chipid = 0; 3185 cpi->cpi_coreid = cpu->cpu_id; 3186 cpi->cpi_clogid = cpu->cpu_id; 3187 cpi->cpi_pkgcoreid = cpu->cpu_id; 3188 cpi->cpi_procnodeid = cpi->cpi_chipid; 3189 cpi->cpi_compunitid = cpi->cpi_coreid; 3190 break; 3191 } 3192 } 3193 } 3194 3195 /* 3196 * Gather relevant CPU features from leaf 6 which covers thermal information. We 3197 * always gather leaf 6 if it's supported; however, we only look for features on 3198 * Intel systems as AMD does not currently define any of the features we look 3199 * for below. 3200 */ 3201 static void 3202 cpuid_pass1_thermal(cpu_t *cpu, uchar_t *featureset) 3203 { 3204 struct cpuid_regs *cp; 3205 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 3206 3207 if (cpi->cpi_maxeax < 6) { 3208 return; 3209 } 3210 3211 cp = &cpi->cpi_std[6]; 3212 cp->cp_eax = 6; 3213 cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0; 3214 (void) __cpuid_insn(cp); 3215 platform_cpuid_mangle(cpi->cpi_vendor, 6, cp); 3216 3217 if (cpi->cpi_vendor != X86_VENDOR_Intel) { 3218 return; 3219 } 3220 3221 if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) { 3222 add_x86_feature(featureset, X86FSET_CORE_THERMAL); 3223 } 3224 3225 if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) { 3226 add_x86_feature(featureset, X86FSET_PKG_THERMAL); 3227 } 3228 } 3229 3230 void 3231 cpuid_pass1(cpu_t *cpu, uchar_t *featureset) 3232 { 3233 uint32_t mask_ecx, mask_edx; 3234 struct cpuid_info *cpi; 3235 struct cpuid_regs *cp; 3236 int xcpuid; 3237 #if !defined(__xpv) 3238 extern int idle_cpu_prefer_mwait; 3239 #endif 3240 3241 /* 3242 * Space statically allocated for BSP, ensure pointer is set 3243 */ 3244 if (cpu->cpu_id == 0) { 3245 if (cpu->cpu_m.mcpu_cpi == NULL) 3246 cpu->cpu_m.mcpu_cpi = &cpuid_info0; 3247 } 3248 3249 add_x86_feature(featureset, X86FSET_CPUID); 3250 3251 cpi = cpu->cpu_m.mcpu_cpi; 3252 ASSERT(cpi != NULL); 3253 cp = &cpi->cpi_std[0]; 3254 cp->cp_eax = 0; 3255 cpi->cpi_maxeax = __cpuid_insn(cp); 3256 { 3257 uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr; 3258 *iptr++ = cp->cp_ebx; 3259 *iptr++ = cp->cp_edx; 3260 *iptr++ = cp->cp_ecx; 3261 *(char *)&cpi->cpi_vendorstr[12] = '\0'; 3262 } 3263 3264 cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr); 3265 x86_vendor = cpi->cpi_vendor; /* for compatibility */ 3266 3267 /* 3268 * Limit the range in case of weird hardware 3269 */ 3270 if (cpi->cpi_maxeax > CPI_MAXEAX_MAX) 3271 cpi->cpi_maxeax = CPI_MAXEAX_MAX; 3272 if (cpi->cpi_maxeax < 1) 3273 goto pass1_done; 3274 3275 cp = &cpi->cpi_std[1]; 3276 cp->cp_eax = 1; 3277 (void) __cpuid_insn(cp); 3278 3279 /* 3280 * Extract identifying constants for easy access. 3281 */ 3282 cpi->cpi_model = CPI_MODEL(cpi); 3283 cpi->cpi_family = CPI_FAMILY(cpi); 3284 3285 if (cpi->cpi_family == 0xf) 3286 cpi->cpi_family += CPI_FAMILY_XTD(cpi); 3287 3288 /* 3289 * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf. 3290 * Intel, and presumably everyone else, uses model == 0xf, as 3291 * one would expect (max value means possible overflow). Sigh. 3292 */ 3293 3294 switch (cpi->cpi_vendor) { 3295 case X86_VENDOR_Intel: 3296 if (IS_EXTENDED_MODEL_INTEL(cpi)) 3297 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4; 3298 break; 3299 case X86_VENDOR_AMD: 3300 if (CPI_FAMILY(cpi) == 0xf) 3301 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4; 3302 break; 3303 default: 3304 if (cpi->cpi_model == 0xf) 3305 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4; 3306 break; 3307 } 3308 3309 cpi->cpi_step = CPI_STEP(cpi); 3310 cpi->cpi_brandid = CPI_BRANDID(cpi); 3311 3312 /* 3313 * *default* assumptions: 3314 * - believe %edx feature word 3315 * - ignore %ecx feature word 3316 * - 32-bit virtual and physical addressing 3317 */ 3318 mask_edx = 0xffffffff; 3319 mask_ecx = 0; 3320 3321 cpi->cpi_pabits = cpi->cpi_vabits = 32; 3322 3323 switch (cpi->cpi_vendor) { 3324 case X86_VENDOR_Intel: 3325 if (cpi->cpi_family == 5) 3326 x86_type = X86_TYPE_P5; 3327 else if (IS_LEGACY_P6(cpi)) { 3328 x86_type = X86_TYPE_P6; 3329 pentiumpro_bug4046376 = 1; 3330 /* 3331 * Clear the SEP bit when it was set erroneously 3332 */ 3333 if (cpi->cpi_model < 3 && cpi->cpi_step < 3) 3334 cp->cp_edx &= ~CPUID_INTC_EDX_SEP; 3335 } else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) { 3336 x86_type = X86_TYPE_P4; 3337 /* 3338 * We don't currently depend on any of the %ecx 3339 * features until Prescott, so we'll only check 3340 * this from P4 onwards. We might want to revisit 3341 * that idea later. 3342 */ 3343 mask_ecx = 0xffffffff; 3344 } else if (cpi->cpi_family > 0xf) 3345 mask_ecx = 0xffffffff; 3346 /* 3347 * We don't support MONITOR/MWAIT if leaf 5 is not available 3348 * to obtain the monitor linesize. 3349 */ 3350 if (cpi->cpi_maxeax < 5) 3351 mask_ecx &= ~CPUID_INTC_ECX_MON; 3352 break; 3353 case X86_VENDOR_IntelClone: 3354 default: 3355 break; 3356 case X86_VENDOR_AMD: 3357 #if defined(OPTERON_ERRATUM_108) 3358 if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) { 3359 cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0; 3360 cpi->cpi_model = 0xc; 3361 } else 3362 #endif 3363 if (cpi->cpi_family == 5) { 3364 /* 3365 * AMD K5 and K6 3366 * 3367 * These CPUs have an incomplete implementation 3368 * of MCA/MCE which we mask away. 3369 */ 3370 mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA); 3371 3372 /* 3373 * Model 0 uses the wrong (APIC) bit 3374 * to indicate PGE. Fix it here. 3375 */ 3376 if (cpi->cpi_model == 0) { 3377 if (cp->cp_edx & 0x200) { 3378 cp->cp_edx &= ~0x200; 3379 cp->cp_edx |= CPUID_INTC_EDX_PGE; 3380 } 3381 } 3382 3383 /* 3384 * Early models had problems w/ MMX; disable. 3385 */ 3386 if (cpi->cpi_model < 6) 3387 mask_edx &= ~CPUID_INTC_EDX_MMX; 3388 } 3389 3390 /* 3391 * For newer families, SSE3 and CX16, at least, are valid; 3392 * enable all 3393 */ 3394 if (cpi->cpi_family >= 0xf) 3395 mask_ecx = 0xffffffff; 3396 /* 3397 * We don't support MONITOR/MWAIT if leaf 5 is not available 3398 * to obtain the monitor linesize. 3399 */ 3400 if (cpi->cpi_maxeax < 5) 3401 mask_ecx &= ~CPUID_INTC_ECX_MON; 3402 3403 #if !defined(__xpv) 3404 /* 3405 * AMD has not historically used MWAIT in the CPU's idle loop. 3406 * Pre-family-10h Opterons do not have the MWAIT instruction. We 3407 * know for certain that in at least family 17h, per AMD, mwait 3408 * is preferred. Families in-between are less certain. 3409 */ 3410 if (cpi->cpi_family < 0x17) { 3411 idle_cpu_prefer_mwait = 0; 3412 } 3413 #endif 3414 3415 break; 3416 case X86_VENDOR_TM: 3417 /* 3418 * workaround the NT workaround in CMS 4.1 3419 */ 3420 if (cpi->cpi_family == 5 && cpi->cpi_model == 4 && 3421 (cpi->cpi_step == 2 || cpi->cpi_step == 3)) 3422 cp->cp_edx |= CPUID_INTC_EDX_CX8; 3423 break; 3424 case X86_VENDOR_Centaur: 3425 /* 3426 * workaround the NT workarounds again 3427 */ 3428 if (cpi->cpi_family == 6) 3429 cp->cp_edx |= CPUID_INTC_EDX_CX8; 3430 break; 3431 case X86_VENDOR_Cyrix: 3432 /* 3433 * We rely heavily on the probing in locore 3434 * to actually figure out what parts, if any, 3435 * of the Cyrix cpuid instruction to believe. 3436 */ 3437 switch (x86_type) { 3438 case X86_TYPE_CYRIX_486: 3439 mask_edx = 0; 3440 break; 3441 case X86_TYPE_CYRIX_6x86: 3442 mask_edx = 0; 3443 break; 3444 case X86_TYPE_CYRIX_6x86L: 3445 mask_edx = 3446 CPUID_INTC_EDX_DE | 3447 CPUID_INTC_EDX_CX8; 3448 break; 3449 case X86_TYPE_CYRIX_6x86MX: 3450 mask_edx = 3451 CPUID_INTC_EDX_DE | 3452 CPUID_INTC_EDX_MSR | 3453 CPUID_INTC_EDX_CX8 | 3454 CPUID_INTC_EDX_PGE | 3455 CPUID_INTC_EDX_CMOV | 3456 CPUID_INTC_EDX_MMX; 3457 break; 3458 case X86_TYPE_CYRIX_GXm: 3459 mask_edx = 3460 CPUID_INTC_EDX_MSR | 3461 CPUID_INTC_EDX_CX8 | 3462 CPUID_INTC_EDX_CMOV | 3463 CPUID_INTC_EDX_MMX; 3464 break; 3465 case X86_TYPE_CYRIX_MediaGX: 3466 break; 3467 case X86_TYPE_CYRIX_MII: 3468 case X86_TYPE_VIA_CYRIX_III: 3469 mask_edx = 3470 CPUID_INTC_EDX_DE | 3471 CPUID_INTC_EDX_TSC | 3472 CPUID_INTC_EDX_MSR | 3473 CPUID_INTC_EDX_CX8 | 3474 CPUID_INTC_EDX_PGE | 3475 CPUID_INTC_EDX_CMOV | 3476 CPUID_INTC_EDX_MMX; 3477 break; 3478 default: 3479 break; 3480 } 3481 break; 3482 } 3483 3484 #if defined(__xpv) 3485 /* 3486 * Do not support MONITOR/MWAIT under a hypervisor 3487 */ 3488 mask_ecx &= ~CPUID_INTC_ECX_MON; 3489 /* 3490 * Do not support XSAVE under a hypervisor for now 3491 */ 3492 xsave_force_disable = B_TRUE; 3493 3494 #endif /* __xpv */ 3495 3496 if (xsave_force_disable) { 3497 mask_ecx &= ~CPUID_INTC_ECX_XSAVE; 3498 mask_ecx &= ~CPUID_INTC_ECX_AVX; 3499 mask_ecx &= ~CPUID_INTC_ECX_F16C; 3500 mask_ecx &= ~CPUID_INTC_ECX_FMA; 3501 } 3502 3503 /* 3504 * Now we've figured out the masks that determine 3505 * which bits we choose to believe, apply the masks 3506 * to the feature words, then map the kernel's view 3507 * of these feature words into its feature word. 3508 */ 3509 cp->cp_edx &= mask_edx; 3510 cp->cp_ecx &= mask_ecx; 3511 3512 /* 3513 * apply any platform restrictions (we don't call this 3514 * immediately after __cpuid_insn here, because we need the 3515 * workarounds applied above first) 3516 */ 3517 platform_cpuid_mangle(cpi->cpi_vendor, 1, cp); 3518 3519 /* 3520 * In addition to ecx and edx, Intel and AMD are storing a bunch of 3521 * instruction set extensions in leaf 7's ebx, ecx, and edx. 3522 */ 3523 if (cpi->cpi_maxeax >= 7) { 3524 struct cpuid_regs *ecp; 3525 ecp = &cpi->cpi_std[7]; 3526 ecp->cp_eax = 7; 3527 ecp->cp_ecx = 0; 3528 (void) __cpuid_insn(ecp); 3529 3530 /* 3531 * If XSAVE has been disabled, just ignore all of the 3532 * extended-save-area dependent flags here. 3533 */ 3534 if (xsave_force_disable) { 3535 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1; 3536 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2; 3537 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2; 3538 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX; 3539 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512; 3540 ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512; 3541 ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512; 3542 } 3543 3544 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP) 3545 add_x86_feature(featureset, X86FSET_SMEP); 3546 3547 /* 3548 * We check disable_smap here in addition to in startup_smap() 3549 * to ensure CPUs that aren't the boot CPU don't accidentally 3550 * include it in the feature set and thus generate a mismatched 3551 * x86 feature set across CPUs. 3552 */ 3553 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP && 3554 disable_smap == 0) 3555 add_x86_feature(featureset, X86FSET_SMAP); 3556 3557 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED) 3558 add_x86_feature(featureset, X86FSET_RDSEED); 3559 3560 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX) 3561 add_x86_feature(featureset, X86FSET_ADX); 3562 3563 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE) 3564 add_x86_feature(featureset, X86FSET_FSGSBASE); 3565 3566 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT) 3567 add_x86_feature(featureset, X86FSET_CLFLUSHOPT); 3568 3569 if (cpi->cpi_vendor == X86_VENDOR_Intel) { 3570 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID) 3571 add_x86_feature(featureset, X86FSET_INVPCID); 3572 3573 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX) 3574 add_x86_feature(featureset, X86FSET_MPX); 3575 3576 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB) 3577 add_x86_feature(featureset, X86FSET_CLWB); 3578 } 3579 } 3580 3581 /* 3582 * fold in overrides from the "eeprom" mechanism 3583 */ 3584 cp->cp_edx |= cpuid_feature_edx_include; 3585 cp->cp_edx &= ~cpuid_feature_edx_exclude; 3586 3587 cp->cp_ecx |= cpuid_feature_ecx_include; 3588 cp->cp_ecx &= ~cpuid_feature_ecx_exclude; 3589 3590 if (cp->cp_edx & CPUID_INTC_EDX_PSE) { 3591 add_x86_feature(featureset, X86FSET_LARGEPAGE); 3592 } 3593 if (cp->cp_edx & CPUID_INTC_EDX_TSC) { 3594 add_x86_feature(featureset, X86FSET_TSC); 3595 } 3596 if (cp->cp_edx & CPUID_INTC_EDX_MSR) { 3597 add_x86_feature(featureset, X86FSET_MSR); 3598 } 3599 if (cp->cp_edx & CPUID_INTC_EDX_MTRR) { 3600 add_x86_feature(featureset, X86FSET_MTRR); 3601 } 3602 if (cp->cp_edx & CPUID_INTC_EDX_PGE) { 3603 add_x86_feature(featureset, X86FSET_PGE); 3604 } 3605 if (cp->cp_edx & CPUID_INTC_EDX_CMOV) { 3606 add_x86_feature(featureset, X86FSET_CMOV); 3607 } 3608 if (cp->cp_edx & CPUID_INTC_EDX_MMX) { 3609 add_x86_feature(featureset, X86FSET_MMX); 3610 } 3611 if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 && 3612 (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) { 3613 add_x86_feature(featureset, X86FSET_MCA); 3614 } 3615 if (cp->cp_edx & CPUID_INTC_EDX_PAE) { 3616 add_x86_feature(featureset, X86FSET_PAE); 3617 } 3618 if (cp->cp_edx & CPUID_INTC_EDX_CX8) { 3619 add_x86_feature(featureset, X86FSET_CX8); 3620 } 3621 if (cp->cp_ecx & CPUID_INTC_ECX_CX16) { 3622 add_x86_feature(featureset, X86FSET_CX16); 3623 } 3624 if (cp->cp_edx & CPUID_INTC_EDX_PAT) { 3625 add_x86_feature(featureset, X86FSET_PAT); 3626 } 3627 if (cp->cp_edx & CPUID_INTC_EDX_SEP) { 3628 add_x86_feature(featureset, X86FSET_SEP); 3629 } 3630 if (cp->cp_edx & CPUID_INTC_EDX_FXSR) { 3631 /* 3632 * In our implementation, fxsave/fxrstor 3633 * are prerequisites before we'll even 3634 * try and do SSE things. 3635 */ 3636 if (cp->cp_edx & CPUID_INTC_EDX_SSE) { 3637 add_x86_feature(featureset, X86FSET_SSE); 3638 } 3639 if (cp->cp_edx & CPUID_INTC_EDX_SSE2) { 3640 add_x86_feature(featureset, X86FSET_SSE2); 3641 } 3642 if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) { 3643 add_x86_feature(featureset, X86FSET_SSE3); 3644 } 3645 if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) { 3646 add_x86_feature(featureset, X86FSET_SSSE3); 3647 } 3648 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) { 3649 add_x86_feature(featureset, X86FSET_SSE4_1); 3650 } 3651 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) { 3652 add_x86_feature(featureset, X86FSET_SSE4_2); 3653 } 3654 if (cp->cp_ecx & CPUID_INTC_ECX_AES) { 3655 add_x86_feature(featureset, X86FSET_AES); 3656 } 3657 if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) { 3658 add_x86_feature(featureset, X86FSET_PCLMULQDQ); 3659 } 3660 3661 if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA) 3662 add_x86_feature(featureset, X86FSET_SHA); 3663 3664 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_UMIP) 3665 add_x86_feature(featureset, X86FSET_UMIP); 3666 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_PKU) 3667 add_x86_feature(featureset, X86FSET_PKU); 3668 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_OSPKE) 3669 add_x86_feature(featureset, X86FSET_OSPKE); 3670 3671 if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) { 3672 add_x86_feature(featureset, X86FSET_XSAVE); 3673 3674 /* We only test AVX & AVX512 when there is XSAVE */ 3675 3676 if (cp->cp_ecx & CPUID_INTC_ECX_AVX) { 3677 add_x86_feature(featureset, 3678 X86FSET_AVX); 3679 3680 /* 3681 * Intel says we can't check these without also 3682 * checking AVX. 3683 */ 3684 if (cp->cp_ecx & CPUID_INTC_ECX_F16C) 3685 add_x86_feature(featureset, 3686 X86FSET_F16C); 3687 3688 if (cp->cp_ecx & CPUID_INTC_ECX_FMA) 3689 add_x86_feature(featureset, 3690 X86FSET_FMA); 3691 3692 if (cpi->cpi_std[7].cp_ebx & 3693 CPUID_INTC_EBX_7_0_BMI1) 3694 add_x86_feature(featureset, 3695 X86FSET_BMI1); 3696 3697 if (cpi->cpi_std[7].cp_ebx & 3698 CPUID_INTC_EBX_7_0_BMI2) 3699 add_x86_feature(featureset, 3700 X86FSET_BMI2); 3701 3702 if (cpi->cpi_std[7].cp_ebx & 3703 CPUID_INTC_EBX_7_0_AVX2) 3704 add_x86_feature(featureset, 3705 X86FSET_AVX2); 3706 } 3707 3708 if (cpi->cpi_vendor == X86_VENDOR_Intel && 3709 (cpi->cpi_std[7].cp_ebx & 3710 CPUID_INTC_EBX_7_0_AVX512F) != 0) { 3711 add_x86_feature(featureset, X86FSET_AVX512F); 3712 3713 if (cpi->cpi_std[7].cp_ebx & 3714 CPUID_INTC_EBX_7_0_AVX512DQ) 3715 add_x86_feature(featureset, 3716 X86FSET_AVX512DQ); 3717 if (cpi->cpi_std[7].cp_ebx & 3718 CPUID_INTC_EBX_7_0_AVX512IFMA) 3719 add_x86_feature(featureset, 3720 X86FSET_AVX512FMA); 3721 if (cpi->cpi_std[7].cp_ebx & 3722 CPUID_INTC_EBX_7_0_AVX512PF) 3723 add_x86_feature(featureset, 3724 X86FSET_AVX512PF); 3725 if (cpi->cpi_std[7].cp_ebx & 3726 CPUID_INTC_EBX_7_0_AVX512ER) 3727 add_x86_feature(featureset, 3728 X86FSET_AVX512ER); 3729 if (cpi->cpi_std[7].cp_ebx & 3730 CPUID_INTC_EBX_7_0_AVX512CD) 3731 add_x86_feature(featureset, 3732 X86FSET_AVX512CD); 3733 if (cpi->cpi_std[7].cp_ebx & 3734 CPUID_INTC_EBX_7_0_AVX512BW) 3735 add_x86_feature(featureset, 3736 X86FSET_AVX512BW); 3737 if (cpi->cpi_std[7].cp_ebx & 3738 CPUID_INTC_EBX_7_0_AVX512VL) 3739 add_x86_feature(featureset, 3740 X86FSET_AVX512VL); 3741 3742 if (cpi->cpi_std[7].cp_ecx & 3743 CPUID_INTC_ECX_7_0_AVX512VBMI) 3744 add_x86_feature(featureset, 3745 X86FSET_AVX512VBMI); 3746 if (cpi->cpi_std[7].cp_ecx & 3747 CPUID_INTC_ECX_7_0_AVX512VNNI) 3748 add_x86_feature(featureset, 3749 X86FSET_AVX512VNNI); 3750 if (cpi->cpi_std[7].cp_ecx & 3751 CPUID_INTC_ECX_7_0_AVX512VPOPCDQ) 3752 add_x86_feature(featureset, 3753 X86FSET_AVX512VPOPCDQ); 3754 3755 if (cpi->cpi_std[7].cp_edx & 3756 CPUID_INTC_EDX_7_0_AVX5124NNIW) 3757 add_x86_feature(featureset, 3758 X86FSET_AVX512NNIW); 3759 if (cpi->cpi_std[7].cp_edx & 3760 CPUID_INTC_EDX_7_0_AVX5124FMAPS) 3761 add_x86_feature(featureset, 3762 X86FSET_AVX512FMAPS); 3763 } 3764 } 3765 } 3766 3767 if (cpi->cpi_vendor == X86_VENDOR_Intel) { 3768 if (cp->cp_ecx & CPUID_INTC_ECX_PCID) { 3769 add_x86_feature(featureset, X86FSET_PCID); 3770 } 3771 } 3772 3773 if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) { 3774 add_x86_feature(featureset, X86FSET_X2APIC); 3775 } 3776 if (cp->cp_edx & CPUID_INTC_EDX_DE) { 3777 add_x86_feature(featureset, X86FSET_DE); 3778 } 3779 #if !defined(__xpv) 3780 if (cp->cp_ecx & CPUID_INTC_ECX_MON) { 3781 3782 /* 3783 * We require the CLFLUSH instruction for erratum workaround 3784 * to use MONITOR/MWAIT. 3785 */ 3786 if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) { 3787 cpi->cpi_mwait.support |= MWAIT_SUPPORT; 3788 add_x86_feature(featureset, X86FSET_MWAIT); 3789 } else { 3790 extern int idle_cpu_assert_cflush_monitor; 3791 3792 /* 3793 * All processors we are aware of which have 3794 * MONITOR/MWAIT also have CLFLUSH. 3795 */ 3796 if (idle_cpu_assert_cflush_monitor) { 3797 ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) && 3798 (cp->cp_edx & CPUID_INTC_EDX_CLFSH)); 3799 } 3800 } 3801 } 3802 #endif /* __xpv */ 3803 3804 if (cp->cp_ecx & CPUID_INTC_ECX_VMX) { 3805 add_x86_feature(featureset, X86FSET_VMX); 3806 } 3807 3808 if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND) 3809 add_x86_feature(featureset, X86FSET_RDRAND); 3810 3811 /* 3812 * Only need it first time, rest of the cpus would follow suit. 3813 * we only capture this for the bootcpu. 3814 */ 3815 if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) { 3816 add_x86_feature(featureset, X86FSET_CLFSH); 3817 x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8); 3818 } 3819 if (is_x86_feature(featureset, X86FSET_PAE)) 3820 cpi->cpi_pabits = 36; 3821 3822 if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) { 3823 struct cpuid_regs r, *ecp; 3824 3825 ecp = &r; 3826 ecp->cp_eax = 0xD; 3827 ecp->cp_ecx = 1; 3828 ecp->cp_edx = ecp->cp_ebx = 0; 3829 (void) __cpuid_insn(ecp); 3830 3831 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT) 3832 add_x86_feature(featureset, X86FSET_XSAVEOPT); 3833 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC) 3834 add_x86_feature(featureset, X86FSET_XSAVEC); 3835 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES) 3836 add_x86_feature(featureset, X86FSET_XSAVES); 3837 } 3838 3839 /* 3840 * Work on the "extended" feature information, doing 3841 * some basic initialization for cpuid_pass2() 3842 */ 3843 xcpuid = 0; 3844 switch (cpi->cpi_vendor) { 3845 case X86_VENDOR_Intel: 3846 /* 3847 * On KVM we know we will have proper support for extended 3848 * cpuid. 3849 */ 3850 if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf || 3851 (get_hwenv() == HW_KVM && cpi->cpi_family == 6 && 3852 (cpi->cpi_model == 6 || cpi->cpi_model == 2))) 3853 xcpuid++; 3854 break; 3855 case X86_VENDOR_AMD: 3856 if (cpi->cpi_family > 5 || 3857 (cpi->cpi_family == 5 && cpi->cpi_model >= 1)) 3858 xcpuid++; 3859 break; 3860 case X86_VENDOR_Cyrix: 3861 /* 3862 * Only these Cyrix CPUs are -known- to support 3863 * extended cpuid operations. 3864 */ 3865 if (x86_type == X86_TYPE_VIA_CYRIX_III || 3866 x86_type == X86_TYPE_CYRIX_GXm) 3867 xcpuid++; 3868 break; 3869 case X86_VENDOR_Centaur: 3870 case X86_VENDOR_TM: 3871 default: 3872 xcpuid++; 3873 break; 3874 } 3875 3876 if (xcpuid) { 3877 cp = &cpi->cpi_extd[0]; 3878 cp->cp_eax = CPUID_LEAF_EXT_0; 3879 cpi->cpi_xmaxeax = __cpuid_insn(cp); 3880 } 3881 3882 if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) { 3883 3884 if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX) 3885 cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX; 3886 3887 switch (cpi->cpi_vendor) { 3888 case X86_VENDOR_Intel: 3889 case X86_VENDOR_AMD: 3890 if (cpi->cpi_xmaxeax < 0x80000001) 3891 break; 3892 cp = &cpi->cpi_extd[1]; 3893 cp->cp_eax = 0x80000001; 3894 (void) __cpuid_insn(cp); 3895 3896 if (cpi->cpi_vendor == X86_VENDOR_AMD && 3897 cpi->cpi_family == 5 && 3898 cpi->cpi_model == 6 && 3899 cpi->cpi_step == 6) { 3900 /* 3901 * K6 model 6 uses bit 10 to indicate SYSC 3902 * Later models use bit 11. Fix it here. 3903 */ 3904 if (cp->cp_edx & 0x400) { 3905 cp->cp_edx &= ~0x400; 3906 cp->cp_edx |= CPUID_AMD_EDX_SYSC; 3907 } 3908 } 3909 3910 platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp); 3911 3912 /* 3913 * Compute the additions to the kernel's feature word. 3914 */ 3915 if (cp->cp_edx & CPUID_AMD_EDX_NX) { 3916 add_x86_feature(featureset, X86FSET_NX); 3917 } 3918 3919 /* 3920 * Regardless whether or not we boot 64-bit, 3921 * we should have a way to identify whether 3922 * the CPU is capable of running 64-bit. 3923 */ 3924 if (cp->cp_edx & CPUID_AMD_EDX_LM) { 3925 add_x86_feature(featureset, X86FSET_64); 3926 } 3927 3928 /* 1 GB large page - enable only for 64 bit kernel */ 3929 if (cp->cp_edx & CPUID_AMD_EDX_1GPG) { 3930 add_x86_feature(featureset, X86FSET_1GPG); 3931 } 3932 3933 if ((cpi->cpi_vendor == X86_VENDOR_AMD) && 3934 (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) && 3935 (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) { 3936 add_x86_feature(featureset, X86FSET_SSE4A); 3937 } 3938 3939 /* 3940 * It's really tricky to support syscall/sysret in 3941 * the i386 kernel; we rely on sysenter/sysexit 3942 * instead. In the amd64 kernel, things are -way- 3943 * better. 3944 */ 3945 if (cp->cp_edx & CPUID_AMD_EDX_SYSC) { 3946 add_x86_feature(featureset, X86FSET_ASYSC); 3947 } 3948 3949 /* 3950 * While we're thinking about system calls, note 3951 * that AMD processors don't support sysenter 3952 * in long mode at all, so don't try to program them. 3953 */ 3954 if (x86_vendor == X86_VENDOR_AMD) { 3955 remove_x86_feature(featureset, X86FSET_SEP); 3956 } 3957 3958 if (cp->cp_edx & CPUID_AMD_EDX_TSCP) { 3959 add_x86_feature(featureset, X86FSET_TSCP); 3960 } 3961 3962 if (cp->cp_ecx & CPUID_AMD_ECX_SVM) { 3963 add_x86_feature(featureset, X86FSET_SVM); 3964 } 3965 3966 if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) { 3967 add_x86_feature(featureset, X86FSET_TOPOEXT); 3968 } 3969 3970 if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) { 3971 add_x86_feature(featureset, X86FSET_AMD_PCEC); 3972 } 3973 3974 if (cp->cp_ecx & CPUID_AMD_ECX_XOP) { 3975 add_x86_feature(featureset, X86FSET_XOP); 3976 } 3977 3978 if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) { 3979 add_x86_feature(featureset, X86FSET_FMA4); 3980 } 3981 3982 if (cp->cp_ecx & CPUID_AMD_ECX_TBM) { 3983 add_x86_feature(featureset, X86FSET_TBM); 3984 } 3985 3986 if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) { 3987 add_x86_feature(featureset, X86FSET_MONITORX); 3988 } 3989 break; 3990 default: 3991 break; 3992 } 3993 3994 /* 3995 * Get CPUID data about processor cores and hyperthreads. 3996 */ 3997 switch (cpi->cpi_vendor) { 3998 case X86_VENDOR_Intel: 3999 if (cpi->cpi_maxeax >= 4) { 4000 cp = &cpi->cpi_std[4]; 4001 cp->cp_eax = 4; 4002 cp->cp_ecx = 0; 4003 (void) __cpuid_insn(cp); 4004 platform_cpuid_mangle(cpi->cpi_vendor, 4, cp); 4005 } 4006 /*FALLTHROUGH*/ 4007 case X86_VENDOR_AMD: 4008 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) 4009 break; 4010 cp = &cpi->cpi_extd[8]; 4011 cp->cp_eax = CPUID_LEAF_EXT_8; 4012 (void) __cpuid_insn(cp); 4013 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, 4014 cp); 4015 4016 /* 4017 * AMD uses ebx for some extended functions. 4018 */ 4019 if (cpi->cpi_vendor == X86_VENDOR_AMD) { 4020 /* 4021 * While we're here, check for the AMD "Error 4022 * Pointer Zero/Restore" feature. This can be 4023 * used to setup the FP save handlers 4024 * appropriately. 4025 */ 4026 if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) { 4027 cpi->cpi_fp_amd_save = 0; 4028 } else { 4029 cpi->cpi_fp_amd_save = 1; 4030 } 4031 4032 if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) { 4033 add_x86_feature(featureset, 4034 X86FSET_CLZERO); 4035 } 4036 } 4037 4038 /* 4039 * Virtual and physical address limits from 4040 * cpuid override previously guessed values. 4041 */ 4042 cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0); 4043 cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8); 4044 break; 4045 default: 4046 break; 4047 } 4048 4049 /* 4050 * Get CPUID data about TSC Invariance in Deep C-State. 4051 */ 4052 switch (cpi->cpi_vendor) { 4053 case X86_VENDOR_Intel: 4054 case X86_VENDOR_AMD: 4055 if (cpi->cpi_maxeax >= 7) { 4056 cp = &cpi->cpi_extd[7]; 4057 cp->cp_eax = 0x80000007; 4058 cp->cp_ecx = 0; 4059 (void) __cpuid_insn(cp); 4060 } 4061 break; 4062 default: 4063 break; 4064 } 4065 } 4066 4067 cpuid_pass1_topology(cpu, featureset); 4068 cpuid_pass1_thermal(cpu, featureset); 4069 4070 /* 4071 * Synthesize chip "revision" and socket type 4072 */ 4073 cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family, 4074 cpi->cpi_model, cpi->cpi_step); 4075 cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor, 4076 cpi->cpi_family, cpi->cpi_model, cpi->cpi_step); 4077 cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family, 4078 cpi->cpi_model, cpi->cpi_step); 4079 4080 if (cpi->cpi_vendor == X86_VENDOR_AMD) { 4081 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 && 4082 cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) { 4083 /* Special handling for AMD FP not necessary. */ 4084 cpi->cpi_fp_amd_save = 0; 4085 } else { 4086 cpi->cpi_fp_amd_save = 1; 4087 } 4088 } 4089 4090 /* 4091 * Check the processor leaves that are used for security features. 4092 */ 4093 cpuid_scan_security(cpu, featureset); 4094 4095 pass1_done: 4096 cpi->cpi_pass = 1; 4097 } 4098 4099 /* 4100 * Make copies of the cpuid table entries we depend on, in 4101 * part for ease of parsing now, in part so that we have only 4102 * one place to correct any of it, in part for ease of 4103 * later export to userland, and in part so we can look at 4104 * this stuff in a crash dump. 4105 */ 4106 4107 /*ARGSUSED*/ 4108 void 4109 cpuid_pass2(cpu_t *cpu) 4110 { 4111 uint_t n, nmax; 4112 int i; 4113 struct cpuid_regs *cp; 4114 uint8_t *dp; 4115 uint32_t *iptr; 4116 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 4117 4118 ASSERT(cpi->cpi_pass == 1); 4119 4120 if (cpi->cpi_maxeax < 1) 4121 goto pass2_done; 4122 4123 if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD) 4124 nmax = NMAX_CPI_STD; 4125 /* 4126 * (We already handled n == 0 and n == 1 in pass 1) 4127 */ 4128 for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) { 4129 /* 4130 * leaves 6 and 7 were handled in pass 1 4131 */ 4132 if (n == 6 || n == 7) 4133 continue; 4134 4135 cp->cp_eax = n; 4136 4137 /* 4138 * CPUID function 4 expects %ecx to be initialized 4139 * with an index which indicates which cache to return 4140 * information about. The OS is expected to call function 4 4141 * with %ecx set to 0, 1, 2, ... until it returns with 4142 * EAX[4:0] set to 0, which indicates there are no more 4143 * caches. 4144 * 4145 * Here, populate cpi_std[4] with the information returned by 4146 * function 4 when %ecx == 0, and do the rest in cpuid_pass3() 4147 * when dynamic memory allocation becomes available. 4148 * 4149 * Note: we need to explicitly initialize %ecx here, since 4150 * function 4 may have been previously invoked. 4151 */ 4152 if (n == 4) 4153 cp->cp_ecx = 0; 4154 4155 (void) __cpuid_insn(cp); 4156 platform_cpuid_mangle(cpi->cpi_vendor, n, cp); 4157 switch (n) { 4158 case 2: 4159 /* 4160 * "the lower 8 bits of the %eax register 4161 * contain a value that identifies the number 4162 * of times the cpuid [instruction] has to be 4163 * executed to obtain a complete image of the 4164 * processor's caching systems." 4165 * 4166 * How *do* they make this stuff up? 4167 */ 4168 cpi->cpi_ncache = sizeof (*cp) * 4169 BITX(cp->cp_eax, 7, 0); 4170 if (cpi->cpi_ncache == 0) 4171 break; 4172 cpi->cpi_ncache--; /* skip count byte */ 4173 4174 /* 4175 * Well, for now, rather than attempt to implement 4176 * this slightly dubious algorithm, we just look 4177 * at the first 15 .. 4178 */ 4179 if (cpi->cpi_ncache > (sizeof (*cp) - 1)) 4180 cpi->cpi_ncache = sizeof (*cp) - 1; 4181 4182 dp = cpi->cpi_cacheinfo; 4183 if (BITX(cp->cp_eax, 31, 31) == 0) { 4184 uint8_t *p = (void *)&cp->cp_eax; 4185 for (i = 1; i < 4; i++) 4186 if (p[i] != 0) 4187 *dp++ = p[i]; 4188 } 4189 if (BITX(cp->cp_ebx, 31, 31) == 0) { 4190 uint8_t *p = (void *)&cp->cp_ebx; 4191 for (i = 0; i < 4; i++) 4192 if (p[i] != 0) 4193 *dp++ = p[i]; 4194 } 4195 if (BITX(cp->cp_ecx, 31, 31) == 0) { 4196 uint8_t *p = (void *)&cp->cp_ecx; 4197 for (i = 0; i < 4; i++) 4198 if (p[i] != 0) 4199 *dp++ = p[i]; 4200 } 4201 if (BITX(cp->cp_edx, 31, 31) == 0) { 4202 uint8_t *p = (void *)&cp->cp_edx; 4203 for (i = 0; i < 4; i++) 4204 if (p[i] != 0) 4205 *dp++ = p[i]; 4206 } 4207 break; 4208 4209 case 3: /* Processor serial number, if PSN supported */ 4210 break; 4211 4212 case 4: /* Deterministic cache parameters */ 4213 break; 4214 4215 case 5: /* Monitor/Mwait parameters */ 4216 { 4217 size_t mwait_size; 4218 4219 /* 4220 * check cpi_mwait.support which was set in cpuid_pass1 4221 */ 4222 if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT)) 4223 break; 4224 4225 /* 4226 * Protect ourself from insane mwait line size. 4227 * Workaround for incomplete hardware emulator(s). 4228 */ 4229 mwait_size = (size_t)MWAIT_SIZE_MAX(cpi); 4230 if (mwait_size < sizeof (uint32_t) || 4231 !ISP2(mwait_size)) { 4232 #if DEBUG 4233 cmn_err(CE_NOTE, "Cannot handle cpu %d mwait " 4234 "size %ld", cpu->cpu_id, (long)mwait_size); 4235 #endif 4236 break; 4237 } 4238 4239 cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi); 4240 cpi->cpi_mwait.mon_max = mwait_size; 4241 if (MWAIT_EXTENSION(cpi)) { 4242 cpi->cpi_mwait.support |= MWAIT_EXTENSIONS; 4243 if (MWAIT_INT_ENABLE(cpi)) 4244 cpi->cpi_mwait.support |= 4245 MWAIT_ECX_INT_ENABLE; 4246 } 4247 break; 4248 } 4249 default: 4250 break; 4251 } 4252 } 4253 4254 /* 4255 * XSAVE enumeration 4256 */ 4257 if (cpi->cpi_maxeax >= 0xD) { 4258 struct cpuid_regs regs; 4259 boolean_t cpuid_d_valid = B_TRUE; 4260 4261 cp = ®s; 4262 cp->cp_eax = 0xD; 4263 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0; 4264 4265 (void) __cpuid_insn(cp); 4266 4267 /* 4268 * Sanity checks for debug 4269 */ 4270 if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 || 4271 (cp->cp_eax & XFEATURE_SSE) == 0) { 4272 cpuid_d_valid = B_FALSE; 4273 } 4274 4275 cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax; 4276 cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx; 4277 cpi->cpi_xsave.xsav_max_size = cp->cp_ecx; 4278 4279 /* 4280 * If the hw supports AVX, get the size and offset in the save 4281 * area for the ymm state. 4282 */ 4283 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) { 4284 cp->cp_eax = 0xD; 4285 cp->cp_ecx = 2; 4286 cp->cp_edx = cp->cp_ebx = 0; 4287 4288 (void) __cpuid_insn(cp); 4289 4290 if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET || 4291 cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) { 4292 cpuid_d_valid = B_FALSE; 4293 } 4294 4295 cpi->cpi_xsave.ymm_size = cp->cp_eax; 4296 cpi->cpi_xsave.ymm_offset = cp->cp_ebx; 4297 } 4298 4299 /* 4300 * If the hw supports MPX, get the size and offset in the 4301 * save area for BNDREGS and BNDCSR. 4302 */ 4303 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) { 4304 cp->cp_eax = 0xD; 4305 cp->cp_ecx = 3; 4306 cp->cp_edx = cp->cp_ebx = 0; 4307 4308 (void) __cpuid_insn(cp); 4309 4310 cpi->cpi_xsave.bndregs_size = cp->cp_eax; 4311 cpi->cpi_xsave.bndregs_offset = cp->cp_ebx; 4312 4313 cp->cp_eax = 0xD; 4314 cp->cp_ecx = 4; 4315 cp->cp_edx = cp->cp_ebx = 0; 4316 4317 (void) __cpuid_insn(cp); 4318 4319 cpi->cpi_xsave.bndcsr_size = cp->cp_eax; 4320 cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx; 4321 } 4322 4323 /* 4324 * If the hw supports AVX512, get the size and offset in the 4325 * save area for the opmask registers and zmm state. 4326 */ 4327 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) { 4328 cp->cp_eax = 0xD; 4329 cp->cp_ecx = 5; 4330 cp->cp_edx = cp->cp_ebx = 0; 4331 4332 (void) __cpuid_insn(cp); 4333 4334 cpi->cpi_xsave.opmask_size = cp->cp_eax; 4335 cpi->cpi_xsave.opmask_offset = cp->cp_ebx; 4336 4337 cp->cp_eax = 0xD; 4338 cp->cp_ecx = 6; 4339 cp->cp_edx = cp->cp_ebx = 0; 4340 4341 (void) __cpuid_insn(cp); 4342 4343 cpi->cpi_xsave.zmmlo_size = cp->cp_eax; 4344 cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx; 4345 4346 cp->cp_eax = 0xD; 4347 cp->cp_ecx = 7; 4348 cp->cp_edx = cp->cp_ebx = 0; 4349 4350 (void) __cpuid_insn(cp); 4351 4352 cpi->cpi_xsave.zmmhi_size = cp->cp_eax; 4353 cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx; 4354 } 4355 4356 if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) { 4357 xsave_state_size = 0; 4358 } else if (cpuid_d_valid) { 4359 xsave_state_size = cpi->cpi_xsave.xsav_max_size; 4360 } else { 4361 /* Broken CPUID 0xD, probably in HVM */ 4362 cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid " 4363 "value: hw_low = %d, hw_high = %d, xsave_size = %d" 4364 ", ymm_size = %d, ymm_offset = %d\n", 4365 cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low, 4366 cpi->cpi_xsave.xsav_hw_features_high, 4367 (int)cpi->cpi_xsave.xsav_max_size, 4368 (int)cpi->cpi_xsave.ymm_size, 4369 (int)cpi->cpi_xsave.ymm_offset); 4370 4371 if (xsave_state_size != 0) { 4372 /* 4373 * This must be a non-boot CPU. We cannot 4374 * continue, because boot cpu has already 4375 * enabled XSAVE. 4376 */ 4377 ASSERT(cpu->cpu_id != 0); 4378 cmn_err(CE_PANIC, "cpu%d: we have already " 4379 "enabled XSAVE on boot cpu, cannot " 4380 "continue.", cpu->cpu_id); 4381 } else { 4382 /* 4383 * If we reached here on the boot CPU, it's also 4384 * almost certain that we'll reach here on the 4385 * non-boot CPUs. When we're here on a boot CPU 4386 * we should disable the feature, on a non-boot 4387 * CPU we need to confirm that we have. 4388 */ 4389 if (cpu->cpu_id == 0) { 4390 remove_x86_feature(x86_featureset, 4391 X86FSET_XSAVE); 4392 remove_x86_feature(x86_featureset, 4393 X86FSET_AVX); 4394 remove_x86_feature(x86_featureset, 4395 X86FSET_F16C); 4396 remove_x86_feature(x86_featureset, 4397 X86FSET_BMI1); 4398 remove_x86_feature(x86_featureset, 4399 X86FSET_BMI2); 4400 remove_x86_feature(x86_featureset, 4401 X86FSET_FMA); 4402 remove_x86_feature(x86_featureset, 4403 X86FSET_AVX2); 4404 remove_x86_feature(x86_featureset, 4405 X86FSET_MPX); 4406 remove_x86_feature(x86_featureset, 4407 X86FSET_AVX512F); 4408 remove_x86_feature(x86_featureset, 4409 X86FSET_AVX512DQ); 4410 remove_x86_feature(x86_featureset, 4411 X86FSET_AVX512PF); 4412 remove_x86_feature(x86_featureset, 4413 X86FSET_AVX512ER); 4414 remove_x86_feature(x86_featureset, 4415 X86FSET_AVX512CD); 4416 remove_x86_feature(x86_featureset, 4417 X86FSET_AVX512BW); 4418 remove_x86_feature(x86_featureset, 4419 X86FSET_AVX512VL); 4420 remove_x86_feature(x86_featureset, 4421 X86FSET_AVX512FMA); 4422 remove_x86_feature(x86_featureset, 4423 X86FSET_AVX512VBMI); 4424 remove_x86_feature(x86_featureset, 4425 X86FSET_AVX512VNNI); 4426 remove_x86_feature(x86_featureset, 4427 X86FSET_AVX512VPOPCDQ); 4428 remove_x86_feature(x86_featureset, 4429 X86FSET_AVX512NNIW); 4430 remove_x86_feature(x86_featureset, 4431 X86FSET_AVX512FMAPS); 4432 4433 CPI_FEATURES_ECX(cpi) &= 4434 ~CPUID_INTC_ECX_XSAVE; 4435 CPI_FEATURES_ECX(cpi) &= 4436 ~CPUID_INTC_ECX_AVX; 4437 CPI_FEATURES_ECX(cpi) &= 4438 ~CPUID_INTC_ECX_F16C; 4439 CPI_FEATURES_ECX(cpi) &= 4440 ~CPUID_INTC_ECX_FMA; 4441 CPI_FEATURES_7_0_EBX(cpi) &= 4442 ~CPUID_INTC_EBX_7_0_BMI1; 4443 CPI_FEATURES_7_0_EBX(cpi) &= 4444 ~CPUID_INTC_EBX_7_0_BMI2; 4445 CPI_FEATURES_7_0_EBX(cpi) &= 4446 ~CPUID_INTC_EBX_7_0_AVX2; 4447 CPI_FEATURES_7_0_EBX(cpi) &= 4448 ~CPUID_INTC_EBX_7_0_MPX; 4449 CPI_FEATURES_7_0_EBX(cpi) &= 4450 ~CPUID_INTC_EBX_7_0_ALL_AVX512; 4451 4452 CPI_FEATURES_7_0_ECX(cpi) &= 4453 ~CPUID_INTC_ECX_7_0_ALL_AVX512; 4454 4455 CPI_FEATURES_7_0_EDX(cpi) &= 4456 ~CPUID_INTC_EDX_7_0_ALL_AVX512; 4457 4458 xsave_force_disable = B_TRUE; 4459 } else { 4460 VERIFY(is_x86_feature(x86_featureset, 4461 X86FSET_XSAVE) == B_FALSE); 4462 } 4463 } 4464 } 4465 } 4466 4467 4468 if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) 4469 goto pass2_done; 4470 4471 if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD) 4472 nmax = NMAX_CPI_EXTD; 4473 /* 4474 * Copy the extended properties, fixing them as we go. 4475 * (We already handled n == 0 and n == 1 in pass 1) 4476 */ 4477 iptr = (void *)cpi->cpi_brandstr; 4478 for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) { 4479 cp->cp_eax = CPUID_LEAF_EXT_0 + n; 4480 (void) __cpuid_insn(cp); 4481 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n, 4482 cp); 4483 switch (n) { 4484 case 2: 4485 case 3: 4486 case 4: 4487 /* 4488 * Extract the brand string 4489 */ 4490 *iptr++ = cp->cp_eax; 4491 *iptr++ = cp->cp_ebx; 4492 *iptr++ = cp->cp_ecx; 4493 *iptr++ = cp->cp_edx; 4494 break; 4495 case 5: 4496 switch (cpi->cpi_vendor) { 4497 case X86_VENDOR_AMD: 4498 /* 4499 * The Athlon and Duron were the first 4500 * parts to report the sizes of the 4501 * TLB for large pages. Before then, 4502 * we don't trust the data. 4503 */ 4504 if (cpi->cpi_family < 6 || 4505 (cpi->cpi_family == 6 && 4506 cpi->cpi_model < 1)) 4507 cp->cp_eax = 0; 4508 break; 4509 default: 4510 break; 4511 } 4512 break; 4513 case 6: 4514 switch (cpi->cpi_vendor) { 4515 case X86_VENDOR_AMD: 4516 /* 4517 * The Athlon and Duron were the first 4518 * AMD parts with L2 TLB's. 4519 * Before then, don't trust the data. 4520 */ 4521 if (cpi->cpi_family < 6 || 4522 cpi->cpi_family == 6 && 4523 cpi->cpi_model < 1) 4524 cp->cp_eax = cp->cp_ebx = 0; 4525 /* 4526 * AMD Duron rev A0 reports L2 4527 * cache size incorrectly as 1K 4528 * when it is really 64K 4529 */ 4530 if (cpi->cpi_family == 6 && 4531 cpi->cpi_model == 3 && 4532 cpi->cpi_step == 0) { 4533 cp->cp_ecx &= 0xffff; 4534 cp->cp_ecx |= 0x400000; 4535 } 4536 break; 4537 case X86_VENDOR_Cyrix: /* VIA C3 */ 4538 /* 4539 * VIA C3 processors are a bit messed 4540 * up w.r.t. encoding cache sizes in %ecx 4541 */ 4542 if (cpi->cpi_family != 6) 4543 break; 4544 /* 4545 * model 7 and 8 were incorrectly encoded 4546 * 4547 * xxx is model 8 really broken? 4548 */ 4549 if (cpi->cpi_model == 7 || 4550 cpi->cpi_model == 8) 4551 cp->cp_ecx = 4552 BITX(cp->cp_ecx, 31, 24) << 16 | 4553 BITX(cp->cp_ecx, 23, 16) << 12 | 4554 BITX(cp->cp_ecx, 15, 8) << 8 | 4555 BITX(cp->cp_ecx, 7, 0); 4556 /* 4557 * model 9 stepping 1 has wrong associativity 4558 */ 4559 if (cpi->cpi_model == 9 && cpi->cpi_step == 1) 4560 cp->cp_ecx |= 8 << 12; 4561 break; 4562 case X86_VENDOR_Intel: 4563 /* 4564 * Extended L2 Cache features function. 4565 * First appeared on Prescott. 4566 */ 4567 default: 4568 break; 4569 } 4570 break; 4571 default: 4572 break; 4573 } 4574 } 4575 4576 pass2_done: 4577 cpi->cpi_pass = 2; 4578 } 4579 4580 static const char * 4581 intel_cpubrand(const struct cpuid_info *cpi) 4582 { 4583 int i; 4584 4585 if (!is_x86_feature(x86_featureset, X86FSET_CPUID) || 4586 cpi->cpi_maxeax < 1 || cpi->cpi_family < 5) 4587 return ("i486"); 4588 4589 switch (cpi->cpi_family) { 4590 case 5: 4591 return ("Intel Pentium(r)"); 4592 case 6: 4593 switch (cpi->cpi_model) { 4594 uint_t celeron, xeon; 4595 const struct cpuid_regs *cp; 4596 case 0: 4597 case 1: 4598 case 2: 4599 return ("Intel Pentium(r) Pro"); 4600 case 3: 4601 case 4: 4602 return ("Intel Pentium(r) II"); 4603 case 6: 4604 return ("Intel Celeron(r)"); 4605 case 5: 4606 case 7: 4607 celeron = xeon = 0; 4608 cp = &cpi->cpi_std[2]; /* cache info */ 4609 4610 for (i = 1; i < 4; i++) { 4611 uint_t tmp; 4612 4613 tmp = (cp->cp_eax >> (8 * i)) & 0xff; 4614 if (tmp == 0x40) 4615 celeron++; 4616 if (tmp >= 0x44 && tmp <= 0x45) 4617 xeon++; 4618 } 4619 4620 for (i = 0; i < 2; i++) { 4621 uint_t tmp; 4622 4623 tmp = (cp->cp_ebx >> (8 * i)) & 0xff; 4624 if (tmp == 0x40) 4625 celeron++; 4626 else if (tmp >= 0x44 && tmp <= 0x45) 4627 xeon++; 4628 } 4629 4630 for (i = 0; i < 4; i++) { 4631 uint_t tmp; 4632 4633 tmp = (cp->cp_ecx >> (8 * i)) & 0xff; 4634 if (tmp == 0x40) 4635 celeron++; 4636 else if (tmp >= 0x44 && tmp <= 0x45) 4637 xeon++; 4638 } 4639 4640 for (i = 0; i < 4; i++) { 4641 uint_t tmp; 4642 4643 tmp = (cp->cp_edx >> (8 * i)) & 0xff; 4644 if (tmp == 0x40) 4645 celeron++; 4646 else if (tmp >= 0x44 && tmp <= 0x45) 4647 xeon++; 4648 } 4649 4650 if (celeron) 4651 return ("Intel Celeron(r)"); 4652 if (xeon) 4653 return (cpi->cpi_model == 5 ? 4654 "Intel Pentium(r) II Xeon(tm)" : 4655 "Intel Pentium(r) III Xeon(tm)"); 4656 return (cpi->cpi_model == 5 ? 4657 "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" : 4658 "Intel Pentium(r) III or Pentium(r) III Xeon(tm)"); 4659 default: 4660 break; 4661 } 4662 default: 4663 break; 4664 } 4665 4666 /* BrandID is present if the field is nonzero */ 4667 if (cpi->cpi_brandid != 0) { 4668 static const struct { 4669 uint_t bt_bid; 4670 const char *bt_str; 4671 } brand_tbl[] = { 4672 { 0x1, "Intel(r) Celeron(r)" }, 4673 { 0x2, "Intel(r) Pentium(r) III" }, 4674 { 0x3, "Intel(r) Pentium(r) III Xeon(tm)" }, 4675 { 0x4, "Intel(r) Pentium(r) III" }, 4676 { 0x6, "Mobile Intel(r) Pentium(r) III" }, 4677 { 0x7, "Mobile Intel(r) Celeron(r)" }, 4678 { 0x8, "Intel(r) Pentium(r) 4" }, 4679 { 0x9, "Intel(r) Pentium(r) 4" }, 4680 { 0xa, "Intel(r) Celeron(r)" }, 4681 { 0xb, "Intel(r) Xeon(tm)" }, 4682 { 0xc, "Intel(r) Xeon(tm) MP" }, 4683 { 0xe, "Mobile Intel(r) Pentium(r) 4" }, 4684 { 0xf, "Mobile Intel(r) Celeron(r)" }, 4685 { 0x11, "Mobile Genuine Intel(r)" }, 4686 { 0x12, "Intel(r) Celeron(r) M" }, 4687 { 0x13, "Mobile Intel(r) Celeron(r)" }, 4688 { 0x14, "Intel(r) Celeron(r)" }, 4689 { 0x15, "Mobile Genuine Intel(r)" }, 4690 { 0x16, "Intel(r) Pentium(r) M" }, 4691 { 0x17, "Mobile Intel(r) Celeron(r)" } 4692 }; 4693 uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]); 4694 uint_t sgn; 4695 4696 sgn = (cpi->cpi_family << 8) | 4697 (cpi->cpi_model << 4) | cpi->cpi_step; 4698 4699 for (i = 0; i < btblmax; i++) 4700 if (brand_tbl[i].bt_bid == cpi->cpi_brandid) 4701 break; 4702 if (i < btblmax) { 4703 if (sgn == 0x6b1 && cpi->cpi_brandid == 3) 4704 return ("Intel(r) Celeron(r)"); 4705 if (sgn < 0xf13 && cpi->cpi_brandid == 0xb) 4706 return ("Intel(r) Xeon(tm) MP"); 4707 if (sgn < 0xf13 && cpi->cpi_brandid == 0xe) 4708 return ("Intel(r) Xeon(tm)"); 4709 return (brand_tbl[i].bt_str); 4710 } 4711 } 4712 4713 return (NULL); 4714 } 4715 4716 static const char * 4717 amd_cpubrand(const struct cpuid_info *cpi) 4718 { 4719 if (!is_x86_feature(x86_featureset, X86FSET_CPUID) || 4720 cpi->cpi_maxeax < 1 || cpi->cpi_family < 5) 4721 return ("i486 compatible"); 4722 4723 switch (cpi->cpi_family) { 4724 case 5: 4725 switch (cpi->cpi_model) { 4726 case 0: 4727 case 1: 4728 case 2: 4729 case 3: 4730 case 4: 4731 case 5: 4732 return ("AMD-K5(r)"); 4733 case 6: 4734 case 7: 4735 return ("AMD-K6(r)"); 4736 case 8: 4737 return ("AMD-K6(r)-2"); 4738 case 9: 4739 return ("AMD-K6(r)-III"); 4740 default: 4741 return ("AMD (family 5)"); 4742 } 4743 case 6: 4744 switch (cpi->cpi_model) { 4745 case 1: 4746 return ("AMD-K7(tm)"); 4747 case 0: 4748 case 2: 4749 case 4: 4750 return ("AMD Athlon(tm)"); 4751 case 3: 4752 case 7: 4753 return ("AMD Duron(tm)"); 4754 case 6: 4755 case 8: 4756 case 10: 4757 /* 4758 * Use the L2 cache size to distinguish 4759 */ 4760 return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ? 4761 "AMD Athlon(tm)" : "AMD Duron(tm)"); 4762 default: 4763 return ("AMD (family 6)"); 4764 } 4765 default: 4766 break; 4767 } 4768 4769 if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 && 4770 cpi->cpi_brandid != 0) { 4771 switch (BITX(cpi->cpi_brandid, 7, 5)) { 4772 case 3: 4773 return ("AMD Opteron(tm) UP 1xx"); 4774 case 4: 4775 return ("AMD Opteron(tm) DP 2xx"); 4776 case 5: 4777 return ("AMD Opteron(tm) MP 8xx"); 4778 default: 4779 return ("AMD Opteron(tm)"); 4780 } 4781 } 4782 4783 return (NULL); 4784 } 4785 4786 static const char * 4787 cyrix_cpubrand(struct cpuid_info *cpi, uint_t type) 4788 { 4789 if (!is_x86_feature(x86_featureset, X86FSET_CPUID) || 4790 cpi->cpi_maxeax < 1 || cpi->cpi_family < 5 || 4791 type == X86_TYPE_CYRIX_486) 4792 return ("i486 compatible"); 4793 4794 switch (type) { 4795 case X86_TYPE_CYRIX_6x86: 4796 return ("Cyrix 6x86"); 4797 case X86_TYPE_CYRIX_6x86L: 4798 return ("Cyrix 6x86L"); 4799 case X86_TYPE_CYRIX_6x86MX: 4800 return ("Cyrix 6x86MX"); 4801 case X86_TYPE_CYRIX_GXm: 4802 return ("Cyrix GXm"); 4803 case X86_TYPE_CYRIX_MediaGX: 4804 return ("Cyrix MediaGX"); 4805 case X86_TYPE_CYRIX_MII: 4806 return ("Cyrix M2"); 4807 case X86_TYPE_VIA_CYRIX_III: 4808 return ("VIA Cyrix M3"); 4809 default: 4810 /* 4811 * Have another wild guess .. 4812 */ 4813 if (cpi->cpi_family == 4 && cpi->cpi_model == 9) 4814 return ("Cyrix 5x86"); 4815 else if (cpi->cpi_family == 5) { 4816 switch (cpi->cpi_model) { 4817 case 2: 4818 return ("Cyrix 6x86"); /* Cyrix M1 */ 4819 case 4: 4820 return ("Cyrix MediaGX"); 4821 default: 4822 break; 4823 } 4824 } else if (cpi->cpi_family == 6) { 4825 switch (cpi->cpi_model) { 4826 case 0: 4827 return ("Cyrix 6x86MX"); /* Cyrix M2? */ 4828 case 5: 4829 case 6: 4830 case 7: 4831 case 8: 4832 case 9: 4833 return ("VIA C3"); 4834 default: 4835 break; 4836 } 4837 } 4838 break; 4839 } 4840 return (NULL); 4841 } 4842 4843 /* 4844 * This only gets called in the case that the CPU extended 4845 * feature brand string (0x80000002, 0x80000003, 0x80000004) 4846 * aren't available, or contain null bytes for some reason. 4847 */ 4848 static void 4849 fabricate_brandstr(struct cpuid_info *cpi) 4850 { 4851 const char *brand = NULL; 4852 4853 switch (cpi->cpi_vendor) { 4854 case X86_VENDOR_Intel: 4855 brand = intel_cpubrand(cpi); 4856 break; 4857 case X86_VENDOR_AMD: 4858 brand = amd_cpubrand(cpi); 4859 break; 4860 case X86_VENDOR_Cyrix: 4861 brand = cyrix_cpubrand(cpi, x86_type); 4862 break; 4863 case X86_VENDOR_NexGen: 4864 if (cpi->cpi_family == 5 && cpi->cpi_model == 0) 4865 brand = "NexGen Nx586"; 4866 break; 4867 case X86_VENDOR_Centaur: 4868 if (cpi->cpi_family == 5) 4869 switch (cpi->cpi_model) { 4870 case 4: 4871 brand = "Centaur C6"; 4872 break; 4873 case 8: 4874 brand = "Centaur C2"; 4875 break; 4876 case 9: 4877 brand = "Centaur C3"; 4878 break; 4879 default: 4880 break; 4881 } 4882 break; 4883 case X86_VENDOR_Rise: 4884 if (cpi->cpi_family == 5 && 4885 (cpi->cpi_model == 0 || cpi->cpi_model == 2)) 4886 brand = "Rise mP6"; 4887 break; 4888 case X86_VENDOR_SiS: 4889 if (cpi->cpi_family == 5 && cpi->cpi_model == 0) 4890 brand = "SiS 55x"; 4891 break; 4892 case X86_VENDOR_TM: 4893 if (cpi->cpi_family == 5 && cpi->cpi_model == 4) 4894 brand = "Transmeta Crusoe TM3x00 or TM5x00"; 4895 break; 4896 case X86_VENDOR_NSC: 4897 case X86_VENDOR_UMC: 4898 default: 4899 break; 4900 } 4901 if (brand) { 4902 (void) strcpy((char *)cpi->cpi_brandstr, brand); 4903 return; 4904 } 4905 4906 /* 4907 * If all else fails ... 4908 */ 4909 (void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr), 4910 "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family, 4911 cpi->cpi_model, cpi->cpi_step); 4912 } 4913 4914 /* 4915 * This routine is called just after kernel memory allocation 4916 * becomes available on cpu0, and as part of mp_startup() on 4917 * the other cpus. 4918 * 4919 * Fixup the brand string, and collect any information from cpuid 4920 * that requires dynamically allocated storage to represent. 4921 */ 4922 /*ARGSUSED*/ 4923 void 4924 cpuid_pass3(cpu_t *cpu) 4925 { 4926 int i, max, shft, level, size; 4927 struct cpuid_regs regs; 4928 struct cpuid_regs *cp; 4929 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 4930 4931 ASSERT(cpi->cpi_pass == 2); 4932 4933 /* 4934 * Deterministic cache parameters 4935 * 4936 * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The 4937 * values that are present are currently defined to be the same. This 4938 * means we can use the same logic to parse it as long as we use the 4939 * appropriate leaf to get the data. If you're updating this, make sure 4940 * you're careful about which vendor supports which aspect. 4941 * 4942 * Take this opportunity to detect the number of threads sharing the 4943 * last level cache, and construct a corresponding cache id. The 4944 * respective cpuid_info members are initialized to the default case of 4945 * "no last level cache sharing". 4946 */ 4947 cpi->cpi_ncpu_shr_last_cache = 1; 4948 cpi->cpi_last_lvl_cacheid = cpu->cpu_id; 4949 4950 if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) || 4951 (cpi->cpi_vendor == X86_VENDOR_AMD && 4952 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d && 4953 is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) { 4954 uint32_t leaf; 4955 4956 if (cpi->cpi_vendor == X86_VENDOR_Intel) { 4957 leaf = 4; 4958 } else { 4959 leaf = CPUID_LEAF_EXT_1d; 4960 } 4961 4962 /* 4963 * Find the # of elements (size) returned by the leaf and along 4964 * the way detect last level cache sharing details. 4965 */ 4966 bzero(®s, sizeof (regs)); 4967 cp = ®s; 4968 for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) { 4969 cp->cp_eax = leaf; 4970 cp->cp_ecx = i; 4971 4972 (void) __cpuid_insn(cp); 4973 4974 if (CPI_CACHE_TYPE(cp) == 0) 4975 break; 4976 level = CPI_CACHE_LVL(cp); 4977 if (level > max) { 4978 max = level; 4979 cpi->cpi_ncpu_shr_last_cache = 4980 CPI_NTHR_SHR_CACHE(cp) + 1; 4981 } 4982 } 4983 cpi->cpi_cache_leaf_size = size = i; 4984 4985 /* 4986 * Allocate the cpi_cache_leaves array. The first element 4987 * references the regs for the corresponding leaf with %ecx set 4988 * to 0. This was gathered in cpuid_pass2(). 4989 */ 4990 if (size > 0) { 4991 cpi->cpi_cache_leaves = 4992 kmem_alloc(size * sizeof (cp), KM_SLEEP); 4993 if (cpi->cpi_vendor == X86_VENDOR_Intel) { 4994 cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4]; 4995 } else { 4996 cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d]; 4997 } 4998 4999 /* 5000 * Allocate storage to hold the additional regs 5001 * for the leaf, %ecx == 1 .. cpi_cache_leaf_size. 5002 * 5003 * The regs for the leaf, %ecx == 0 has already 5004 * been allocated as indicated above. 5005 */ 5006 for (i = 1; i < size; i++) { 5007 cp = cpi->cpi_cache_leaves[i] = 5008 kmem_zalloc(sizeof (regs), KM_SLEEP); 5009 cp->cp_eax = leaf; 5010 cp->cp_ecx = i; 5011 5012 (void) __cpuid_insn(cp); 5013 } 5014 } 5015 /* 5016 * Determine the number of bits needed to represent 5017 * the number of CPUs sharing the last level cache. 5018 * 5019 * Shift off that number of bits from the APIC id to 5020 * derive the cache id. 5021 */ 5022 shft = 0; 5023 for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1) 5024 shft++; 5025 cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft; 5026 } 5027 5028 /* 5029 * Now fixup the brand string 5030 */ 5031 if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) { 5032 fabricate_brandstr(cpi); 5033 } else { 5034 5035 /* 5036 * If we successfully extracted a brand string from the cpuid 5037 * instruction, clean it up by removing leading spaces and 5038 * similar junk. 5039 */ 5040 if (cpi->cpi_brandstr[0]) { 5041 size_t maxlen = sizeof (cpi->cpi_brandstr); 5042 char *src, *dst; 5043 5044 dst = src = (char *)cpi->cpi_brandstr; 5045 src[maxlen - 1] = '\0'; 5046 /* 5047 * strip leading spaces 5048 */ 5049 while (*src == ' ') 5050 src++; 5051 /* 5052 * Remove any 'Genuine' or "Authentic" prefixes 5053 */ 5054 if (strncmp(src, "Genuine ", 8) == 0) 5055 src += 8; 5056 if (strncmp(src, "Authentic ", 10) == 0) 5057 src += 10; 5058 5059 /* 5060 * Now do an in-place copy. 5061 * Map (R) to (r) and (TM) to (tm). 5062 * The era of teletypes is long gone, and there's 5063 * -really- no need to shout. 5064 */ 5065 while (*src != '\0') { 5066 if (src[0] == '(') { 5067 if (strncmp(src + 1, "R)", 2) == 0) { 5068 (void) strncpy(dst, "(r)", 3); 5069 src += 3; 5070 dst += 3; 5071 continue; 5072 } 5073 if (strncmp(src + 1, "TM)", 3) == 0) { 5074 (void) strncpy(dst, "(tm)", 4); 5075 src += 4; 5076 dst += 4; 5077 continue; 5078 } 5079 } 5080 *dst++ = *src++; 5081 } 5082 *dst = '\0'; 5083 5084 /* 5085 * Finally, remove any trailing spaces 5086 */ 5087 while (--dst > cpi->cpi_brandstr) 5088 if (*dst == ' ') 5089 *dst = '\0'; 5090 else 5091 break; 5092 } else 5093 fabricate_brandstr(cpi); 5094 } 5095 cpi->cpi_pass = 3; 5096 } 5097 5098 /* 5099 * This routine is called out of bind_hwcap() much later in the life 5100 * of the kernel (post_startup()). The job of this routine is to resolve 5101 * the hardware feature support and kernel support for those features into 5102 * what we're actually going to tell applications via the aux vector. 5103 */ 5104 void 5105 cpuid_pass4(cpu_t *cpu, uint_t *hwcap_out) 5106 { 5107 struct cpuid_info *cpi; 5108 uint_t hwcap_flags = 0, hwcap_flags_2 = 0; 5109 5110 if (cpu == NULL) 5111 cpu = CPU; 5112 cpi = cpu->cpu_m.mcpu_cpi; 5113 5114 ASSERT(cpi->cpi_pass == 3); 5115 5116 if (cpi->cpi_maxeax >= 1) { 5117 uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES]; 5118 uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES]; 5119 uint32_t *ebx = &cpi->cpi_support[STD_EBX_FEATURES]; 5120 5121 *edx = CPI_FEATURES_EDX(cpi); 5122 *ecx = CPI_FEATURES_ECX(cpi); 5123 *ebx = CPI_FEATURES_7_0_EBX(cpi); 5124 5125 /* 5126 * [these require explicit kernel support] 5127 */ 5128 if (!is_x86_feature(x86_featureset, X86FSET_SEP)) 5129 *edx &= ~CPUID_INTC_EDX_SEP; 5130 5131 if (!is_x86_feature(x86_featureset, X86FSET_SSE)) 5132 *edx &= ~(CPUID_INTC_EDX_FXSR|CPUID_INTC_EDX_SSE); 5133 if (!is_x86_feature(x86_featureset, X86FSET_SSE2)) 5134 *edx &= ~CPUID_INTC_EDX_SSE2; 5135 5136 if (!is_x86_feature(x86_featureset, X86FSET_HTT)) 5137 *edx &= ~CPUID_INTC_EDX_HTT; 5138 5139 if (!is_x86_feature(x86_featureset, X86FSET_SSE3)) 5140 *ecx &= ~CPUID_INTC_ECX_SSE3; 5141 5142 if (!is_x86_feature(x86_featureset, X86FSET_SSSE3)) 5143 *ecx &= ~CPUID_INTC_ECX_SSSE3; 5144 if (!is_x86_feature(x86_featureset, X86FSET_SSE4_1)) 5145 *ecx &= ~CPUID_INTC_ECX_SSE4_1; 5146 if (!is_x86_feature(x86_featureset, X86FSET_SSE4_2)) 5147 *ecx &= ~CPUID_INTC_ECX_SSE4_2; 5148 if (!is_x86_feature(x86_featureset, X86FSET_AES)) 5149 *ecx &= ~CPUID_INTC_ECX_AES; 5150 if (!is_x86_feature(x86_featureset, X86FSET_PCLMULQDQ)) 5151 *ecx &= ~CPUID_INTC_ECX_PCLMULQDQ; 5152 if (!is_x86_feature(x86_featureset, X86FSET_XSAVE)) 5153 *ecx &= ~(CPUID_INTC_ECX_XSAVE | 5154 CPUID_INTC_ECX_OSXSAVE); 5155 if (!is_x86_feature(x86_featureset, X86FSET_AVX)) 5156 *ecx &= ~CPUID_INTC_ECX_AVX; 5157 if (!is_x86_feature(x86_featureset, X86FSET_F16C)) 5158 *ecx &= ~CPUID_INTC_ECX_F16C; 5159 if (!is_x86_feature(x86_featureset, X86FSET_FMA)) 5160 *ecx &= ~CPUID_INTC_ECX_FMA; 5161 if (!is_x86_feature(x86_featureset, X86FSET_BMI1)) 5162 *ebx &= ~CPUID_INTC_EBX_7_0_BMI1; 5163 if (!is_x86_feature(x86_featureset, X86FSET_BMI2)) 5164 *ebx &= ~CPUID_INTC_EBX_7_0_BMI2; 5165 if (!is_x86_feature(x86_featureset, X86FSET_AVX2)) 5166 *ebx &= ~CPUID_INTC_EBX_7_0_AVX2; 5167 if (!is_x86_feature(x86_featureset, X86FSET_RDSEED)) 5168 *ebx &= ~CPUID_INTC_EBX_7_0_RDSEED; 5169 if (!is_x86_feature(x86_featureset, X86FSET_ADX)) 5170 *ebx &= ~CPUID_INTC_EBX_7_0_ADX; 5171 5172 /* 5173 * [no explicit support required beyond x87 fp context] 5174 */ 5175 if (!fpu_exists) 5176 *edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX); 5177 5178 /* 5179 * Now map the supported feature vector to things that we 5180 * think userland will care about. 5181 */ 5182 if (*edx & CPUID_INTC_EDX_SEP) 5183 hwcap_flags |= AV_386_SEP; 5184 if (*edx & CPUID_INTC_EDX_SSE) 5185 hwcap_flags |= AV_386_FXSR | AV_386_SSE; 5186 if (*edx & CPUID_INTC_EDX_SSE2) 5187 hwcap_flags |= AV_386_SSE2; 5188 if (*ecx & CPUID_INTC_ECX_SSE3) 5189 hwcap_flags |= AV_386_SSE3; 5190 if (*ecx & CPUID_INTC_ECX_SSSE3) 5191 hwcap_flags |= AV_386_SSSE3; 5192 if (*ecx & CPUID_INTC_ECX_SSE4_1) 5193 hwcap_flags |= AV_386_SSE4_1; 5194 if (*ecx & CPUID_INTC_ECX_SSE4_2) 5195 hwcap_flags |= AV_386_SSE4_2; 5196 if (*ecx & CPUID_INTC_ECX_MOVBE) 5197 hwcap_flags |= AV_386_MOVBE; 5198 if (*ecx & CPUID_INTC_ECX_AES) 5199 hwcap_flags |= AV_386_AES; 5200 if (*ecx & CPUID_INTC_ECX_PCLMULQDQ) 5201 hwcap_flags |= AV_386_PCLMULQDQ; 5202 if ((*ecx & CPUID_INTC_ECX_XSAVE) && 5203 (*ecx & CPUID_INTC_ECX_OSXSAVE)) { 5204 hwcap_flags |= AV_386_XSAVE; 5205 5206 if (*ecx & CPUID_INTC_ECX_AVX) { 5207 uint32_t *ecx_7 = &CPI_FEATURES_7_0_ECX(cpi); 5208 uint32_t *edx_7 = &CPI_FEATURES_7_0_EDX(cpi); 5209 5210 hwcap_flags |= AV_386_AVX; 5211 if (*ecx & CPUID_INTC_ECX_F16C) 5212 hwcap_flags_2 |= AV_386_2_F16C; 5213 if (*ecx & CPUID_INTC_ECX_FMA) 5214 hwcap_flags_2 |= AV_386_2_FMA; 5215 5216 if (*ebx & CPUID_INTC_EBX_7_0_BMI1) 5217 hwcap_flags_2 |= AV_386_2_BMI1; 5218 if (*ebx & CPUID_INTC_EBX_7_0_BMI2) 5219 hwcap_flags_2 |= AV_386_2_BMI2; 5220 if (*ebx & CPUID_INTC_EBX_7_0_AVX2) 5221 hwcap_flags_2 |= AV_386_2_AVX2; 5222 if (*ebx & CPUID_INTC_EBX_7_0_AVX512F) 5223 hwcap_flags_2 |= AV_386_2_AVX512F; 5224 if (*ebx & CPUID_INTC_EBX_7_0_AVX512DQ) 5225 hwcap_flags_2 |= AV_386_2_AVX512DQ; 5226 if (*ebx & CPUID_INTC_EBX_7_0_AVX512IFMA) 5227 hwcap_flags_2 |= AV_386_2_AVX512IFMA; 5228 if (*ebx & CPUID_INTC_EBX_7_0_AVX512PF) 5229 hwcap_flags_2 |= AV_386_2_AVX512PF; 5230 if (*ebx & CPUID_INTC_EBX_7_0_AVX512ER) 5231 hwcap_flags_2 |= AV_386_2_AVX512ER; 5232 if (*ebx & CPUID_INTC_EBX_7_0_AVX512CD) 5233 hwcap_flags_2 |= AV_386_2_AVX512CD; 5234 if (*ebx & CPUID_INTC_EBX_7_0_AVX512BW) 5235 hwcap_flags_2 |= AV_386_2_AVX512BW; 5236 if (*ebx & CPUID_INTC_EBX_7_0_AVX512VL) 5237 hwcap_flags_2 |= AV_386_2_AVX512VL; 5238 5239 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VBMI) 5240 hwcap_flags_2 |= AV_386_2_AVX512VBMI; 5241 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VNNI) 5242 hwcap_flags_2 |= AV_386_2_AVX512_VNNI; 5243 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ) 5244 hwcap_flags_2 |= AV_386_2_AVX512VPOPCDQ; 5245 5246 if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124NNIW) 5247 hwcap_flags_2 |= AV_386_2_AVX512_4NNIW; 5248 if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124FMAPS) 5249 hwcap_flags_2 |= AV_386_2_AVX512_4FMAPS; 5250 } 5251 } 5252 if (*ecx & CPUID_INTC_ECX_VMX) 5253 hwcap_flags |= AV_386_VMX; 5254 if (*ecx & CPUID_INTC_ECX_POPCNT) 5255 hwcap_flags |= AV_386_POPCNT; 5256 if (*edx & CPUID_INTC_EDX_FPU) 5257 hwcap_flags |= AV_386_FPU; 5258 if (*edx & CPUID_INTC_EDX_MMX) 5259 hwcap_flags |= AV_386_MMX; 5260 5261 if (*edx & CPUID_INTC_EDX_TSC) 5262 hwcap_flags |= AV_386_TSC; 5263 if (*edx & CPUID_INTC_EDX_CX8) 5264 hwcap_flags |= AV_386_CX8; 5265 if (*edx & CPUID_INTC_EDX_CMOV) 5266 hwcap_flags |= AV_386_CMOV; 5267 if (*ecx & CPUID_INTC_ECX_CX16) 5268 hwcap_flags |= AV_386_CX16; 5269 5270 if (*ecx & CPUID_INTC_ECX_RDRAND) 5271 hwcap_flags_2 |= AV_386_2_RDRAND; 5272 if (*ebx & CPUID_INTC_EBX_7_0_ADX) 5273 hwcap_flags_2 |= AV_386_2_ADX; 5274 if (*ebx & CPUID_INTC_EBX_7_0_RDSEED) 5275 hwcap_flags_2 |= AV_386_2_RDSEED; 5276 if (*ebx & CPUID_INTC_EBX_7_0_SHA) 5277 hwcap_flags_2 |= AV_386_2_SHA; 5278 if (*ebx & CPUID_INTC_EBX_7_0_FSGSBASE) 5279 hwcap_flags_2 |= AV_386_2_FSGSBASE; 5280 if (*ebx & CPUID_INTC_EBX_7_0_CLWB) 5281 hwcap_flags_2 |= AV_386_2_CLWB; 5282 if (*ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT) 5283 hwcap_flags_2 |= AV_386_2_CLFLUSHOPT; 5284 5285 } 5286 /* 5287 * Check a few miscilaneous features. 5288 */ 5289 if (is_x86_feature(x86_featureset, X86FSET_CLZERO)) 5290 hwcap_flags_2 |= AV_386_2_CLZERO; 5291 5292 if (cpi->cpi_xmaxeax < 0x80000001) 5293 goto pass4_done; 5294 5295 switch (cpi->cpi_vendor) { 5296 struct cpuid_regs cp; 5297 uint32_t *edx, *ecx; 5298 5299 case X86_VENDOR_Intel: 5300 /* 5301 * Seems like Intel duplicated what we necessary 5302 * here to make the initial crop of 64-bit OS's work. 5303 * Hopefully, those are the only "extended" bits 5304 * they'll add. 5305 */ 5306 /*FALLTHROUGH*/ 5307 5308 case X86_VENDOR_AMD: 5309 edx = &cpi->cpi_support[AMD_EDX_FEATURES]; 5310 ecx = &cpi->cpi_support[AMD_ECX_FEATURES]; 5311 5312 *edx = CPI_FEATURES_XTD_EDX(cpi); 5313 *ecx = CPI_FEATURES_XTD_ECX(cpi); 5314 5315 /* 5316 * [these features require explicit kernel support] 5317 */ 5318 switch (cpi->cpi_vendor) { 5319 case X86_VENDOR_Intel: 5320 if (!is_x86_feature(x86_featureset, X86FSET_TSCP)) 5321 *edx &= ~CPUID_AMD_EDX_TSCP; 5322 break; 5323 5324 case X86_VENDOR_AMD: 5325 if (!is_x86_feature(x86_featureset, X86FSET_TSCP)) 5326 *edx &= ~CPUID_AMD_EDX_TSCP; 5327 if (!is_x86_feature(x86_featureset, X86FSET_SSE4A)) 5328 *ecx &= ~CPUID_AMD_ECX_SSE4A; 5329 break; 5330 5331 default: 5332 break; 5333 } 5334 5335 /* 5336 * [no explicit support required beyond 5337 * x87 fp context and exception handlers] 5338 */ 5339 if (!fpu_exists) 5340 *edx &= ~(CPUID_AMD_EDX_MMXamd | 5341 CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx); 5342 5343 if (!is_x86_feature(x86_featureset, X86FSET_NX)) 5344 *edx &= ~CPUID_AMD_EDX_NX; 5345 #if !defined(__amd64) 5346 *edx &= ~CPUID_AMD_EDX_LM; 5347 #endif 5348 /* 5349 * Now map the supported feature vector to 5350 * things that we think userland will care about. 5351 */ 5352 #if defined(__amd64) 5353 if (*edx & CPUID_AMD_EDX_SYSC) 5354 hwcap_flags |= AV_386_AMD_SYSC; 5355 #endif 5356 if (*edx & CPUID_AMD_EDX_MMXamd) 5357 hwcap_flags |= AV_386_AMD_MMX; 5358 if (*edx & CPUID_AMD_EDX_3DNow) 5359 hwcap_flags |= AV_386_AMD_3DNow; 5360 if (*edx & CPUID_AMD_EDX_3DNowx) 5361 hwcap_flags |= AV_386_AMD_3DNowx; 5362 if (*ecx & CPUID_AMD_ECX_SVM) 5363 hwcap_flags |= AV_386_AMD_SVM; 5364 5365 switch (cpi->cpi_vendor) { 5366 case X86_VENDOR_AMD: 5367 if (*edx & CPUID_AMD_EDX_TSCP) 5368 hwcap_flags |= AV_386_TSCP; 5369 if (*ecx & CPUID_AMD_ECX_AHF64) 5370 hwcap_flags |= AV_386_AHF; 5371 if (*ecx & CPUID_AMD_ECX_SSE4A) 5372 hwcap_flags |= AV_386_AMD_SSE4A; 5373 if (*ecx & CPUID_AMD_ECX_LZCNT) 5374 hwcap_flags |= AV_386_AMD_LZCNT; 5375 if (*ecx & CPUID_AMD_ECX_MONITORX) 5376 hwcap_flags_2 |= AV_386_2_MONITORX; 5377 break; 5378 5379 case X86_VENDOR_Intel: 5380 if (*edx & CPUID_AMD_EDX_TSCP) 5381 hwcap_flags |= AV_386_TSCP; 5382 if (*ecx & CPUID_AMD_ECX_LZCNT) 5383 hwcap_flags |= AV_386_AMD_LZCNT; 5384 /* 5385 * Aarrgh. 5386 * Intel uses a different bit in the same word. 5387 */ 5388 if (*ecx & CPUID_INTC_ECX_AHF64) 5389 hwcap_flags |= AV_386_AHF; 5390 break; 5391 5392 default: 5393 break; 5394 } 5395 break; 5396 5397 case X86_VENDOR_TM: 5398 cp.cp_eax = 0x80860001; 5399 (void) __cpuid_insn(&cp); 5400 cpi->cpi_support[TM_EDX_FEATURES] = cp.cp_edx; 5401 break; 5402 5403 default: 5404 break; 5405 } 5406 5407 pass4_done: 5408 cpi->cpi_pass = 4; 5409 if (hwcap_out != NULL) { 5410 hwcap_out[0] = hwcap_flags; 5411 hwcap_out[1] = hwcap_flags_2; 5412 } 5413 } 5414 5415 5416 /* 5417 * Simulate the cpuid instruction using the data we previously 5418 * captured about this CPU. We try our best to return the truth 5419 * about the hardware, independently of kernel support. 5420 */ 5421 uint32_t 5422 cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp) 5423 { 5424 struct cpuid_info *cpi; 5425 struct cpuid_regs *xcp; 5426 5427 if (cpu == NULL) 5428 cpu = CPU; 5429 cpi = cpu->cpu_m.mcpu_cpi; 5430 5431 ASSERT(cpuid_checkpass(cpu, 3)); 5432 5433 /* 5434 * CPUID data is cached in two separate places: cpi_std for standard 5435 * CPUID leaves , and cpi_extd for extended CPUID leaves. 5436 */ 5437 if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) { 5438 xcp = &cpi->cpi_std[cp->cp_eax]; 5439 } else if (cp->cp_eax >= CPUID_LEAF_EXT_0 && 5440 cp->cp_eax <= cpi->cpi_xmaxeax && 5441 cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) { 5442 xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0]; 5443 } else { 5444 /* 5445 * The caller is asking for data from an input parameter which 5446 * the kernel has not cached. In this case we go fetch from 5447 * the hardware and return the data directly to the user. 5448 */ 5449 return (__cpuid_insn(cp)); 5450 } 5451 5452 cp->cp_eax = xcp->cp_eax; 5453 cp->cp_ebx = xcp->cp_ebx; 5454 cp->cp_ecx = xcp->cp_ecx; 5455 cp->cp_edx = xcp->cp_edx; 5456 return (cp->cp_eax); 5457 } 5458 5459 int 5460 cpuid_checkpass(cpu_t *cpu, int pass) 5461 { 5462 return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL && 5463 cpu->cpu_m.mcpu_cpi->cpi_pass >= pass); 5464 } 5465 5466 int 5467 cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n) 5468 { 5469 ASSERT(cpuid_checkpass(cpu, 3)); 5470 5471 return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr)); 5472 } 5473 5474 int 5475 cpuid_is_cmt(cpu_t *cpu) 5476 { 5477 if (cpu == NULL) 5478 cpu = CPU; 5479 5480 ASSERT(cpuid_checkpass(cpu, 1)); 5481 5482 return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0); 5483 } 5484 5485 /* 5486 * AMD and Intel both implement the 64-bit variant of the syscall 5487 * instruction (syscallq), so if there's -any- support for syscall, 5488 * cpuid currently says "yes, we support this". 5489 * 5490 * However, Intel decided to -not- implement the 32-bit variant of the 5491 * syscall instruction, so we provide a predicate to allow our caller 5492 * to test that subtlety here. 5493 * 5494 * XXPV Currently, 32-bit syscall instructions don't work via the hypervisor, 5495 * even in the case where the hardware would in fact support it. 5496 */ 5497 /*ARGSUSED*/ 5498 int 5499 cpuid_syscall32_insn(cpu_t *cpu) 5500 { 5501 ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), 1)); 5502 5503 #if !defined(__xpv) 5504 if (cpu == NULL) 5505 cpu = CPU; 5506 5507 /*CSTYLED*/ 5508 { 5509 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 5510 5511 if (cpi->cpi_vendor == X86_VENDOR_AMD && 5512 cpi->cpi_xmaxeax >= 0x80000001 && 5513 (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC)) 5514 return (1); 5515 } 5516 #endif 5517 return (0); 5518 } 5519 5520 int 5521 cpuid_getidstr(cpu_t *cpu, char *s, size_t n) 5522 { 5523 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 5524 5525 static const char fmt[] = 5526 "x86 (%s %X family %d model %d step %d clock %d MHz)"; 5527 static const char fmt_ht[] = 5528 "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)"; 5529 5530 ASSERT(cpuid_checkpass(cpu, 1)); 5531 5532 if (cpuid_is_cmt(cpu)) 5533 return (snprintf(s, n, fmt_ht, cpi->cpi_chipid, 5534 cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax, 5535 cpi->cpi_family, cpi->cpi_model, 5536 cpi->cpi_step, cpu->cpu_type_info.pi_clock)); 5537 return (snprintf(s, n, fmt, 5538 cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax, 5539 cpi->cpi_family, cpi->cpi_model, 5540 cpi->cpi_step, cpu->cpu_type_info.pi_clock)); 5541 } 5542 5543 const char * 5544 cpuid_getvendorstr(cpu_t *cpu) 5545 { 5546 ASSERT(cpuid_checkpass(cpu, 1)); 5547 return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr); 5548 } 5549 5550 uint_t 5551 cpuid_getvendor(cpu_t *cpu) 5552 { 5553 ASSERT(cpuid_checkpass(cpu, 1)); 5554 return (cpu->cpu_m.mcpu_cpi->cpi_vendor); 5555 } 5556 5557 uint_t 5558 cpuid_getfamily(cpu_t *cpu) 5559 { 5560 ASSERT(cpuid_checkpass(cpu, 1)); 5561 return (cpu->cpu_m.mcpu_cpi->cpi_family); 5562 } 5563 5564 uint_t 5565 cpuid_getmodel(cpu_t *cpu) 5566 { 5567 ASSERT(cpuid_checkpass(cpu, 1)); 5568 return (cpu->cpu_m.mcpu_cpi->cpi_model); 5569 } 5570 5571 uint_t 5572 cpuid_get_ncpu_per_chip(cpu_t *cpu) 5573 { 5574 ASSERT(cpuid_checkpass(cpu, 1)); 5575 return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip); 5576 } 5577 5578 uint_t 5579 cpuid_get_ncore_per_chip(cpu_t *cpu) 5580 { 5581 ASSERT(cpuid_checkpass(cpu, 1)); 5582 return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip); 5583 } 5584 5585 uint_t 5586 cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu) 5587 { 5588 ASSERT(cpuid_checkpass(cpu, 2)); 5589 return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache); 5590 } 5591 5592 id_t 5593 cpuid_get_last_lvl_cacheid(cpu_t *cpu) 5594 { 5595 ASSERT(cpuid_checkpass(cpu, 2)); 5596 return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid); 5597 } 5598 5599 uint_t 5600 cpuid_getstep(cpu_t *cpu) 5601 { 5602 ASSERT(cpuid_checkpass(cpu, 1)); 5603 return (cpu->cpu_m.mcpu_cpi->cpi_step); 5604 } 5605 5606 uint_t 5607 cpuid_getsig(struct cpu *cpu) 5608 { 5609 ASSERT(cpuid_checkpass(cpu, 1)); 5610 return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax); 5611 } 5612 5613 uint32_t 5614 cpuid_getchiprev(struct cpu *cpu) 5615 { 5616 ASSERT(cpuid_checkpass(cpu, 1)); 5617 return (cpu->cpu_m.mcpu_cpi->cpi_chiprev); 5618 } 5619 5620 const char * 5621 cpuid_getchiprevstr(struct cpu *cpu) 5622 { 5623 ASSERT(cpuid_checkpass(cpu, 1)); 5624 return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr); 5625 } 5626 5627 uint32_t 5628 cpuid_getsockettype(struct cpu *cpu) 5629 { 5630 ASSERT(cpuid_checkpass(cpu, 1)); 5631 return (cpu->cpu_m.mcpu_cpi->cpi_socket); 5632 } 5633 5634 const char * 5635 cpuid_getsocketstr(cpu_t *cpu) 5636 { 5637 static const char *socketstr = NULL; 5638 struct cpuid_info *cpi; 5639 5640 ASSERT(cpuid_checkpass(cpu, 1)); 5641 cpi = cpu->cpu_m.mcpu_cpi; 5642 5643 /* Assume that socket types are the same across the system */ 5644 if (socketstr == NULL) 5645 socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family, 5646 cpi->cpi_model, cpi->cpi_step); 5647 5648 5649 return (socketstr); 5650 } 5651 5652 int 5653 cpuid_get_chipid(cpu_t *cpu) 5654 { 5655 ASSERT(cpuid_checkpass(cpu, 1)); 5656 5657 if (cpuid_is_cmt(cpu)) 5658 return (cpu->cpu_m.mcpu_cpi->cpi_chipid); 5659 return (cpu->cpu_id); 5660 } 5661 5662 id_t 5663 cpuid_get_coreid(cpu_t *cpu) 5664 { 5665 ASSERT(cpuid_checkpass(cpu, 1)); 5666 return (cpu->cpu_m.mcpu_cpi->cpi_coreid); 5667 } 5668 5669 int 5670 cpuid_get_pkgcoreid(cpu_t *cpu) 5671 { 5672 ASSERT(cpuid_checkpass(cpu, 1)); 5673 return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid); 5674 } 5675 5676 int 5677 cpuid_get_clogid(cpu_t *cpu) 5678 { 5679 ASSERT(cpuid_checkpass(cpu, 1)); 5680 return (cpu->cpu_m.mcpu_cpi->cpi_clogid); 5681 } 5682 5683 int 5684 cpuid_get_cacheid(cpu_t *cpu) 5685 { 5686 ASSERT(cpuid_checkpass(cpu, 1)); 5687 return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid); 5688 } 5689 5690 uint_t 5691 cpuid_get_procnodeid(cpu_t *cpu) 5692 { 5693 ASSERT(cpuid_checkpass(cpu, 1)); 5694 return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid); 5695 } 5696 5697 uint_t 5698 cpuid_get_procnodes_per_pkg(cpu_t *cpu) 5699 { 5700 ASSERT(cpuid_checkpass(cpu, 1)); 5701 return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg); 5702 } 5703 5704 uint_t 5705 cpuid_get_compunitid(cpu_t *cpu) 5706 { 5707 ASSERT(cpuid_checkpass(cpu, 1)); 5708 return (cpu->cpu_m.mcpu_cpi->cpi_compunitid); 5709 } 5710 5711 uint_t 5712 cpuid_get_cores_per_compunit(cpu_t *cpu) 5713 { 5714 ASSERT(cpuid_checkpass(cpu, 1)); 5715 return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit); 5716 } 5717 5718 /*ARGSUSED*/ 5719 int 5720 cpuid_have_cr8access(cpu_t *cpu) 5721 { 5722 #if defined(__amd64) 5723 return (1); 5724 #else 5725 struct cpuid_info *cpi; 5726 5727 ASSERT(cpu != NULL); 5728 cpi = cpu->cpu_m.mcpu_cpi; 5729 if (cpi->cpi_vendor == X86_VENDOR_AMD && cpi->cpi_maxeax >= 1 && 5730 (CPI_FEATURES_XTD_ECX(cpi) & CPUID_AMD_ECX_CR8D) != 0) 5731 return (1); 5732 return (0); 5733 #endif 5734 } 5735 5736 uint32_t 5737 cpuid_get_apicid(cpu_t *cpu) 5738 { 5739 ASSERT(cpuid_checkpass(cpu, 1)); 5740 if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) { 5741 return (UINT32_MAX); 5742 } else { 5743 return (cpu->cpu_m.mcpu_cpi->cpi_apicid); 5744 } 5745 } 5746 5747 void 5748 cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits) 5749 { 5750 struct cpuid_info *cpi; 5751 5752 if (cpu == NULL) 5753 cpu = CPU; 5754 cpi = cpu->cpu_m.mcpu_cpi; 5755 5756 ASSERT(cpuid_checkpass(cpu, 1)); 5757 5758 if (pabits) 5759 *pabits = cpi->cpi_pabits; 5760 if (vabits) 5761 *vabits = cpi->cpi_vabits; 5762 } 5763 5764 size_t 5765 cpuid_get_xsave_size() 5766 { 5767 return (MAX(cpuid_info0.cpi_xsave.xsav_max_size, 5768 sizeof (struct xsave_state))); 5769 } 5770 5771 /* 5772 * Return true if the CPUs on this system require 'pointer clearing' for the 5773 * floating point error pointer exception handling. In the past, this has been 5774 * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to 5775 * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO 5776 * feature bit and is reflected in the cpi_fp_amd_save member. 5777 */ 5778 boolean_t 5779 cpuid_need_fp_excp_handling() 5780 { 5781 return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD && 5782 cpuid_info0.cpi_fp_amd_save != 0); 5783 } 5784 5785 /* 5786 * Returns the number of data TLB entries for a corresponding 5787 * pagesize. If it can't be computed, or isn't known, the 5788 * routine returns zero. If you ask about an architecturally 5789 * impossible pagesize, the routine will panic (so that the 5790 * hat implementor knows that things are inconsistent.) 5791 */ 5792 uint_t 5793 cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize) 5794 { 5795 struct cpuid_info *cpi; 5796 uint_t dtlb_nent = 0; 5797 5798 if (cpu == NULL) 5799 cpu = CPU; 5800 cpi = cpu->cpu_m.mcpu_cpi; 5801 5802 ASSERT(cpuid_checkpass(cpu, 1)); 5803 5804 /* 5805 * Check the L2 TLB info 5806 */ 5807 if (cpi->cpi_xmaxeax >= 0x80000006) { 5808 struct cpuid_regs *cp = &cpi->cpi_extd[6]; 5809 5810 switch (pagesize) { 5811 5812 case 4 * 1024: 5813 /* 5814 * All zero in the top 16 bits of the register 5815 * indicates a unified TLB. Size is in low 16 bits. 5816 */ 5817 if ((cp->cp_ebx & 0xffff0000) == 0) 5818 dtlb_nent = cp->cp_ebx & 0x0000ffff; 5819 else 5820 dtlb_nent = BITX(cp->cp_ebx, 27, 16); 5821 break; 5822 5823 case 2 * 1024 * 1024: 5824 if ((cp->cp_eax & 0xffff0000) == 0) 5825 dtlb_nent = cp->cp_eax & 0x0000ffff; 5826 else 5827 dtlb_nent = BITX(cp->cp_eax, 27, 16); 5828 break; 5829 5830 default: 5831 panic("unknown L2 pagesize"); 5832 /*NOTREACHED*/ 5833 } 5834 } 5835 5836 if (dtlb_nent != 0) 5837 return (dtlb_nent); 5838 5839 /* 5840 * No L2 TLB support for this size, try L1. 5841 */ 5842 if (cpi->cpi_xmaxeax >= 0x80000005) { 5843 struct cpuid_regs *cp = &cpi->cpi_extd[5]; 5844 5845 switch (pagesize) { 5846 case 4 * 1024: 5847 dtlb_nent = BITX(cp->cp_ebx, 23, 16); 5848 break; 5849 case 2 * 1024 * 1024: 5850 dtlb_nent = BITX(cp->cp_eax, 23, 16); 5851 break; 5852 default: 5853 panic("unknown L1 d-TLB pagesize"); 5854 /*NOTREACHED*/ 5855 } 5856 } 5857 5858 return (dtlb_nent); 5859 } 5860 5861 /* 5862 * Return 0 if the erratum is not present or not applicable, positive 5863 * if it is, and negative if the status of the erratum is unknown. 5864 * 5865 * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm) 5866 * Processors" #25759, Rev 3.57, August 2005 5867 */ 5868 int 5869 cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum) 5870 { 5871 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 5872 uint_t eax; 5873 5874 /* 5875 * Bail out if this CPU isn't an AMD CPU, or if it's 5876 * a legacy (32-bit) AMD CPU. 5877 */ 5878 if (cpi->cpi_vendor != X86_VENDOR_AMD || 5879 cpi->cpi_family == 4 || cpi->cpi_family == 5 || 5880 cpi->cpi_family == 6) { 5881 return (0); 5882 } 5883 5884 eax = cpi->cpi_std[1].cp_eax; 5885 5886 #define SH_B0(eax) (eax == 0xf40 || eax == 0xf50) 5887 #define SH_B3(eax) (eax == 0xf51) 5888 #define B(eax) (SH_B0(eax) || SH_B3(eax)) 5889 5890 #define SH_C0(eax) (eax == 0xf48 || eax == 0xf58) 5891 5892 #define SH_CG(eax) (eax == 0xf4a || eax == 0xf5a || eax == 0xf7a) 5893 #define DH_CG(eax) (eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0) 5894 #define CH_CG(eax) (eax == 0xf82 || eax == 0xfb2) 5895 #define CG(eax) (SH_CG(eax) || DH_CG(eax) || CH_CG(eax)) 5896 5897 #define SH_D0(eax) (eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70) 5898 #define DH_D0(eax) (eax == 0x10fc0 || eax == 0x10ff0) 5899 #define CH_D0(eax) (eax == 0x10f80 || eax == 0x10fb0) 5900 #define D0(eax) (SH_D0(eax) || DH_D0(eax) || CH_D0(eax)) 5901 5902 #define SH_E0(eax) (eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70) 5903 #define JH_E1(eax) (eax == 0x20f10) /* JH8_E0 had 0x20f30 */ 5904 #define DH_E3(eax) (eax == 0x20fc0 || eax == 0x20ff0) 5905 #define SH_E4(eax) (eax == 0x20f51 || eax == 0x20f71) 5906 #define BH_E4(eax) (eax == 0x20fb1) 5907 #define SH_E5(eax) (eax == 0x20f42) 5908 #define DH_E6(eax) (eax == 0x20ff2 || eax == 0x20fc2) 5909 #define JH_E6(eax) (eax == 0x20f12 || eax == 0x20f32) 5910 #define EX(eax) (SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \ 5911 SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \ 5912 DH_E6(eax) || JH_E6(eax)) 5913 5914 #define DR_AX(eax) (eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02) 5915 #define DR_B0(eax) (eax == 0x100f20) 5916 #define DR_B1(eax) (eax == 0x100f21) 5917 #define DR_BA(eax) (eax == 0x100f2a) 5918 #define DR_B2(eax) (eax == 0x100f22) 5919 #define DR_B3(eax) (eax == 0x100f23) 5920 #define RB_C0(eax) (eax == 0x100f40) 5921 5922 switch (erratum) { 5923 case 1: 5924 return (cpi->cpi_family < 0x10); 5925 case 51: /* what does the asterisk mean? */ 5926 return (B(eax) || SH_C0(eax) || CG(eax)); 5927 case 52: 5928 return (B(eax)); 5929 case 57: 5930 return (cpi->cpi_family <= 0x11); 5931 case 58: 5932 return (B(eax)); 5933 case 60: 5934 return (cpi->cpi_family <= 0x11); 5935 case 61: 5936 case 62: 5937 case 63: 5938 case 64: 5939 case 65: 5940 case 66: 5941 case 68: 5942 case 69: 5943 case 70: 5944 case 71: 5945 return (B(eax)); 5946 case 72: 5947 return (SH_B0(eax)); 5948 case 74: 5949 return (B(eax)); 5950 case 75: 5951 return (cpi->cpi_family < 0x10); 5952 case 76: 5953 return (B(eax)); 5954 case 77: 5955 return (cpi->cpi_family <= 0x11); 5956 case 78: 5957 return (B(eax) || SH_C0(eax)); 5958 case 79: 5959 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax)); 5960 case 80: 5961 case 81: 5962 case 82: 5963 return (B(eax)); 5964 case 83: 5965 return (B(eax) || SH_C0(eax) || CG(eax)); 5966 case 85: 5967 return (cpi->cpi_family < 0x10); 5968 case 86: 5969 return (SH_C0(eax) || CG(eax)); 5970 case 88: 5971 #if !defined(__amd64) 5972 return (0); 5973 #else 5974 return (B(eax) || SH_C0(eax)); 5975 #endif 5976 case 89: 5977 return (cpi->cpi_family < 0x10); 5978 case 90: 5979 return (B(eax) || SH_C0(eax) || CG(eax)); 5980 case 91: 5981 case 92: 5982 return (B(eax) || SH_C0(eax)); 5983 case 93: 5984 return (SH_C0(eax)); 5985 case 94: 5986 return (B(eax) || SH_C0(eax) || CG(eax)); 5987 case 95: 5988 #if !defined(__amd64) 5989 return (0); 5990 #else 5991 return (B(eax) || SH_C0(eax)); 5992 #endif 5993 case 96: 5994 return (B(eax) || SH_C0(eax) || CG(eax)); 5995 case 97: 5996 case 98: 5997 return (SH_C0(eax) || CG(eax)); 5998 case 99: 5999 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax)); 6000 case 100: 6001 return (B(eax) || SH_C0(eax)); 6002 case 101: 6003 case 103: 6004 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax)); 6005 case 104: 6006 return (SH_C0(eax) || CG(eax) || D0(eax)); 6007 case 105: 6008 case 106: 6009 case 107: 6010 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax)); 6011 case 108: 6012 return (DH_CG(eax)); 6013 case 109: 6014 return (SH_C0(eax) || CG(eax) || D0(eax)); 6015 case 110: 6016 return (D0(eax) || EX(eax)); 6017 case 111: 6018 return (CG(eax)); 6019 case 112: 6020 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax)); 6021 case 113: 6022 return (eax == 0x20fc0); 6023 case 114: 6024 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax)); 6025 case 115: 6026 return (SH_E0(eax) || JH_E1(eax)); 6027 case 116: 6028 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax)); 6029 case 117: 6030 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax)); 6031 case 118: 6032 return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) || 6033 JH_E6(eax)); 6034 case 121: 6035 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax)); 6036 case 122: 6037 return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11); 6038 case 123: 6039 return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax)); 6040 case 131: 6041 return (cpi->cpi_family < 0x10); 6042 case 6336786: 6043 6044 /* 6045 * Test for AdvPowerMgmtInfo.TscPStateInvariant 6046 * if this is a K8 family or newer processor. We're testing for 6047 * this 'erratum' to determine whether or not we have a constant 6048 * TSC. 6049 * 6050 * Our current fix for this is to disable the C1-Clock ramping. 6051 * However, this doesn't work on newer processor families nor 6052 * does it work when virtualized as those devices don't exist. 6053 */ 6054 if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) { 6055 return (0); 6056 } 6057 6058 if (CPI_FAMILY(cpi) == 0xf) { 6059 struct cpuid_regs regs; 6060 regs.cp_eax = 0x80000007; 6061 (void) __cpuid_insn(®s); 6062 return (!(regs.cp_edx & 0x100)); 6063 } 6064 return (0); 6065 case 6323525: 6066 /* 6067 * This erratum (K8 #147) is not present on family 10 and newer. 6068 */ 6069 if (cpi->cpi_family >= 0x10) { 6070 return (0); 6071 } 6072 return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) | 6073 (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40); 6074 6075 case 6671130: 6076 /* 6077 * check for processors (pre-Shanghai) that do not provide 6078 * optimal management of 1gb ptes in its tlb. 6079 */ 6080 return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4); 6081 6082 case 298: 6083 return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) || 6084 DR_B2(eax) || RB_C0(eax)); 6085 6086 case 721: 6087 #if defined(__amd64) 6088 return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12); 6089 #else 6090 return (0); 6091 #endif 6092 6093 default: 6094 return (-1); 6095 6096 } 6097 } 6098 6099 /* 6100 * Determine if specified erratum is present via OSVW (OS Visible Workaround). 6101 * Return 1 if erratum is present, 0 if not present and -1 if indeterminate. 6102 */ 6103 int 6104 osvw_opteron_erratum(cpu_t *cpu, uint_t erratum) 6105 { 6106 struct cpuid_info *cpi; 6107 uint_t osvwid; 6108 static int osvwfeature = -1; 6109 uint64_t osvwlength; 6110 6111 6112 cpi = cpu->cpu_m.mcpu_cpi; 6113 6114 /* confirm OSVW supported */ 6115 if (osvwfeature == -1) { 6116 osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW; 6117 } else { 6118 /* assert that osvw feature setting is consistent on all cpus */ 6119 ASSERT(osvwfeature == 6120 (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW)); 6121 } 6122 if (!osvwfeature) 6123 return (-1); 6124 6125 osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK; 6126 6127 switch (erratum) { 6128 case 298: /* osvwid is 0 */ 6129 osvwid = 0; 6130 if (osvwlength <= (uint64_t)osvwid) { 6131 /* osvwid 0 is unknown */ 6132 return (-1); 6133 } 6134 6135 /* 6136 * Check the OSVW STATUS MSR to determine the state 6137 * of the erratum where: 6138 * 0 - fixed by HW 6139 * 1 - BIOS has applied the workaround when BIOS 6140 * workaround is available. (Or for other errata, 6141 * OS workaround is required.) 6142 * For a value of 1, caller will confirm that the 6143 * erratum 298 workaround has indeed been applied by BIOS. 6144 * 6145 * A 1 may be set in cpus that have a HW fix 6146 * in a mixed cpu system. Regarding erratum 298: 6147 * In a multiprocessor platform, the workaround above 6148 * should be applied to all processors regardless of 6149 * silicon revision when an affected processor is 6150 * present. 6151 */ 6152 6153 return (rdmsr(MSR_AMD_OSVW_STATUS + 6154 (osvwid / OSVW_ID_CNT_PER_MSR)) & 6155 (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR))); 6156 6157 default: 6158 return (-1); 6159 } 6160 } 6161 6162 static const char assoc_str[] = "associativity"; 6163 static const char line_str[] = "line-size"; 6164 static const char size_str[] = "size"; 6165 6166 static void 6167 add_cache_prop(dev_info_t *devi, const char *label, const char *type, 6168 uint32_t val) 6169 { 6170 char buf[128]; 6171 6172 /* 6173 * ndi_prop_update_int() is used because it is desirable for 6174 * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set. 6175 */ 6176 if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf)) 6177 (void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val); 6178 } 6179 6180 /* 6181 * Intel-style cache/tlb description 6182 * 6183 * Standard cpuid level 2 gives a randomly ordered 6184 * selection of tags that index into a table that describes 6185 * cache and tlb properties. 6186 */ 6187 6188 static const char l1_icache_str[] = "l1-icache"; 6189 static const char l1_dcache_str[] = "l1-dcache"; 6190 static const char l2_cache_str[] = "l2-cache"; 6191 static const char l3_cache_str[] = "l3-cache"; 6192 static const char itlb4k_str[] = "itlb-4K"; 6193 static const char dtlb4k_str[] = "dtlb-4K"; 6194 static const char itlb2M_str[] = "itlb-2M"; 6195 static const char itlb4M_str[] = "itlb-4M"; 6196 static const char dtlb4M_str[] = "dtlb-4M"; 6197 static const char dtlb24_str[] = "dtlb0-2M-4M"; 6198 static const char itlb424_str[] = "itlb-4K-2M-4M"; 6199 static const char itlb24_str[] = "itlb-2M-4M"; 6200 static const char dtlb44_str[] = "dtlb-4K-4M"; 6201 static const char sl1_dcache_str[] = "sectored-l1-dcache"; 6202 static const char sl2_cache_str[] = "sectored-l2-cache"; 6203 static const char itrace_str[] = "itrace-cache"; 6204 static const char sl3_cache_str[] = "sectored-l3-cache"; 6205 static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k"; 6206 6207 static const struct cachetab { 6208 uint8_t ct_code; 6209 uint8_t ct_assoc; 6210 uint16_t ct_line_size; 6211 size_t ct_size; 6212 const char *ct_label; 6213 } intel_ctab[] = { 6214 /* 6215 * maintain descending order! 6216 * 6217 * Codes ignored - Reason 6218 * ---------------------- 6219 * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache 6220 * f0H/f1H - Currently we do not interpret prefetch size by design 6221 */ 6222 { 0xe4, 16, 64, 8*1024*1024, l3_cache_str}, 6223 { 0xe3, 16, 64, 4*1024*1024, l3_cache_str}, 6224 { 0xe2, 16, 64, 2*1024*1024, l3_cache_str}, 6225 { 0xde, 12, 64, 6*1024*1024, l3_cache_str}, 6226 { 0xdd, 12, 64, 3*1024*1024, l3_cache_str}, 6227 { 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str}, 6228 { 0xd8, 8, 64, 4*1024*1024, l3_cache_str}, 6229 { 0xd7, 8, 64, 2*1024*1024, l3_cache_str}, 6230 { 0xd6, 8, 64, 1*1024*1024, l3_cache_str}, 6231 { 0xd2, 4, 64, 2*1024*1024, l3_cache_str}, 6232 { 0xd1, 4, 64, 1*1024*1024, l3_cache_str}, 6233 { 0xd0, 4, 64, 512*1024, l3_cache_str}, 6234 { 0xca, 4, 0, 512, sh_l2_tlb4k_str}, 6235 { 0xc0, 4, 0, 8, dtlb44_str }, 6236 { 0xba, 4, 0, 64, dtlb4k_str }, 6237 { 0xb4, 4, 0, 256, dtlb4k_str }, 6238 { 0xb3, 4, 0, 128, dtlb4k_str }, 6239 { 0xb2, 4, 0, 64, itlb4k_str }, 6240 { 0xb0, 4, 0, 128, itlb4k_str }, 6241 { 0x87, 8, 64, 1024*1024, l2_cache_str}, 6242 { 0x86, 4, 64, 512*1024, l2_cache_str}, 6243 { 0x85, 8, 32, 2*1024*1024, l2_cache_str}, 6244 { 0x84, 8, 32, 1024*1024, l2_cache_str}, 6245 { 0x83, 8, 32, 512*1024, l2_cache_str}, 6246 { 0x82, 8, 32, 256*1024, l2_cache_str}, 6247 { 0x80, 8, 64, 512*1024, l2_cache_str}, 6248 { 0x7f, 2, 64, 512*1024, l2_cache_str}, 6249 { 0x7d, 8, 64, 2*1024*1024, sl2_cache_str}, 6250 { 0x7c, 8, 64, 1024*1024, sl2_cache_str}, 6251 { 0x7b, 8, 64, 512*1024, sl2_cache_str}, 6252 { 0x7a, 8, 64, 256*1024, sl2_cache_str}, 6253 { 0x79, 8, 64, 128*1024, sl2_cache_str}, 6254 { 0x78, 8, 64, 1024*1024, l2_cache_str}, 6255 { 0x73, 8, 0, 64*1024, itrace_str}, 6256 { 0x72, 8, 0, 32*1024, itrace_str}, 6257 { 0x71, 8, 0, 16*1024, itrace_str}, 6258 { 0x70, 8, 0, 12*1024, itrace_str}, 6259 { 0x68, 4, 64, 32*1024, sl1_dcache_str}, 6260 { 0x67, 4, 64, 16*1024, sl1_dcache_str}, 6261 { 0x66, 4, 64, 8*1024, sl1_dcache_str}, 6262 { 0x60, 8, 64, 16*1024, sl1_dcache_str}, 6263 { 0x5d, 0, 0, 256, dtlb44_str}, 6264 { 0x5c, 0, 0, 128, dtlb44_str}, 6265 { 0x5b, 0, 0, 64, dtlb44_str}, 6266 { 0x5a, 4, 0, 32, dtlb24_str}, 6267 { 0x59, 0, 0, 16, dtlb4k_str}, 6268 { 0x57, 4, 0, 16, dtlb4k_str}, 6269 { 0x56, 4, 0, 16, dtlb4M_str}, 6270 { 0x55, 0, 0, 7, itlb24_str}, 6271 { 0x52, 0, 0, 256, itlb424_str}, 6272 { 0x51, 0, 0, 128, itlb424_str}, 6273 { 0x50, 0, 0, 64, itlb424_str}, 6274 { 0x4f, 0, 0, 32, itlb4k_str}, 6275 { 0x4e, 24, 64, 6*1024*1024, l2_cache_str}, 6276 { 0x4d, 16, 64, 16*1024*1024, l3_cache_str}, 6277 { 0x4c, 12, 64, 12*1024*1024, l3_cache_str}, 6278 { 0x4b, 16, 64, 8*1024*1024, l3_cache_str}, 6279 { 0x4a, 12, 64, 6*1024*1024, l3_cache_str}, 6280 { 0x49, 16, 64, 4*1024*1024, l3_cache_str}, 6281 { 0x48, 12, 64, 3*1024*1024, l2_cache_str}, 6282 { 0x47, 8, 64, 8*1024*1024, l3_cache_str}, 6283 { 0x46, 4, 64, 4*1024*1024, l3_cache_str}, 6284 { 0x45, 4, 32, 2*1024*1024, l2_cache_str}, 6285 { 0x44, 4, 32, 1024*1024, l2_cache_str}, 6286 { 0x43, 4, 32, 512*1024, l2_cache_str}, 6287 { 0x42, 4, 32, 256*1024, l2_cache_str}, 6288 { 0x41, 4, 32, 128*1024, l2_cache_str}, 6289 { 0x3e, 4, 64, 512*1024, sl2_cache_str}, 6290 { 0x3d, 6, 64, 384*1024, sl2_cache_str}, 6291 { 0x3c, 4, 64, 256*1024, sl2_cache_str}, 6292 { 0x3b, 2, 64, 128*1024, sl2_cache_str}, 6293 { 0x3a, 6, 64, 192*1024, sl2_cache_str}, 6294 { 0x39, 4, 64, 128*1024, sl2_cache_str}, 6295 { 0x30, 8, 64, 32*1024, l1_icache_str}, 6296 { 0x2c, 8, 64, 32*1024, l1_dcache_str}, 6297 { 0x29, 8, 64, 4096*1024, sl3_cache_str}, 6298 { 0x25, 8, 64, 2048*1024, sl3_cache_str}, 6299 { 0x23, 8, 64, 1024*1024, sl3_cache_str}, 6300 { 0x22, 4, 64, 512*1024, sl3_cache_str}, 6301 { 0x0e, 6, 64, 24*1024, l1_dcache_str}, 6302 { 0x0d, 4, 32, 16*1024, l1_dcache_str}, 6303 { 0x0c, 4, 32, 16*1024, l1_dcache_str}, 6304 { 0x0b, 4, 0, 4, itlb4M_str}, 6305 { 0x0a, 2, 32, 8*1024, l1_dcache_str}, 6306 { 0x08, 4, 32, 16*1024, l1_icache_str}, 6307 { 0x06, 4, 32, 8*1024, l1_icache_str}, 6308 { 0x05, 4, 0, 32, dtlb4M_str}, 6309 { 0x04, 4, 0, 8, dtlb4M_str}, 6310 { 0x03, 4, 0, 64, dtlb4k_str}, 6311 { 0x02, 4, 0, 2, itlb4M_str}, 6312 { 0x01, 4, 0, 32, itlb4k_str}, 6313 { 0 } 6314 }; 6315 6316 static const struct cachetab cyrix_ctab[] = { 6317 { 0x70, 4, 0, 32, "tlb-4K" }, 6318 { 0x80, 4, 16, 16*1024, "l1-cache" }, 6319 { 0 } 6320 }; 6321 6322 /* 6323 * Search a cache table for a matching entry 6324 */ 6325 static const struct cachetab * 6326 find_cacheent(const struct cachetab *ct, uint_t code) 6327 { 6328 if (code != 0) { 6329 for (; ct->ct_code != 0; ct++) 6330 if (ct->ct_code <= code) 6331 break; 6332 if (ct->ct_code == code) 6333 return (ct); 6334 } 6335 return (NULL); 6336 } 6337 6338 /* 6339 * Populate cachetab entry with L2 or L3 cache-information using 6340 * cpuid function 4. This function is called from intel_walk_cacheinfo() 6341 * when descriptor 0x49 is encountered. It returns 0 if no such cache 6342 * information is found. 6343 */ 6344 static int 6345 intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi) 6346 { 6347 uint32_t level, i; 6348 int ret = 0; 6349 6350 for (i = 0; i < cpi->cpi_cache_leaf_size; i++) { 6351 level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]); 6352 6353 if (level == 2 || level == 3) { 6354 ct->ct_assoc = 6355 CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1; 6356 ct->ct_line_size = 6357 CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1; 6358 ct->ct_size = ct->ct_assoc * 6359 (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) * 6360 ct->ct_line_size * 6361 (cpi->cpi_cache_leaves[i]->cp_ecx + 1); 6362 6363 if (level == 2) { 6364 ct->ct_label = l2_cache_str; 6365 } else if (level == 3) { 6366 ct->ct_label = l3_cache_str; 6367 } 6368 ret = 1; 6369 } 6370 } 6371 6372 return (ret); 6373 } 6374 6375 /* 6376 * Walk the cacheinfo descriptor, applying 'func' to every valid element 6377 * The walk is terminated if the walker returns non-zero. 6378 */ 6379 static void 6380 intel_walk_cacheinfo(struct cpuid_info *cpi, 6381 void *arg, int (*func)(void *, const struct cachetab *)) 6382 { 6383 const struct cachetab *ct; 6384 struct cachetab des_49_ct, des_b1_ct; 6385 uint8_t *dp; 6386 int i; 6387 6388 if ((dp = cpi->cpi_cacheinfo) == NULL) 6389 return; 6390 for (i = 0; i < cpi->cpi_ncache; i++, dp++) { 6391 /* 6392 * For overloaded descriptor 0x49 we use cpuid function 4 6393 * if supported by the current processor, to create 6394 * cache information. 6395 * For overloaded descriptor 0xb1 we use X86_PAE flag 6396 * to disambiguate the cache information. 6397 */ 6398 if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 && 6399 intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) { 6400 ct = &des_49_ct; 6401 } else if (*dp == 0xb1) { 6402 des_b1_ct.ct_code = 0xb1; 6403 des_b1_ct.ct_assoc = 4; 6404 des_b1_ct.ct_line_size = 0; 6405 if (is_x86_feature(x86_featureset, X86FSET_PAE)) { 6406 des_b1_ct.ct_size = 8; 6407 des_b1_ct.ct_label = itlb2M_str; 6408 } else { 6409 des_b1_ct.ct_size = 4; 6410 des_b1_ct.ct_label = itlb4M_str; 6411 } 6412 ct = &des_b1_ct; 6413 } else { 6414 if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) { 6415 continue; 6416 } 6417 } 6418 6419 if (func(arg, ct) != 0) { 6420 break; 6421 } 6422 } 6423 } 6424 6425 /* 6426 * (Like the Intel one, except for Cyrix CPUs) 6427 */ 6428 static void 6429 cyrix_walk_cacheinfo(struct cpuid_info *cpi, 6430 void *arg, int (*func)(void *, const struct cachetab *)) 6431 { 6432 const struct cachetab *ct; 6433 uint8_t *dp; 6434 int i; 6435 6436 if ((dp = cpi->cpi_cacheinfo) == NULL) 6437 return; 6438 for (i = 0; i < cpi->cpi_ncache; i++, dp++) { 6439 /* 6440 * Search Cyrix-specific descriptor table first .. 6441 */ 6442 if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) { 6443 if (func(arg, ct) != 0) 6444 break; 6445 continue; 6446 } 6447 /* 6448 * .. else fall back to the Intel one 6449 */ 6450 if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) { 6451 if (func(arg, ct) != 0) 6452 break; 6453 continue; 6454 } 6455 } 6456 } 6457 6458 /* 6459 * A cacheinfo walker that adds associativity, line-size, and size properties 6460 * to the devinfo node it is passed as an argument. 6461 */ 6462 static int 6463 add_cacheent_props(void *arg, const struct cachetab *ct) 6464 { 6465 dev_info_t *devi = arg; 6466 6467 add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc); 6468 if (ct->ct_line_size != 0) 6469 add_cache_prop(devi, ct->ct_label, line_str, 6470 ct->ct_line_size); 6471 add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size); 6472 return (0); 6473 } 6474 6475 6476 static const char fully_assoc[] = "fully-associative?"; 6477 6478 /* 6479 * AMD style cache/tlb description 6480 * 6481 * Extended functions 5 and 6 directly describe properties of 6482 * tlbs and various cache levels. 6483 */ 6484 static void 6485 add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc) 6486 { 6487 switch (assoc) { 6488 case 0: /* reserved; ignore */ 6489 break; 6490 default: 6491 add_cache_prop(devi, label, assoc_str, assoc); 6492 break; 6493 case 0xff: 6494 add_cache_prop(devi, label, fully_assoc, 1); 6495 break; 6496 } 6497 } 6498 6499 static void 6500 add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size) 6501 { 6502 if (size == 0) 6503 return; 6504 add_cache_prop(devi, label, size_str, size); 6505 add_amd_assoc(devi, label, assoc); 6506 } 6507 6508 static void 6509 add_amd_cache(dev_info_t *devi, const char *label, 6510 uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size) 6511 { 6512 if (size == 0 || line_size == 0) 6513 return; 6514 add_amd_assoc(devi, label, assoc); 6515 /* 6516 * Most AMD parts have a sectored cache. Multiple cache lines are 6517 * associated with each tag. A sector consists of all cache lines 6518 * associated with a tag. For example, the AMD K6-III has a sector 6519 * size of 2 cache lines per tag. 6520 */ 6521 if (lines_per_tag != 0) 6522 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag); 6523 add_cache_prop(devi, label, line_str, line_size); 6524 add_cache_prop(devi, label, size_str, size * 1024); 6525 } 6526 6527 static void 6528 add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc) 6529 { 6530 switch (assoc) { 6531 case 0: /* off */ 6532 break; 6533 case 1: 6534 case 2: 6535 case 4: 6536 add_cache_prop(devi, label, assoc_str, assoc); 6537 break; 6538 case 6: 6539 add_cache_prop(devi, label, assoc_str, 8); 6540 break; 6541 case 8: 6542 add_cache_prop(devi, label, assoc_str, 16); 6543 break; 6544 case 0xf: 6545 add_cache_prop(devi, label, fully_assoc, 1); 6546 break; 6547 default: /* reserved; ignore */ 6548 break; 6549 } 6550 } 6551 6552 static void 6553 add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size) 6554 { 6555 if (size == 0 || assoc == 0) 6556 return; 6557 add_amd_l2_assoc(devi, label, assoc); 6558 add_cache_prop(devi, label, size_str, size); 6559 } 6560 6561 static void 6562 add_amd_l2_cache(dev_info_t *devi, const char *label, 6563 uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size) 6564 { 6565 if (size == 0 || assoc == 0 || line_size == 0) 6566 return; 6567 add_amd_l2_assoc(devi, label, assoc); 6568 if (lines_per_tag != 0) 6569 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag); 6570 add_cache_prop(devi, label, line_str, line_size); 6571 add_cache_prop(devi, label, size_str, size * 1024); 6572 } 6573 6574 static void 6575 amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi) 6576 { 6577 struct cpuid_regs *cp; 6578 6579 if (cpi->cpi_xmaxeax < 0x80000005) 6580 return; 6581 cp = &cpi->cpi_extd[5]; 6582 6583 /* 6584 * 4M/2M L1 TLB configuration 6585 * 6586 * We report the size for 2M pages because AMD uses two 6587 * TLB entries for one 4M page. 6588 */ 6589 add_amd_tlb(devi, "dtlb-2M", 6590 BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16)); 6591 add_amd_tlb(devi, "itlb-2M", 6592 BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0)); 6593 6594 /* 6595 * 4K L1 TLB configuration 6596 */ 6597 6598 switch (cpi->cpi_vendor) { 6599 uint_t nentries; 6600 case X86_VENDOR_TM: 6601 if (cpi->cpi_family >= 5) { 6602 /* 6603 * Crusoe processors have 256 TLB entries, but 6604 * cpuid data format constrains them to only 6605 * reporting 255 of them. 6606 */ 6607 if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255) 6608 nentries = 256; 6609 /* 6610 * Crusoe processors also have a unified TLB 6611 */ 6612 add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24), 6613 nentries); 6614 break; 6615 } 6616 /*FALLTHROUGH*/ 6617 default: 6618 add_amd_tlb(devi, itlb4k_str, 6619 BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16)); 6620 add_amd_tlb(devi, dtlb4k_str, 6621 BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0)); 6622 break; 6623 } 6624 6625 /* 6626 * data L1 cache configuration 6627 */ 6628 6629 add_amd_cache(devi, l1_dcache_str, 6630 BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16), 6631 BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0)); 6632 6633 /* 6634 * code L1 cache configuration 6635 */ 6636 6637 add_amd_cache(devi, l1_icache_str, 6638 BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16), 6639 BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0)); 6640 6641 if (cpi->cpi_xmaxeax < 0x80000006) 6642 return; 6643 cp = &cpi->cpi_extd[6]; 6644 6645 /* Check for a unified L2 TLB for large pages */ 6646 6647 if (BITX(cp->cp_eax, 31, 16) == 0) 6648 add_amd_l2_tlb(devi, "l2-tlb-2M", 6649 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0)); 6650 else { 6651 add_amd_l2_tlb(devi, "l2-dtlb-2M", 6652 BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16)); 6653 add_amd_l2_tlb(devi, "l2-itlb-2M", 6654 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0)); 6655 } 6656 6657 /* Check for a unified L2 TLB for 4K pages */ 6658 6659 if (BITX(cp->cp_ebx, 31, 16) == 0) { 6660 add_amd_l2_tlb(devi, "l2-tlb-4K", 6661 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0)); 6662 } else { 6663 add_amd_l2_tlb(devi, "l2-dtlb-4K", 6664 BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16)); 6665 add_amd_l2_tlb(devi, "l2-itlb-4K", 6666 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0)); 6667 } 6668 6669 add_amd_l2_cache(devi, l2_cache_str, 6670 BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12), 6671 BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0)); 6672 } 6673 6674 /* 6675 * There are two basic ways that the x86 world describes it cache 6676 * and tlb architecture - Intel's way and AMD's way. 6677 * 6678 * Return which flavor of cache architecture we should use 6679 */ 6680 static int 6681 x86_which_cacheinfo(struct cpuid_info *cpi) 6682 { 6683 switch (cpi->cpi_vendor) { 6684 case X86_VENDOR_Intel: 6685 if (cpi->cpi_maxeax >= 2) 6686 return (X86_VENDOR_Intel); 6687 break; 6688 case X86_VENDOR_AMD: 6689 /* 6690 * The K5 model 1 was the first part from AMD that reported 6691 * cache sizes via extended cpuid functions. 6692 */ 6693 if (cpi->cpi_family > 5 || 6694 (cpi->cpi_family == 5 && cpi->cpi_model >= 1)) 6695 return (X86_VENDOR_AMD); 6696 break; 6697 case X86_VENDOR_TM: 6698 if (cpi->cpi_family >= 5) 6699 return (X86_VENDOR_AMD); 6700 /*FALLTHROUGH*/ 6701 default: 6702 /* 6703 * If they have extended CPU data for 0x80000005 6704 * then we assume they have AMD-format cache 6705 * information. 6706 * 6707 * If not, and the vendor happens to be Cyrix, 6708 * then try our-Cyrix specific handler. 6709 * 6710 * If we're not Cyrix, then assume we're using Intel's 6711 * table-driven format instead. 6712 */ 6713 if (cpi->cpi_xmaxeax >= 0x80000005) 6714 return (X86_VENDOR_AMD); 6715 else if (cpi->cpi_vendor == X86_VENDOR_Cyrix) 6716 return (X86_VENDOR_Cyrix); 6717 else if (cpi->cpi_maxeax >= 2) 6718 return (X86_VENDOR_Intel); 6719 break; 6720 } 6721 return (-1); 6722 } 6723 6724 void 6725 cpuid_set_cpu_properties(void *dip, processorid_t cpu_id, 6726 struct cpuid_info *cpi) 6727 { 6728 dev_info_t *cpu_devi; 6729 int create; 6730 6731 cpu_devi = (dev_info_t *)dip; 6732 6733 /* device_type */ 6734 (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi, 6735 "device_type", "cpu"); 6736 6737 /* reg */ 6738 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6739 "reg", cpu_id); 6740 6741 /* cpu-mhz, and clock-frequency */ 6742 if (cpu_freq > 0) { 6743 long long mul; 6744 6745 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6746 "cpu-mhz", cpu_freq); 6747 if ((mul = cpu_freq * 1000000LL) <= INT_MAX) 6748 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6749 "clock-frequency", (int)mul); 6750 } 6751 6752 if (!is_x86_feature(x86_featureset, X86FSET_CPUID)) { 6753 return; 6754 } 6755 6756 /* vendor-id */ 6757 (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi, 6758 "vendor-id", cpi->cpi_vendorstr); 6759 6760 if (cpi->cpi_maxeax == 0) { 6761 return; 6762 } 6763 6764 /* 6765 * family, model, and step 6766 */ 6767 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6768 "family", CPI_FAMILY(cpi)); 6769 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6770 "cpu-model", CPI_MODEL(cpi)); 6771 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6772 "stepping-id", CPI_STEP(cpi)); 6773 6774 /* type */ 6775 switch (cpi->cpi_vendor) { 6776 case X86_VENDOR_Intel: 6777 create = 1; 6778 break; 6779 default: 6780 create = 0; 6781 break; 6782 } 6783 if (create) 6784 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6785 "type", CPI_TYPE(cpi)); 6786 6787 /* ext-family */ 6788 switch (cpi->cpi_vendor) { 6789 case X86_VENDOR_Intel: 6790 case X86_VENDOR_AMD: 6791 create = cpi->cpi_family >= 0xf; 6792 break; 6793 default: 6794 create = 0; 6795 break; 6796 } 6797 if (create) 6798 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6799 "ext-family", CPI_FAMILY_XTD(cpi)); 6800 6801 /* ext-model */ 6802 switch (cpi->cpi_vendor) { 6803 case X86_VENDOR_Intel: 6804 create = IS_EXTENDED_MODEL_INTEL(cpi); 6805 break; 6806 case X86_VENDOR_AMD: 6807 create = CPI_FAMILY(cpi) == 0xf; 6808 break; 6809 default: 6810 create = 0; 6811 break; 6812 } 6813 if (create) 6814 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6815 "ext-model", CPI_MODEL_XTD(cpi)); 6816 6817 /* generation */ 6818 switch (cpi->cpi_vendor) { 6819 case X86_VENDOR_AMD: 6820 /* 6821 * AMD K5 model 1 was the first part to support this 6822 */ 6823 create = cpi->cpi_xmaxeax >= 0x80000001; 6824 break; 6825 default: 6826 create = 0; 6827 break; 6828 } 6829 if (create) 6830 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6831 "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8)); 6832 6833 /* brand-id */ 6834 switch (cpi->cpi_vendor) { 6835 case X86_VENDOR_Intel: 6836 /* 6837 * brand id first appeared on Pentium III Xeon model 8, 6838 * and Celeron model 8 processors and Opteron 6839 */ 6840 create = cpi->cpi_family > 6 || 6841 (cpi->cpi_family == 6 && cpi->cpi_model >= 8); 6842 break; 6843 case X86_VENDOR_AMD: 6844 create = cpi->cpi_family >= 0xf; 6845 break; 6846 default: 6847 create = 0; 6848 break; 6849 } 6850 if (create && cpi->cpi_brandid != 0) { 6851 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6852 "brand-id", cpi->cpi_brandid); 6853 } 6854 6855 /* chunks, and apic-id */ 6856 switch (cpi->cpi_vendor) { 6857 /* 6858 * first available on Pentium IV and Opteron (K8) 6859 */ 6860 case X86_VENDOR_Intel: 6861 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf; 6862 break; 6863 case X86_VENDOR_AMD: 6864 create = cpi->cpi_family >= 0xf; 6865 break; 6866 default: 6867 create = 0; 6868 break; 6869 } 6870 if (create) { 6871 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6872 "chunks", CPI_CHUNKS(cpi)); 6873 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6874 "apic-id", cpi->cpi_apicid); 6875 if (cpi->cpi_chipid >= 0) { 6876 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6877 "chip#", cpi->cpi_chipid); 6878 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6879 "clog#", cpi->cpi_clogid); 6880 } 6881 } 6882 6883 /* cpuid-features */ 6884 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6885 "cpuid-features", CPI_FEATURES_EDX(cpi)); 6886 6887 6888 /* cpuid-features-ecx */ 6889 switch (cpi->cpi_vendor) { 6890 case X86_VENDOR_Intel: 6891 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf; 6892 break; 6893 case X86_VENDOR_AMD: 6894 create = cpi->cpi_family >= 0xf; 6895 break; 6896 default: 6897 create = 0; 6898 break; 6899 } 6900 if (create) 6901 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6902 "cpuid-features-ecx", CPI_FEATURES_ECX(cpi)); 6903 6904 /* ext-cpuid-features */ 6905 switch (cpi->cpi_vendor) { 6906 case X86_VENDOR_Intel: 6907 case X86_VENDOR_AMD: 6908 case X86_VENDOR_Cyrix: 6909 case X86_VENDOR_TM: 6910 case X86_VENDOR_Centaur: 6911 create = cpi->cpi_xmaxeax >= 0x80000001; 6912 break; 6913 default: 6914 create = 0; 6915 break; 6916 } 6917 if (create) { 6918 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6919 "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi)); 6920 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6921 "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi)); 6922 } 6923 6924 /* 6925 * Brand String first appeared in Intel Pentium IV, AMD K5 6926 * model 1, and Cyrix GXm. On earlier models we try and 6927 * simulate something similar .. so this string should always 6928 * same -something- about the processor, however lame. 6929 */ 6930 (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi, 6931 "brand-string", cpi->cpi_brandstr); 6932 6933 /* 6934 * Finally, cache and tlb information 6935 */ 6936 switch (x86_which_cacheinfo(cpi)) { 6937 case X86_VENDOR_Intel: 6938 intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props); 6939 break; 6940 case X86_VENDOR_Cyrix: 6941 cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props); 6942 break; 6943 case X86_VENDOR_AMD: 6944 amd_cache_info(cpi, cpu_devi); 6945 break; 6946 default: 6947 break; 6948 } 6949 } 6950 6951 struct l2info { 6952 int *l2i_csz; 6953 int *l2i_lsz; 6954 int *l2i_assoc; 6955 int l2i_ret; 6956 }; 6957 6958 /* 6959 * A cacheinfo walker that fetches the size, line-size and associativity 6960 * of the L2 cache 6961 */ 6962 static int 6963 intel_l2cinfo(void *arg, const struct cachetab *ct) 6964 { 6965 struct l2info *l2i = arg; 6966 int *ip; 6967 6968 if (ct->ct_label != l2_cache_str && 6969 ct->ct_label != sl2_cache_str) 6970 return (0); /* not an L2 -- keep walking */ 6971 6972 if ((ip = l2i->l2i_csz) != NULL) 6973 *ip = ct->ct_size; 6974 if ((ip = l2i->l2i_lsz) != NULL) 6975 *ip = ct->ct_line_size; 6976 if ((ip = l2i->l2i_assoc) != NULL) 6977 *ip = ct->ct_assoc; 6978 l2i->l2i_ret = ct->ct_size; 6979 return (1); /* was an L2 -- terminate walk */ 6980 } 6981 6982 /* 6983 * AMD L2/L3 Cache and TLB Associativity Field Definition: 6984 * 6985 * Unlike the associativity for the L1 cache and tlb where the 8 bit 6986 * value is the associativity, the associativity for the L2 cache and 6987 * tlb is encoded in the following table. The 4 bit L2 value serves as 6988 * an index into the amd_afd[] array to determine the associativity. 6989 * -1 is undefined. 0 is fully associative. 6990 */ 6991 6992 static int amd_afd[] = 6993 {-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0}; 6994 6995 static void 6996 amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i) 6997 { 6998 struct cpuid_regs *cp; 6999 uint_t size, assoc; 7000 int i; 7001 int *ip; 7002 7003 if (cpi->cpi_xmaxeax < 0x80000006) 7004 return; 7005 cp = &cpi->cpi_extd[6]; 7006 7007 if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 && 7008 (size = BITX(cp->cp_ecx, 31, 16)) != 0) { 7009 uint_t cachesz = size * 1024; 7010 assoc = amd_afd[i]; 7011 7012 ASSERT(assoc != -1); 7013 7014 if ((ip = l2i->l2i_csz) != NULL) 7015 *ip = cachesz; 7016 if ((ip = l2i->l2i_lsz) != NULL) 7017 *ip = BITX(cp->cp_ecx, 7, 0); 7018 if ((ip = l2i->l2i_assoc) != NULL) 7019 *ip = assoc; 7020 l2i->l2i_ret = cachesz; 7021 } 7022 } 7023 7024 int 7025 getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc) 7026 { 7027 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 7028 struct l2info __l2info, *l2i = &__l2info; 7029 7030 l2i->l2i_csz = csz; 7031 l2i->l2i_lsz = lsz; 7032 l2i->l2i_assoc = assoc; 7033 l2i->l2i_ret = -1; 7034 7035 switch (x86_which_cacheinfo(cpi)) { 7036 case X86_VENDOR_Intel: 7037 intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo); 7038 break; 7039 case X86_VENDOR_Cyrix: 7040 cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo); 7041 break; 7042 case X86_VENDOR_AMD: 7043 amd_l2cacheinfo(cpi, l2i); 7044 break; 7045 default: 7046 break; 7047 } 7048 return (l2i->l2i_ret); 7049 } 7050 7051 #if !defined(__xpv) 7052 7053 uint32_t * 7054 cpuid_mwait_alloc(cpu_t *cpu) 7055 { 7056 uint32_t *ret; 7057 size_t mwait_size; 7058 7059 ASSERT(cpuid_checkpass(CPU, 2)); 7060 7061 mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max; 7062 if (mwait_size == 0) 7063 return (NULL); 7064 7065 /* 7066 * kmem_alloc() returns cache line size aligned data for mwait_size 7067 * allocations. mwait_size is currently cache line sized. Neither 7068 * of these implementation details are guarantied to be true in the 7069 * future. 7070 * 7071 * First try allocating mwait_size as kmem_alloc() currently returns 7072 * correctly aligned memory. If kmem_alloc() does not return 7073 * mwait_size aligned memory, then use mwait_size ROUNDUP. 7074 * 7075 * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we 7076 * decide to free this memory. 7077 */ 7078 ret = kmem_zalloc(mwait_size, KM_SLEEP); 7079 if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) { 7080 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret; 7081 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size; 7082 *ret = MWAIT_RUNNING; 7083 return (ret); 7084 } else { 7085 kmem_free(ret, mwait_size); 7086 ret = kmem_zalloc(mwait_size * 2, KM_SLEEP); 7087 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret; 7088 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2; 7089 ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size); 7090 *ret = MWAIT_RUNNING; 7091 return (ret); 7092 } 7093 } 7094 7095 void 7096 cpuid_mwait_free(cpu_t *cpu) 7097 { 7098 if (cpu->cpu_m.mcpu_cpi == NULL) { 7099 return; 7100 } 7101 7102 if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL && 7103 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) { 7104 kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual, 7105 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual); 7106 } 7107 7108 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL; 7109 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0; 7110 } 7111 7112 void 7113 patch_tsc_read(int flag) 7114 { 7115 size_t cnt; 7116 7117 switch (flag) { 7118 case TSC_NONE: 7119 cnt = &_no_rdtsc_end - &_no_rdtsc_start; 7120 (void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt); 7121 break; 7122 case TSC_RDTSC_MFENCE: 7123 cnt = &_tsc_mfence_end - &_tsc_mfence_start; 7124 (void) memcpy((void *)tsc_read, 7125 (void *)&_tsc_mfence_start, cnt); 7126 break; 7127 case TSC_RDTSC_LFENCE: 7128 cnt = &_tsc_lfence_end - &_tsc_lfence_start; 7129 (void) memcpy((void *)tsc_read, 7130 (void *)&_tsc_lfence_start, cnt); 7131 break; 7132 case TSC_TSCP: 7133 cnt = &_tscp_end - &_tscp_start; 7134 (void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt); 7135 break; 7136 default: 7137 /* Bail for unexpected TSC types. (TSC_NONE covers 0) */ 7138 cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag); 7139 break; 7140 } 7141 tsc_type = flag; 7142 } 7143 7144 int 7145 cpuid_deep_cstates_supported(void) 7146 { 7147 struct cpuid_info *cpi; 7148 struct cpuid_regs regs; 7149 7150 ASSERT(cpuid_checkpass(CPU, 1)); 7151 7152 cpi = CPU->cpu_m.mcpu_cpi; 7153 7154 if (!is_x86_feature(x86_featureset, X86FSET_CPUID)) 7155 return (0); 7156 7157 switch (cpi->cpi_vendor) { 7158 case X86_VENDOR_Intel: 7159 if (cpi->cpi_xmaxeax < 0x80000007) 7160 return (0); 7161 7162 /* 7163 * TSC run at a constant rate in all ACPI C-states? 7164 */ 7165 regs.cp_eax = 0x80000007; 7166 (void) __cpuid_insn(®s); 7167 return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE); 7168 7169 default: 7170 return (0); 7171 } 7172 } 7173 7174 #endif /* !__xpv */ 7175 7176 void 7177 post_startup_cpu_fixups(void) 7178 { 7179 #ifndef __xpv 7180 /* 7181 * Some AMD processors support C1E state. Entering this state will 7182 * cause the local APIC timer to stop, which we can't deal with at 7183 * this time. 7184 */ 7185 if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) { 7186 on_trap_data_t otd; 7187 uint64_t reg; 7188 7189 if (!on_trap(&otd, OT_DATA_ACCESS)) { 7190 reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT); 7191 /* Disable C1E state if it is enabled by BIOS */ 7192 if ((reg >> AMD_ACTONCMPHALT_SHIFT) & 7193 AMD_ACTONCMPHALT_MASK) { 7194 reg &= ~(AMD_ACTONCMPHALT_MASK << 7195 AMD_ACTONCMPHALT_SHIFT); 7196 wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg); 7197 } 7198 } 7199 no_trap(); 7200 } 7201 #endif /* !__xpv */ 7202 } 7203 7204 void 7205 enable_pcid(void) 7206 { 7207 if (x86_use_pcid == -1) 7208 x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID); 7209 7210 if (x86_use_invpcid == -1) { 7211 x86_use_invpcid = is_x86_feature(x86_featureset, 7212 X86FSET_INVPCID); 7213 } 7214 7215 if (!x86_use_pcid) 7216 return; 7217 7218 /* 7219 * Intel say that on setting PCIDE, it immediately starts using the PCID 7220 * bits; better make sure there's nothing there. 7221 */ 7222 ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE); 7223 7224 setcr4(getcr4() | CR4_PCIDE); 7225 } 7226 7227 /* 7228 * Setup necessary registers to enable XSAVE feature on this processor. 7229 * This function needs to be called early enough, so that no xsave/xrstor 7230 * ops will execute on the processor before the MSRs are properly set up. 7231 * 7232 * Current implementation has the following assumption: 7233 * - cpuid_pass1() is done, so that X86 features are known. 7234 * - fpu_probe() is done, so that fp_save_mech is chosen. 7235 */ 7236 void 7237 xsave_setup_msr(cpu_t *cpu) 7238 { 7239 ASSERT(fp_save_mech == FP_XSAVE); 7240 ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE)); 7241 7242 /* Enable OSXSAVE in CR4. */ 7243 setcr4(getcr4() | CR4_OSXSAVE); 7244 /* 7245 * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report 7246 * correct value. 7247 */ 7248 cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE; 7249 setup_xfem(); 7250 } 7251 7252 /* 7253 * Starting with the Westmere processor the local 7254 * APIC timer will continue running in all C-states, 7255 * including the deepest C-states. 7256 */ 7257 int 7258 cpuid_arat_supported(void) 7259 { 7260 struct cpuid_info *cpi; 7261 struct cpuid_regs regs; 7262 7263 ASSERT(cpuid_checkpass(CPU, 1)); 7264 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID)); 7265 7266 cpi = CPU->cpu_m.mcpu_cpi; 7267 7268 switch (cpi->cpi_vendor) { 7269 case X86_VENDOR_Intel: 7270 /* 7271 * Always-running Local APIC Timer is 7272 * indicated by CPUID.6.EAX[2]. 7273 */ 7274 if (cpi->cpi_maxeax >= 6) { 7275 regs.cp_eax = 6; 7276 (void) cpuid_insn(NULL, ®s); 7277 return (regs.cp_eax & CPUID_INTC_EAX_ARAT); 7278 } else { 7279 return (0); 7280 } 7281 default: 7282 return (0); 7283 } 7284 } 7285 7286 /* 7287 * Check support for Intel ENERGY_PERF_BIAS feature 7288 */ 7289 int 7290 cpuid_iepb_supported(struct cpu *cp) 7291 { 7292 struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi; 7293 struct cpuid_regs regs; 7294 7295 ASSERT(cpuid_checkpass(cp, 1)); 7296 7297 if (!(is_x86_feature(x86_featureset, X86FSET_CPUID)) || 7298 !(is_x86_feature(x86_featureset, X86FSET_MSR))) { 7299 return (0); 7300 } 7301 7302 /* 7303 * Intel ENERGY_PERF_BIAS MSR is indicated by 7304 * capability bit CPUID.6.ECX.3 7305 */ 7306 if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6)) 7307 return (0); 7308 7309 regs.cp_eax = 0x6; 7310 (void) cpuid_insn(NULL, ®s); 7311 return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS); 7312 } 7313 7314 /* 7315 * Check support for TSC deadline timer 7316 * 7317 * TSC deadline timer provides a superior software programming 7318 * model over local APIC timer that eliminates "time drifts". 7319 * Instead of specifying a relative time, software specifies an 7320 * absolute time as the target at which the processor should 7321 * generate a timer event. 7322 */ 7323 int 7324 cpuid_deadline_tsc_supported(void) 7325 { 7326 struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi; 7327 struct cpuid_regs regs; 7328 7329 ASSERT(cpuid_checkpass(CPU, 1)); 7330 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID)); 7331 7332 switch (cpi->cpi_vendor) { 7333 case X86_VENDOR_Intel: 7334 if (cpi->cpi_maxeax >= 1) { 7335 regs.cp_eax = 1; 7336 (void) cpuid_insn(NULL, ®s); 7337 return (regs.cp_ecx & CPUID_DEADLINE_TSC); 7338 } else { 7339 return (0); 7340 } 7341 default: 7342 return (0); 7343 } 7344 } 7345 7346 #if defined(__amd64) && !defined(__xpv) 7347 /* 7348 * Patch in versions of bcopy for high performance Intel Nhm processors 7349 * and later... 7350 */ 7351 void 7352 patch_memops(uint_t vendor) 7353 { 7354 size_t cnt, i; 7355 caddr_t to, from; 7356 7357 if ((vendor == X86_VENDOR_Intel) && 7358 is_x86_feature(x86_featureset, X86FSET_SSE4_2)) { 7359 cnt = &bcopy_patch_end - &bcopy_patch_start; 7360 to = &bcopy_ck_size; 7361 from = &bcopy_patch_start; 7362 for (i = 0; i < cnt; i++) { 7363 *to++ = *from++; 7364 } 7365 } 7366 } 7367 #endif /* __amd64 && !__xpv */ 7368 7369 /* 7370 * We're being asked to tell the system how many bits are required to represent 7371 * the various thread and strand IDs. While it's tempting to derive this based 7372 * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite 7373 * correct. Instead, this needs to be based on the number of bits that the APIC 7374 * allows for these different configurations. We only update these to a larger 7375 * value if we find one. 7376 */ 7377 void 7378 cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits) 7379 { 7380 struct cpuid_info *cpi; 7381 7382 VERIFY(cpuid_checkpass(CPU, 1)); 7383 cpi = cpu->cpu_m.mcpu_cpi; 7384 7385 if (cpi->cpi_ncore_bits > *core_nbits) { 7386 *core_nbits = cpi->cpi_ncore_bits; 7387 } 7388 7389 if (cpi->cpi_nthread_bits > *strand_nbits) { 7390 *strand_nbits = cpi->cpi_nthread_bits; 7391 } 7392 } 7393 7394 void 7395 cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset) 7396 { 7397 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 7398 struct cpuid_regs cp; 7399 7400 /* 7401 * Reread the CPUID portions that we need for various security 7402 * information. 7403 */ 7404 if (cpi->cpi_vendor == X86_VENDOR_Intel) { 7405 /* 7406 * Check if we now have leaf 7 available to us. 7407 */ 7408 if (cpi->cpi_maxeax < 7) { 7409 bzero(&cp, sizeof (cp)); 7410 cp.cp_eax = 0; 7411 cpi->cpi_maxeax = __cpuid_insn(&cp); 7412 if (cpi->cpi_maxeax < 7) 7413 return; 7414 } 7415 7416 bzero(&cp, sizeof (cp)); 7417 cp.cp_eax = 7; 7418 cp.cp_ecx = 0; 7419 (void) __cpuid_insn(&cp); 7420 cpi->cpi_std[7] = cp; 7421 } else if (cpi->cpi_vendor == X86_VENDOR_AMD) { 7422 /* No xcpuid support */ 7423 if (cpi->cpi_family < 5 || 7424 (cpi->cpi_family == 5 && cpi->cpi_model < 1)) 7425 return; 7426 7427 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) { 7428 bzero(&cp, sizeof (cp)); 7429 cp.cp_eax = CPUID_LEAF_EXT_0; 7430 cpi->cpi_xmaxeax = __cpuid_insn(&cp); 7431 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) { 7432 return; 7433 } 7434 } 7435 7436 bzero(&cp, sizeof (cp)); 7437 cp.cp_eax = CPUID_LEAF_EXT_8; 7438 (void) __cpuid_insn(&cp); 7439 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp); 7440 cpi->cpi_extd[8] = cp; 7441 } else { 7442 /* 7443 * Nothing to do here. Return an empty set which has already 7444 * been zeroed for us. 7445 */ 7446 return; 7447 } 7448 cpuid_scan_security(cpu, fset); 7449 } 7450 7451 /* ARGSUSED */ 7452 static int 7453 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2) 7454 { 7455 uchar_t *fset; 7456 boolean_t first_pass = (boolean_t)arg1; 7457 7458 fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id); 7459 if (first_pass && CPU->cpu_id != 0) 7460 return (0); 7461 if (!first_pass && CPU->cpu_id == 0) 7462 return (0); 7463 cpuid_pass_ucode(CPU, fset); 7464 7465 return (0); 7466 } 7467 7468 /* 7469 * After a microcode update where the version has changed, then we need to 7470 * rescan CPUID. To do this we check every CPU to make sure that they have the 7471 * same microcode. Then we perform a cross call to all such CPUs. It's the 7472 * caller's job to make sure that no one else can end up doing an update while 7473 * this is going on. 7474 * 7475 * We assume that the system is microcode capable if we're called. 7476 */ 7477 void 7478 cpuid_post_ucodeadm(void) 7479 { 7480 uint32_t rev; 7481 int i; 7482 struct cpu *cpu; 7483 cpuset_t cpuset; 7484 void *argdata; 7485 uchar_t *f0; 7486 7487 argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP); 7488 7489 mutex_enter(&cpu_lock); 7490 cpu = cpu_get(0); 7491 rev = cpu->cpu_m.mcpu_ucode_info->cui_rev; 7492 CPUSET_ONLY(cpuset, 0); 7493 for (i = 1; i < max_ncpus; i++) { 7494 if ((cpu = cpu_get(i)) == NULL) 7495 continue; 7496 7497 if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) { 7498 panic("post microcode update CPU %d has differing " 7499 "microcode revision (%u) from CPU 0 (%u)", 7500 i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev); 7501 } 7502 CPUSET_ADD(cpuset, i); 7503 } 7504 7505 /* 7506 * We do the cross calls in two passes. The first pass is only for the 7507 * boot CPU. The second pass is for all of the other CPUs. This allows 7508 * the boot CPU to go through and change behavior related to patching or 7509 * whether or not Enhanced IBRS needs to be enabled and then allow all 7510 * other CPUs to follow suit. 7511 */ 7512 kpreempt_disable(); 7513 xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset), 7514 cpuid_post_ucodeadm_xc); 7515 xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset), 7516 cpuid_post_ucodeadm_xc); 7517 kpreempt_enable(); 7518 7519 /* 7520 * OK, now look at each CPU and see if their feature sets are equal. 7521 */ 7522 f0 = argdata; 7523 for (i = 1; i < max_ncpus; i++) { 7524 uchar_t *fset; 7525 if (!CPU_IN_SET(cpuset, i)) 7526 continue; 7527 7528 fset = (uchar_t *)((uintptr_t)argdata + 7529 sizeof (x86_featureset) * i); 7530 7531 if (!compare_x86_featureset(f0, fset)) { 7532 panic("Post microcode update CPU %d has " 7533 "differing security feature (%p) set from CPU 0 " 7534 "(%p), not appending to feature set", i, 7535 (void *)fset, (void *)f0); 7536 } 7537 } 7538 7539 mutex_exit(&cpu_lock); 7540 7541 for (i = 0; i < NUM_X86_FEATURES; i++) { 7542 cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n", 7543 x86_feature_names[i]); 7544 if (is_x86_feature(f0, i)) { 7545 add_x86_feature(x86_featureset, i); 7546 } 7547 } 7548 kmem_free(argdata, sizeof (x86_featureset) * NCPU); 7549 }