1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2016 by Delphix. All rights reserved. 24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved. 25 * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net> 26 */ 27 /* 28 * Copyright (c) 2010, Intel Corporation. 29 * All rights reserved. 30 */ 31 /* 32 * Portions Copyright 2009 Advanced Micro Devices, Inc. 33 */ 34 /* 35 * Copyright 2020 Joyent, Inc. 36 */ 37 38 /* 39 * CPU Identification logic 40 * 41 * The purpose of this file and its companion, cpuid_subr.c, is to help deal 42 * with the identification of CPUs, their features, and their topologies. More 43 * specifically, this file helps drive the following: 44 * 45 * 1. Enumeration of features of the processor which are used by the kernel to 46 * determine what features to enable or disable. These may be instruction set 47 * enhancements or features that we use. 48 * 49 * 2. Enumeration of instruction set architecture (ISA) additions that userland 50 * will be told about through the auxiliary vector. 51 * 52 * 3. Understanding the physical topology of the CPU such as the number of 53 * caches, how many cores it has, whether or not it supports symmetric 54 * multi-processing (SMT), etc. 55 * 56 * ------------------------ 57 * CPUID History and Basics 58 * ------------------------ 59 * 60 * The cpuid instruction was added by Intel roughly around the time that the 61 * original Pentium was introduced. The purpose of cpuid was to tell in a 62 * programmatic fashion information about the CPU that previously was guessed 63 * at. For example, an important part of cpuid is that we can know what 64 * extensions to the ISA exist. If you use an invalid opcode you would get a 65 * #UD, so this method allows a program (whether a user program or the kernel) 66 * to determine what exists without crashing or getting a SIGILL. Of course, 67 * this was also during the era of the clones and the AMD Am5x86. The vendor 68 * name shows up first in cpuid for a reason. 69 * 70 * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts 71 * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has 72 * its own meaning. The different leaves are broken down into different regions: 73 * 74 * [ 0, 7fffffff ] This region is called the 'basic' 75 * region. This region is generally defined 76 * by Intel, though some of the original 77 * portions have different meanings based 78 * on the manufacturer. These days, Intel 79 * adds most new features to this region. 80 * AMD adds non-Intel compatible 81 * information in the third, extended 82 * region. Intel uses this for everything 83 * including ISA extensions, CPU 84 * features, cache information, topology, 85 * and more. 86 * 87 * There is a hole carved out of this 88 * region which is reserved for 89 * hypervisors. 90 * 91 * [ 40000000, 4fffffff ] This region, which is found in the 92 * middle of the previous region, is 93 * explicitly promised to never be used by 94 * CPUs. Instead, it is used by hypervisors 95 * to communicate information about 96 * themselves to the operating system. The 97 * values and details are unique for each 98 * hypervisor. 99 * 100 * [ 80000000, ffffffff ] This region is called the 'extended' 101 * region. Some of the low leaves mirror 102 * parts of the basic leaves. This region 103 * has generally been used by AMD for 104 * various extensions. For example, AMD- 105 * specific information about caches, 106 * features, and topology are found in this 107 * region. 108 * 109 * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx, 110 * and %edx, and then issue the cpuid instruction. At the first leaf in each of 111 * the ranges, one of the primary things returned is the maximum valid leaf in 112 * that range. This allows for discovery of what range of CPUID is valid. 113 * 114 * The CPUs have potentially surprising behavior when using an invalid leaf or 115 * unimplemented leaf. If the requested leaf is within the valid basic or 116 * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be 117 * set to zero. However, if you specify a leaf that is outside of a valid range, 118 * then instead it will be filled with the last valid _basic_ leaf. For example, 119 * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or 120 * an invalid extended leaf will return the information for leaf 3. 121 * 122 * Some leaves are broken down into sub-leaves. This means that the value 123 * depends on both the leaf asked for in %eax and a secondary register. For 124 * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get 125 * additional information. Or when getting topology information in leaf 0xb, the 126 * initial value in %ecx changes which level of the topology that you are 127 * getting information about. 128 * 129 * cpuid values are always kept to 32 bits regardless of whether or not the 130 * program is in 64-bit mode. When executing in 64-bit mode, the upper 131 * 32 bits of the register are always set to zero so that way the values are the 132 * same regardless of execution mode. 133 * 134 * ---------------------- 135 * Identifying Processors 136 * ---------------------- 137 * 138 * We can identify a processor in two steps. The first step looks at cpuid leaf 139 * 0. Leaf 0 contains the processor's vendor information. This is done by 140 * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is 141 * 'AuthenticAMD' and on Intel it is 'GenuineIntel'. 142 * 143 * From there, a processor is identified by a combination of three different 144 * values: 145 * 146 * 1. Family 147 * 2. Model 148 * 3. Stepping 149 * 150 * Each vendor uses the family and model to uniquely identify a processor. The 151 * way that family and model are changed depends on the vendor. For example, 152 * Intel has been using family 0x6 for almost all of their processor since the 153 * Pentium Pro/Pentium II era, often called the P6. The model is used to 154 * identify the exact processor. Different models are often used for the client 155 * (consumer) and server parts. Even though each processor often has major 156 * architectural differences, they still are considered the same family by 157 * Intel. 158 * 159 * On the other hand, each major AMD architecture generally has its own family. 160 * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it 161 * the model number is used to help identify specific processors. 162 * 163 * The stepping is used to refer to a revision of a specific microprocessor. The 164 * term comes from equipment used to produce masks that are used to create 165 * integrated circuits. 166 * 167 * The information is present in leaf 1, %eax. In technical documentation you 168 * will see the terms extended model and extended family. The original family, 169 * model, and stepping fields were each 4 bits wide. If the values in either 170 * are 0xf, then one is to consult the extended model and extended family, which 171 * take previously reserved bits and allow for a larger number of models and add 172 * 0xf to them. 173 * 174 * When we process this information, we store the full family, model, and 175 * stepping in the struct cpuid_info members cpi_family, cpi_model, and 176 * cpi_step, respectively. Whenever you are performing comparisons with the 177 * family, model, and stepping, you should use these members and not the raw 178 * values from cpuid. If you must use the raw values from cpuid directly, you 179 * must make sure that you add the extended model and family to the base model 180 * and family. 181 * 182 * In general, we do not use information about the family, model, and stepping 183 * to determine whether or not a feature is present; that is generally driven by 184 * specific leaves. However, when something we care about on the processor is 185 * not considered 'architectural' meaning that it is specific to a set of 186 * processors and not promised in the architecture model to be consistent from 187 * generation to generation, then we will fall back on this information. The 188 * most common cases where this comes up is when we have to workaround errata in 189 * the processor, are dealing with processor-specific features such as CPU 190 * performance counters, or we want to provide additional information for things 191 * such as fault management. 192 * 193 * While processors also do have a brand string, which is the name that people 194 * are familiar with when buying the processor, they are not meant for 195 * programmatic consumption. That is what the family, model, and stepping are 196 * for. 197 * 198 * ------------ 199 * CPUID Passes 200 * ------------ 201 * 202 * As part of performing feature detection, we break this into several different 203 * passes. The passes are as follows: 204 * 205 * Pass 0 This is a primordial pass done in locore.s to deal with 206 * Cyrix CPUs that don't support cpuid. The reality is that 207 * we likely don't run on them any more, but there is still 208 * logic for handling them. 209 * 210 * Pass 1 This is the primary pass and is responsible for doing a 211 * large number of different things: 212 * 213 * 1. Determine which vendor manufactured the CPU and 214 * determining the family, model, and stepping information. 215 * 216 * 2. Gathering a large number of feature flags to 217 * determine which features the CPU support and which 218 * indicate things that we need to do other work in the OS 219 * to enable. Features detected this way are added to the 220 * x86_featureset which can be queried to 221 * determine what we should do. This includes processing 222 * all of the basic and extended CPU features that we care 223 * about. 224 * 225 * 3. Determining the CPU's topology. This includes 226 * information about how many cores and threads are present 227 * in the package. It also is responsible for figuring out 228 * which logical CPUs are potentially part of the same core 229 * and what other resources they might share. For more 230 * information see the 'Topology' section. 231 * 232 * 4. Determining the set of CPU security-specific features 233 * that we need to worry about and determine the 234 * appropriate set of workarounds. 235 * 236 * Pass 1 on the boot CPU occurs before KMDB is started. 237 * 238 * Pass 2 The second pass is done after startup(). Here, we check 239 * other miscellaneous features. Most of this is gathering 240 * additional basic and extended features that we'll use in 241 * later passes or for debugging support. 242 * 243 * Pass 3 The third pass occurs after the kernel memory allocator 244 * has been fully initialized. This gathers information 245 * where we might need dynamic memory available for our 246 * uses. This includes several varying width leaves that 247 * have cache information and the processor's brand string. 248 * 249 * Pass 4 The fourth and final normal pass is performed after the 250 * kernel has brought most everything online. This is 251 * invoked from post_startup(). In this pass, we go through 252 * the set of features that we have enabled and turn that 253 * into the hardware auxiliary vector features that 254 * userland receives. This is used by userland, primarily 255 * by the run-time link-editor (RTLD), though userland 256 * software could also refer to it directly. 257 * 258 * Microcode After a microcode update, we do a selective rescan of 259 * the cpuid leaves to determine what features have 260 * changed. Microcode updates can provide more details 261 * about security related features to deal with issues like 262 * Spectre and L1TF. On occasion, vendors have violated 263 * their contract and removed bits. However, we don't try 264 * to detect that because that puts us in a situation that 265 * we really can't deal with. As such, the only thing we 266 * rescan are security related features today. See 267 * cpuid_pass_ucode(). 268 * 269 * All of the passes (except pass 0) are run on all CPUs. However, for the most 270 * part we only care about what the boot CPU says about this information and use 271 * the other CPUs as a rough guide to sanity check that we have the same feature 272 * set. 273 * 274 * We do not support running multiple logical CPUs with disjoint, let alone 275 * different, feature sets. 276 * 277 * ------------------ 278 * Processor Topology 279 * ------------------ 280 * 281 * One of the important things that we need to do is to understand the topology 282 * of the underlying processor. When we say topology in this case, we're trying 283 * to understand the relationship between the logical CPUs that the operating 284 * system sees and the underlying physical layout. Different logical CPUs may 285 * share different resources which can have important consequences for the 286 * performance of the system. For example, they may share caches, execution 287 * units, and more. 288 * 289 * The topology of the processor changes from generation to generation and 290 * vendor to vendor. Along with that, different vendors use different 291 * terminology, and the operating system itself uses occasionally overlapping 292 * terminology. It's important to understand what this topology looks like so 293 * one can understand the different things that we try to calculate and 294 * determine. 295 * 296 * To get started, let's talk about a little bit of terminology that we've used 297 * so far, is used throughout this file, and is fairly generic across multiple 298 * vendors: 299 * 300 * CPU 301 * A central processing unit (CPU) refers to a logical and/or virtual 302 * entity that the operating system can execute instructions on. The 303 * underlying resources for this CPU may be shared between multiple 304 * entities; however, to the operating system it is a discrete unit. 305 * 306 * PROCESSOR and PACKAGE 307 * 308 * Generally, when we use the term 'processor' on its own, we are referring 309 * to the physical entity that one buys and plugs into a board. However, 310 * because processor has been overloaded and one might see it used to mean 311 * multiple different levels, we will instead use the term 'package' for 312 * the rest of this file. The term package comes from the electrical 313 * engineering side and refers to the physical entity that encloses the 314 * electronics inside. Strictly speaking the package can contain more than 315 * just the CPU, for example, on many processors it may also have what's 316 * called an 'integrated graphical processing unit (GPU)'. Because the 317 * package can encapsulate multiple units, it is the largest physical unit 318 * that we refer to. 319 * 320 * SOCKET 321 * 322 * A socket refers to unit on a system board (generally the motherboard) 323 * that can receive a package. A single package, or processor, is plugged 324 * into a single socket. A system may have multiple sockets. Often times, 325 * the term socket is used interchangeably with package and refers to the 326 * electrical component that has plugged in, and not the receptacle itself. 327 * 328 * CORE 329 * 330 * A core refers to the physical instantiation of a CPU, generally, with a 331 * full set of hardware resources available to it. A package may contain 332 * multiple cores inside of it or it may just have a single one. A 333 * processor with more than one core is often referred to as 'multi-core'. 334 * In illumos, we will use the feature X86FSET_CMP to refer to a system 335 * that has 'multi-core' processors. 336 * 337 * A core may expose a single logical CPU to the operating system, or it 338 * may expose multiple CPUs, which we call threads, defined below. 339 * 340 * Some resources may still be shared by cores in the same package. For 341 * example, many processors will share the level 3 cache between cores. 342 * Some AMD generations share hardware resources between cores. For more 343 * information on that see the section 'AMD Topology'. 344 * 345 * THREAD and STRAND 346 * 347 * In this file, generally a thread refers to a hardware resources and not 348 * the operating system's logical abstraction. A thread is always exposed 349 * as an independent logical CPU to the operating system. A thread belongs 350 * to a specific core. A core may have more than one thread. When that is 351 * the case, the threads that are part of the same core are often referred 352 * to as 'siblings'. 353 * 354 * When multiple threads exist, this is generally referred to as 355 * simultaneous multi-threading (SMT). When Intel introduced this in their 356 * processors they called it hyper-threading (HT). When multiple threads 357 * are active in a core, they split the resources of the core. For example, 358 * two threads may share the same set of hardware execution units. 359 * 360 * The operating system often uses the term 'strand' to refer to a thread. 361 * This helps disambiguate it from the software concept. 362 * 363 * CHIP 364 * 365 * Unfortunately, the term 'chip' is dramatically overloaded. At its most 366 * base meaning, it is used to refer to a single integrated circuit, which 367 * may or may not be the only thing in the package. In illumos, when you 368 * see the term 'chip' it is almost always referring to the same thing as 369 * the 'package'. However, many vendors may use chip to refer to one of 370 * many integrated circuits that have been placed in the package. As an 371 * example, see the subsequent definition. 372 * 373 * To try and keep things consistent, we will only use chip when referring 374 * to the entire integrated circuit package, with the exception of the 375 * definition of multi-chip module (because it is in the name) and use the 376 * term 'die' when we want the more general, potential sub-component 377 * definition. 378 * 379 * DIE 380 * 381 * A die refers to an integrated circuit. Inside of the package there may 382 * be a single die or multiple dies. This is sometimes called a 'chip' in 383 * vendor's parlance, but in this file, we use the term die to refer to a 384 * subcomponent. 385 * 386 * MULTI-CHIP MODULE 387 * 388 * A multi-chip module (MCM) refers to putting multiple distinct chips that 389 * are connected together in the same package. When a multi-chip design is 390 * used, generally each chip is manufactured independently and then joined 391 * together in the package. For example, on AMD's Zen microarchitecture 392 * (family 0x17), the package contains several dies (the second meaning of 393 * chip from above) that are connected together. 394 * 395 * CACHE 396 * 397 * A cache is a part of the processor that maintains copies of recently 398 * accessed memory. Caches are split into levels and then into types. 399 * Commonly there are one to three levels, called level one, two, and 400 * three. The lower the level, the smaller it is, the closer it is to the 401 * execution units of the CPU, and the faster it is to access. The layout 402 * and design of the cache come in many different flavors, consult other 403 * resources for a discussion of those. 404 * 405 * Caches are generally split into two types, the instruction and data 406 * cache. The caches contain what their names suggest, the instruction 407 * cache has executable program text, while the data cache has all other 408 * memory that the processor accesses. As of this writing, data is kept 409 * coherent between all of the caches on x86, so if one modifies program 410 * text before it is executed, that will be in the data cache, and the 411 * instruction cache will be synchronized with that change when the 412 * processor actually executes those instructions. This coherency also 413 * covers the fact that data could show up in multiple caches. 414 * 415 * Generally, the lowest level caches are specific to a core. However, the 416 * last layer cache is shared between some number of cores. The number of 417 * CPUs sharing this last level cache is important. This has implications 418 * for the choices that the scheduler makes, as accessing memory that might 419 * be in a remote cache after thread migration can be quite expensive. 420 * 421 * Sometimes, the word cache is abbreviated with a '$', because in US 422 * English the word cache is pronounced the same as cash. So L1D$ refers to 423 * the L1 data cache, and L2$ would be the L2 cache. This will not be used 424 * in the rest of this theory statement for clarity. 425 * 426 * MEMORY CONTROLLER 427 * 428 * The memory controller is a component that provides access to DRAM. Each 429 * memory controller can access a set number of DRAM channels. Each channel 430 * can have a number of DIMMs (sticks of memory) associated with it. A 431 * given package may have more than one memory controller. The association 432 * of the memory controller to a group of cores is important as it is 433 * cheaper to access memory on the controller that you are associated with. 434 * 435 * NUMA 436 * 437 * NUMA or non-uniform memory access, describes a way that systems are 438 * built. On x86, any processor core can address all of the memory in the 439 * system. However, When using multiple sockets or possibly within a 440 * multi-chip module, some of that memory is physically closer and some of 441 * it is further. Memory that is further away is more expensive to access. 442 * Consider the following image of multiple sockets with memory: 443 * 444 * +--------+ +--------+ 445 * | DIMM A | +----------+ +----------+ | DIMM D | 446 * +--------+-+ | | | | +-+------+-+ 447 * | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E | 448 * +--------+-+ | | | | +-+------+-+ 449 * | DIMM C | +----------+ +----------+ | DIMM F | 450 * +--------+ +--------+ 451 * 452 * In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is 453 * closer to DIMMs D-F. This means that it is cheaper for socket 0 to 454 * access DIMMs A-C and more expensive to access D-F as it has to go 455 * through Socket 1 to get there. The inverse is true for Socket 1. DIMMs 456 * D-F are cheaper than A-C. While the socket form is the most common, when 457 * using multi-chip modules, this can also sometimes occur. For another 458 * example of this that's more involved, see the AMD topology section. 459 * 460 * 461 * Intel Topology 462 * -------------- 463 * 464 * Most Intel processors since Nehalem, (as of this writing the current gen 465 * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of 466 * the package is a single monolithic die. MCMs currently aren't used. Most 467 * parts have three levels of caches, with the L3 cache being shared between 468 * all of the cores on the package. The L1/L2 cache is generally specific to 469 * an individual core. The following image shows at a simplified level what 470 * this looks like. The memory controller is commonly part of something called 471 * the 'Uncore', that used to be separate physical chips that were not a part of 472 * the package, but are now part of the same chip. 473 * 474 * +-----------------------------------------------------------------------+ 475 * | Package | 476 * | +-------------------+ +-------------------+ +-------------------+ | 477 * | | Core | | Core | | Core | | 478 * | | +--------+ +---+ | | +--------+ +---+ | | +--------+ +---+ | | 479 * | | | Thread | | L | | | | Thread | | L | | | | Thread | | L | | | 480 * | | +--------+ | 1 | | | +--------+ | 1 | | | +--------+ | 1 | | | 481 * | | +--------+ | | | | +--------+ | | | | +--------+ | | | | 482 * | | | Thread | | | | | | Thread | | | | | | Thread | | | | | 483 * | | +--------+ +---+ | | +--------+ +---+ | | +--------+ +---+ | | 484 * | | +--------------+ | | +--------------+ | | +--------------+ | | 485 * | | | L2 Cache | | | | L2 Cache | | | | L2 Cache | | | 486 * | | +--------------+ | | +--------------+ | | +--------------+ | | 487 * | +-------------------+ +-------------------+ +-------------------+ | 488 * | +-------------------------------------------------------------------+ | 489 * | | Shared L3 Cache | | 490 * | +-------------------------------------------------------------------+ | 491 * | +-------------------------------------------------------------------+ | 492 * | | Memory Controller | | 493 * | +-------------------------------------------------------------------+ | 494 * +-----------------------------------------------------------------------+ 495 * 496 * A side effect of this current architecture is that what we care about from a 497 * scheduling and topology perspective, is simplified. In general we care about 498 * understanding which logical CPUs are part of the same core and socket. 499 * 500 * To determine the relationship between threads and cores, Intel initially used 501 * the identifier in the advanced programmable interrupt controller (APIC). They 502 * also added cpuid leaf 4 to give additional information about the number of 503 * threads and CPUs in the processor. With the addition of x2apic (which 504 * increased the number of addressable logical CPUs from 8-bits to 32-bits), an 505 * additional cpuid topology leaf 0xB was added. 506 * 507 * AMD Topology 508 * ------------ 509 * 510 * When discussing AMD topology, we want to break this into three distinct 511 * generations of topology. There's the basic topology that has been used in 512 * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced 513 * with family 0x15 (Bulldozer), and there's the topology that was introduced 514 * with family 0x17 (Zen). AMD also has some additional terminology that's worth 515 * talking about. 516 * 517 * Until the introduction of family 0x17 (Zen), AMD did not implement something 518 * that they considered SMT. Whether or not the AMD processors have SMT 519 * influences many things including scheduling and reliability, availability, 520 * and serviceability (RAS) features. 521 * 522 * NODE 523 * 524 * AMD uses the term node to refer to a die that contains a number of cores 525 * and I/O resources. Depending on the processor family and model, more 526 * than one node can be present in the package. When there is more than one 527 * node this indicates a multi-chip module. Usually each node has its own 528 * access to memory and I/O devices. This is important and generally 529 * different from the corresponding Intel Nehalem-Skylake+ processors. As a 530 * result, we track this relationship in the operating system. 531 * 532 * In processors with an L3 cache, the L3 cache is generally shared across 533 * the entire node, though the way this is carved up varies from generation 534 * to generation. 535 * 536 * BULLDOZER 537 * 538 * Starting with the Bulldozer family (0x15) and continuing until the 539 * introduction of the Zen microarchitecture, AMD introduced the idea of a 540 * compute unit. In a compute unit, two traditional cores share a number of 541 * hardware resources. Critically, they share the FPU, L1 instruction 542 * cache, and the L2 cache. Several compute units were then combined inside 543 * of a single node. Because the integer execution units, L1 data cache, 544 * and some other resources were not shared between the cores, AMD never 545 * considered this to be SMT. 546 * 547 * ZEN 548 * 549 * The Zen family (0x17) uses a multi-chip module (MCM) design, the module 550 * is called Zeppelin. These modules are similar to the idea of nodes used 551 * previously. Each of these nodes has two DRAM channels which all of the 552 * cores in the node can access uniformly. These nodes are linked together 553 * in the package, creating a NUMA environment. 554 * 555 * The Zeppelin die itself contains two different 'core complexes'. Each 556 * core complex consists of four cores which each have two threads, for a 557 * total of 8 logical CPUs per complex. Unlike other generations, 558 * where all the logical CPUs in a given node share the L3 cache, here each 559 * core complex has its own shared L3 cache. 560 * 561 * A further thing that we need to consider is that in some configurations, 562 * particularly with the Threadripper line of processors, not every die 563 * actually has its memory controllers wired up to actual memory channels. 564 * This means that some cores have memory attached to them and others 565 * don't. 566 * 567 * To put Zen in perspective, consider the following images: 568 * 569 * +--------------------------------------------------------+ 570 * | Core Complex | 571 * | +-------------------+ +-------------------+ +---+ | 572 * | | Core +----+ | | Core +----+ | | | | 573 * | | +--------+ | L2 | | | +--------+ | L2 | | | | | 574 * | | | Thread | +----+ | | | Thread | +----+ | | | | 575 * | | +--------+-+ +--+ | | +--------+-+ +--+ | | L | | 576 * | | | Thread | |L1| | | | Thread | |L1| | | 3 | | 577 * | | +--------+ +--+ | | +--------+ +--+ | | | | 578 * | +-------------------+ +-------------------+ | C | | 579 * | +-------------------+ +-------------------+ | a | | 580 * | | Core +----+ | | Core +----+ | | c | | 581 * | | +--------+ | L2 | | | +--------+ | L2 | | | h | | 582 * | | | Thread | +----+ | | | Thread | +----+ | | e | | 583 * | | +--------+-+ +--+ | | +--------+-+ +--+ | | | | 584 * | | | Thread | |L1| | | | Thread | |L1| | | | | 585 * | | +--------+ +--+ | | +--------+ +--+ | | | | 586 * | +-------------------+ +-------------------+ +---+ | 587 * | | 588 * +--------------------------------------------------------+ 589 * 590 * This first image represents a single Zen core complex that consists of four 591 * cores. 592 * 593 * 594 * +--------------------------------------------------------+ 595 * | Zeppelin Die | 596 * | +--------------------------------------------------+ | 597 * | | I/O Units (PCIe, SATA, USB, etc.) | | 598 * | +--------------------------------------------------+ | 599 * | HH | 600 * | +-----------+ HH +-----------+ | 601 * | | | HH | | | 602 * | | Core |==========| Core | | 603 * | | Complex |==========| Complex | | 604 * | | | HH | | | 605 * | +-----------+ HH +-----------+ | 606 * | HH | 607 * | +--------------------------------------------------+ | 608 * | | Memory Controller | | 609 * | +--------------------------------------------------+ | 610 * | | 611 * +--------------------------------------------------------+ 612 * 613 * This image represents a single Zeppelin Die. Note how both cores are 614 * connected to the same memory controller and I/O units. While each core 615 * complex has its own L3 cache as seen in the first image, they both have 616 * uniform access to memory. 617 * 618 * 619 * PP PP 620 * PP PP 621 * +----------PP---------------------PP---------+ 622 * | PP PP | 623 * | +-----------+ +-----------+ | 624 * | | | | | | 625 * MMMMMMMMM| Zeppelin |==========| Zeppelin |MMMMMMMMM 626 * MMMMMMMMM| Die |==========| Die |MMMMMMMMM 627 * | | | | | | 628 * | +-----------+ooo ...+-----------+ | 629 * | HH ooo ... HH | 630 * | HH oo.. HH | 631 * | HH ..oo HH | 632 * | HH ... ooo HH | 633 * | +-----------+... ooo+-----------+ | 634 * | | | | | | 635 * MMMMMMMMM| Zeppelin |==========| Zeppelin |MMMMMMMMM 636 * MMMMMMMMM| Die |==========| Die |MMMMMMMMM 637 * | | | | | | 638 * | +-----------+ +-----------+ | 639 * | PP PP | 640 * +----------PP---------------------PP---------+ 641 * PP PP 642 * PP PP 643 * 644 * This image represents a single Zen package. In this example, it has four 645 * Zeppelin dies, though some configurations only have a single one. In this 646 * example, each die is directly connected to the next. Also, each die is 647 * represented as being connected to memory by the 'M' character and connected 648 * to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin 649 * die is made up of two core complexes, we have multiple different NUMA 650 * domains that we care about for these systems. 651 * 652 * CPUID LEAVES 653 * 654 * There are a few different CPUID leaves that we can use to try and understand 655 * the actual state of the world. As part of the introduction of family 0xf, AMD 656 * added CPUID leaf 0x80000008. This leaf tells us the number of logical 657 * processors that are in the system. Because families before Zen didn't have 658 * SMT, this was always the number of cores that were in the system. However, it 659 * should always be thought of as the number of logical threads to be consistent 660 * between generations. In addition we also get the size of the APIC ID that is 661 * used to represent the number of logical processors. This is important for 662 * deriving topology information. 663 * 664 * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a 665 * bit between Bulldozer and later families, but it is quite useful in 666 * determining the topology information. Because this information has changed 667 * across family generations, it's worth calling out what these mean 668 * explicitly. The registers have the following meanings: 669 * 670 * %eax The APIC ID. The entire register is defined to have a 32-bit 671 * APIC ID, even though on systems without x2apic support, it will 672 * be limited to 8 bits. 673 * 674 * %ebx On Bulldozer-era systems this contains information about the 675 * number of cores that are in a compute unit (cores that share 676 * resources). It also contains a per-package compute unit ID that 677 * identifies which compute unit the logical CPU is a part of. 678 * 679 * On Zen-era systems this instead contains the number of threads 680 * per core and the ID of the core that the logical CPU is a part 681 * of. Note, this ID is unique only to the package, it is not 682 * globally unique across the entire system. 683 * 684 * %ecx This contains the number of nodes that exist in the package. It 685 * also contains an ID that identifies which node the logical CPU 686 * is a part of. 687 * 688 * Finally, we also use cpuid leaf 0x8000001D to determine information about the 689 * cache layout to determine which logical CPUs are sharing which caches. 690 * 691 * illumos Topology 692 * ---------------- 693 * 694 * Based on the above we synthesize the information into several different 695 * variables that we store in the 'struct cpuid_info'. We'll go into the details 696 * of what each member is supposed to represent and their uniqueness. In 697 * general, there are two levels of uniqueness that we care about. We care about 698 * an ID that is globally unique. That means that it will be unique across all 699 * entities in the system. For example, the default logical CPU ID is globally 700 * unique. On the other hand, there is some information that we only care about 701 * being unique within the context of a single package / socket. Here are the 702 * variables that we keep track of and their meaning. 703 * 704 * Several of the values that are asking for an identifier, with the exception 705 * of cpi_apicid, are allowed to be synthetic. 706 * 707 * 708 * cpi_apicid 709 * 710 * This is the value of the CPU's APIC id. This should be the full 32-bit 711 * ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit 712 * APIC ID. This value is globally unique between all logical CPUs across 713 * all packages. This is usually required by the APIC. 714 * 715 * cpi_chipid 716 * 717 * This value indicates the ID of the package that the logical CPU is a 718 * part of. This value is allowed to be synthetic. It is usually derived by 719 * taking the CPU's APIC ID and determining how many bits are used to 720 * represent CPU cores in the package. All logical CPUs that are part of 721 * the same package must have the same value. 722 * 723 * cpi_coreid 724 * 725 * This represents the ID of a CPU core. Two logical CPUs should only have 726 * the same cpi_coreid value if they are part of the same core. These 727 * values may be synthetic. On systems that support SMT, this value is 728 * usually derived from the APIC ID, otherwise it is often synthetic and 729 * just set to the value of the cpu_id in the cpu_t. 730 * 731 * cpi_pkgcoreid 732 * 733 * This is similar to the cpi_coreid in that logical CPUs that are part of 734 * the same core should have the same ID. The main difference is that these 735 * values are only required to be unique to a given socket. 736 * 737 * cpi_clogid 738 * 739 * This represents the logical ID of a logical CPU. This value should be 740 * unique within a given socket for each logical CPU. This is allowed to be 741 * synthetic, though it is usually based off of the CPU's apic ID. The 742 * broader system expects that logical CPUs that have are part of the same 743 * core have contiguous numbers. For example, if there were two threads per 744 * core, then the core IDs divided by two should be the same and the first 745 * modulus two should be zero and the second one. For example, IDs 4 and 5 746 * indicate two logical CPUs that are part of the same core. But IDs 5 and 747 * 6 represent two logical CPUs that are part of different cores. 748 * 749 * While it is common for the cpi_coreid and the cpi_clogid to be derived 750 * from the same source, strictly speaking, they don't have to be and the 751 * two values should be considered logically independent. One should not 752 * try to compare a logical CPU's cpi_coreid and cpi_clogid to determine 753 * some kind of relationship. While this is tempting, we've seen cases on 754 * AMD family 0xf where the system's cpu id is not related to its APIC ID. 755 * 756 * cpi_ncpu_per_chip 757 * 758 * This value indicates the total number of logical CPUs that exist in the 759 * physical package. Critically, this is not the number of logical CPUs 760 * that exist for just the single core. 761 * 762 * This value should be the same for all logical CPUs in the same package. 763 * 764 * cpi_ncore_per_chip 765 * 766 * This value indicates the total number of physical CPU cores that exist 767 * in the package. The system compares this value with cpi_ncpu_per_chip to 768 * determine if simultaneous multi-threading (SMT) is enabled. When 769 * cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and 770 * the X86FSET_HTT feature is not set. If this value is greater than one, 771 * than we consider the processor to have the feature X86FSET_CMP, to 772 * indicate that there is support for more than one core. 773 * 774 * This value should be the same for all logical CPUs in the same package. 775 * 776 * cpi_procnodes_per_pkg 777 * 778 * This value indicates the number of 'nodes' that exist in the package. 779 * When processors are actually a multi-chip module, this represents the 780 * number of such modules that exist in the package. Currently, on Intel 781 * based systems this member is always set to 1. 782 * 783 * This value should be the same for all logical CPUs in the same package. 784 * 785 * cpi_procnodeid 786 * 787 * This value indicates the ID of the node that the logical CPU is a part 788 * of. All logical CPUs that are in the same node must have the same value 789 * here. This value must be unique across all of the packages in the 790 * system. On Intel based systems, this is currently set to the value in 791 * cpi_chipid because there is only one node. 792 * 793 * cpi_cores_per_compunit 794 * 795 * This value indicates the number of cores that are part of a compute 796 * unit. See the AMD topology section for this. This member only has real 797 * meaning currently for AMD Bulldozer family processors. For all other 798 * processors, this should currently be set to 1. 799 * 800 * cpi_compunitid 801 * 802 * This indicates the compute unit that the logical CPU belongs to. For 803 * processors without AMD Bulldozer-style compute units this should be set 804 * to the value of cpi_coreid. 805 * 806 * cpi_ncpu_shr_last_cache 807 * 808 * This indicates the number of logical CPUs that are sharing the same last 809 * level cache. This value should be the same for all CPUs that are sharing 810 * that cache. The last cache refers to the cache that is closest to memory 811 * and furthest away from the CPU. 812 * 813 * cpi_last_lvl_cacheid 814 * 815 * This indicates the ID of the last cache that the logical CPU uses. This 816 * cache is often shared between multiple logical CPUs and is the cache 817 * that is closest to memory and furthest away from the CPU. This value 818 * should be the same for a group of logical CPUs only if they actually 819 * share the same last level cache. IDs should not overlap between 820 * packages. 821 * 822 * cpi_ncore_bits 823 * 824 * This indicates the number of bits that are required to represent all of 825 * the cores in the system. As cores are derived based on their APIC IDs, 826 * we aren't guaranteed a run of APIC IDs starting from zero. It's OK for 827 * this value to be larger than the actual number of IDs that are present 828 * in the system. This is used to size tables by the CMI framework. It is 829 * only filled in for Intel and AMD CPUs. 830 * 831 * cpi_nthread_bits 832 * 833 * This indicates the number of bits required to represent all of the IDs 834 * that cover the logical CPUs that exist on a given core. It's OK for this 835 * value to be larger than the actual number of IDs that are present in the 836 * system. This is used to size tables by the CMI framework. It is 837 * only filled in for Intel and AMD CPUs. 838 * 839 * ----------- 840 * Hypervisors 841 * ----------- 842 * 843 * If trying to manage the differences between vendors wasn't bad enough, it can 844 * get worse thanks to our friend hardware virtualization. Hypervisors are given 845 * the ability to interpose on all cpuid instructions and change them to suit 846 * their purposes. In general, this is necessary as the hypervisor wants to be 847 * able to present a more uniform set of features or not necessarily give the 848 * guest operating system kernel knowledge of all features so it can be 849 * more easily migrated between systems. 850 * 851 * When it comes to trying to determine topology information, this can be a 852 * double edged sword. When a hypervisor doesn't actually implement a cpuid 853 * leaf, it'll often return all zeros. Because of that, you'll often see various 854 * checks scattered about fields being non-zero before we assume we can use 855 * them. 856 * 857 * When it comes to topology information, the hypervisor is often incentivized 858 * to lie to you about topology. This is because it doesn't always actually 859 * guarantee that topology at all. The topology path we take in the system 860 * depends on how the CPU advertises itself. If it advertises itself as an Intel 861 * or AMD CPU, then we basically do our normal path. However, when they don't 862 * use an actual vendor, then that usually turns into multiple one-core CPUs 863 * that we enumerate that are often on different sockets. The actual behavior 864 * depends greatly on what the hypervisor actually exposes to us. 865 * 866 * -------------------- 867 * Exposing Information 868 * -------------------- 869 * 870 * We expose CPUID information in three different forms in the system. 871 * 872 * The first is through the x86_featureset variable. This is used in conjunction 873 * with the is_x86_feature() function. This is queried by x86-specific functions 874 * to determine which features are or aren't present in the system and to make 875 * decisions based upon them. For example, users of this include everything from 876 * parts of the system dedicated to reliability, availability, and 877 * serviceability (RAS), to making decisions about how to handle security 878 * mitigations, to various x86-specific drivers. General purpose or 879 * architecture independent drivers should never be calling this function. 880 * 881 * The second means is through the auxiliary vector. The auxiliary vector is a 882 * series of tagged data that the kernel passes down to a user program when it 883 * begins executing. This information is used to indicate to programs what 884 * instruction set extensions are present. For example, information about the 885 * CPU supporting the machine check architecture (MCA) wouldn't be passed down 886 * since user programs cannot make use of it. However, things like the AVX 887 * instruction sets are. Programs use this information to make run-time 888 * decisions about what features they should use. As an example, the run-time 889 * link-editor (rtld) can relocate different functions depending on the hardware 890 * support available. 891 * 892 * The final form is through a series of accessor functions that all have the 893 * form cpuid_get*. This is used by a number of different subsystems in the 894 * kernel to determine more detailed information about what we're running on, 895 * topology information, etc. Some of these subsystems include processor groups 896 * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI, 897 * microcode, and performance monitoring. These functions all ASSERT that the 898 * CPU they're being called on has reached a certain cpuid pass. If the passes 899 * are rearranged, then this needs to be adjusted. 900 * 901 * ----------------------------------------------- 902 * Speculative Execution CPU Side Channel Security 903 * ----------------------------------------------- 904 * 905 * With the advent of the Spectre and Meltdown attacks which exploit speculative 906 * execution in the CPU to create side channels there have been a number of 907 * different attacks and corresponding issues that the operating system needs to 908 * mitigate against. The following list is some of the common, but not 909 * exhaustive, set of issues that we know about and have done some or need to do 910 * more work in the system to mitigate against: 911 * 912 * - Spectre v1 913 * - swapgs (Spectre v1 variant) 914 * - Spectre v2 915 * - Meltdown (Spectre v3) 916 * - Rogue Register Read (Spectre v3a) 917 * - Speculative Store Bypass (Spectre v4) 918 * - ret2spec, SpectreRSB 919 * - L1 Terminal Fault (L1TF) 920 * - Microarchitectural Data Sampling (MDS) 921 * 922 * Each of these requires different sets of mitigations and has different attack 923 * surfaces. For the most part, this discussion is about protecting the kernel 924 * from non-kernel executing environments such as user processes and hardware 925 * virtual machines. Unfortunately, there are a number of user vs. user 926 * scenarios that exist with these. The rest of this section will describe the 927 * overall approach that the system has taken to address these as well as their 928 * shortcomings. Unfortunately, not all of the above have been handled today. 929 * 930 * SPECTRE v2, ret2spec, SpectreRSB 931 * 932 * The second variant of the spectre attack focuses on performing branch target 933 * injection. This generally impacts indirect call instructions in the system. 934 * There are three different ways to mitigate this issue that are commonly 935 * described today: 936 * 937 * 1. Using Indirect Branch Restricted Speculation (IBRS). 938 * 2. Using Retpolines and RSB Stuffing 939 * 3. Using Enhanced Indirect Branch Restricted Speculation (EIBRS) 940 * 941 * IBRS uses a feature added to microcode to restrict speculation, among other 942 * things. This form of mitigation has not been used as it has been generally 943 * seen as too expensive and requires reactivation upon various transitions in 944 * the system. 945 * 946 * As a less impactful alternative to IBRS, retpolines were developed by 947 * Google. These basically require one to replace indirect calls with a specific 948 * trampoline that will cause speculation to fail and break the attack. 949 * Retpolines require compiler support. We always build with retpolines in the 950 * external thunk mode. This means that a traditional indirect call is replaced 951 * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect 952 * of this is that all indirect function calls are performed through a register. 953 * 954 * We have to use a common external location of the thunk and not inline it into 955 * the callsite so that way we can have a single place to patch these functions. 956 * As it turns out, we actually have three different forms of retpolines that 957 * exist in the system: 958 * 959 * 1. A full retpoline 960 * 2. An AMD-specific optimized retpoline 961 * 3. A no-op version 962 * 963 * The first one is used in the general case. The second one is used if we can 964 * determine that we're on an AMD system and we can successfully toggle the 965 * lfence serializing MSR that exists on the platform. Basically with this 966 * present, an lfence is sufficient and we don't need to do anywhere near as 967 * complicated a dance to successfully use retpolines. 968 * 969 * The third form described above is the most curious. It turns out that the way 970 * that retpolines are implemented is that they rely on how speculation is 971 * performed on a 'ret' instruction. Intel has continued to optimize this 972 * process (which is partly why we need to have return stack buffer stuffing, 973 * but more on that in a bit) and in processors starting with Cascade Lake 974 * on the server side, it's dangerous to rely on retpolines. Instead, a new 975 * mechanism has been introduced called Enhanced IBRS (EIBRS). 976 * 977 * Unlike IBRS, EIBRS is designed to be enabled once at boot and left on each 978 * physical core. However, if this is the case, we don't want to use retpolines 979 * any more. Therefore if EIBRS is present, we end up turning each retpoline 980 * function (called a thunk) into a jmp instruction. This means that we're still 981 * paying the cost of an extra jump to the external thunk, but it gives us 982 * flexibility and the ability to have a single kernel image that works across a 983 * wide variety of systems and hardware features. 984 * 985 * Unfortunately, this alone is insufficient. First, Skylake systems have 986 * additional speculation for the Return Stack Buffer (RSB) which is used to 987 * return from call instructions which retpolines take advantage of. However, 988 * this problem is not just limited to Skylake and is actually more pernicious. 989 * The SpectreRSB paper introduces several more problems that can arise with 990 * dealing with this. The RSB can be poisoned just like the indirect branch 991 * predictor. This means that one needs to clear the RSB when transitioning 992 * between two different privilege domains. Some examples include: 993 * 994 * - Switching between two different user processes 995 * - Going between user land and the kernel 996 * - Returning to the kernel from a hardware virtual machine 997 * 998 * Mitigating this involves combining a couple of different things. The first is 999 * SMEP (supervisor mode execution protection) which was introduced in Ivy 1000 * Bridge. When an RSB entry refers to a user address and we're executing in the 1001 * kernel, speculation through it will be stopped when SMEP is enabled. This 1002 * protects against a number of the different cases that we would normally be 1003 * worried about such as when we enter the kernel from user land. 1004 * 1005 * To prevent against additional manipulation of the RSB from other contexts 1006 * such as a non-root VMX context attacking the kernel we first look to enhanced 1007 * IBRS. When EIBRS is present and enabled, then there is nothing else that we 1008 * need to do to protect the kernel at this time. 1009 * 1010 * On CPUs without EIBRS we need to manually overwrite the contents of the 1011 * return stack buffer. We do this through the x86_rsb_stuff() function. 1012 * Currently this is employed on context switch. The x86_rsb_stuff() function is 1013 * disabled when enhanced IBRS is present because Intel claims on such systems 1014 * it will be ineffective. Stuffing the RSB in context switch helps prevent user 1015 * to user attacks via the RSB. 1016 * 1017 * If SMEP is not present, then we would have to stuff the RSB every time we 1018 * transitioned from user mode to the kernel, which isn't very practical right 1019 * now. 1020 * 1021 * To fully protect user to user and vmx to vmx attacks from these classes of 1022 * issues, we would also need to allow them to opt into performing an Indirect 1023 * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up. 1024 * 1025 * By default, the system will enable RSB stuffing and the required variant of 1026 * retpolines and store that information in the x86_spectrev2_mitigation value. 1027 * This will be evaluated after a microcode update as well, though it is 1028 * expected that microcode updates will not take away features. This may mean 1029 * that a late loaded microcode may not end up in the optimal configuration 1030 * (though this should be rare). 1031 * 1032 * Currently we do not build kmdb with retpolines or perform any additional side 1033 * channel security mitigations for it. One complication with kmdb is that it 1034 * requires its own retpoline thunks and it would need to adjust itself based on 1035 * what the kernel does. The threat model of kmdb is more limited and therefore 1036 * it may make more sense to investigate using prediction barriers as the whole 1037 * system is only executing a single instruction at a time while in kmdb. 1038 * 1039 * SPECTRE v1, v4 1040 * 1041 * The v1 and v4 variants of spectre are not currently mitigated in the 1042 * system and require other classes of changes to occur in the code. 1043 * 1044 * SPECTRE v1 (SWAPGS VARIANT) 1045 * 1046 * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but 1047 * can generally affect any branch-dependent code. The swapgs issue is one 1048 * variant of this. If we are coming in from userspace, we can have code like 1049 * this: 1050 * 1051 * cmpw $KCS_SEL, REGOFF_CS(%rsp) 1052 * je 1f 1053 * movq $0, REGOFF_SAVFP(%rsp) 1054 * swapgs 1055 * 1: 1056 * movq %gs:CPU_THREAD, %rax 1057 * 1058 * If an attacker can cause a mis-speculation of the branch here, we could skip 1059 * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based 1060 * load. If subsequent code can act as the usual Spectre cache gadget, this 1061 * would potentially allow KPTI bypass. To fix this, we need an lfence prior to 1062 * any use of the %gs override. 1063 * 1064 * The other case is also an issue: if we're coming into a trap from kernel 1065 * space, we could mis-speculate and swapgs the user %gsbase back in prior to 1066 * using it. AMD systems are not vulnerable to this version, as a swapgs is 1067 * serializing with respect to subsequent uses. But as AMD /does/ need the other 1068 * case, and the fix is the same in both cases (an lfence at the branch target 1069 * 1: in this example), we'll just do it unconditionally. 1070 * 1071 * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it 1072 * harder for user-space to actually set a useful %gsbase value: although it's 1073 * not clear, it might still be feasible via lwp_setprivate(), though, so we 1074 * mitigate anyway. 1075 * 1076 * MELTDOWN 1077 * 1078 * Meltdown, or spectre v3, allowed a user process to read any data in their 1079 * address space regardless of whether or not the page tables in question 1080 * allowed the user to have the ability to read them. The solution to meltdown 1081 * is kernel page table isolation. In this world, there are two page tables that 1082 * are used for a process, one in user land and one in the kernel. To implement 1083 * this we use per-CPU page tables and switch between the user and kernel 1084 * variants when entering and exiting the kernel. For more information about 1085 * this process and how the trampolines work, please see the big theory 1086 * statements and additional comments in: 1087 * 1088 * - uts/i86pc/ml/kpti_trampolines.s 1089 * - uts/i86pc/vm/hat_i86.c 1090 * 1091 * While Meltdown only impacted Intel systems and there are also Intel systems 1092 * that have Meltdown fixed (called Rogue Data Cache Load), we always have 1093 * kernel page table isolation enabled. While this may at first seem weird, an 1094 * important thing to remember is that you can't speculatively read an address 1095 * if it's never in your page table at all. Having user processes without kernel 1096 * pages present provides us with an important layer of defense in the kernel 1097 * against any other side channel attacks that exist and have yet to be 1098 * discovered. As such, kernel page table isolation (KPTI) is always enabled by 1099 * default, no matter the x86 system. 1100 * 1101 * L1 TERMINAL FAULT 1102 * 1103 * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative 1104 * execution uses page table entries. Effectively, it is two different problems. 1105 * The first is that it ignores the not present bit in the page table entries 1106 * when performing speculative execution. This means that something can 1107 * speculatively read the listed physical address if it's present in the L1 1108 * cache under certain conditions (see Intel's documentation for the full set of 1109 * conditions). Secondly, this can be used to bypass hardware virtualization 1110 * extended page tables (EPT) that are part of Intel's hardware virtual machine 1111 * instructions. 1112 * 1113 * For the non-hardware virtualized case, this is relatively easy to deal with. 1114 * We must make sure that all unmapped pages have an address of zero. This means 1115 * that they could read the first 4k of physical memory; however, we never use 1116 * that first page in the operating system and always skip putting it in our 1117 * memory map, even if firmware tells us we can use it in our memory map. While 1118 * other systems try to put extra metadata in the address and reserved bits, 1119 * which led to this being problematic in those cases, we do not. 1120 * 1121 * For hardware virtual machines things are more complicated. Because they can 1122 * construct their own page tables, it isn't hard for them to perform this 1123 * attack against any physical address. The one wrinkle is that this physical 1124 * address must be in the L1 data cache. Thus Intel added an MSR that we can use 1125 * to flush the L1 data cache. We wrap this up in the function 1126 * spec_uarch_flush(). This function is also used in the mitigation of 1127 * microarchitectural data sampling (MDS) discussed later on. Kernel based 1128 * hypervisors such as KVM or bhyve are responsible for performing this before 1129 * entering the guest. 1130 * 1131 * Because this attack takes place in the L1 cache, there's another wrinkle 1132 * here. The L1 cache is shared between all logical CPUs in a core in most Intel 1133 * designs. This means that when a thread enters a hardware virtualized context 1134 * and flushes the L1 data cache, the other thread on the processor may then go 1135 * ahead and put new data in it that can be potentially attacked. While one 1136 * solution is to disable SMT on the system, another option that is available is 1137 * to use a feature for hardware virtualization called 'SMT exclusion'. This 1138 * goes through and makes sure that if a HVM is being scheduled on one thread, 1139 * then the thing on the other thread is from the same hardware virtual machine. 1140 * If an interrupt comes in or the guest exits to the broader system, then the 1141 * other SMT thread will be kicked out. 1142 * 1143 * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the 1144 * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not 1145 * perform L1TF related mitigations. 1146 * 1147 * MICROARCHITECTURAL DATA SAMPLING 1148 * 1149 * Microarchitectural data sampling (MDS) is a combination of four discrete 1150 * vulnerabilities that are similar issues affecting various parts of the CPU's 1151 * microarchitectural implementation around load, store, and fill buffers. 1152 * Specifically it is made up of the following subcomponents: 1153 * 1154 * 1. Microarchitectural Store Buffer Data Sampling (MSBDS) 1155 * 2. Microarchitectural Fill Buffer Data Sampling (MFBDS) 1156 * 3. Microarchitectural Load Port Data Sampling (MLPDS) 1157 * 4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM) 1158 * 1159 * To begin addressing these, Intel has introduced another feature in microcode 1160 * called MD_CLEAR. This changes the verw instruction to operate in a different 1161 * way. This allows us to execute the verw instruction in a particular way to 1162 * flush the state of the affected parts. The L1TF L1D flush mechanism is also 1163 * updated when this microcode is present to flush this state. 1164 * 1165 * Primarily we need to flush this state whenever we transition from the kernel 1166 * to a less privileged context such as user mode or an HVM guest. MSBDS is a 1167 * little bit different. Here the structures are statically sized when a logical 1168 * CPU is in use and resized when it goes to sleep. Therefore, we also need to 1169 * flush the microarchitectural state before the CPU goes idles by calling hlt, 1170 * mwait, or another ACPI method. To perform these flushes, we call 1171 * x86_md_clear() at all of these transition points. 1172 * 1173 * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF, 1174 * then we change the spec_uarch_flush() function to point to x86_md_clear(). If 1175 * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes 1176 * a no-op. 1177 * 1178 * Unfortunately, with this issue hyperthreading rears its ugly head. In 1179 * particular, everything we've discussed above is only valid for a single 1180 * thread executing on a core. In the case where you have hyper-threading 1181 * present, this attack can be performed between threads. The theoretical fix 1182 * for this is to ensure that both threads are always in the same security 1183 * domain. This means that they are executing in the same ring and mutually 1184 * trust each other. Practically speaking, this would mean that a system call 1185 * would have to issue an inter-processor interrupt (IPI) to the other thread. 1186 * Rather than implement this, we recommend that one disables hyper-threading 1187 * through the use of psradm -aS. 1188 * 1189 * TSX ASYNCHRONOUS ABORT 1190 * 1191 * TSX Asynchronous Abort (TAA) is another side-channel vulnerability that 1192 * behaves like MDS, but leverages Intel's transactional instructions as another 1193 * vector. Effectively, when a transaction hits one of these cases (unmapped 1194 * page, various cache snoop activity, etc.) then the same data can be exposed 1195 * as in the case of MDS. This means that you can attack your twin. 1196 * 1197 * Intel has described that there are two different ways that we can mitigate 1198 * this problem on affected processors: 1199 * 1200 * 1) We can use the same techniques used to deal with MDS. Flushing the 1201 * microarchitectural buffers and disabling hyperthreading will mitigate 1202 * this in the same way. 1203 * 1204 * 2) Using microcode to disable TSX. 1205 * 1206 * Now, most processors that are subject to MDS (as in they don't have MDS_NO in 1207 * the IA32_ARCH_CAPABILITIES MSR) will not receive microcode to disable TSX. 1208 * That's OK as we're already doing all such mitigations. On the other hand, 1209 * processors with MDS_NO are all supposed to receive microcode updates that 1210 * enumerate support for disabling TSX. In general, we'd rather use this method 1211 * when available as it doesn't require disabling hyperthreading to be 1212 * effective. Currently we basically are relying on microcode for processors 1213 * that enumerate MDS_NO. 1214 * 1215 * The microcode features are enumerated as part of the IA32_ARCH_CAPABILITIES. 1216 * When bit 7 (IA32_ARCH_CAP_TSX_CTRL) is present, then we are given two 1217 * different powers. The first allows us to cause all transactions to 1218 * immediately abort. The second gives us a means of disabling TSX completely, 1219 * which includes removing it from cpuid. If we have support for this in 1220 * microcode during the first cpuid pass, then we'll disable TSX completely such 1221 * that user land never has a chance to observe the bit. However, if we are late 1222 * loading the microcode, then we must use the functionality to cause 1223 * transactions to automatically abort. This is necessary for user land's sake. 1224 * Once a program sees a cpuid bit, it must not be taken away. 1225 * 1226 * We track whether or not we should do this based on what cpuid pass we're in. 1227 * Whenever we hit cpuid_scan_security() on the boot CPU and we're still on pass 1228 * 1 of the cpuid logic, then we can completely turn off TSX. Notably this 1229 * should happen twice. Once in the normal cpuid_pass1() code and then a second 1230 * time after we do the initial microcode update. 1231 * 1232 * If TAA has been fixed, then it will be enumerated in IA32_ARCH_CAPABILITIES 1233 * as TAA_NO. In such a case, we will still disable TSX: it's proven to be an 1234 * unfortunate feature in a number of ways, and taking the opportunity to 1235 * finally be able to turn it off is likely to be of benefit in the future. 1236 * 1237 * SUMMARY 1238 * 1239 * The following table attempts to summarize the mitigations for various issues 1240 * and what's done in various places: 1241 * 1242 * - Spectre v1: Not currently mitigated 1243 * - swapgs: lfences after swapgs paths 1244 * - Spectre v2: Retpolines/RSB Stuffing or EIBRS if HW support 1245 * - Meltdown: Kernel Page Table Isolation 1246 * - Spectre v3a: Updated CPU microcode 1247 * - Spectre v4: Not currently mitigated 1248 * - SpectreRSB: SMEP and RSB Stuffing 1249 * - L1TF: spec_uarch_flush, SMT exclusion, requires microcode 1250 * - MDS: x86_md_clear, requires microcode, disabling SMT 1251 * - TAA: x86_md_clear and disabling SMT OR microcode and disabling TSX 1252 * 1253 * The following table indicates the x86 feature set bits that indicate that a 1254 * given problem has been solved or a notable feature is present: 1255 * 1256 * - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS 1257 * - MDS_NO: All forms of MDS 1258 * - TAA_NO: TAA 1259 */ 1260 1261 #include <sys/types.h> 1262 #include <sys/archsystm.h> 1263 #include <sys/x86_archext.h> 1264 #include <sys/kmem.h> 1265 #include <sys/systm.h> 1266 #include <sys/cmn_err.h> 1267 #include <sys/sunddi.h> 1268 #include <sys/sunndi.h> 1269 #include <sys/cpuvar.h> 1270 #include <sys/processor.h> 1271 #include <sys/sysmacros.h> 1272 #include <sys/pg.h> 1273 #include <sys/fp.h> 1274 #include <sys/controlregs.h> 1275 #include <sys/bitmap.h> 1276 #include <sys/auxv_386.h> 1277 #include <sys/memnode.h> 1278 #include <sys/pci_cfgspace.h> 1279 #include <sys/comm_page.h> 1280 #include <sys/mach_mmu.h> 1281 #include <sys/ucode.h> 1282 #include <sys/tsc.h> 1283 #include <sys/kobj.h> 1284 #include <sys/asm_misc.h> 1285 1286 #ifdef __xpv 1287 #include <sys/hypervisor.h> 1288 #else 1289 #include <sys/ontrap.h> 1290 #endif 1291 1292 uint_t x86_vendor = X86_VENDOR_IntelClone; 1293 uint_t x86_type = X86_TYPE_OTHER; 1294 uint_t x86_clflush_size = 0; 1295 1296 #if defined(__xpv) 1297 int x86_use_pcid = 0; 1298 int x86_use_invpcid = 0; 1299 #else 1300 int x86_use_pcid = -1; 1301 int x86_use_invpcid = -1; 1302 #endif 1303 1304 typedef enum { 1305 X86_SPECTREV2_RETPOLINE, 1306 X86_SPECTREV2_RETPOLINE_AMD, 1307 X86_SPECTREV2_ENHANCED_IBRS, 1308 X86_SPECTREV2_DISABLED 1309 } x86_spectrev2_mitigation_t; 1310 1311 uint_t x86_disable_spectrev2 = 0; 1312 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation = 1313 X86_SPECTREV2_RETPOLINE; 1314 1315 /* 1316 * The mitigation status for TAA: 1317 * X86_TAA_NOTHING -- no mitigation available for TAA side-channels 1318 * X86_TAA_DISABLED -- mitigation disabled via x86_disable_taa 1319 * X86_TAA_MD_CLEAR -- MDS mitigation also suffices for TAA 1320 * X86_TAA_TSX_FORCE_ABORT -- transactions are forced to abort 1321 * X86_TAA_TSX_DISABLE -- force abort transactions and hide from CPUID 1322 * X86_TAA_HW_MITIGATED -- TSX potentially active but H/W not TAA-vulnerable 1323 */ 1324 typedef enum { 1325 X86_TAA_NOTHING, 1326 X86_TAA_DISABLED, 1327 X86_TAA_MD_CLEAR, 1328 X86_TAA_TSX_FORCE_ABORT, 1329 X86_TAA_TSX_DISABLE, 1330 X86_TAA_HW_MITIGATED 1331 } x86_taa_mitigation_t; 1332 1333 uint_t x86_disable_taa = 0; 1334 static x86_taa_mitigation_t x86_taa_mitigation = X86_TAA_NOTHING; 1335 1336 uint_t pentiumpro_bug4046376; 1337 1338 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)]; 1339 1340 static char *x86_feature_names[NUM_X86_FEATURES] = { 1341 "lgpg", 1342 "tsc", 1343 "msr", 1344 "mtrr", 1345 "pge", 1346 "de", 1347 "cmov", 1348 "mmx", 1349 "mca", 1350 "pae", 1351 "cv8", 1352 "pat", 1353 "sep", 1354 "sse", 1355 "sse2", 1356 "htt", 1357 "asysc", 1358 "nx", 1359 "sse3", 1360 "cx16", 1361 "cmp", 1362 "tscp", 1363 "mwait", 1364 "sse4a", 1365 "cpuid", 1366 "ssse3", 1367 "sse4_1", 1368 "sse4_2", 1369 "1gpg", 1370 "clfsh", 1371 "64", 1372 "aes", 1373 "pclmulqdq", 1374 "xsave", 1375 "avx", 1376 "vmx", 1377 "svm", 1378 "topoext", 1379 "f16c", 1380 "rdrand", 1381 "x2apic", 1382 "avx2", 1383 "bmi1", 1384 "bmi2", 1385 "fma", 1386 "smep", 1387 "smap", 1388 "adx", 1389 "rdseed", 1390 "mpx", 1391 "avx512f", 1392 "avx512dq", 1393 "avx512pf", 1394 "avx512er", 1395 "avx512cd", 1396 "avx512bw", 1397 "avx512vl", 1398 "avx512fma", 1399 "avx512vbmi", 1400 "avx512_vpopcntdq", 1401 "avx512_4vnniw", 1402 "avx512_4fmaps", 1403 "xsaveopt", 1404 "xsavec", 1405 "xsaves", 1406 "sha", 1407 "umip", 1408 "pku", 1409 "ospke", 1410 "pcid", 1411 "invpcid", 1412 "ibrs", 1413 "ibpb", 1414 "stibp", 1415 "ssbd", 1416 "ssbd_virt", 1417 "rdcl_no", 1418 "ibrs_all", 1419 "rsba", 1420 "ssb_no", 1421 "stibp_all", 1422 "flush_cmd", 1423 "l1d_vmentry_no", 1424 "fsgsbase", 1425 "clflushopt", 1426 "clwb", 1427 "monitorx", 1428 "clzero", 1429 "xop", 1430 "fma4", 1431 "tbm", 1432 "avx512_vnni", 1433 "amd_pcec", 1434 "mb_clear", 1435 "mds_no", 1436 "core_thermal", 1437 "pkg_thermal", 1438 "tsx_ctrl", 1439 "taa_no" 1440 }; 1441 1442 boolean_t 1443 is_x86_feature(void *featureset, uint_t feature) 1444 { 1445 ASSERT(feature < NUM_X86_FEATURES); 1446 return (BT_TEST((ulong_t *)featureset, feature)); 1447 } 1448 1449 void 1450 add_x86_feature(void *featureset, uint_t feature) 1451 { 1452 ASSERT(feature < NUM_X86_FEATURES); 1453 BT_SET((ulong_t *)featureset, feature); 1454 } 1455 1456 void 1457 remove_x86_feature(void *featureset, uint_t feature) 1458 { 1459 ASSERT(feature < NUM_X86_FEATURES); 1460 BT_CLEAR((ulong_t *)featureset, feature); 1461 } 1462 1463 boolean_t 1464 compare_x86_featureset(void *setA, void *setB) 1465 { 1466 /* 1467 * We assume that the unused bits of the bitmap are always zero. 1468 */ 1469 if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) { 1470 return (B_TRUE); 1471 } else { 1472 return (B_FALSE); 1473 } 1474 } 1475 1476 void 1477 print_x86_featureset(void *featureset) 1478 { 1479 uint_t i; 1480 1481 for (i = 0; i < NUM_X86_FEATURES; i++) { 1482 if (is_x86_feature(featureset, i)) { 1483 cmn_err(CE_CONT, "?x86_feature: %s\n", 1484 x86_feature_names[i]); 1485 } 1486 } 1487 } 1488 1489 /* Note: This is the maximum size for the CPU, not the size of the structure. */ 1490 static size_t xsave_state_size = 0; 1491 uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE); 1492 boolean_t xsave_force_disable = B_FALSE; 1493 extern int disable_smap; 1494 1495 /* 1496 * This is set to platform type we are running on. 1497 */ 1498 static int platform_type = -1; 1499 1500 #if !defined(__xpv) 1501 /* 1502 * Variable to patch if hypervisor platform detection needs to be 1503 * disabled (e.g. platform_type will always be HW_NATIVE if this is 0). 1504 */ 1505 int enable_platform_detection = 1; 1506 #endif 1507 1508 /* 1509 * monitor/mwait info. 1510 * 1511 * size_actual and buf_actual are the real address and size allocated to get 1512 * proper mwait_buf alignement. buf_actual and size_actual should be passed 1513 * to kmem_free(). Currently kmem_alloc() and mwait happen to both use 1514 * processor cache-line alignment, but this is not guarantied in the furture. 1515 */ 1516 struct mwait_info { 1517 size_t mon_min; /* min size to avoid missed wakeups */ 1518 size_t mon_max; /* size to avoid false wakeups */ 1519 size_t size_actual; /* size actually allocated */ 1520 void *buf_actual; /* memory actually allocated */ 1521 uint32_t support; /* processor support of monitor/mwait */ 1522 }; 1523 1524 /* 1525 * xsave/xrestor info. 1526 * 1527 * This structure contains HW feature bits and the size of the xsave save area. 1528 * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure 1529 * (xsave_state) to describe the xsave layout. However, at runtime the 1530 * per-lwp xsave area is dynamically allocated based on xsav_max_size. The 1531 * xsave_state structure simply represents the legacy layout of the beginning 1532 * of the xsave area. 1533 */ 1534 struct xsave_info { 1535 uint32_t xsav_hw_features_low; /* Supported HW features */ 1536 uint32_t xsav_hw_features_high; /* Supported HW features */ 1537 size_t xsav_max_size; /* max size save area for HW features */ 1538 size_t ymm_size; /* AVX: size of ymm save area */ 1539 size_t ymm_offset; /* AVX: offset for ymm save area */ 1540 size_t bndregs_size; /* MPX: size of bndregs save area */ 1541 size_t bndregs_offset; /* MPX: offset for bndregs save area */ 1542 size_t bndcsr_size; /* MPX: size of bndcsr save area */ 1543 size_t bndcsr_offset; /* MPX: offset for bndcsr save area */ 1544 size_t opmask_size; /* AVX512: size of opmask save */ 1545 size_t opmask_offset; /* AVX512: offset for opmask save */ 1546 size_t zmmlo_size; /* AVX512: size of zmm 256 save */ 1547 size_t zmmlo_offset; /* AVX512: offset for zmm 256 save */ 1548 size_t zmmhi_size; /* AVX512: size of zmm hi reg save */ 1549 size_t zmmhi_offset; /* AVX512: offset for zmm hi reg save */ 1550 }; 1551 1552 1553 /* 1554 * These constants determine how many of the elements of the 1555 * cpuid we cache in the cpuid_info data structure; the 1556 * remaining elements are accessible via the cpuid instruction. 1557 */ 1558 1559 #define NMAX_CPI_STD 8 /* eax = 0 .. 7 */ 1560 #define NMAX_CPI_EXTD 0x1f /* eax = 0x80000000 .. 0x8000001e */ 1561 1562 /* 1563 * See the big theory statement for a more detailed explanation of what some of 1564 * these members mean. 1565 */ 1566 struct cpuid_info { 1567 uint_t cpi_pass; /* last pass completed */ 1568 /* 1569 * standard function information 1570 */ 1571 uint_t cpi_maxeax; /* fn 0: %eax */ 1572 char cpi_vendorstr[13]; /* fn 0: %ebx:%ecx:%edx */ 1573 uint_t cpi_vendor; /* enum of cpi_vendorstr */ 1574 1575 uint_t cpi_family; /* fn 1: extended family */ 1576 uint_t cpi_model; /* fn 1: extended model */ 1577 uint_t cpi_step; /* fn 1: stepping */ 1578 chipid_t cpi_chipid; /* fn 1: %ebx: Intel: chip # */ 1579 /* AMD: package/socket # */ 1580 uint_t cpi_brandid; /* fn 1: %ebx: brand ID */ 1581 int cpi_clogid; /* fn 1: %ebx: thread # */ 1582 uint_t cpi_ncpu_per_chip; /* fn 1: %ebx: logical cpu count */ 1583 uint8_t cpi_cacheinfo[16]; /* fn 2: intel-style cache desc */ 1584 uint_t cpi_ncache; /* fn 2: number of elements */ 1585 uint_t cpi_ncpu_shr_last_cache; /* fn 4: %eax: ncpus sharing cache */ 1586 id_t cpi_last_lvl_cacheid; /* fn 4: %eax: derived cache id */ 1587 uint_t cpi_cache_leaf_size; /* Number of cache elements */ 1588 /* Intel fn: 4, AMD fn: 8000001d */ 1589 struct cpuid_regs **cpi_cache_leaves; /* Acual leaves from above */ 1590 struct cpuid_regs cpi_std[NMAX_CPI_STD]; /* 0 .. 7 */ 1591 /* 1592 * extended function information 1593 */ 1594 uint_t cpi_xmaxeax; /* fn 0x80000000: %eax */ 1595 char cpi_brandstr[49]; /* fn 0x8000000[234] */ 1596 uint8_t cpi_pabits; /* fn 0x80000006: %eax */ 1597 uint8_t cpi_vabits; /* fn 0x80000006: %eax */ 1598 uint8_t cpi_fp_amd_save; /* AMD: FP error pointer save rqd. */ 1599 struct cpuid_regs cpi_extd[NMAX_CPI_EXTD]; /* 0x800000XX */ 1600 1601 id_t cpi_coreid; /* same coreid => strands share core */ 1602 int cpi_pkgcoreid; /* core number within single package */ 1603 uint_t cpi_ncore_per_chip; /* AMD: fn 0x80000008: %ecx[7-0] */ 1604 /* Intel: fn 4: %eax[31-26] */ 1605 1606 /* 1607 * These values represent the number of bits that are required to store 1608 * information about the number of cores and threads. 1609 */ 1610 uint_t cpi_ncore_bits; 1611 uint_t cpi_nthread_bits; 1612 /* 1613 * supported feature information 1614 */ 1615 uint32_t cpi_support[6]; 1616 #define STD_EDX_FEATURES 0 1617 #define AMD_EDX_FEATURES 1 1618 #define TM_EDX_FEATURES 2 1619 #define STD_ECX_FEATURES 3 1620 #define AMD_ECX_FEATURES 4 1621 #define STD_EBX_FEATURES 5 1622 /* 1623 * Synthesized information, where known. 1624 */ 1625 uint32_t cpi_chiprev; /* See X86_CHIPREV_* in x86_archext.h */ 1626 const char *cpi_chiprevstr; /* May be NULL if chiprev unknown */ 1627 uint32_t cpi_socket; /* Chip package/socket type */ 1628 1629 struct mwait_info cpi_mwait; /* fn 5: monitor/mwait info */ 1630 uint32_t cpi_apicid; 1631 uint_t cpi_procnodeid; /* AMD: nodeID on HT, Intel: chipid */ 1632 uint_t cpi_procnodes_per_pkg; /* AMD: # of nodes in the package */ 1633 /* Intel: 1 */ 1634 uint_t cpi_compunitid; /* AMD: ComputeUnit ID, Intel: coreid */ 1635 uint_t cpi_cores_per_compunit; /* AMD: # of cores in the ComputeUnit */ 1636 1637 struct xsave_info cpi_xsave; /* fn D: xsave/xrestor info */ 1638 }; 1639 1640 1641 static struct cpuid_info cpuid_info0; 1642 1643 /* 1644 * These bit fields are defined by the Intel Application Note AP-485 1645 * "Intel Processor Identification and the CPUID Instruction" 1646 */ 1647 #define CPI_FAMILY_XTD(cpi) BITX((cpi)->cpi_std[1].cp_eax, 27, 20) 1648 #define CPI_MODEL_XTD(cpi) BITX((cpi)->cpi_std[1].cp_eax, 19, 16) 1649 #define CPI_TYPE(cpi) BITX((cpi)->cpi_std[1].cp_eax, 13, 12) 1650 #define CPI_FAMILY(cpi) BITX((cpi)->cpi_std[1].cp_eax, 11, 8) 1651 #define CPI_STEP(cpi) BITX((cpi)->cpi_std[1].cp_eax, 3, 0) 1652 #define CPI_MODEL(cpi) BITX((cpi)->cpi_std[1].cp_eax, 7, 4) 1653 1654 #define CPI_FEATURES_EDX(cpi) ((cpi)->cpi_std[1].cp_edx) 1655 #define CPI_FEATURES_ECX(cpi) ((cpi)->cpi_std[1].cp_ecx) 1656 #define CPI_FEATURES_XTD_EDX(cpi) ((cpi)->cpi_extd[1].cp_edx) 1657 #define CPI_FEATURES_XTD_ECX(cpi) ((cpi)->cpi_extd[1].cp_ecx) 1658 #define CPI_FEATURES_7_0_EBX(cpi) ((cpi)->cpi_std[7].cp_ebx) 1659 #define CPI_FEATURES_7_0_ECX(cpi) ((cpi)->cpi_std[7].cp_ecx) 1660 #define CPI_FEATURES_7_0_EDX(cpi) ((cpi)->cpi_std[7].cp_edx) 1661 1662 #define CPI_BRANDID(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 7, 0) 1663 #define CPI_CHUNKS(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 15, 7) 1664 #define CPI_CPU_COUNT(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 23, 16) 1665 #define CPI_APIC_ID(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 31, 24) 1666 1667 #define CPI_MAXEAX_MAX 0x100 /* sanity control */ 1668 #define CPI_XMAXEAX_MAX 0x80000100 1669 #define CPI_FN4_ECX_MAX 0x20 /* sanity: max fn 4 levels */ 1670 #define CPI_FNB_ECX_MAX 0x20 /* sanity: max fn B levels */ 1671 1672 /* 1673 * Function 4 (Deterministic Cache Parameters) macros 1674 * Defined by Intel Application Note AP-485 1675 */ 1676 #define CPI_NUM_CORES(regs) BITX((regs)->cp_eax, 31, 26) 1677 #define CPI_NTHR_SHR_CACHE(regs) BITX((regs)->cp_eax, 25, 14) 1678 #define CPI_FULL_ASSOC_CACHE(regs) BITX((regs)->cp_eax, 9, 9) 1679 #define CPI_SELF_INIT_CACHE(regs) BITX((regs)->cp_eax, 8, 8) 1680 #define CPI_CACHE_LVL(regs) BITX((regs)->cp_eax, 7, 5) 1681 #define CPI_CACHE_TYPE(regs) BITX((regs)->cp_eax, 4, 0) 1682 #define CPI_CPU_LEVEL_TYPE(regs) BITX((regs)->cp_ecx, 15, 8) 1683 1684 #define CPI_CACHE_WAYS(regs) BITX((regs)->cp_ebx, 31, 22) 1685 #define CPI_CACHE_PARTS(regs) BITX((regs)->cp_ebx, 21, 12) 1686 #define CPI_CACHE_COH_LN_SZ(regs) BITX((regs)->cp_ebx, 11, 0) 1687 1688 #define CPI_CACHE_SETS(regs) BITX((regs)->cp_ecx, 31, 0) 1689 1690 #define CPI_PREFCH_STRIDE(regs) BITX((regs)->cp_edx, 9, 0) 1691 1692 1693 /* 1694 * A couple of shorthand macros to identify "later" P6-family chips 1695 * like the Pentium M and Core. First, the "older" P6-based stuff 1696 * (loosely defined as "pre-Pentium-4"): 1697 * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon 1698 */ 1699 #define IS_LEGACY_P6(cpi) ( \ 1700 cpi->cpi_family == 6 && \ 1701 (cpi->cpi_model == 1 || \ 1702 cpi->cpi_model == 3 || \ 1703 cpi->cpi_model == 5 || \ 1704 cpi->cpi_model == 6 || \ 1705 cpi->cpi_model == 7 || \ 1706 cpi->cpi_model == 8 || \ 1707 cpi->cpi_model == 0xA || \ 1708 cpi->cpi_model == 0xB) \ 1709 ) 1710 1711 /* A "new F6" is everything with family 6 that's not the above */ 1712 #define IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi)) 1713 1714 /* Extended family/model support */ 1715 #define IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \ 1716 cpi->cpi_family >= 0xf) 1717 1718 /* 1719 * Info for monitor/mwait idle loop. 1720 * 1721 * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's 1722 * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November 1723 * 2006. 1724 * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual 1725 * Documentation Updates" #33633, Rev 2.05, December 2006. 1726 */ 1727 #define MWAIT_SUPPORT (0x00000001) /* mwait supported */ 1728 #define MWAIT_EXTENSIONS (0x00000002) /* extenstion supported */ 1729 #define MWAIT_ECX_INT_ENABLE (0x00000004) /* ecx 1 extension supported */ 1730 #define MWAIT_SUPPORTED(cpi) ((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON) 1731 #define MWAIT_INT_ENABLE(cpi) ((cpi)->cpi_std[5].cp_ecx & 0x2) 1732 #define MWAIT_EXTENSION(cpi) ((cpi)->cpi_std[5].cp_ecx & 0x1) 1733 #define MWAIT_SIZE_MIN(cpi) BITX((cpi)->cpi_std[5].cp_eax, 15, 0) 1734 #define MWAIT_SIZE_MAX(cpi) BITX((cpi)->cpi_std[5].cp_ebx, 15, 0) 1735 /* 1736 * Number of sub-cstates for a given c-state. 1737 */ 1738 #define MWAIT_NUM_SUBC_STATES(cpi, c_state) \ 1739 BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state) 1740 1741 /* 1742 * XSAVE leaf 0xD enumeration 1743 */ 1744 #define CPUID_LEAFD_2_YMM_OFFSET 576 1745 #define CPUID_LEAFD_2_YMM_SIZE 256 1746 1747 /* 1748 * Common extended leaf names to cut down on typos. 1749 */ 1750 #define CPUID_LEAF_EXT_0 0x80000000 1751 #define CPUID_LEAF_EXT_8 0x80000008 1752 #define CPUID_LEAF_EXT_1d 0x8000001d 1753 #define CPUID_LEAF_EXT_1e 0x8000001e 1754 1755 /* 1756 * Functions we consune from cpuid_subr.c; don't publish these in a header 1757 * file to try and keep people using the expected cpuid_* interfaces. 1758 */ 1759 extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t); 1760 extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t); 1761 extern uint32_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t); 1762 extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t); 1763 extern uint_t _cpuid_vendorstr_to_vendorcode(char *); 1764 1765 /* 1766 * Apply up various platform-dependent restrictions where the 1767 * underlying platform restrictions mean the CPU can be marked 1768 * as less capable than its cpuid instruction would imply. 1769 */ 1770 #if defined(__xpv) 1771 static void 1772 platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp) 1773 { 1774 switch (eax) { 1775 case 1: { 1776 uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ? 1777 0 : CPUID_INTC_EDX_MCA; 1778 cp->cp_edx &= 1779 ~(mcamask | 1780 CPUID_INTC_EDX_PSE | 1781 CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE | 1782 CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR | 1783 CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT | 1784 CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP | 1785 CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT); 1786 break; 1787 } 1788 1789 case 0x80000001: 1790 cp->cp_edx &= 1791 ~(CPUID_AMD_EDX_PSE | 1792 CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE | 1793 CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE | 1794 CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 | 1795 CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP | 1796 CPUID_AMD_EDX_TSCP); 1797 cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY; 1798 break; 1799 default: 1800 break; 1801 } 1802 1803 switch (vendor) { 1804 case X86_VENDOR_Intel: 1805 switch (eax) { 1806 case 4: 1807 /* 1808 * Zero out the (ncores-per-chip - 1) field 1809 */ 1810 cp->cp_eax &= 0x03fffffff; 1811 break; 1812 default: 1813 break; 1814 } 1815 break; 1816 case X86_VENDOR_AMD: 1817 switch (eax) { 1818 1819 case 0x80000001: 1820 cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D; 1821 break; 1822 1823 case CPUID_LEAF_EXT_8: 1824 /* 1825 * Zero out the (ncores-per-chip - 1) field 1826 */ 1827 cp->cp_ecx &= 0xffffff00; 1828 break; 1829 default: 1830 break; 1831 } 1832 break; 1833 default: 1834 break; 1835 } 1836 } 1837 #else 1838 #define platform_cpuid_mangle(vendor, eax, cp) /* nothing */ 1839 #endif 1840 1841 /* 1842 * Some undocumented ways of patching the results of the cpuid 1843 * instruction to permit running Solaris 10 on future cpus that 1844 * we don't currently support. Could be set to non-zero values 1845 * via settings in eeprom. 1846 */ 1847 1848 uint32_t cpuid_feature_ecx_include; 1849 uint32_t cpuid_feature_ecx_exclude; 1850 uint32_t cpuid_feature_edx_include; 1851 uint32_t cpuid_feature_edx_exclude; 1852 1853 /* 1854 * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs. 1855 */ 1856 void 1857 cpuid_alloc_space(cpu_t *cpu) 1858 { 1859 /* 1860 * By convention, cpu0 is the boot cpu, which is set up 1861 * before memory allocation is available. All other cpus get 1862 * their cpuid_info struct allocated here. 1863 */ 1864 ASSERT(cpu->cpu_id != 0); 1865 ASSERT(cpu->cpu_m.mcpu_cpi == NULL); 1866 cpu->cpu_m.mcpu_cpi = 1867 kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP); 1868 } 1869 1870 void 1871 cpuid_free_space(cpu_t *cpu) 1872 { 1873 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 1874 int i; 1875 1876 ASSERT(cpi != NULL); 1877 ASSERT(cpi != &cpuid_info0); 1878 1879 /* 1880 * Free up any cache leaf related dynamic storage. The first entry was 1881 * cached from the standard cpuid storage, so we should not free it. 1882 */ 1883 for (i = 1; i < cpi->cpi_cache_leaf_size; i++) 1884 kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs)); 1885 if (cpi->cpi_cache_leaf_size > 0) 1886 kmem_free(cpi->cpi_cache_leaves, 1887 cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *)); 1888 1889 kmem_free(cpi, sizeof (*cpi)); 1890 cpu->cpu_m.mcpu_cpi = NULL; 1891 } 1892 1893 #if !defined(__xpv) 1894 /* 1895 * Determine the type of the underlying platform. This is used to customize 1896 * initialization of various subsystems (e.g. TSC). determine_platform() must 1897 * only ever be called once to prevent two processors from seeing different 1898 * values of platform_type. Must be called before cpuid_pass1(), the earliest 1899 * consumer to execute (uses _cpuid_chiprev --> synth_amd_info --> get_hwenv). 1900 */ 1901 void 1902 determine_platform(void) 1903 { 1904 struct cpuid_regs cp; 1905 uint32_t base; 1906 uint32_t regs[4]; 1907 char *hvstr = (char *)regs; 1908 1909 ASSERT(platform_type == -1); 1910 1911 platform_type = HW_NATIVE; 1912 1913 if (!enable_platform_detection) 1914 return; 1915 1916 /* 1917 * If Hypervisor CPUID bit is set, try to determine hypervisor 1918 * vendor signature, and set platform type accordingly. 1919 * 1920 * References: 1921 * http://lkml.org/lkml/2008/10/1/246 1922 * http://kb.vmware.com/kb/1009458 1923 */ 1924 cp.cp_eax = 0x1; 1925 (void) __cpuid_insn(&cp); 1926 if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) { 1927 cp.cp_eax = 0x40000000; 1928 (void) __cpuid_insn(&cp); 1929 regs[0] = cp.cp_ebx; 1930 regs[1] = cp.cp_ecx; 1931 regs[2] = cp.cp_edx; 1932 regs[3] = 0; 1933 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) { 1934 platform_type = HW_XEN_HVM; 1935 return; 1936 } 1937 if (strcmp(hvstr, HVSIG_VMWARE) == 0) { 1938 platform_type = HW_VMWARE; 1939 return; 1940 } 1941 if (strcmp(hvstr, HVSIG_KVM) == 0) { 1942 platform_type = HW_KVM; 1943 return; 1944 } 1945 if (strcmp(hvstr, HVSIG_BHYVE) == 0) { 1946 platform_type = HW_BHYVE; 1947 return; 1948 } 1949 if (strcmp(hvstr, HVSIG_MICROSOFT) == 0) 1950 platform_type = HW_MICROSOFT; 1951 } else { 1952 /* 1953 * Check older VMware hardware versions. VMware hypervisor is 1954 * detected by performing an IN operation to VMware hypervisor 1955 * port and checking that value returned in %ebx is VMware 1956 * hypervisor magic value. 1957 * 1958 * References: http://kb.vmware.com/kb/1009458 1959 */ 1960 vmware_port(VMWARE_HVCMD_GETVERSION, regs); 1961 if (regs[1] == VMWARE_HVMAGIC) { 1962 platform_type = HW_VMWARE; 1963 return; 1964 } 1965 } 1966 1967 /* 1968 * Check Xen hypervisor. In a fully virtualized domain, 1969 * Xen's pseudo-cpuid function returns a string representing the 1970 * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum 1971 * supported cpuid function. We need at least a (base + 2) leaf value 1972 * to do what we want to do. Try different base values, since the 1973 * hypervisor might use a different one depending on whether Hyper-V 1974 * emulation is switched on by default or not. 1975 */ 1976 for (base = 0x40000000; base < 0x40010000; base += 0x100) { 1977 cp.cp_eax = base; 1978 (void) __cpuid_insn(&cp); 1979 regs[0] = cp.cp_ebx; 1980 regs[1] = cp.cp_ecx; 1981 regs[2] = cp.cp_edx; 1982 regs[3] = 0; 1983 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 && 1984 cp.cp_eax >= (base + 2)) { 1985 platform_type &= ~HW_NATIVE; 1986 platform_type |= HW_XEN_HVM; 1987 return; 1988 } 1989 } 1990 } 1991 1992 int 1993 get_hwenv(void) 1994 { 1995 ASSERT(platform_type != -1); 1996 return (platform_type); 1997 } 1998 1999 int 2000 is_controldom(void) 2001 { 2002 return (0); 2003 } 2004 2005 #else 2006 2007 int 2008 get_hwenv(void) 2009 { 2010 return (HW_XEN_PV); 2011 } 2012 2013 int 2014 is_controldom(void) 2015 { 2016 return (DOMAIN_IS_INITDOMAIN(xen_info)); 2017 } 2018 2019 #endif /* __xpv */ 2020 2021 /* 2022 * Make sure that we have gathered all of the CPUID leaves that we might need to 2023 * determine topology. We assume that the standard leaf 1 has already been done 2024 * and that xmaxeax has already been calculated. 2025 */ 2026 static void 2027 cpuid_gather_amd_topology_leaves(cpu_t *cpu) 2028 { 2029 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2030 2031 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) { 2032 struct cpuid_regs *cp; 2033 2034 cp = &cpi->cpi_extd[8]; 2035 cp->cp_eax = CPUID_LEAF_EXT_8; 2036 (void) __cpuid_insn(cp); 2037 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp); 2038 } 2039 2040 if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 2041 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { 2042 struct cpuid_regs *cp; 2043 2044 cp = &cpi->cpi_extd[0x1e]; 2045 cp->cp_eax = CPUID_LEAF_EXT_1e; 2046 (void) __cpuid_insn(cp); 2047 } 2048 } 2049 2050 /* 2051 * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer 2052 * it to everything else. If not, and we're on an AMD system where 8000001e is 2053 * valid, then we use that. Othewrise, we fall back to the default value for the 2054 * APIC ID in leaf 1. 2055 */ 2056 static uint32_t 2057 cpuid_gather_apicid(struct cpuid_info *cpi) 2058 { 2059 /* 2060 * Leaf B changes based on the arguments to it. Beacuse we don't cache 2061 * it, we need to gather it again. 2062 */ 2063 if (cpi->cpi_maxeax >= 0xB) { 2064 struct cpuid_regs regs; 2065 struct cpuid_regs *cp; 2066 2067 cp = ®s; 2068 cp->cp_eax = 0xB; 2069 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0; 2070 (void) __cpuid_insn(cp); 2071 2072 if (cp->cp_ebx != 0) { 2073 return (cp->cp_edx); 2074 } 2075 } 2076 2077 if (cpi->cpi_vendor == X86_VENDOR_AMD && 2078 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 2079 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { 2080 return (cpi->cpi_extd[0x1e].cp_eax); 2081 } 2082 2083 return (CPI_APIC_ID(cpi)); 2084 } 2085 2086 /* 2087 * For AMD processors, attempt to calculate the number of chips and cores that 2088 * exist. The way that we do this varies based on the generation, because the 2089 * generations themselves have changed dramatically. 2090 * 2091 * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores. 2092 * However, with the advent of family 17h (Zen) it actually tells us the number 2093 * of threads, so we need to look at leaf 0x8000001e if available to determine 2094 * its value. Otherwise, for all prior families, the number of enabled cores is 2095 * the same as threads. 2096 * 2097 * If we do not have leaf 0x80000008, then we assume that this processor does 2098 * not have anything. AMD's older CPUID specification says there's no reason to 2099 * fall back to leaf 1. 2100 * 2101 * In some virtualization cases we will not have leaf 8000001e or it will be 2102 * zero. When that happens we assume the number of threads is one. 2103 */ 2104 static void 2105 cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores) 2106 { 2107 uint_t nthreads, nthread_per_core; 2108 2109 nthreads = nthread_per_core = 1; 2110 2111 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) { 2112 nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1; 2113 } else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) { 2114 nthreads = CPI_CPU_COUNT(cpi); 2115 } 2116 2117 /* 2118 * For us to have threads, and know about it, we have to be at least at 2119 * family 17h and have the cpuid bit that says we have extended 2120 * topology. 2121 */ 2122 if (cpi->cpi_family >= 0x17 && 2123 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 2124 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { 2125 nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1; 2126 } 2127 2128 *ncpus = nthreads; 2129 *ncores = nthreads / nthread_per_core; 2130 } 2131 2132 /* 2133 * Seed the initial values for the cores and threads for an Intel based 2134 * processor. These values will be overwritten if we detect that the processor 2135 * supports CPUID leaf 0xb. 2136 */ 2137 static void 2138 cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores) 2139 { 2140 /* 2141 * Only seed the number of physical cores from the first level leaf 4 2142 * information. The number of threads there indicate how many share the 2143 * L1 cache, which may or may not have anything to do with the number of 2144 * logical CPUs per core. 2145 */ 2146 if (cpi->cpi_maxeax >= 4) { 2147 *ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1; 2148 } else { 2149 *ncores = 1; 2150 } 2151 2152 if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) { 2153 *ncpus = CPI_CPU_COUNT(cpi); 2154 } else { 2155 *ncpus = *ncores; 2156 } 2157 } 2158 2159 static boolean_t 2160 cpuid_leafB_getids(cpu_t *cpu) 2161 { 2162 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2163 struct cpuid_regs regs; 2164 struct cpuid_regs *cp; 2165 2166 if (cpi->cpi_maxeax < 0xB) 2167 return (B_FALSE); 2168 2169 cp = ®s; 2170 cp->cp_eax = 0xB; 2171 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0; 2172 2173 (void) __cpuid_insn(cp); 2174 2175 /* 2176 * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which 2177 * indicates that the extended topology enumeration leaf is 2178 * available. 2179 */ 2180 if (cp->cp_ebx != 0) { 2181 uint32_t x2apic_id = 0; 2182 uint_t coreid_shift = 0; 2183 uint_t ncpu_per_core = 1; 2184 uint_t chipid_shift = 0; 2185 uint_t ncpu_per_chip = 1; 2186 uint_t i; 2187 uint_t level; 2188 2189 for (i = 0; i < CPI_FNB_ECX_MAX; i++) { 2190 cp->cp_eax = 0xB; 2191 cp->cp_ecx = i; 2192 2193 (void) __cpuid_insn(cp); 2194 level = CPI_CPU_LEVEL_TYPE(cp); 2195 2196 if (level == 1) { 2197 x2apic_id = cp->cp_edx; 2198 coreid_shift = BITX(cp->cp_eax, 4, 0); 2199 ncpu_per_core = BITX(cp->cp_ebx, 15, 0); 2200 } else if (level == 2) { 2201 x2apic_id = cp->cp_edx; 2202 chipid_shift = BITX(cp->cp_eax, 4, 0); 2203 ncpu_per_chip = BITX(cp->cp_ebx, 15, 0); 2204 } 2205 } 2206 2207 /* 2208 * cpi_apicid is taken care of in cpuid_gather_apicid. 2209 */ 2210 cpi->cpi_ncpu_per_chip = ncpu_per_chip; 2211 cpi->cpi_ncore_per_chip = ncpu_per_chip / 2212 ncpu_per_core; 2213 cpi->cpi_chipid = x2apic_id >> chipid_shift; 2214 cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1); 2215 cpi->cpi_coreid = x2apic_id >> coreid_shift; 2216 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift; 2217 cpi->cpi_procnodeid = cpi->cpi_chipid; 2218 cpi->cpi_compunitid = cpi->cpi_coreid; 2219 2220 if (coreid_shift > 0 && chipid_shift > coreid_shift) { 2221 cpi->cpi_nthread_bits = coreid_shift; 2222 cpi->cpi_ncore_bits = chipid_shift - coreid_shift; 2223 } 2224 2225 return (B_TRUE); 2226 } else { 2227 return (B_FALSE); 2228 } 2229 } 2230 2231 static void 2232 cpuid_intel_getids(cpu_t *cpu, void *feature) 2233 { 2234 uint_t i; 2235 uint_t chipid_shift = 0; 2236 uint_t coreid_shift = 0; 2237 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2238 2239 /* 2240 * There are no compute units or processor nodes currently on Intel. 2241 * Always set these to one. 2242 */ 2243 cpi->cpi_procnodes_per_pkg = 1; 2244 cpi->cpi_cores_per_compunit = 1; 2245 2246 /* 2247 * If cpuid Leaf B is present, use that to try and get this information. 2248 * It will be the most accurate for Intel CPUs. 2249 */ 2250 if (cpuid_leafB_getids(cpu)) 2251 return; 2252 2253 /* 2254 * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip 2255 * and ncore_per_chip. These represent the largest power of two values 2256 * that we need to cover all of the IDs in the system. Therefore, we use 2257 * those values to seed the number of bits needed to cover information 2258 * in the case when leaf B is not available. These values will probably 2259 * be larger than required, but that's OK. 2260 */ 2261 cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip); 2262 cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip); 2263 2264 for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1) 2265 chipid_shift++; 2266 2267 cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift; 2268 cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1); 2269 2270 if (is_x86_feature(feature, X86FSET_CMP)) { 2271 /* 2272 * Multi-core (and possibly multi-threaded) 2273 * processors. 2274 */ 2275 uint_t ncpu_per_core; 2276 if (cpi->cpi_ncore_per_chip == 1) 2277 ncpu_per_core = cpi->cpi_ncpu_per_chip; 2278 else if (cpi->cpi_ncore_per_chip > 1) 2279 ncpu_per_core = cpi->cpi_ncpu_per_chip / 2280 cpi->cpi_ncore_per_chip; 2281 /* 2282 * 8bit APIC IDs on dual core Pentiums 2283 * look like this: 2284 * 2285 * +-----------------------+------+------+ 2286 * | Physical Package ID | MC | HT | 2287 * +-----------------------+------+------+ 2288 * <------- chipid --------> 2289 * <------- coreid ---------------> 2290 * <--- clogid --> 2291 * <------> 2292 * pkgcoreid 2293 * 2294 * Where the number of bits necessary to 2295 * represent MC and HT fields together equals 2296 * to the minimum number of bits necessary to 2297 * store the value of cpi->cpi_ncpu_per_chip. 2298 * Of those bits, the MC part uses the number 2299 * of bits necessary to store the value of 2300 * cpi->cpi_ncore_per_chip. 2301 */ 2302 for (i = 1; i < ncpu_per_core; i <<= 1) 2303 coreid_shift++; 2304 cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift; 2305 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift; 2306 } else if (is_x86_feature(feature, X86FSET_HTT)) { 2307 /* 2308 * Single-core multi-threaded processors. 2309 */ 2310 cpi->cpi_coreid = cpi->cpi_chipid; 2311 cpi->cpi_pkgcoreid = 0; 2312 } else { 2313 /* 2314 * Single-core single-thread processors. 2315 */ 2316 cpi->cpi_coreid = cpu->cpu_id; 2317 cpi->cpi_pkgcoreid = 0; 2318 } 2319 cpi->cpi_procnodeid = cpi->cpi_chipid; 2320 cpi->cpi_compunitid = cpi->cpi_coreid; 2321 } 2322 2323 /* 2324 * Historically, AMD has had CMP chips with only a single thread per core. 2325 * However, starting in family 17h (Zen), this has changed and they now have 2326 * multiple threads. Our internal core id needs to be a unique value. 2327 * 2328 * To determine the core id of an AMD system, if we're from a family before 17h, 2329 * then we just use the cpu id, as that gives us a good value that will be 2330 * unique for each core. If instead, we're on family 17h or later, then we need 2331 * to do something more complicated. CPUID leaf 0x8000001e can tell us 2332 * how many threads are in the system. Based on that, we'll shift the APIC ID. 2333 * We can't use the normal core id in that leaf as it's only unique within the 2334 * socket, which is perfect for cpi_pkgcoreid, but not us. 2335 */ 2336 static id_t 2337 cpuid_amd_get_coreid(cpu_t *cpu) 2338 { 2339 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2340 2341 if (cpi->cpi_family >= 0x17 && 2342 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 2343 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { 2344 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1; 2345 if (nthreads > 1) { 2346 VERIFY3U(nthreads, ==, 2); 2347 return (cpi->cpi_apicid >> 1); 2348 } 2349 } 2350 2351 return (cpu->cpu_id); 2352 } 2353 2354 /* 2355 * IDs on AMD is a more challenging task. This is notable because of the 2356 * following two facts: 2357 * 2358 * 1. Before family 0x17 (Zen), there was no support for SMT and there was 2359 * also no way to get an actual unique core id from the system. As such, we 2360 * synthesize this case by using cpu->cpu_id. This scheme does not, 2361 * however, guarantee that sibling cores of a chip will have sequential 2362 * coreids starting at a multiple of the number of cores per chip - that is 2363 * usually the case, but if the ACPI MADT table is presented in a different 2364 * order then we need to perform a few more gymnastics for the pkgcoreid. 2365 * 2366 * 2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups 2367 * called compute units. These compute units share the L1I cache, L2 cache, 2368 * and the FPU. To deal with this, a new topology leaf was added in 2369 * 0x8000001e. However, parts of this leaf have different meanings 2370 * once we get to family 0x17. 2371 */ 2372 2373 static void 2374 cpuid_amd_getids(cpu_t *cpu, uchar_t *features) 2375 { 2376 int i, first_half, coreidsz; 2377 uint32_t nb_caps_reg; 2378 uint_t node2_1; 2379 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2380 struct cpuid_regs *cp; 2381 2382 /* 2383 * Calculate the core id (this comes from hardware in family 0x17 if it 2384 * hasn't been stripped by virtualization). We always set the compute 2385 * unit id to the same value. Also, initialize the default number of 2386 * cores per compute unit and nodes per package. This will be 2387 * overwritten when we know information about a particular family. 2388 */ 2389 cpi->cpi_coreid = cpuid_amd_get_coreid(cpu); 2390 cpi->cpi_compunitid = cpi->cpi_coreid; 2391 cpi->cpi_cores_per_compunit = 1; 2392 cpi->cpi_procnodes_per_pkg = 1; 2393 2394 /* 2395 * To construct the logical ID, we need to determine how many APIC IDs 2396 * are dedicated to the cores and threads. This is provided for us in 2397 * 0x80000008. However, if it's not present (say due to virtualization), 2398 * then we assume it's one. This should be present on all 64-bit AMD 2399 * processors. It was added in family 0xf (Hammer). 2400 */ 2401 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) { 2402 coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12); 2403 2404 /* 2405 * In AMD parlance chip is really a node while illumos 2406 * uses chip as equivalent to socket/package. 2407 */ 2408 if (coreidsz == 0) { 2409 /* Use legacy method */ 2410 for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1) 2411 coreidsz++; 2412 if (coreidsz == 0) 2413 coreidsz = 1; 2414 } 2415 } else { 2416 /* Assume single-core part */ 2417 coreidsz = 1; 2418 } 2419 cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1); 2420 2421 /* 2422 * The package core ID varies depending on the family. While it may be 2423 * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately, 2424 * this value is the core id in the given node. For non-virtualized 2425 * family 17h, we need to take the logical core id and shift off the 2426 * threads like we do when getting the core id. Otherwise, we can use 2427 * the clogid as is. When family 17h is virtualized, the clogid should 2428 * be sufficient as if we don't have valid data in the leaf, then we 2429 * won't think we have SMT, in which case the cpi_clogid should be 2430 * sufficient. 2431 */ 2432 if (cpi->cpi_family >= 0x17 && 2433 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 2434 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e && 2435 cpi->cpi_extd[0x1e].cp_ebx != 0) { 2436 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1; 2437 if (nthreads > 1) { 2438 VERIFY3U(nthreads, ==, 2); 2439 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1; 2440 } else { 2441 cpi->cpi_pkgcoreid = cpi->cpi_clogid; 2442 } 2443 } else { 2444 cpi->cpi_pkgcoreid = cpi->cpi_clogid; 2445 } 2446 2447 /* 2448 * Obtain the node ID and compute unit IDs. If we're on family 0x15 2449 * (bulldozer) or newer, then we can derive all of this from leaf 2450 * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family. 2451 */ 2452 if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 2453 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { 2454 cp = &cpi->cpi_extd[0x1e]; 2455 2456 cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1; 2457 cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0); 2458 2459 /* 2460 * For Bulldozer-era CPUs, recalculate the compute unit 2461 * information. 2462 */ 2463 if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) { 2464 cpi->cpi_cores_per_compunit = 2465 BITX(cp->cp_ebx, 15, 8) + 1; 2466 cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) + 2467 (cpi->cpi_ncore_per_chip / 2468 cpi->cpi_cores_per_compunit) * 2469 (cpi->cpi_procnodeid / 2470 cpi->cpi_procnodes_per_pkg); 2471 } 2472 } else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) { 2473 cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7; 2474 } else if (cpi->cpi_family == 0x10) { 2475 /* 2476 * See if we are a multi-node processor. 2477 * All processors in the system have the same number of nodes 2478 */ 2479 nb_caps_reg = pci_getl_func(0, 24, 3, 0xe8); 2480 if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) { 2481 /* Single-node */ 2482 cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5, 2483 coreidsz); 2484 } else { 2485 2486 /* 2487 * Multi-node revision D (2 nodes per package 2488 * are supported) 2489 */ 2490 cpi->cpi_procnodes_per_pkg = 2; 2491 2492 first_half = (cpi->cpi_pkgcoreid <= 2493 (cpi->cpi_ncore_per_chip/2 - 1)); 2494 2495 if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) { 2496 /* We are BSP */ 2497 cpi->cpi_procnodeid = (first_half ? 0 : 1); 2498 } else { 2499 2500 /* We are AP */ 2501 /* NodeId[2:1] bits to use for reading F3xe8 */ 2502 node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1; 2503 2504 nb_caps_reg = 2505 pci_getl_func(0, 24 + node2_1, 3, 0xe8); 2506 2507 /* 2508 * Check IntNodeNum bit (31:30, but bit 31 is 2509 * always 0 on dual-node processors) 2510 */ 2511 if (BITX(nb_caps_reg, 30, 30) == 0) 2512 cpi->cpi_procnodeid = node2_1 + 2513 !first_half; 2514 else 2515 cpi->cpi_procnodeid = node2_1 + 2516 first_half; 2517 } 2518 } 2519 } else { 2520 cpi->cpi_procnodeid = 0; 2521 } 2522 2523 cpi->cpi_chipid = 2524 cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg; 2525 2526 cpi->cpi_ncore_bits = coreidsz; 2527 cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip / 2528 cpi->cpi_ncore_per_chip); 2529 } 2530 2531 static void 2532 spec_uarch_flush_noop(void) 2533 { 2534 } 2535 2536 /* 2537 * When microcode is present that mitigates MDS, this wrmsr will also flush the 2538 * MDS-related micro-architectural state that would normally happen by calling 2539 * x86_md_clear(). 2540 */ 2541 static void 2542 spec_uarch_flush_msr(void) 2543 { 2544 wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D); 2545 } 2546 2547 /* 2548 * This function points to a function that will flush certain 2549 * micro-architectural state on the processor. This flush is used to mitigate 2550 * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This 2551 * function can point to one of three functions: 2552 * 2553 * - A noop which is done because we either are vulnerable, but do not have 2554 * microcode available to help deal with a fix, or because we aren't 2555 * vulnerable. 2556 * 2557 * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to 2558 * mitigate MDS is present, also perform the equivalent of the MDS flush; 2559 * however, it only flushes the MDS related micro-architectural state on the 2560 * current hyperthread, it does not do anything for the twin. 2561 * 2562 * - x86_md_clear which will flush the MDS related state. This is done when we 2563 * have a processor that is vulnerable to MDS, but is not vulnerable to L1TF 2564 * (RDCL_NO is set). 2565 */ 2566 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop; 2567 2568 static void 2569 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset) 2570 { 2571 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2572 2573 /* 2574 * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS) 2575 * has been fixed in hardware, it doesn't cover everything related to 2576 * MDS. Therefore we can only rely on MDS_NO to determine that we don't 2577 * need to mitigate this. 2578 */ 2579 if (cpi->cpi_vendor != X86_VENDOR_Intel || 2580 is_x86_feature(featureset, X86FSET_MDS_NO)) { 2581 return; 2582 } 2583 2584 if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) { 2585 const uint8_t nop = NOP_INSTR; 2586 uint8_t *md = (uint8_t *)x86_md_clear; 2587 2588 *md = nop; 2589 } 2590 2591 membar_producer(); 2592 } 2593 2594 static void 2595 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset) 2596 { 2597 boolean_t need_l1d, need_mds; 2598 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2599 2600 /* 2601 * If we're not on Intel or we've mitigated both RDCL and MDS in 2602 * hardware, then there's nothing left for us to do for enabling the 2603 * flush. We can also go ahead and say that SMT exclusion is 2604 * unnecessary. 2605 */ 2606 if (cpi->cpi_vendor != X86_VENDOR_Intel || 2607 (is_x86_feature(featureset, X86FSET_RDCL_NO) && 2608 is_x86_feature(featureset, X86FSET_MDS_NO))) { 2609 extern int smt_exclusion; 2610 smt_exclusion = 0; 2611 spec_uarch_flush = spec_uarch_flush_noop; 2612 membar_producer(); 2613 return; 2614 } 2615 2616 /* 2617 * The locations where we need to perform an L1D flush are required both 2618 * for mitigating L1TF and MDS. When verw support is present in 2619 * microcode, then the L1D flush will take care of doing that as well. 2620 * However, if we have a system where RDCL_NO is present, but we don't 2621 * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full 2622 * L1D flush. 2623 */ 2624 if (!is_x86_feature(featureset, X86FSET_RDCL_NO) && 2625 is_x86_feature(featureset, X86FSET_FLUSH_CMD) && 2626 !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) { 2627 need_l1d = B_TRUE; 2628 } else { 2629 need_l1d = B_FALSE; 2630 } 2631 2632 if (!is_x86_feature(featureset, X86FSET_MDS_NO) && 2633 is_x86_feature(featureset, X86FSET_MD_CLEAR)) { 2634 need_mds = B_TRUE; 2635 } else { 2636 need_mds = B_FALSE; 2637 } 2638 2639 if (need_l1d) { 2640 spec_uarch_flush = spec_uarch_flush_msr; 2641 } else if (need_mds) { 2642 spec_uarch_flush = x86_md_clear; 2643 } else { 2644 /* 2645 * We have no hardware mitigations available to us. 2646 */ 2647 spec_uarch_flush = spec_uarch_flush_noop; 2648 } 2649 membar_producer(); 2650 } 2651 2652 /* 2653 * We default to enabling RSB mitigations. 2654 */ 2655 static void 2656 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit) 2657 { 2658 const uint8_t ret = RET_INSTR; 2659 uint8_t *stuff = (uint8_t *)x86_rsb_stuff; 2660 2661 switch (mit) { 2662 case X86_SPECTREV2_ENHANCED_IBRS: 2663 case X86_SPECTREV2_DISABLED: 2664 *stuff = ret; 2665 break; 2666 default: 2667 break; 2668 } 2669 } 2670 2671 static void 2672 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit) 2673 { 2674 const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi", 2675 "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13", 2676 "_r14", "_r15" }; 2677 const uint_t nthunks = ARRAY_SIZE(thunks); 2678 const char *type; 2679 uint_t i; 2680 2681 if (mit == x86_spectrev2_mitigation) 2682 return; 2683 2684 switch (mit) { 2685 case X86_SPECTREV2_RETPOLINE: 2686 type = "gen"; 2687 break; 2688 case X86_SPECTREV2_RETPOLINE_AMD: 2689 type = "amd"; 2690 break; 2691 case X86_SPECTREV2_ENHANCED_IBRS: 2692 case X86_SPECTREV2_DISABLED: 2693 type = "jmp"; 2694 break; 2695 default: 2696 panic("asked to updated retpoline state with unknown state!"); 2697 } 2698 2699 for (i = 0; i < nthunks; i++) { 2700 uintptr_t source, dest; 2701 int ssize, dsize; 2702 char sourcebuf[64], destbuf[64]; 2703 size_t len; 2704 2705 (void) snprintf(destbuf, sizeof (destbuf), 2706 "__x86_indirect_thunk%s", thunks[i]); 2707 (void) snprintf(sourcebuf, sizeof (sourcebuf), 2708 "__x86_indirect_thunk_%s%s", type, thunks[i]); 2709 2710 source = kobj_getelfsym(sourcebuf, NULL, &ssize); 2711 dest = kobj_getelfsym(destbuf, NULL, &dsize); 2712 VERIFY3U(source, !=, 0); 2713 VERIFY3U(dest, !=, 0); 2714 VERIFY3S(dsize, >=, ssize); 2715 bcopy((void *)source, (void *)dest, ssize); 2716 } 2717 } 2718 2719 static void 2720 cpuid_enable_enhanced_ibrs(void) 2721 { 2722 uint64_t val; 2723 2724 val = rdmsr(MSR_IA32_SPEC_CTRL); 2725 val |= IA32_SPEC_CTRL_IBRS; 2726 wrmsr(MSR_IA32_SPEC_CTRL, val); 2727 } 2728 2729 #ifndef __xpv 2730 /* 2731 * Determine whether or not we can use the AMD optimized retpoline 2732 * functionality. We use this when we know we're on an AMD system and we can 2733 * successfully verify that lfence is dispatch serializing. 2734 */ 2735 static boolean_t 2736 cpuid_use_amd_retpoline(struct cpuid_info *cpi) 2737 { 2738 uint64_t val; 2739 on_trap_data_t otd; 2740 2741 if (cpi->cpi_vendor != X86_VENDOR_AMD) 2742 return (B_FALSE); 2743 2744 /* 2745 * We need to determine whether or not lfence is serializing. It always 2746 * is on families 0xf and 0x11. On others, it's controlled by 2747 * MSR_AMD_DECODE_CONFIG (MSRC001_1029). If some hypervisor gives us a 2748 * crazy old family, don't try and do anything. 2749 */ 2750 if (cpi->cpi_family < 0xf) 2751 return (B_FALSE); 2752 if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11) 2753 return (B_TRUE); 2754 2755 /* 2756 * While it may be tempting to use get_hwenv(), there are no promises 2757 * that a hypervisor will actually declare themselves to be so in a 2758 * friendly way. As such, try to read and set the MSR. If we can then 2759 * read back the value we set (it wasn't just set to zero), then we go 2760 * for it. 2761 */ 2762 if (!on_trap(&otd, OT_DATA_ACCESS)) { 2763 val = rdmsr(MSR_AMD_DECODE_CONFIG); 2764 val |= AMD_DECODE_CONFIG_LFENCE_DISPATCH; 2765 wrmsr(MSR_AMD_DECODE_CONFIG, val); 2766 val = rdmsr(MSR_AMD_DECODE_CONFIG); 2767 } else { 2768 val = 0; 2769 } 2770 no_trap(); 2771 2772 if ((val & AMD_DECODE_CONFIG_LFENCE_DISPATCH) != 0) 2773 return (B_TRUE); 2774 return (B_FALSE); 2775 } 2776 #endif /* !__xpv */ 2777 2778 /* 2779 * Determine how we should mitigate TAA or if we need to. Regardless of TAA, if 2780 * we can disable TSX, we do so. 2781 * 2782 * This determination is done only on the boot CPU, potentially after loading 2783 * updated microcode. 2784 */ 2785 static void 2786 cpuid_update_tsx(cpu_t *cpu, uchar_t *featureset) 2787 { 2788 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2789 2790 VERIFY(cpu->cpu_id == 0); 2791 2792 if (cpi->cpi_vendor != X86_VENDOR_Intel) { 2793 x86_taa_mitigation = X86_TAA_HW_MITIGATED; 2794 return; 2795 } 2796 2797 if (x86_disable_taa) { 2798 x86_taa_mitigation = X86_TAA_DISABLED; 2799 return; 2800 } 2801 2802 /* 2803 * If we do not have the ability to disable TSX, then our only 2804 * mitigation options are in hardware (TAA_NO), or by using our existing 2805 * MDS mitigation as described above. The latter relies upon us having 2806 * configured MDS mitigations correctly! This includes disabling SMT if 2807 * we want to cross-CPU-thread protection. 2808 */ 2809 if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) { 2810 /* 2811 * It's not clear whether any parts will enumerate TAA_NO 2812 * *without* TSX_CTRL, but let's mark it as such if we see this. 2813 */ 2814 if (is_x86_feature(featureset, X86FSET_TAA_NO)) { 2815 x86_taa_mitigation = X86_TAA_HW_MITIGATED; 2816 return; 2817 } 2818 2819 if (is_x86_feature(featureset, X86FSET_MD_CLEAR) && 2820 !is_x86_feature(featureset, X86FSET_MDS_NO)) { 2821 x86_taa_mitigation = X86_TAA_MD_CLEAR; 2822 } else { 2823 x86_taa_mitigation = X86_TAA_NOTHING; 2824 } 2825 return; 2826 } 2827 2828 /* 2829 * We have TSX_CTRL, but we can only fully disable TSX if we're early 2830 * enough in boot. 2831 * 2832 * Otherwise, we'll fall back to causing transactions to abort as our 2833 * mitigation. TSX-using code will always take the fallback path. 2834 */ 2835 if (cpi->cpi_pass < 4) { 2836 x86_taa_mitigation = X86_TAA_TSX_DISABLE; 2837 } else { 2838 x86_taa_mitigation = X86_TAA_TSX_FORCE_ABORT; 2839 } 2840 } 2841 2842 static void 2843 cpuid_apply_tsx(x86_taa_mitigation_t taa) 2844 { 2845 uint64_t val; 2846 2847 switch (taa) { 2848 case X86_TAA_TSX_DISABLE: 2849 val = rdmsr(MSR_IA32_TSX_CTRL); 2850 val |= IA32_TSX_CTRL_CPUID_CLEAR | IA32_TSX_CTRL_RTM_DISABLE; 2851 wrmsr(MSR_IA32_TSX_CTRL, val); 2852 break; 2853 case X86_TAA_TSX_FORCE_ABORT: 2854 val = rdmsr(MSR_IA32_TSX_CTRL); 2855 val |= IA32_TSX_CTRL_RTM_DISABLE; 2856 wrmsr(MSR_IA32_TSX_CTRL, val); 2857 break; 2858 case X86_TAA_HW_MITIGATED: 2859 case X86_TAA_MD_CLEAR: 2860 case X86_TAA_DISABLED: 2861 case X86_TAA_NOTHING: 2862 break; 2863 } 2864 } 2865 2866 static void 2867 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset) 2868 { 2869 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2870 x86_spectrev2_mitigation_t v2mit; 2871 2872 if (cpi->cpi_vendor == X86_VENDOR_AMD && 2873 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) { 2874 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB) 2875 add_x86_feature(featureset, X86FSET_IBPB); 2876 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS) 2877 add_x86_feature(featureset, X86FSET_IBRS); 2878 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP) 2879 add_x86_feature(featureset, X86FSET_STIBP); 2880 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL) 2881 add_x86_feature(featureset, X86FSET_STIBP_ALL); 2882 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD) 2883 add_x86_feature(featureset, X86FSET_SSBD); 2884 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD) 2885 add_x86_feature(featureset, X86FSET_SSBD_VIRT); 2886 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO) 2887 add_x86_feature(featureset, X86FSET_SSB_NO); 2888 /* 2889 * Don't enable enhanced IBRS unless we're told that we should 2890 * prefer it and it has the same semantics as Intel. This is 2891 * split into two bits rather than a single one. 2892 */ 2893 if ((cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) && 2894 (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)) { 2895 add_x86_feature(featureset, X86FSET_IBRS_ALL); 2896 } 2897 2898 } else if (cpi->cpi_vendor == X86_VENDOR_Intel && 2899 cpi->cpi_maxeax >= 7) { 2900 struct cpuid_regs *ecp; 2901 ecp = &cpi->cpi_std[7]; 2902 2903 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) { 2904 add_x86_feature(featureset, X86FSET_MD_CLEAR); 2905 } 2906 2907 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) { 2908 add_x86_feature(featureset, X86FSET_IBRS); 2909 add_x86_feature(featureset, X86FSET_IBPB); 2910 } 2911 2912 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) { 2913 add_x86_feature(featureset, X86FSET_STIBP); 2914 } 2915 2916 /* 2917 * Don't read the arch caps MSR on xpv where we lack the 2918 * on_trap(). 2919 */ 2920 #ifndef __xpv 2921 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) { 2922 on_trap_data_t otd; 2923 2924 /* 2925 * Be paranoid and assume we'll get a #GP. 2926 */ 2927 if (!on_trap(&otd, OT_DATA_ACCESS)) { 2928 uint64_t reg; 2929 2930 reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES); 2931 if (reg & IA32_ARCH_CAP_RDCL_NO) { 2932 add_x86_feature(featureset, 2933 X86FSET_RDCL_NO); 2934 } 2935 if (reg & IA32_ARCH_CAP_IBRS_ALL) { 2936 add_x86_feature(featureset, 2937 X86FSET_IBRS_ALL); 2938 } 2939 if (reg & IA32_ARCH_CAP_RSBA) { 2940 add_x86_feature(featureset, 2941 X86FSET_RSBA); 2942 } 2943 if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) { 2944 add_x86_feature(featureset, 2945 X86FSET_L1D_VM_NO); 2946 } 2947 if (reg & IA32_ARCH_CAP_SSB_NO) { 2948 add_x86_feature(featureset, 2949 X86FSET_SSB_NO); 2950 } 2951 if (reg & IA32_ARCH_CAP_MDS_NO) { 2952 add_x86_feature(featureset, 2953 X86FSET_MDS_NO); 2954 } 2955 if (reg & IA32_ARCH_CAP_TSX_CTRL) { 2956 add_x86_feature(featureset, 2957 X86FSET_TSX_CTRL); 2958 } 2959 if (reg & IA32_ARCH_CAP_TAA_NO) { 2960 add_x86_feature(featureset, 2961 X86FSET_TAA_NO); 2962 } 2963 } 2964 no_trap(); 2965 } 2966 #endif /* !__xpv */ 2967 2968 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD) 2969 add_x86_feature(featureset, X86FSET_SSBD); 2970 2971 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD) 2972 add_x86_feature(featureset, X86FSET_FLUSH_CMD); 2973 } 2974 2975 /* 2976 * Take care of certain mitigations on the non-boot CPU. The boot CPU 2977 * will have already run this function and determined what we need to 2978 * do. This gives us a hook for per-HW thread mitigations such as 2979 * enhanced IBRS, or disabling TSX. For TSX disabling, we need to be 2980 * careful that we've had a chance to load ucode that enables the new 2981 * MSRs. 2982 */ 2983 if (cpu->cpu_id != 0) { 2984 if (x86_spectrev2_mitigation == X86_SPECTREV2_ENHANCED_IBRS) { 2985 cpuid_enable_enhanced_ibrs(); 2986 } 2987 2988 if (cpi->cpi_pass >= 1) 2989 cpuid_apply_tsx(x86_taa_mitigation); 2990 return; 2991 } 2992 2993 /* 2994 * Go through and initialize various security mechanisms that we should 2995 * only do on a single CPU. This includes Spectre V2, L1TF, MDS, and 2996 * TAA. 2997 */ 2998 2999 /* 3000 * By default we've come in with retpolines enabled. Check whether we 3001 * should disable them or enable enhanced IBRS. RSB stuffing is enabled 3002 * by default, but disabled if we are using enhanced IBRS. 3003 */ 3004 if (x86_disable_spectrev2 != 0) { 3005 v2mit = X86_SPECTREV2_DISABLED; 3006 } else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) { 3007 cpuid_enable_enhanced_ibrs(); 3008 v2mit = X86_SPECTREV2_ENHANCED_IBRS; 3009 #ifndef __xpv 3010 } else if (cpuid_use_amd_retpoline(cpi)) { 3011 v2mit = X86_SPECTREV2_RETPOLINE_AMD; 3012 #endif /* !__xpv */ 3013 } else { 3014 v2mit = X86_SPECTREV2_RETPOLINE; 3015 } 3016 3017 cpuid_patch_retpolines(v2mit); 3018 cpuid_patch_rsb(v2mit); 3019 x86_spectrev2_mitigation = v2mit; 3020 membar_producer(); 3021 3022 /* 3023 * We need to determine what changes are required for mitigating L1TF 3024 * and MDS. If the CPU suffers from either of them, then SMT exclusion 3025 * is required. 3026 * 3027 * If any of these are present, then we need to flush u-arch state at 3028 * various points. For MDS, we need to do so whenever we change to a 3029 * lesser privilege level or we are halting the CPU. For L1TF we need to 3030 * flush the L1D cache at VM entry. When we have microcode that handles 3031 * MDS, the L1D flush also clears the other u-arch state that the 3032 * md_clear does. 3033 */ 3034 3035 /* 3036 * Update whether or not we need to be taking explicit action against 3037 * MDS. 3038 */ 3039 cpuid_update_md_clear(cpu, featureset); 3040 3041 /* 3042 * Determine whether SMT exclusion is required and whether or not we 3043 * need to perform an l1d flush. 3044 */ 3045 cpuid_update_l1d_flush(cpu, featureset); 3046 3047 /* 3048 * Determine what our mitigation strategy should be for TAA and then 3049 * also apply TAA mitigations. 3050 */ 3051 cpuid_update_tsx(cpu, featureset); 3052 cpuid_apply_tsx(x86_taa_mitigation); 3053 } 3054 3055 /* 3056 * Setup XFeature_Enabled_Mask register. Required by xsave feature. 3057 */ 3058 void 3059 setup_xfem(void) 3060 { 3061 uint64_t flags = XFEATURE_LEGACY_FP; 3062 3063 ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE)); 3064 3065 if (is_x86_feature(x86_featureset, X86FSET_SSE)) 3066 flags |= XFEATURE_SSE; 3067 3068 if (is_x86_feature(x86_featureset, X86FSET_AVX)) 3069 flags |= XFEATURE_AVX; 3070 3071 if (is_x86_feature(x86_featureset, X86FSET_AVX512F)) 3072 flags |= XFEATURE_AVX512; 3073 3074 set_xcr(XFEATURE_ENABLED_MASK, flags); 3075 3076 xsave_bv_all = flags; 3077 } 3078 3079 static void 3080 cpuid_pass1_topology(cpu_t *cpu, uchar_t *featureset) 3081 { 3082 struct cpuid_info *cpi; 3083 3084 cpi = cpu->cpu_m.mcpu_cpi; 3085 3086 if (cpi->cpi_vendor == X86_VENDOR_AMD) { 3087 cpuid_gather_amd_topology_leaves(cpu); 3088 } 3089 3090 cpi->cpi_apicid = cpuid_gather_apicid(cpi); 3091 3092 /* 3093 * Before we can calculate the IDs that we should assign to this 3094 * processor, we need to understand how many cores and threads it has. 3095 */ 3096 switch (cpi->cpi_vendor) { 3097 case X86_VENDOR_Intel: 3098 cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip, 3099 &cpi->cpi_ncore_per_chip); 3100 break; 3101 case X86_VENDOR_AMD: 3102 cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip, 3103 &cpi->cpi_ncore_per_chip); 3104 break; 3105 default: 3106 /* 3107 * If we have some other x86 compatible chip, it's not clear how 3108 * they would behave. The most common case is virtualization 3109 * today, though there are also 64-bit VIA chips. Assume that 3110 * all we can get is the basic Leaf 1 HTT information. 3111 */ 3112 if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) { 3113 cpi->cpi_ncore_per_chip = 1; 3114 cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi); 3115 } 3116 break; 3117 } 3118 3119 /* 3120 * Based on the calculated number of threads and cores, potentially 3121 * assign the HTT and CMT features. 3122 */ 3123 if (cpi->cpi_ncore_per_chip > 1) { 3124 add_x86_feature(featureset, X86FSET_CMP); 3125 } 3126 3127 if (cpi->cpi_ncpu_per_chip > 1 && 3128 cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) { 3129 add_x86_feature(featureset, X86FSET_HTT); 3130 } 3131 3132 /* 3133 * Now that has been set up, we need to go through and calculate all of 3134 * the rest of the parameters that exist. If we think the CPU doesn't 3135 * have either SMT (HTT) or CMP, then we basically go through and fake 3136 * up information in some way. The most likely case for this is 3137 * virtualization where we have a lot of partial topology information. 3138 */ 3139 if (!is_x86_feature(featureset, X86FSET_HTT) && 3140 !is_x86_feature(featureset, X86FSET_CMP)) { 3141 /* 3142 * This is a single core, single-threaded processor. 3143 */ 3144 cpi->cpi_procnodes_per_pkg = 1; 3145 cpi->cpi_cores_per_compunit = 1; 3146 cpi->cpi_compunitid = 0; 3147 cpi->cpi_chipid = -1; 3148 cpi->cpi_clogid = 0; 3149 cpi->cpi_coreid = cpu->cpu_id; 3150 cpi->cpi_pkgcoreid = 0; 3151 if (cpi->cpi_vendor == X86_VENDOR_AMD) { 3152 cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0); 3153 } else { 3154 cpi->cpi_procnodeid = cpi->cpi_chipid; 3155 } 3156 } else { 3157 switch (cpi->cpi_vendor) { 3158 case X86_VENDOR_Intel: 3159 cpuid_intel_getids(cpu, featureset); 3160 break; 3161 case X86_VENDOR_AMD: 3162 cpuid_amd_getids(cpu, featureset); 3163 break; 3164 default: 3165 /* 3166 * In this case, it's hard to say what we should do. 3167 * We're going to model them to the OS as single core 3168 * threads. We don't have a good identifier for them, so 3169 * we're just going to use the cpu id all on a single 3170 * chip. 3171 * 3172 * This case has historically been different from the 3173 * case above where we don't have HTT or CMP. While they 3174 * could be combined, we've opted to keep it separate to 3175 * minimize the risk of topology changes in weird cases. 3176 */ 3177 cpi->cpi_procnodes_per_pkg = 1; 3178 cpi->cpi_cores_per_compunit = 1; 3179 cpi->cpi_chipid = 0; 3180 cpi->cpi_coreid = cpu->cpu_id; 3181 cpi->cpi_clogid = cpu->cpu_id; 3182 cpi->cpi_pkgcoreid = cpu->cpu_id; 3183 cpi->cpi_procnodeid = cpi->cpi_chipid; 3184 cpi->cpi_compunitid = cpi->cpi_coreid; 3185 break; 3186 } 3187 } 3188 } 3189 3190 /* 3191 * Gather relevant CPU features from leaf 6 which covers thermal information. We 3192 * always gather leaf 6 if it's supported; however, we only look for features on 3193 * Intel systems as AMD does not currently define any of the features we look 3194 * for below. 3195 */ 3196 static void 3197 cpuid_pass1_thermal(cpu_t *cpu, uchar_t *featureset) 3198 { 3199 struct cpuid_regs *cp; 3200 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 3201 3202 if (cpi->cpi_maxeax < 6) { 3203 return; 3204 } 3205 3206 cp = &cpi->cpi_std[6]; 3207 cp->cp_eax = 6; 3208 cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0; 3209 (void) __cpuid_insn(cp); 3210 platform_cpuid_mangle(cpi->cpi_vendor, 6, cp); 3211 3212 if (cpi->cpi_vendor != X86_VENDOR_Intel) { 3213 return; 3214 } 3215 3216 if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) { 3217 add_x86_feature(featureset, X86FSET_CORE_THERMAL); 3218 } 3219 3220 if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) { 3221 add_x86_feature(featureset, X86FSET_PKG_THERMAL); 3222 } 3223 } 3224 3225 void 3226 cpuid_pass1(cpu_t *cpu, uchar_t *featureset) 3227 { 3228 uint32_t mask_ecx, mask_edx; 3229 struct cpuid_info *cpi; 3230 struct cpuid_regs *cp; 3231 int xcpuid; 3232 #if !defined(__xpv) 3233 extern int idle_cpu_prefer_mwait; 3234 #endif 3235 3236 /* 3237 * Space statically allocated for BSP, ensure pointer is set 3238 */ 3239 if (cpu->cpu_id == 0) { 3240 if (cpu->cpu_m.mcpu_cpi == NULL) 3241 cpu->cpu_m.mcpu_cpi = &cpuid_info0; 3242 } 3243 3244 add_x86_feature(featureset, X86FSET_CPUID); 3245 3246 cpi = cpu->cpu_m.mcpu_cpi; 3247 ASSERT(cpi != NULL); 3248 cp = &cpi->cpi_std[0]; 3249 cp->cp_eax = 0; 3250 cpi->cpi_maxeax = __cpuid_insn(cp); 3251 { 3252 uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr; 3253 *iptr++ = cp->cp_ebx; 3254 *iptr++ = cp->cp_edx; 3255 *iptr++ = cp->cp_ecx; 3256 *(char *)&cpi->cpi_vendorstr[12] = '\0'; 3257 } 3258 3259 cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr); 3260 x86_vendor = cpi->cpi_vendor; /* for compatibility */ 3261 3262 /* 3263 * Limit the range in case of weird hardware 3264 */ 3265 if (cpi->cpi_maxeax > CPI_MAXEAX_MAX) 3266 cpi->cpi_maxeax = CPI_MAXEAX_MAX; 3267 if (cpi->cpi_maxeax < 1) 3268 goto pass1_done; 3269 3270 cp = &cpi->cpi_std[1]; 3271 cp->cp_eax = 1; 3272 (void) __cpuid_insn(cp); 3273 3274 /* 3275 * Extract identifying constants for easy access. 3276 */ 3277 cpi->cpi_model = CPI_MODEL(cpi); 3278 cpi->cpi_family = CPI_FAMILY(cpi); 3279 3280 if (cpi->cpi_family == 0xf) 3281 cpi->cpi_family += CPI_FAMILY_XTD(cpi); 3282 3283 /* 3284 * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf. 3285 * Intel, and presumably everyone else, uses model == 0xf, as 3286 * one would expect (max value means possible overflow). Sigh. 3287 */ 3288 3289 switch (cpi->cpi_vendor) { 3290 case X86_VENDOR_Intel: 3291 if (IS_EXTENDED_MODEL_INTEL(cpi)) 3292 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4; 3293 break; 3294 case X86_VENDOR_AMD: 3295 if (CPI_FAMILY(cpi) == 0xf) 3296 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4; 3297 break; 3298 default: 3299 if (cpi->cpi_model == 0xf) 3300 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4; 3301 break; 3302 } 3303 3304 cpi->cpi_step = CPI_STEP(cpi); 3305 cpi->cpi_brandid = CPI_BRANDID(cpi); 3306 3307 /* 3308 * *default* assumptions: 3309 * - believe %edx feature word 3310 * - ignore %ecx feature word 3311 * - 32-bit virtual and physical addressing 3312 */ 3313 mask_edx = 0xffffffff; 3314 mask_ecx = 0; 3315 3316 cpi->cpi_pabits = cpi->cpi_vabits = 32; 3317 3318 switch (cpi->cpi_vendor) { 3319 case X86_VENDOR_Intel: 3320 if (cpi->cpi_family == 5) 3321 x86_type = X86_TYPE_P5; 3322 else if (IS_LEGACY_P6(cpi)) { 3323 x86_type = X86_TYPE_P6; 3324 pentiumpro_bug4046376 = 1; 3325 /* 3326 * Clear the SEP bit when it was set erroneously 3327 */ 3328 if (cpi->cpi_model < 3 && cpi->cpi_step < 3) 3329 cp->cp_edx &= ~CPUID_INTC_EDX_SEP; 3330 } else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) { 3331 x86_type = X86_TYPE_P4; 3332 /* 3333 * We don't currently depend on any of the %ecx 3334 * features until Prescott, so we'll only check 3335 * this from P4 onwards. We might want to revisit 3336 * that idea later. 3337 */ 3338 mask_ecx = 0xffffffff; 3339 } else if (cpi->cpi_family > 0xf) 3340 mask_ecx = 0xffffffff; 3341 /* 3342 * We don't support MONITOR/MWAIT if leaf 5 is not available 3343 * to obtain the monitor linesize. 3344 */ 3345 if (cpi->cpi_maxeax < 5) 3346 mask_ecx &= ~CPUID_INTC_ECX_MON; 3347 break; 3348 case X86_VENDOR_IntelClone: 3349 default: 3350 break; 3351 case X86_VENDOR_AMD: 3352 #if defined(OPTERON_ERRATUM_108) 3353 if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) { 3354 cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0; 3355 cpi->cpi_model = 0xc; 3356 } else 3357 #endif 3358 if (cpi->cpi_family == 5) { 3359 /* 3360 * AMD K5 and K6 3361 * 3362 * These CPUs have an incomplete implementation 3363 * of MCA/MCE which we mask away. 3364 */ 3365 mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA); 3366 3367 /* 3368 * Model 0 uses the wrong (APIC) bit 3369 * to indicate PGE. Fix it here. 3370 */ 3371 if (cpi->cpi_model == 0) { 3372 if (cp->cp_edx & 0x200) { 3373 cp->cp_edx &= ~0x200; 3374 cp->cp_edx |= CPUID_INTC_EDX_PGE; 3375 } 3376 } 3377 3378 /* 3379 * Early models had problems w/ MMX; disable. 3380 */ 3381 if (cpi->cpi_model < 6) 3382 mask_edx &= ~CPUID_INTC_EDX_MMX; 3383 } 3384 3385 /* 3386 * For newer families, SSE3 and CX16, at least, are valid; 3387 * enable all 3388 */ 3389 if (cpi->cpi_family >= 0xf) 3390 mask_ecx = 0xffffffff; 3391 /* 3392 * We don't support MONITOR/MWAIT if leaf 5 is not available 3393 * to obtain the monitor linesize. 3394 */ 3395 if (cpi->cpi_maxeax < 5) 3396 mask_ecx &= ~CPUID_INTC_ECX_MON; 3397 3398 #if !defined(__xpv) 3399 /* 3400 * AMD has not historically used MWAIT in the CPU's idle loop. 3401 * Pre-family-10h Opterons do not have the MWAIT instruction. We 3402 * know for certain that in at least family 17h, per AMD, mwait 3403 * is preferred. Families in-between are less certain. 3404 */ 3405 if (cpi->cpi_family < 0x17) { 3406 idle_cpu_prefer_mwait = 0; 3407 } 3408 #endif 3409 3410 break; 3411 case X86_VENDOR_TM: 3412 /* 3413 * workaround the NT workaround in CMS 4.1 3414 */ 3415 if (cpi->cpi_family == 5 && cpi->cpi_model == 4 && 3416 (cpi->cpi_step == 2 || cpi->cpi_step == 3)) 3417 cp->cp_edx |= CPUID_INTC_EDX_CX8; 3418 break; 3419 case X86_VENDOR_Centaur: 3420 /* 3421 * workaround the NT workarounds again 3422 */ 3423 if (cpi->cpi_family == 6) 3424 cp->cp_edx |= CPUID_INTC_EDX_CX8; 3425 break; 3426 case X86_VENDOR_Cyrix: 3427 /* 3428 * We rely heavily on the probing in locore 3429 * to actually figure out what parts, if any, 3430 * of the Cyrix cpuid instruction to believe. 3431 */ 3432 switch (x86_type) { 3433 case X86_TYPE_CYRIX_486: 3434 mask_edx = 0; 3435 break; 3436 case X86_TYPE_CYRIX_6x86: 3437 mask_edx = 0; 3438 break; 3439 case X86_TYPE_CYRIX_6x86L: 3440 mask_edx = 3441 CPUID_INTC_EDX_DE | 3442 CPUID_INTC_EDX_CX8; 3443 break; 3444 case X86_TYPE_CYRIX_6x86MX: 3445 mask_edx = 3446 CPUID_INTC_EDX_DE | 3447 CPUID_INTC_EDX_MSR | 3448 CPUID_INTC_EDX_CX8 | 3449 CPUID_INTC_EDX_PGE | 3450 CPUID_INTC_EDX_CMOV | 3451 CPUID_INTC_EDX_MMX; 3452 break; 3453 case X86_TYPE_CYRIX_GXm: 3454 mask_edx = 3455 CPUID_INTC_EDX_MSR | 3456 CPUID_INTC_EDX_CX8 | 3457 CPUID_INTC_EDX_CMOV | 3458 CPUID_INTC_EDX_MMX; 3459 break; 3460 case X86_TYPE_CYRIX_MediaGX: 3461 break; 3462 case X86_TYPE_CYRIX_MII: 3463 case X86_TYPE_VIA_CYRIX_III: 3464 mask_edx = 3465 CPUID_INTC_EDX_DE | 3466 CPUID_INTC_EDX_TSC | 3467 CPUID_INTC_EDX_MSR | 3468 CPUID_INTC_EDX_CX8 | 3469 CPUID_INTC_EDX_PGE | 3470 CPUID_INTC_EDX_CMOV | 3471 CPUID_INTC_EDX_MMX; 3472 break; 3473 default: 3474 break; 3475 } 3476 break; 3477 } 3478 3479 #if defined(__xpv) 3480 /* 3481 * Do not support MONITOR/MWAIT under a hypervisor 3482 */ 3483 mask_ecx &= ~CPUID_INTC_ECX_MON; 3484 /* 3485 * Do not support XSAVE under a hypervisor for now 3486 */ 3487 xsave_force_disable = B_TRUE; 3488 3489 #endif /* __xpv */ 3490 3491 if (xsave_force_disable) { 3492 mask_ecx &= ~CPUID_INTC_ECX_XSAVE; 3493 mask_ecx &= ~CPUID_INTC_ECX_AVX; 3494 mask_ecx &= ~CPUID_INTC_ECX_F16C; 3495 mask_ecx &= ~CPUID_INTC_ECX_FMA; 3496 } 3497 3498 /* 3499 * Now we've figured out the masks that determine 3500 * which bits we choose to believe, apply the masks 3501 * to the feature words, then map the kernel's view 3502 * of these feature words into its feature word. 3503 */ 3504 cp->cp_edx &= mask_edx; 3505 cp->cp_ecx &= mask_ecx; 3506 3507 /* 3508 * apply any platform restrictions (we don't call this 3509 * immediately after __cpuid_insn here, because we need the 3510 * workarounds applied above first) 3511 */ 3512 platform_cpuid_mangle(cpi->cpi_vendor, 1, cp); 3513 3514 /* 3515 * In addition to ecx and edx, Intel and AMD are storing a bunch of 3516 * instruction set extensions in leaf 7's ebx, ecx, and edx. 3517 */ 3518 if (cpi->cpi_maxeax >= 7) { 3519 struct cpuid_regs *ecp; 3520 ecp = &cpi->cpi_std[7]; 3521 ecp->cp_eax = 7; 3522 ecp->cp_ecx = 0; 3523 (void) __cpuid_insn(ecp); 3524 3525 /* 3526 * If XSAVE has been disabled, just ignore all of the 3527 * extended-save-area dependent flags here. 3528 */ 3529 if (xsave_force_disable) { 3530 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1; 3531 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2; 3532 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2; 3533 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX; 3534 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512; 3535 ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512; 3536 ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512; 3537 } 3538 3539 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP) 3540 add_x86_feature(featureset, X86FSET_SMEP); 3541 3542 /* 3543 * We check disable_smap here in addition to in startup_smap() 3544 * to ensure CPUs that aren't the boot CPU don't accidentally 3545 * include it in the feature set and thus generate a mismatched 3546 * x86 feature set across CPUs. 3547 */ 3548 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP && 3549 disable_smap == 0) 3550 add_x86_feature(featureset, X86FSET_SMAP); 3551 3552 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED) 3553 add_x86_feature(featureset, X86FSET_RDSEED); 3554 3555 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX) 3556 add_x86_feature(featureset, X86FSET_ADX); 3557 3558 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE) 3559 add_x86_feature(featureset, X86FSET_FSGSBASE); 3560 3561 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT) 3562 add_x86_feature(featureset, X86FSET_CLFLUSHOPT); 3563 3564 if (cpi->cpi_vendor == X86_VENDOR_Intel) { 3565 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID) 3566 add_x86_feature(featureset, X86FSET_INVPCID); 3567 3568 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX) 3569 add_x86_feature(featureset, X86FSET_MPX); 3570 3571 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB) 3572 add_x86_feature(featureset, X86FSET_CLWB); 3573 } 3574 } 3575 3576 /* 3577 * fold in overrides from the "eeprom" mechanism 3578 */ 3579 cp->cp_edx |= cpuid_feature_edx_include; 3580 cp->cp_edx &= ~cpuid_feature_edx_exclude; 3581 3582 cp->cp_ecx |= cpuid_feature_ecx_include; 3583 cp->cp_ecx &= ~cpuid_feature_ecx_exclude; 3584 3585 if (cp->cp_edx & CPUID_INTC_EDX_PSE) { 3586 add_x86_feature(featureset, X86FSET_LARGEPAGE); 3587 } 3588 if (cp->cp_edx & CPUID_INTC_EDX_TSC) { 3589 add_x86_feature(featureset, X86FSET_TSC); 3590 } 3591 if (cp->cp_edx & CPUID_INTC_EDX_MSR) { 3592 add_x86_feature(featureset, X86FSET_MSR); 3593 } 3594 if (cp->cp_edx & CPUID_INTC_EDX_MTRR) { 3595 add_x86_feature(featureset, X86FSET_MTRR); 3596 } 3597 if (cp->cp_edx & CPUID_INTC_EDX_PGE) { 3598 add_x86_feature(featureset, X86FSET_PGE); 3599 } 3600 if (cp->cp_edx & CPUID_INTC_EDX_CMOV) { 3601 add_x86_feature(featureset, X86FSET_CMOV); 3602 } 3603 if (cp->cp_edx & CPUID_INTC_EDX_MMX) { 3604 add_x86_feature(featureset, X86FSET_MMX); 3605 } 3606 if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 && 3607 (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) { 3608 add_x86_feature(featureset, X86FSET_MCA); 3609 } 3610 if (cp->cp_edx & CPUID_INTC_EDX_PAE) { 3611 add_x86_feature(featureset, X86FSET_PAE); 3612 } 3613 if (cp->cp_edx & CPUID_INTC_EDX_CX8) { 3614 add_x86_feature(featureset, X86FSET_CX8); 3615 } 3616 if (cp->cp_ecx & CPUID_INTC_ECX_CX16) { 3617 add_x86_feature(featureset, X86FSET_CX16); 3618 } 3619 if (cp->cp_edx & CPUID_INTC_EDX_PAT) { 3620 add_x86_feature(featureset, X86FSET_PAT); 3621 } 3622 if (cp->cp_edx & CPUID_INTC_EDX_SEP) { 3623 add_x86_feature(featureset, X86FSET_SEP); 3624 } 3625 if (cp->cp_edx & CPUID_INTC_EDX_FXSR) { 3626 /* 3627 * In our implementation, fxsave/fxrstor 3628 * are prerequisites before we'll even 3629 * try and do SSE things. 3630 */ 3631 if (cp->cp_edx & CPUID_INTC_EDX_SSE) { 3632 add_x86_feature(featureset, X86FSET_SSE); 3633 } 3634 if (cp->cp_edx & CPUID_INTC_EDX_SSE2) { 3635 add_x86_feature(featureset, X86FSET_SSE2); 3636 } 3637 if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) { 3638 add_x86_feature(featureset, X86FSET_SSE3); 3639 } 3640 if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) { 3641 add_x86_feature(featureset, X86FSET_SSSE3); 3642 } 3643 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) { 3644 add_x86_feature(featureset, X86FSET_SSE4_1); 3645 } 3646 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) { 3647 add_x86_feature(featureset, X86FSET_SSE4_2); 3648 } 3649 if (cp->cp_ecx & CPUID_INTC_ECX_AES) { 3650 add_x86_feature(featureset, X86FSET_AES); 3651 } 3652 if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) { 3653 add_x86_feature(featureset, X86FSET_PCLMULQDQ); 3654 } 3655 3656 if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA) 3657 add_x86_feature(featureset, X86FSET_SHA); 3658 3659 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_UMIP) 3660 add_x86_feature(featureset, X86FSET_UMIP); 3661 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_PKU) 3662 add_x86_feature(featureset, X86FSET_PKU); 3663 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_OSPKE) 3664 add_x86_feature(featureset, X86FSET_OSPKE); 3665 3666 if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) { 3667 add_x86_feature(featureset, X86FSET_XSAVE); 3668 3669 /* We only test AVX & AVX512 when there is XSAVE */ 3670 3671 if (cp->cp_ecx & CPUID_INTC_ECX_AVX) { 3672 add_x86_feature(featureset, 3673 X86FSET_AVX); 3674 3675 /* 3676 * Intel says we can't check these without also 3677 * checking AVX. 3678 */ 3679 if (cp->cp_ecx & CPUID_INTC_ECX_F16C) 3680 add_x86_feature(featureset, 3681 X86FSET_F16C); 3682 3683 if (cp->cp_ecx & CPUID_INTC_ECX_FMA) 3684 add_x86_feature(featureset, 3685 X86FSET_FMA); 3686 3687 if (cpi->cpi_std[7].cp_ebx & 3688 CPUID_INTC_EBX_7_0_BMI1) 3689 add_x86_feature(featureset, 3690 X86FSET_BMI1); 3691 3692 if (cpi->cpi_std[7].cp_ebx & 3693 CPUID_INTC_EBX_7_0_BMI2) 3694 add_x86_feature(featureset, 3695 X86FSET_BMI2); 3696 3697 if (cpi->cpi_std[7].cp_ebx & 3698 CPUID_INTC_EBX_7_0_AVX2) 3699 add_x86_feature(featureset, 3700 X86FSET_AVX2); 3701 } 3702 3703 if (cpi->cpi_vendor == X86_VENDOR_Intel && 3704 (cpi->cpi_std[7].cp_ebx & 3705 CPUID_INTC_EBX_7_0_AVX512F) != 0) { 3706 add_x86_feature(featureset, X86FSET_AVX512F); 3707 3708 if (cpi->cpi_std[7].cp_ebx & 3709 CPUID_INTC_EBX_7_0_AVX512DQ) 3710 add_x86_feature(featureset, 3711 X86FSET_AVX512DQ); 3712 if (cpi->cpi_std[7].cp_ebx & 3713 CPUID_INTC_EBX_7_0_AVX512IFMA) 3714 add_x86_feature(featureset, 3715 X86FSET_AVX512FMA); 3716 if (cpi->cpi_std[7].cp_ebx & 3717 CPUID_INTC_EBX_7_0_AVX512PF) 3718 add_x86_feature(featureset, 3719 X86FSET_AVX512PF); 3720 if (cpi->cpi_std[7].cp_ebx & 3721 CPUID_INTC_EBX_7_0_AVX512ER) 3722 add_x86_feature(featureset, 3723 X86FSET_AVX512ER); 3724 if (cpi->cpi_std[7].cp_ebx & 3725 CPUID_INTC_EBX_7_0_AVX512CD) 3726 add_x86_feature(featureset, 3727 X86FSET_AVX512CD); 3728 if (cpi->cpi_std[7].cp_ebx & 3729 CPUID_INTC_EBX_7_0_AVX512BW) 3730 add_x86_feature(featureset, 3731 X86FSET_AVX512BW); 3732 if (cpi->cpi_std[7].cp_ebx & 3733 CPUID_INTC_EBX_7_0_AVX512VL) 3734 add_x86_feature(featureset, 3735 X86FSET_AVX512VL); 3736 3737 if (cpi->cpi_std[7].cp_ecx & 3738 CPUID_INTC_ECX_7_0_AVX512VBMI) 3739 add_x86_feature(featureset, 3740 X86FSET_AVX512VBMI); 3741 if (cpi->cpi_std[7].cp_ecx & 3742 CPUID_INTC_ECX_7_0_AVX512VNNI) 3743 add_x86_feature(featureset, 3744 X86FSET_AVX512VNNI); 3745 if (cpi->cpi_std[7].cp_ecx & 3746 CPUID_INTC_ECX_7_0_AVX512VPOPCDQ) 3747 add_x86_feature(featureset, 3748 X86FSET_AVX512VPOPCDQ); 3749 3750 if (cpi->cpi_std[7].cp_edx & 3751 CPUID_INTC_EDX_7_0_AVX5124NNIW) 3752 add_x86_feature(featureset, 3753 X86FSET_AVX512NNIW); 3754 if (cpi->cpi_std[7].cp_edx & 3755 CPUID_INTC_EDX_7_0_AVX5124FMAPS) 3756 add_x86_feature(featureset, 3757 X86FSET_AVX512FMAPS); 3758 } 3759 } 3760 } 3761 3762 if (cpi->cpi_vendor == X86_VENDOR_Intel) { 3763 if (cp->cp_ecx & CPUID_INTC_ECX_PCID) { 3764 add_x86_feature(featureset, X86FSET_PCID); 3765 } 3766 } 3767 3768 if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) { 3769 add_x86_feature(featureset, X86FSET_X2APIC); 3770 } 3771 if (cp->cp_edx & CPUID_INTC_EDX_DE) { 3772 add_x86_feature(featureset, X86FSET_DE); 3773 } 3774 #if !defined(__xpv) 3775 if (cp->cp_ecx & CPUID_INTC_ECX_MON) { 3776 3777 /* 3778 * We require the CLFLUSH instruction for erratum workaround 3779 * to use MONITOR/MWAIT. 3780 */ 3781 if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) { 3782 cpi->cpi_mwait.support |= MWAIT_SUPPORT; 3783 add_x86_feature(featureset, X86FSET_MWAIT); 3784 } else { 3785 extern int idle_cpu_assert_cflush_monitor; 3786 3787 /* 3788 * All processors we are aware of which have 3789 * MONITOR/MWAIT also have CLFLUSH. 3790 */ 3791 if (idle_cpu_assert_cflush_monitor) { 3792 ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) && 3793 (cp->cp_edx & CPUID_INTC_EDX_CLFSH)); 3794 } 3795 } 3796 } 3797 #endif /* __xpv */ 3798 3799 if (cp->cp_ecx & CPUID_INTC_ECX_VMX) { 3800 add_x86_feature(featureset, X86FSET_VMX); 3801 } 3802 3803 if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND) 3804 add_x86_feature(featureset, X86FSET_RDRAND); 3805 3806 /* 3807 * Only need it first time, rest of the cpus would follow suit. 3808 * we only capture this for the bootcpu. 3809 */ 3810 if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) { 3811 add_x86_feature(featureset, X86FSET_CLFSH); 3812 x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8); 3813 } 3814 if (is_x86_feature(featureset, X86FSET_PAE)) 3815 cpi->cpi_pabits = 36; 3816 3817 if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) { 3818 struct cpuid_regs r, *ecp; 3819 3820 ecp = &r; 3821 ecp->cp_eax = 0xD; 3822 ecp->cp_ecx = 1; 3823 ecp->cp_edx = ecp->cp_ebx = 0; 3824 (void) __cpuid_insn(ecp); 3825 3826 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT) 3827 add_x86_feature(featureset, X86FSET_XSAVEOPT); 3828 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC) 3829 add_x86_feature(featureset, X86FSET_XSAVEC); 3830 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES) 3831 add_x86_feature(featureset, X86FSET_XSAVES); 3832 } 3833 3834 /* 3835 * Work on the "extended" feature information, doing 3836 * some basic initialization for cpuid_pass2() 3837 */ 3838 xcpuid = 0; 3839 switch (cpi->cpi_vendor) { 3840 case X86_VENDOR_Intel: 3841 /* 3842 * On KVM we know we will have proper support for extended 3843 * cpuid. 3844 */ 3845 if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf || 3846 (get_hwenv() == HW_KVM && cpi->cpi_family == 6 && 3847 (cpi->cpi_model == 6 || cpi->cpi_model == 2))) 3848 xcpuid++; 3849 break; 3850 case X86_VENDOR_AMD: 3851 if (cpi->cpi_family > 5 || 3852 (cpi->cpi_family == 5 && cpi->cpi_model >= 1)) 3853 xcpuid++; 3854 break; 3855 case X86_VENDOR_Cyrix: 3856 /* 3857 * Only these Cyrix CPUs are -known- to support 3858 * extended cpuid operations. 3859 */ 3860 if (x86_type == X86_TYPE_VIA_CYRIX_III || 3861 x86_type == X86_TYPE_CYRIX_GXm) 3862 xcpuid++; 3863 break; 3864 case X86_VENDOR_Centaur: 3865 case X86_VENDOR_TM: 3866 default: 3867 xcpuid++; 3868 break; 3869 } 3870 3871 if (xcpuid) { 3872 cp = &cpi->cpi_extd[0]; 3873 cp->cp_eax = CPUID_LEAF_EXT_0; 3874 cpi->cpi_xmaxeax = __cpuid_insn(cp); 3875 } 3876 3877 if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) { 3878 3879 if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX) 3880 cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX; 3881 3882 switch (cpi->cpi_vendor) { 3883 case X86_VENDOR_Intel: 3884 case X86_VENDOR_AMD: 3885 if (cpi->cpi_xmaxeax < 0x80000001) 3886 break; 3887 cp = &cpi->cpi_extd[1]; 3888 cp->cp_eax = 0x80000001; 3889 (void) __cpuid_insn(cp); 3890 3891 if (cpi->cpi_vendor == X86_VENDOR_AMD && 3892 cpi->cpi_family == 5 && 3893 cpi->cpi_model == 6 && 3894 cpi->cpi_step == 6) { 3895 /* 3896 * K6 model 6 uses bit 10 to indicate SYSC 3897 * Later models use bit 11. Fix it here. 3898 */ 3899 if (cp->cp_edx & 0x400) { 3900 cp->cp_edx &= ~0x400; 3901 cp->cp_edx |= CPUID_AMD_EDX_SYSC; 3902 } 3903 } 3904 3905 platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp); 3906 3907 /* 3908 * Compute the additions to the kernel's feature word. 3909 */ 3910 if (cp->cp_edx & CPUID_AMD_EDX_NX) { 3911 add_x86_feature(featureset, X86FSET_NX); 3912 } 3913 3914 /* 3915 * Regardless whether or not we boot 64-bit, 3916 * we should have a way to identify whether 3917 * the CPU is capable of running 64-bit. 3918 */ 3919 if (cp->cp_edx & CPUID_AMD_EDX_LM) { 3920 add_x86_feature(featureset, X86FSET_64); 3921 } 3922 3923 /* 1 GB large page - enable only for 64 bit kernel */ 3924 if (cp->cp_edx & CPUID_AMD_EDX_1GPG) { 3925 add_x86_feature(featureset, X86FSET_1GPG); 3926 } 3927 3928 if ((cpi->cpi_vendor == X86_VENDOR_AMD) && 3929 (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) && 3930 (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) { 3931 add_x86_feature(featureset, X86FSET_SSE4A); 3932 } 3933 3934 /* 3935 * It's really tricky to support syscall/sysret in 3936 * the i386 kernel; we rely on sysenter/sysexit 3937 * instead. In the amd64 kernel, things are -way- 3938 * better. 3939 */ 3940 if (cp->cp_edx & CPUID_AMD_EDX_SYSC) { 3941 add_x86_feature(featureset, X86FSET_ASYSC); 3942 } 3943 3944 /* 3945 * While we're thinking about system calls, note 3946 * that AMD processors don't support sysenter 3947 * in long mode at all, so don't try to program them. 3948 */ 3949 if (x86_vendor == X86_VENDOR_AMD) { 3950 remove_x86_feature(featureset, X86FSET_SEP); 3951 } 3952 3953 if (cp->cp_edx & CPUID_AMD_EDX_TSCP) { 3954 add_x86_feature(featureset, X86FSET_TSCP); 3955 } 3956 3957 if (cp->cp_ecx & CPUID_AMD_ECX_SVM) { 3958 add_x86_feature(featureset, X86FSET_SVM); 3959 } 3960 3961 if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) { 3962 add_x86_feature(featureset, X86FSET_TOPOEXT); 3963 } 3964 3965 if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) { 3966 add_x86_feature(featureset, X86FSET_AMD_PCEC); 3967 } 3968 3969 if (cp->cp_ecx & CPUID_AMD_ECX_XOP) { 3970 add_x86_feature(featureset, X86FSET_XOP); 3971 } 3972 3973 if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) { 3974 add_x86_feature(featureset, X86FSET_FMA4); 3975 } 3976 3977 if (cp->cp_ecx & CPUID_AMD_ECX_TBM) { 3978 add_x86_feature(featureset, X86FSET_TBM); 3979 } 3980 3981 if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) { 3982 add_x86_feature(featureset, X86FSET_MONITORX); 3983 } 3984 break; 3985 default: 3986 break; 3987 } 3988 3989 /* 3990 * Get CPUID data about processor cores and hyperthreads. 3991 */ 3992 switch (cpi->cpi_vendor) { 3993 case X86_VENDOR_Intel: 3994 if (cpi->cpi_maxeax >= 4) { 3995 cp = &cpi->cpi_std[4]; 3996 cp->cp_eax = 4; 3997 cp->cp_ecx = 0; 3998 (void) __cpuid_insn(cp); 3999 platform_cpuid_mangle(cpi->cpi_vendor, 4, cp); 4000 } 4001 /*FALLTHROUGH*/ 4002 case X86_VENDOR_AMD: 4003 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) 4004 break; 4005 cp = &cpi->cpi_extd[8]; 4006 cp->cp_eax = CPUID_LEAF_EXT_8; 4007 (void) __cpuid_insn(cp); 4008 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, 4009 cp); 4010 4011 /* 4012 * AMD uses ebx for some extended functions. 4013 */ 4014 if (cpi->cpi_vendor == X86_VENDOR_AMD) { 4015 /* 4016 * While we're here, check for the AMD "Error 4017 * Pointer Zero/Restore" feature. This can be 4018 * used to setup the FP save handlers 4019 * appropriately. 4020 */ 4021 if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) { 4022 cpi->cpi_fp_amd_save = 0; 4023 } else { 4024 cpi->cpi_fp_amd_save = 1; 4025 } 4026 4027 if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) { 4028 add_x86_feature(featureset, 4029 X86FSET_CLZERO); 4030 } 4031 } 4032 4033 /* 4034 * Virtual and physical address limits from 4035 * cpuid override previously guessed values. 4036 */ 4037 cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0); 4038 cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8); 4039 break; 4040 default: 4041 break; 4042 } 4043 4044 /* 4045 * Get CPUID data about TSC Invariance in Deep C-State. 4046 */ 4047 switch (cpi->cpi_vendor) { 4048 case X86_VENDOR_Intel: 4049 case X86_VENDOR_AMD: 4050 if (cpi->cpi_maxeax >= 7) { 4051 cp = &cpi->cpi_extd[7]; 4052 cp->cp_eax = 0x80000007; 4053 cp->cp_ecx = 0; 4054 (void) __cpuid_insn(cp); 4055 } 4056 break; 4057 default: 4058 break; 4059 } 4060 } 4061 4062 cpuid_pass1_topology(cpu, featureset); 4063 cpuid_pass1_thermal(cpu, featureset); 4064 4065 /* 4066 * Synthesize chip "revision" and socket type 4067 */ 4068 cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family, 4069 cpi->cpi_model, cpi->cpi_step); 4070 cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor, 4071 cpi->cpi_family, cpi->cpi_model, cpi->cpi_step); 4072 cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family, 4073 cpi->cpi_model, cpi->cpi_step); 4074 4075 if (cpi->cpi_vendor == X86_VENDOR_AMD) { 4076 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 && 4077 cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) { 4078 /* Special handling for AMD FP not necessary. */ 4079 cpi->cpi_fp_amd_save = 0; 4080 } else { 4081 cpi->cpi_fp_amd_save = 1; 4082 } 4083 } 4084 4085 /* 4086 * Check the processor leaves that are used for security features. 4087 */ 4088 cpuid_scan_security(cpu, featureset); 4089 4090 pass1_done: 4091 cpi->cpi_pass = 1; 4092 } 4093 4094 /* 4095 * Make copies of the cpuid table entries we depend on, in 4096 * part for ease of parsing now, in part so that we have only 4097 * one place to correct any of it, in part for ease of 4098 * later export to userland, and in part so we can look at 4099 * this stuff in a crash dump. 4100 */ 4101 4102 /*ARGSUSED*/ 4103 void 4104 cpuid_pass2(cpu_t *cpu) 4105 { 4106 uint_t n, nmax; 4107 int i; 4108 struct cpuid_regs *cp; 4109 uint8_t *dp; 4110 uint32_t *iptr; 4111 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 4112 4113 ASSERT(cpi->cpi_pass == 1); 4114 4115 if (cpi->cpi_maxeax < 1) 4116 goto pass2_done; 4117 4118 if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD) 4119 nmax = NMAX_CPI_STD; 4120 /* 4121 * (We already handled n == 0 and n == 1 in pass 1) 4122 */ 4123 for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) { 4124 /* 4125 * leaves 6 and 7 were handled in pass 1 4126 */ 4127 if (n == 6 || n == 7) 4128 continue; 4129 4130 cp->cp_eax = n; 4131 4132 /* 4133 * CPUID function 4 expects %ecx to be initialized 4134 * with an index which indicates which cache to return 4135 * information about. The OS is expected to call function 4 4136 * with %ecx set to 0, 1, 2, ... until it returns with 4137 * EAX[4:0] set to 0, which indicates there are no more 4138 * caches. 4139 * 4140 * Here, populate cpi_std[4] with the information returned by 4141 * function 4 when %ecx == 0, and do the rest in cpuid_pass3() 4142 * when dynamic memory allocation becomes available. 4143 * 4144 * Note: we need to explicitly initialize %ecx here, since 4145 * function 4 may have been previously invoked. 4146 */ 4147 if (n == 4) 4148 cp->cp_ecx = 0; 4149 4150 (void) __cpuid_insn(cp); 4151 platform_cpuid_mangle(cpi->cpi_vendor, n, cp); 4152 switch (n) { 4153 case 2: 4154 /* 4155 * "the lower 8 bits of the %eax register 4156 * contain a value that identifies the number 4157 * of times the cpuid [instruction] has to be 4158 * executed to obtain a complete image of the 4159 * processor's caching systems." 4160 * 4161 * How *do* they make this stuff up? 4162 */ 4163 cpi->cpi_ncache = sizeof (*cp) * 4164 BITX(cp->cp_eax, 7, 0); 4165 if (cpi->cpi_ncache == 0) 4166 break; 4167 cpi->cpi_ncache--; /* skip count byte */ 4168 4169 /* 4170 * Well, for now, rather than attempt to implement 4171 * this slightly dubious algorithm, we just look 4172 * at the first 15 .. 4173 */ 4174 if (cpi->cpi_ncache > (sizeof (*cp) - 1)) 4175 cpi->cpi_ncache = sizeof (*cp) - 1; 4176 4177 dp = cpi->cpi_cacheinfo; 4178 if (BITX(cp->cp_eax, 31, 31) == 0) { 4179 uint8_t *p = (void *)&cp->cp_eax; 4180 for (i = 1; i < 4; i++) 4181 if (p[i] != 0) 4182 *dp++ = p[i]; 4183 } 4184 if (BITX(cp->cp_ebx, 31, 31) == 0) { 4185 uint8_t *p = (void *)&cp->cp_ebx; 4186 for (i = 0; i < 4; i++) 4187 if (p[i] != 0) 4188 *dp++ = p[i]; 4189 } 4190 if (BITX(cp->cp_ecx, 31, 31) == 0) { 4191 uint8_t *p = (void *)&cp->cp_ecx; 4192 for (i = 0; i < 4; i++) 4193 if (p[i] != 0) 4194 *dp++ = p[i]; 4195 } 4196 if (BITX(cp->cp_edx, 31, 31) == 0) { 4197 uint8_t *p = (void *)&cp->cp_edx; 4198 for (i = 0; i < 4; i++) 4199 if (p[i] != 0) 4200 *dp++ = p[i]; 4201 } 4202 break; 4203 4204 case 3: /* Processor serial number, if PSN supported */ 4205 break; 4206 4207 case 4: /* Deterministic cache parameters */ 4208 break; 4209 4210 case 5: /* Monitor/Mwait parameters */ 4211 { 4212 size_t mwait_size; 4213 4214 /* 4215 * check cpi_mwait.support which was set in cpuid_pass1 4216 */ 4217 if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT)) 4218 break; 4219 4220 /* 4221 * Protect ourself from insane mwait line size. 4222 * Workaround for incomplete hardware emulator(s). 4223 */ 4224 mwait_size = (size_t)MWAIT_SIZE_MAX(cpi); 4225 if (mwait_size < sizeof (uint32_t) || 4226 !ISP2(mwait_size)) { 4227 #if DEBUG 4228 cmn_err(CE_NOTE, "Cannot handle cpu %d mwait " 4229 "size %ld", cpu->cpu_id, (long)mwait_size); 4230 #endif 4231 break; 4232 } 4233 4234 cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi); 4235 cpi->cpi_mwait.mon_max = mwait_size; 4236 if (MWAIT_EXTENSION(cpi)) { 4237 cpi->cpi_mwait.support |= MWAIT_EXTENSIONS; 4238 if (MWAIT_INT_ENABLE(cpi)) 4239 cpi->cpi_mwait.support |= 4240 MWAIT_ECX_INT_ENABLE; 4241 } 4242 break; 4243 } 4244 default: 4245 break; 4246 } 4247 } 4248 4249 /* 4250 * XSAVE enumeration 4251 */ 4252 if (cpi->cpi_maxeax >= 0xD) { 4253 struct cpuid_regs regs; 4254 boolean_t cpuid_d_valid = B_TRUE; 4255 4256 cp = ®s; 4257 cp->cp_eax = 0xD; 4258 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0; 4259 4260 (void) __cpuid_insn(cp); 4261 4262 /* 4263 * Sanity checks for debug 4264 */ 4265 if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 || 4266 (cp->cp_eax & XFEATURE_SSE) == 0) { 4267 cpuid_d_valid = B_FALSE; 4268 } 4269 4270 cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax; 4271 cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx; 4272 cpi->cpi_xsave.xsav_max_size = cp->cp_ecx; 4273 4274 /* 4275 * If the hw supports AVX, get the size and offset in the save 4276 * area for the ymm state. 4277 */ 4278 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) { 4279 cp->cp_eax = 0xD; 4280 cp->cp_ecx = 2; 4281 cp->cp_edx = cp->cp_ebx = 0; 4282 4283 (void) __cpuid_insn(cp); 4284 4285 if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET || 4286 cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) { 4287 cpuid_d_valid = B_FALSE; 4288 } 4289 4290 cpi->cpi_xsave.ymm_size = cp->cp_eax; 4291 cpi->cpi_xsave.ymm_offset = cp->cp_ebx; 4292 } 4293 4294 /* 4295 * If the hw supports MPX, get the size and offset in the 4296 * save area for BNDREGS and BNDCSR. 4297 */ 4298 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) { 4299 cp->cp_eax = 0xD; 4300 cp->cp_ecx = 3; 4301 cp->cp_edx = cp->cp_ebx = 0; 4302 4303 (void) __cpuid_insn(cp); 4304 4305 cpi->cpi_xsave.bndregs_size = cp->cp_eax; 4306 cpi->cpi_xsave.bndregs_offset = cp->cp_ebx; 4307 4308 cp->cp_eax = 0xD; 4309 cp->cp_ecx = 4; 4310 cp->cp_edx = cp->cp_ebx = 0; 4311 4312 (void) __cpuid_insn(cp); 4313 4314 cpi->cpi_xsave.bndcsr_size = cp->cp_eax; 4315 cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx; 4316 } 4317 4318 /* 4319 * If the hw supports AVX512, get the size and offset in the 4320 * save area for the opmask registers and zmm state. 4321 */ 4322 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) { 4323 cp->cp_eax = 0xD; 4324 cp->cp_ecx = 5; 4325 cp->cp_edx = cp->cp_ebx = 0; 4326 4327 (void) __cpuid_insn(cp); 4328 4329 cpi->cpi_xsave.opmask_size = cp->cp_eax; 4330 cpi->cpi_xsave.opmask_offset = cp->cp_ebx; 4331 4332 cp->cp_eax = 0xD; 4333 cp->cp_ecx = 6; 4334 cp->cp_edx = cp->cp_ebx = 0; 4335 4336 (void) __cpuid_insn(cp); 4337 4338 cpi->cpi_xsave.zmmlo_size = cp->cp_eax; 4339 cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx; 4340 4341 cp->cp_eax = 0xD; 4342 cp->cp_ecx = 7; 4343 cp->cp_edx = cp->cp_ebx = 0; 4344 4345 (void) __cpuid_insn(cp); 4346 4347 cpi->cpi_xsave.zmmhi_size = cp->cp_eax; 4348 cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx; 4349 } 4350 4351 if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) { 4352 xsave_state_size = 0; 4353 } else if (cpuid_d_valid) { 4354 xsave_state_size = cpi->cpi_xsave.xsav_max_size; 4355 } else { 4356 /* Broken CPUID 0xD, probably in HVM */ 4357 cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid " 4358 "value: hw_low = %d, hw_high = %d, xsave_size = %d" 4359 ", ymm_size = %d, ymm_offset = %d\n", 4360 cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low, 4361 cpi->cpi_xsave.xsav_hw_features_high, 4362 (int)cpi->cpi_xsave.xsav_max_size, 4363 (int)cpi->cpi_xsave.ymm_size, 4364 (int)cpi->cpi_xsave.ymm_offset); 4365 4366 if (xsave_state_size != 0) { 4367 /* 4368 * This must be a non-boot CPU. We cannot 4369 * continue, because boot cpu has already 4370 * enabled XSAVE. 4371 */ 4372 ASSERT(cpu->cpu_id != 0); 4373 cmn_err(CE_PANIC, "cpu%d: we have already " 4374 "enabled XSAVE on boot cpu, cannot " 4375 "continue.", cpu->cpu_id); 4376 } else { 4377 /* 4378 * If we reached here on the boot CPU, it's also 4379 * almost certain that we'll reach here on the 4380 * non-boot CPUs. When we're here on a boot CPU 4381 * we should disable the feature, on a non-boot 4382 * CPU we need to confirm that we have. 4383 */ 4384 if (cpu->cpu_id == 0) { 4385 remove_x86_feature(x86_featureset, 4386 X86FSET_XSAVE); 4387 remove_x86_feature(x86_featureset, 4388 X86FSET_AVX); 4389 remove_x86_feature(x86_featureset, 4390 X86FSET_F16C); 4391 remove_x86_feature(x86_featureset, 4392 X86FSET_BMI1); 4393 remove_x86_feature(x86_featureset, 4394 X86FSET_BMI2); 4395 remove_x86_feature(x86_featureset, 4396 X86FSET_FMA); 4397 remove_x86_feature(x86_featureset, 4398 X86FSET_AVX2); 4399 remove_x86_feature(x86_featureset, 4400 X86FSET_MPX); 4401 remove_x86_feature(x86_featureset, 4402 X86FSET_AVX512F); 4403 remove_x86_feature(x86_featureset, 4404 X86FSET_AVX512DQ); 4405 remove_x86_feature(x86_featureset, 4406 X86FSET_AVX512PF); 4407 remove_x86_feature(x86_featureset, 4408 X86FSET_AVX512ER); 4409 remove_x86_feature(x86_featureset, 4410 X86FSET_AVX512CD); 4411 remove_x86_feature(x86_featureset, 4412 X86FSET_AVX512BW); 4413 remove_x86_feature(x86_featureset, 4414 X86FSET_AVX512VL); 4415 remove_x86_feature(x86_featureset, 4416 X86FSET_AVX512FMA); 4417 remove_x86_feature(x86_featureset, 4418 X86FSET_AVX512VBMI); 4419 remove_x86_feature(x86_featureset, 4420 X86FSET_AVX512VNNI); 4421 remove_x86_feature(x86_featureset, 4422 X86FSET_AVX512VPOPCDQ); 4423 remove_x86_feature(x86_featureset, 4424 X86FSET_AVX512NNIW); 4425 remove_x86_feature(x86_featureset, 4426 X86FSET_AVX512FMAPS); 4427 4428 CPI_FEATURES_ECX(cpi) &= 4429 ~CPUID_INTC_ECX_XSAVE; 4430 CPI_FEATURES_ECX(cpi) &= 4431 ~CPUID_INTC_ECX_AVX; 4432 CPI_FEATURES_ECX(cpi) &= 4433 ~CPUID_INTC_ECX_F16C; 4434 CPI_FEATURES_ECX(cpi) &= 4435 ~CPUID_INTC_ECX_FMA; 4436 CPI_FEATURES_7_0_EBX(cpi) &= 4437 ~CPUID_INTC_EBX_7_0_BMI1; 4438 CPI_FEATURES_7_0_EBX(cpi) &= 4439 ~CPUID_INTC_EBX_7_0_BMI2; 4440 CPI_FEATURES_7_0_EBX(cpi) &= 4441 ~CPUID_INTC_EBX_7_0_AVX2; 4442 CPI_FEATURES_7_0_EBX(cpi) &= 4443 ~CPUID_INTC_EBX_7_0_MPX; 4444 CPI_FEATURES_7_0_EBX(cpi) &= 4445 ~CPUID_INTC_EBX_7_0_ALL_AVX512; 4446 4447 CPI_FEATURES_7_0_ECX(cpi) &= 4448 ~CPUID_INTC_ECX_7_0_ALL_AVX512; 4449 4450 CPI_FEATURES_7_0_EDX(cpi) &= 4451 ~CPUID_INTC_EDX_7_0_ALL_AVX512; 4452 4453 xsave_force_disable = B_TRUE; 4454 } else { 4455 VERIFY(is_x86_feature(x86_featureset, 4456 X86FSET_XSAVE) == B_FALSE); 4457 } 4458 } 4459 } 4460 } 4461 4462 4463 if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) 4464 goto pass2_done; 4465 4466 if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD) 4467 nmax = NMAX_CPI_EXTD; 4468 /* 4469 * Copy the extended properties, fixing them as we go. 4470 * (We already handled n == 0 and n == 1 in pass 1) 4471 */ 4472 iptr = (void *)cpi->cpi_brandstr; 4473 for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) { 4474 cp->cp_eax = CPUID_LEAF_EXT_0 + n; 4475 (void) __cpuid_insn(cp); 4476 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n, 4477 cp); 4478 switch (n) { 4479 case 2: 4480 case 3: 4481 case 4: 4482 /* 4483 * Extract the brand string 4484 */ 4485 *iptr++ = cp->cp_eax; 4486 *iptr++ = cp->cp_ebx; 4487 *iptr++ = cp->cp_ecx; 4488 *iptr++ = cp->cp_edx; 4489 break; 4490 case 5: 4491 switch (cpi->cpi_vendor) { 4492 case X86_VENDOR_AMD: 4493 /* 4494 * The Athlon and Duron were the first 4495 * parts to report the sizes of the 4496 * TLB for large pages. Before then, 4497 * we don't trust the data. 4498 */ 4499 if (cpi->cpi_family < 6 || 4500 (cpi->cpi_family == 6 && 4501 cpi->cpi_model < 1)) 4502 cp->cp_eax = 0; 4503 break; 4504 default: 4505 break; 4506 } 4507 break; 4508 case 6: 4509 switch (cpi->cpi_vendor) { 4510 case X86_VENDOR_AMD: 4511 /* 4512 * The Athlon and Duron were the first 4513 * AMD parts with L2 TLB's. 4514 * Before then, don't trust the data. 4515 */ 4516 if (cpi->cpi_family < 6 || 4517 cpi->cpi_family == 6 && 4518 cpi->cpi_model < 1) 4519 cp->cp_eax = cp->cp_ebx = 0; 4520 /* 4521 * AMD Duron rev A0 reports L2 4522 * cache size incorrectly as 1K 4523 * when it is really 64K 4524 */ 4525 if (cpi->cpi_family == 6 && 4526 cpi->cpi_model == 3 && 4527 cpi->cpi_step == 0) { 4528 cp->cp_ecx &= 0xffff; 4529 cp->cp_ecx |= 0x400000; 4530 } 4531 break; 4532 case X86_VENDOR_Cyrix: /* VIA C3 */ 4533 /* 4534 * VIA C3 processors are a bit messed 4535 * up w.r.t. encoding cache sizes in %ecx 4536 */ 4537 if (cpi->cpi_family != 6) 4538 break; 4539 /* 4540 * model 7 and 8 were incorrectly encoded 4541 * 4542 * xxx is model 8 really broken? 4543 */ 4544 if (cpi->cpi_model == 7 || 4545 cpi->cpi_model == 8) 4546 cp->cp_ecx = 4547 BITX(cp->cp_ecx, 31, 24) << 16 | 4548 BITX(cp->cp_ecx, 23, 16) << 12 | 4549 BITX(cp->cp_ecx, 15, 8) << 8 | 4550 BITX(cp->cp_ecx, 7, 0); 4551 /* 4552 * model 9 stepping 1 has wrong associativity 4553 */ 4554 if (cpi->cpi_model == 9 && cpi->cpi_step == 1) 4555 cp->cp_ecx |= 8 << 12; 4556 break; 4557 case X86_VENDOR_Intel: 4558 /* 4559 * Extended L2 Cache features function. 4560 * First appeared on Prescott. 4561 */ 4562 default: 4563 break; 4564 } 4565 break; 4566 default: 4567 break; 4568 } 4569 } 4570 4571 pass2_done: 4572 cpi->cpi_pass = 2; 4573 } 4574 4575 static const char * 4576 intel_cpubrand(const struct cpuid_info *cpi) 4577 { 4578 int i; 4579 4580 if (!is_x86_feature(x86_featureset, X86FSET_CPUID) || 4581 cpi->cpi_maxeax < 1 || cpi->cpi_family < 5) 4582 return ("i486"); 4583 4584 switch (cpi->cpi_family) { 4585 case 5: 4586 return ("Intel Pentium(r)"); 4587 case 6: 4588 switch (cpi->cpi_model) { 4589 uint_t celeron, xeon; 4590 const struct cpuid_regs *cp; 4591 case 0: 4592 case 1: 4593 case 2: 4594 return ("Intel Pentium(r) Pro"); 4595 case 3: 4596 case 4: 4597 return ("Intel Pentium(r) II"); 4598 case 6: 4599 return ("Intel Celeron(r)"); 4600 case 5: 4601 case 7: 4602 celeron = xeon = 0; 4603 cp = &cpi->cpi_std[2]; /* cache info */ 4604 4605 for (i = 1; i < 4; i++) { 4606 uint_t tmp; 4607 4608 tmp = (cp->cp_eax >> (8 * i)) & 0xff; 4609 if (tmp == 0x40) 4610 celeron++; 4611 if (tmp >= 0x44 && tmp <= 0x45) 4612 xeon++; 4613 } 4614 4615 for (i = 0; i < 2; i++) { 4616 uint_t tmp; 4617 4618 tmp = (cp->cp_ebx >> (8 * i)) & 0xff; 4619 if (tmp == 0x40) 4620 celeron++; 4621 else if (tmp >= 0x44 && tmp <= 0x45) 4622 xeon++; 4623 } 4624 4625 for (i = 0; i < 4; i++) { 4626 uint_t tmp; 4627 4628 tmp = (cp->cp_ecx >> (8 * i)) & 0xff; 4629 if (tmp == 0x40) 4630 celeron++; 4631 else if (tmp >= 0x44 && tmp <= 0x45) 4632 xeon++; 4633 } 4634 4635 for (i = 0; i < 4; i++) { 4636 uint_t tmp; 4637 4638 tmp = (cp->cp_edx >> (8 * i)) & 0xff; 4639 if (tmp == 0x40) 4640 celeron++; 4641 else if (tmp >= 0x44 && tmp <= 0x45) 4642 xeon++; 4643 } 4644 4645 if (celeron) 4646 return ("Intel Celeron(r)"); 4647 if (xeon) 4648 return (cpi->cpi_model == 5 ? 4649 "Intel Pentium(r) II Xeon(tm)" : 4650 "Intel Pentium(r) III Xeon(tm)"); 4651 return (cpi->cpi_model == 5 ? 4652 "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" : 4653 "Intel Pentium(r) III or Pentium(r) III Xeon(tm)"); 4654 default: 4655 break; 4656 } 4657 default: 4658 break; 4659 } 4660 4661 /* BrandID is present if the field is nonzero */ 4662 if (cpi->cpi_brandid != 0) { 4663 static const struct { 4664 uint_t bt_bid; 4665 const char *bt_str; 4666 } brand_tbl[] = { 4667 { 0x1, "Intel(r) Celeron(r)" }, 4668 { 0x2, "Intel(r) Pentium(r) III" }, 4669 { 0x3, "Intel(r) Pentium(r) III Xeon(tm)" }, 4670 { 0x4, "Intel(r) Pentium(r) III" }, 4671 { 0x6, "Mobile Intel(r) Pentium(r) III" }, 4672 { 0x7, "Mobile Intel(r) Celeron(r)" }, 4673 { 0x8, "Intel(r) Pentium(r) 4" }, 4674 { 0x9, "Intel(r) Pentium(r) 4" }, 4675 { 0xa, "Intel(r) Celeron(r)" }, 4676 { 0xb, "Intel(r) Xeon(tm)" }, 4677 { 0xc, "Intel(r) Xeon(tm) MP" }, 4678 { 0xe, "Mobile Intel(r) Pentium(r) 4" }, 4679 { 0xf, "Mobile Intel(r) Celeron(r)" }, 4680 { 0x11, "Mobile Genuine Intel(r)" }, 4681 { 0x12, "Intel(r) Celeron(r) M" }, 4682 { 0x13, "Mobile Intel(r) Celeron(r)" }, 4683 { 0x14, "Intel(r) Celeron(r)" }, 4684 { 0x15, "Mobile Genuine Intel(r)" }, 4685 { 0x16, "Intel(r) Pentium(r) M" }, 4686 { 0x17, "Mobile Intel(r) Celeron(r)" } 4687 }; 4688 uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]); 4689 uint_t sgn; 4690 4691 sgn = (cpi->cpi_family << 8) | 4692 (cpi->cpi_model << 4) | cpi->cpi_step; 4693 4694 for (i = 0; i < btblmax; i++) 4695 if (brand_tbl[i].bt_bid == cpi->cpi_brandid) 4696 break; 4697 if (i < btblmax) { 4698 if (sgn == 0x6b1 && cpi->cpi_brandid == 3) 4699 return ("Intel(r) Celeron(r)"); 4700 if (sgn < 0xf13 && cpi->cpi_brandid == 0xb) 4701 return ("Intel(r) Xeon(tm) MP"); 4702 if (sgn < 0xf13 && cpi->cpi_brandid == 0xe) 4703 return ("Intel(r) Xeon(tm)"); 4704 return (brand_tbl[i].bt_str); 4705 } 4706 } 4707 4708 return (NULL); 4709 } 4710 4711 static const char * 4712 amd_cpubrand(const struct cpuid_info *cpi) 4713 { 4714 if (!is_x86_feature(x86_featureset, X86FSET_CPUID) || 4715 cpi->cpi_maxeax < 1 || cpi->cpi_family < 5) 4716 return ("i486 compatible"); 4717 4718 switch (cpi->cpi_family) { 4719 case 5: 4720 switch (cpi->cpi_model) { 4721 case 0: 4722 case 1: 4723 case 2: 4724 case 3: 4725 case 4: 4726 case 5: 4727 return ("AMD-K5(r)"); 4728 case 6: 4729 case 7: 4730 return ("AMD-K6(r)"); 4731 case 8: 4732 return ("AMD-K6(r)-2"); 4733 case 9: 4734 return ("AMD-K6(r)-III"); 4735 default: 4736 return ("AMD (family 5)"); 4737 } 4738 case 6: 4739 switch (cpi->cpi_model) { 4740 case 1: 4741 return ("AMD-K7(tm)"); 4742 case 0: 4743 case 2: 4744 case 4: 4745 return ("AMD Athlon(tm)"); 4746 case 3: 4747 case 7: 4748 return ("AMD Duron(tm)"); 4749 case 6: 4750 case 8: 4751 case 10: 4752 /* 4753 * Use the L2 cache size to distinguish 4754 */ 4755 return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ? 4756 "AMD Athlon(tm)" : "AMD Duron(tm)"); 4757 default: 4758 return ("AMD (family 6)"); 4759 } 4760 default: 4761 break; 4762 } 4763 4764 if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 && 4765 cpi->cpi_brandid != 0) { 4766 switch (BITX(cpi->cpi_brandid, 7, 5)) { 4767 case 3: 4768 return ("AMD Opteron(tm) UP 1xx"); 4769 case 4: 4770 return ("AMD Opteron(tm) DP 2xx"); 4771 case 5: 4772 return ("AMD Opteron(tm) MP 8xx"); 4773 default: 4774 return ("AMD Opteron(tm)"); 4775 } 4776 } 4777 4778 return (NULL); 4779 } 4780 4781 static const char * 4782 cyrix_cpubrand(struct cpuid_info *cpi, uint_t type) 4783 { 4784 if (!is_x86_feature(x86_featureset, X86FSET_CPUID) || 4785 cpi->cpi_maxeax < 1 || cpi->cpi_family < 5 || 4786 type == X86_TYPE_CYRIX_486) 4787 return ("i486 compatible"); 4788 4789 switch (type) { 4790 case X86_TYPE_CYRIX_6x86: 4791 return ("Cyrix 6x86"); 4792 case X86_TYPE_CYRIX_6x86L: 4793 return ("Cyrix 6x86L"); 4794 case X86_TYPE_CYRIX_6x86MX: 4795 return ("Cyrix 6x86MX"); 4796 case X86_TYPE_CYRIX_GXm: 4797 return ("Cyrix GXm"); 4798 case X86_TYPE_CYRIX_MediaGX: 4799 return ("Cyrix MediaGX"); 4800 case X86_TYPE_CYRIX_MII: 4801 return ("Cyrix M2"); 4802 case X86_TYPE_VIA_CYRIX_III: 4803 return ("VIA Cyrix M3"); 4804 default: 4805 /* 4806 * Have another wild guess .. 4807 */ 4808 if (cpi->cpi_family == 4 && cpi->cpi_model == 9) 4809 return ("Cyrix 5x86"); 4810 else if (cpi->cpi_family == 5) { 4811 switch (cpi->cpi_model) { 4812 case 2: 4813 return ("Cyrix 6x86"); /* Cyrix M1 */ 4814 case 4: 4815 return ("Cyrix MediaGX"); 4816 default: 4817 break; 4818 } 4819 } else if (cpi->cpi_family == 6) { 4820 switch (cpi->cpi_model) { 4821 case 0: 4822 return ("Cyrix 6x86MX"); /* Cyrix M2? */ 4823 case 5: 4824 case 6: 4825 case 7: 4826 case 8: 4827 case 9: 4828 return ("VIA C3"); 4829 default: 4830 break; 4831 } 4832 } 4833 break; 4834 } 4835 return (NULL); 4836 } 4837 4838 /* 4839 * This only gets called in the case that the CPU extended 4840 * feature brand string (0x80000002, 0x80000003, 0x80000004) 4841 * aren't available, or contain null bytes for some reason. 4842 */ 4843 static void 4844 fabricate_brandstr(struct cpuid_info *cpi) 4845 { 4846 const char *brand = NULL; 4847 4848 switch (cpi->cpi_vendor) { 4849 case X86_VENDOR_Intel: 4850 brand = intel_cpubrand(cpi); 4851 break; 4852 case X86_VENDOR_AMD: 4853 brand = amd_cpubrand(cpi); 4854 break; 4855 case X86_VENDOR_Cyrix: 4856 brand = cyrix_cpubrand(cpi, x86_type); 4857 break; 4858 case X86_VENDOR_NexGen: 4859 if (cpi->cpi_family == 5 && cpi->cpi_model == 0) 4860 brand = "NexGen Nx586"; 4861 break; 4862 case X86_VENDOR_Centaur: 4863 if (cpi->cpi_family == 5) 4864 switch (cpi->cpi_model) { 4865 case 4: 4866 brand = "Centaur C6"; 4867 break; 4868 case 8: 4869 brand = "Centaur C2"; 4870 break; 4871 case 9: 4872 brand = "Centaur C3"; 4873 break; 4874 default: 4875 break; 4876 } 4877 break; 4878 case X86_VENDOR_Rise: 4879 if (cpi->cpi_family == 5 && 4880 (cpi->cpi_model == 0 || cpi->cpi_model == 2)) 4881 brand = "Rise mP6"; 4882 break; 4883 case X86_VENDOR_SiS: 4884 if (cpi->cpi_family == 5 && cpi->cpi_model == 0) 4885 brand = "SiS 55x"; 4886 break; 4887 case X86_VENDOR_TM: 4888 if (cpi->cpi_family == 5 && cpi->cpi_model == 4) 4889 brand = "Transmeta Crusoe TM3x00 or TM5x00"; 4890 break; 4891 case X86_VENDOR_NSC: 4892 case X86_VENDOR_UMC: 4893 default: 4894 break; 4895 } 4896 if (brand) { 4897 (void) strcpy((char *)cpi->cpi_brandstr, brand); 4898 return; 4899 } 4900 4901 /* 4902 * If all else fails ... 4903 */ 4904 (void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr), 4905 "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family, 4906 cpi->cpi_model, cpi->cpi_step); 4907 } 4908 4909 /* 4910 * This routine is called just after kernel memory allocation 4911 * becomes available on cpu0, and as part of mp_startup() on 4912 * the other cpus. 4913 * 4914 * Fixup the brand string, and collect any information from cpuid 4915 * that requires dynamically allocated storage to represent. 4916 */ 4917 /*ARGSUSED*/ 4918 void 4919 cpuid_pass3(cpu_t *cpu) 4920 { 4921 int i, max, shft, level, size; 4922 struct cpuid_regs regs; 4923 struct cpuid_regs *cp; 4924 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 4925 4926 ASSERT(cpi->cpi_pass == 2); 4927 4928 /* 4929 * Deterministic cache parameters 4930 * 4931 * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The 4932 * values that are present are currently defined to be the same. This 4933 * means we can use the same logic to parse it as long as we use the 4934 * appropriate leaf to get the data. If you're updating this, make sure 4935 * you're careful about which vendor supports which aspect. 4936 * 4937 * Take this opportunity to detect the number of threads sharing the 4938 * last level cache, and construct a corresponding cache id. The 4939 * respective cpuid_info members are initialized to the default case of 4940 * "no last level cache sharing". 4941 */ 4942 cpi->cpi_ncpu_shr_last_cache = 1; 4943 cpi->cpi_last_lvl_cacheid = cpu->cpu_id; 4944 4945 if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) || 4946 (cpi->cpi_vendor == X86_VENDOR_AMD && 4947 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d && 4948 is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) { 4949 uint32_t leaf; 4950 4951 if (cpi->cpi_vendor == X86_VENDOR_Intel) { 4952 leaf = 4; 4953 } else { 4954 leaf = CPUID_LEAF_EXT_1d; 4955 } 4956 4957 /* 4958 * Find the # of elements (size) returned by the leaf and along 4959 * the way detect last level cache sharing details. 4960 */ 4961 bzero(®s, sizeof (regs)); 4962 cp = ®s; 4963 for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) { 4964 cp->cp_eax = leaf; 4965 cp->cp_ecx = i; 4966 4967 (void) __cpuid_insn(cp); 4968 4969 if (CPI_CACHE_TYPE(cp) == 0) 4970 break; 4971 level = CPI_CACHE_LVL(cp); 4972 if (level > max) { 4973 max = level; 4974 cpi->cpi_ncpu_shr_last_cache = 4975 CPI_NTHR_SHR_CACHE(cp) + 1; 4976 } 4977 } 4978 cpi->cpi_cache_leaf_size = size = i; 4979 4980 /* 4981 * Allocate the cpi_cache_leaves array. The first element 4982 * references the regs for the corresponding leaf with %ecx set 4983 * to 0. This was gathered in cpuid_pass2(). 4984 */ 4985 if (size > 0) { 4986 cpi->cpi_cache_leaves = 4987 kmem_alloc(size * sizeof (cp), KM_SLEEP); 4988 if (cpi->cpi_vendor == X86_VENDOR_Intel) { 4989 cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4]; 4990 } else { 4991 cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d]; 4992 } 4993 4994 /* 4995 * Allocate storage to hold the additional regs 4996 * for the leaf, %ecx == 1 .. cpi_cache_leaf_size. 4997 * 4998 * The regs for the leaf, %ecx == 0 has already 4999 * been allocated as indicated above. 5000 */ 5001 for (i = 1; i < size; i++) { 5002 cp = cpi->cpi_cache_leaves[i] = 5003 kmem_zalloc(sizeof (regs), KM_SLEEP); 5004 cp->cp_eax = leaf; 5005 cp->cp_ecx = i; 5006 5007 (void) __cpuid_insn(cp); 5008 } 5009 } 5010 /* 5011 * Determine the number of bits needed to represent 5012 * the number of CPUs sharing the last level cache. 5013 * 5014 * Shift off that number of bits from the APIC id to 5015 * derive the cache id. 5016 */ 5017 shft = 0; 5018 for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1) 5019 shft++; 5020 cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft; 5021 } 5022 5023 /* 5024 * Now fixup the brand string 5025 */ 5026 if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) { 5027 fabricate_brandstr(cpi); 5028 } else { 5029 5030 /* 5031 * If we successfully extracted a brand string from the cpuid 5032 * instruction, clean it up by removing leading spaces and 5033 * similar junk. 5034 */ 5035 if (cpi->cpi_brandstr[0]) { 5036 size_t maxlen = sizeof (cpi->cpi_brandstr); 5037 char *src, *dst; 5038 5039 dst = src = (char *)cpi->cpi_brandstr; 5040 src[maxlen - 1] = '\0'; 5041 /* 5042 * strip leading spaces 5043 */ 5044 while (*src == ' ') 5045 src++; 5046 /* 5047 * Remove any 'Genuine' or "Authentic" prefixes 5048 */ 5049 if (strncmp(src, "Genuine ", 8) == 0) 5050 src += 8; 5051 if (strncmp(src, "Authentic ", 10) == 0) 5052 src += 10; 5053 5054 /* 5055 * Now do an in-place copy. 5056 * Map (R) to (r) and (TM) to (tm). 5057 * The era of teletypes is long gone, and there's 5058 * -really- no need to shout. 5059 */ 5060 while (*src != '\0') { 5061 if (src[0] == '(') { 5062 if (strncmp(src + 1, "R)", 2) == 0) { 5063 (void) strncpy(dst, "(r)", 3); 5064 src += 3; 5065 dst += 3; 5066 continue; 5067 } 5068 if (strncmp(src + 1, "TM)", 3) == 0) { 5069 (void) strncpy(dst, "(tm)", 4); 5070 src += 4; 5071 dst += 4; 5072 continue; 5073 } 5074 } 5075 *dst++ = *src++; 5076 } 5077 *dst = '\0'; 5078 5079 /* 5080 * Finally, remove any trailing spaces 5081 */ 5082 while (--dst > cpi->cpi_brandstr) 5083 if (*dst == ' ') 5084 *dst = '\0'; 5085 else 5086 break; 5087 } else 5088 fabricate_brandstr(cpi); 5089 } 5090 cpi->cpi_pass = 3; 5091 } 5092 5093 /* 5094 * This routine is called out of bind_hwcap() much later in the life 5095 * of the kernel (post_startup()). The job of this routine is to resolve 5096 * the hardware feature support and kernel support for those features into 5097 * what we're actually going to tell applications via the aux vector. 5098 */ 5099 void 5100 cpuid_pass4(cpu_t *cpu, uint_t *hwcap_out) 5101 { 5102 struct cpuid_info *cpi; 5103 uint_t hwcap_flags = 0, hwcap_flags_2 = 0; 5104 5105 if (cpu == NULL) 5106 cpu = CPU; 5107 cpi = cpu->cpu_m.mcpu_cpi; 5108 5109 ASSERT(cpi->cpi_pass == 3); 5110 5111 if (cpi->cpi_maxeax >= 1) { 5112 uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES]; 5113 uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES]; 5114 uint32_t *ebx = &cpi->cpi_support[STD_EBX_FEATURES]; 5115 5116 *edx = CPI_FEATURES_EDX(cpi); 5117 *ecx = CPI_FEATURES_ECX(cpi); 5118 *ebx = CPI_FEATURES_7_0_EBX(cpi); 5119 5120 /* 5121 * [these require explicit kernel support] 5122 */ 5123 if (!is_x86_feature(x86_featureset, X86FSET_SEP)) 5124 *edx &= ~CPUID_INTC_EDX_SEP; 5125 5126 if (!is_x86_feature(x86_featureset, X86FSET_SSE)) 5127 *edx &= ~(CPUID_INTC_EDX_FXSR|CPUID_INTC_EDX_SSE); 5128 if (!is_x86_feature(x86_featureset, X86FSET_SSE2)) 5129 *edx &= ~CPUID_INTC_EDX_SSE2; 5130 5131 if (!is_x86_feature(x86_featureset, X86FSET_HTT)) 5132 *edx &= ~CPUID_INTC_EDX_HTT; 5133 5134 if (!is_x86_feature(x86_featureset, X86FSET_SSE3)) 5135 *ecx &= ~CPUID_INTC_ECX_SSE3; 5136 5137 if (!is_x86_feature(x86_featureset, X86FSET_SSSE3)) 5138 *ecx &= ~CPUID_INTC_ECX_SSSE3; 5139 if (!is_x86_feature(x86_featureset, X86FSET_SSE4_1)) 5140 *ecx &= ~CPUID_INTC_ECX_SSE4_1; 5141 if (!is_x86_feature(x86_featureset, X86FSET_SSE4_2)) 5142 *ecx &= ~CPUID_INTC_ECX_SSE4_2; 5143 if (!is_x86_feature(x86_featureset, X86FSET_AES)) 5144 *ecx &= ~CPUID_INTC_ECX_AES; 5145 if (!is_x86_feature(x86_featureset, X86FSET_PCLMULQDQ)) 5146 *ecx &= ~CPUID_INTC_ECX_PCLMULQDQ; 5147 if (!is_x86_feature(x86_featureset, X86FSET_XSAVE)) 5148 *ecx &= ~(CPUID_INTC_ECX_XSAVE | 5149 CPUID_INTC_ECX_OSXSAVE); 5150 if (!is_x86_feature(x86_featureset, X86FSET_AVX)) 5151 *ecx &= ~CPUID_INTC_ECX_AVX; 5152 if (!is_x86_feature(x86_featureset, X86FSET_F16C)) 5153 *ecx &= ~CPUID_INTC_ECX_F16C; 5154 if (!is_x86_feature(x86_featureset, X86FSET_FMA)) 5155 *ecx &= ~CPUID_INTC_ECX_FMA; 5156 if (!is_x86_feature(x86_featureset, X86FSET_BMI1)) 5157 *ebx &= ~CPUID_INTC_EBX_7_0_BMI1; 5158 if (!is_x86_feature(x86_featureset, X86FSET_BMI2)) 5159 *ebx &= ~CPUID_INTC_EBX_7_0_BMI2; 5160 if (!is_x86_feature(x86_featureset, X86FSET_AVX2)) 5161 *ebx &= ~CPUID_INTC_EBX_7_0_AVX2; 5162 if (!is_x86_feature(x86_featureset, X86FSET_RDSEED)) 5163 *ebx &= ~CPUID_INTC_EBX_7_0_RDSEED; 5164 if (!is_x86_feature(x86_featureset, X86FSET_ADX)) 5165 *ebx &= ~CPUID_INTC_EBX_7_0_ADX; 5166 5167 /* 5168 * [no explicit support required beyond x87 fp context] 5169 */ 5170 if (!fpu_exists) 5171 *edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX); 5172 5173 /* 5174 * Now map the supported feature vector to things that we 5175 * think userland will care about. 5176 */ 5177 if (*edx & CPUID_INTC_EDX_SEP) 5178 hwcap_flags |= AV_386_SEP; 5179 if (*edx & CPUID_INTC_EDX_SSE) 5180 hwcap_flags |= AV_386_FXSR | AV_386_SSE; 5181 if (*edx & CPUID_INTC_EDX_SSE2) 5182 hwcap_flags |= AV_386_SSE2; 5183 if (*ecx & CPUID_INTC_ECX_SSE3) 5184 hwcap_flags |= AV_386_SSE3; 5185 if (*ecx & CPUID_INTC_ECX_SSSE3) 5186 hwcap_flags |= AV_386_SSSE3; 5187 if (*ecx & CPUID_INTC_ECX_SSE4_1) 5188 hwcap_flags |= AV_386_SSE4_1; 5189 if (*ecx & CPUID_INTC_ECX_SSE4_2) 5190 hwcap_flags |= AV_386_SSE4_2; 5191 if (*ecx & CPUID_INTC_ECX_MOVBE) 5192 hwcap_flags |= AV_386_MOVBE; 5193 if (*ecx & CPUID_INTC_ECX_AES) 5194 hwcap_flags |= AV_386_AES; 5195 if (*ecx & CPUID_INTC_ECX_PCLMULQDQ) 5196 hwcap_flags |= AV_386_PCLMULQDQ; 5197 if ((*ecx & CPUID_INTC_ECX_XSAVE) && 5198 (*ecx & CPUID_INTC_ECX_OSXSAVE)) { 5199 hwcap_flags |= AV_386_XSAVE; 5200 5201 if (*ecx & CPUID_INTC_ECX_AVX) { 5202 uint32_t *ecx_7 = &CPI_FEATURES_7_0_ECX(cpi); 5203 uint32_t *edx_7 = &CPI_FEATURES_7_0_EDX(cpi); 5204 5205 hwcap_flags |= AV_386_AVX; 5206 if (*ecx & CPUID_INTC_ECX_F16C) 5207 hwcap_flags_2 |= AV_386_2_F16C; 5208 if (*ecx & CPUID_INTC_ECX_FMA) 5209 hwcap_flags_2 |= AV_386_2_FMA; 5210 5211 if (*ebx & CPUID_INTC_EBX_7_0_BMI1) 5212 hwcap_flags_2 |= AV_386_2_BMI1; 5213 if (*ebx & CPUID_INTC_EBX_7_0_BMI2) 5214 hwcap_flags_2 |= AV_386_2_BMI2; 5215 if (*ebx & CPUID_INTC_EBX_7_0_AVX2) 5216 hwcap_flags_2 |= AV_386_2_AVX2; 5217 if (*ebx & CPUID_INTC_EBX_7_0_AVX512F) 5218 hwcap_flags_2 |= AV_386_2_AVX512F; 5219 if (*ebx & CPUID_INTC_EBX_7_0_AVX512DQ) 5220 hwcap_flags_2 |= AV_386_2_AVX512DQ; 5221 if (*ebx & CPUID_INTC_EBX_7_0_AVX512IFMA) 5222 hwcap_flags_2 |= AV_386_2_AVX512IFMA; 5223 if (*ebx & CPUID_INTC_EBX_7_0_AVX512PF) 5224 hwcap_flags_2 |= AV_386_2_AVX512PF; 5225 if (*ebx & CPUID_INTC_EBX_7_0_AVX512ER) 5226 hwcap_flags_2 |= AV_386_2_AVX512ER; 5227 if (*ebx & CPUID_INTC_EBX_7_0_AVX512CD) 5228 hwcap_flags_2 |= AV_386_2_AVX512CD; 5229 if (*ebx & CPUID_INTC_EBX_7_0_AVX512BW) 5230 hwcap_flags_2 |= AV_386_2_AVX512BW; 5231 if (*ebx & CPUID_INTC_EBX_7_0_AVX512VL) 5232 hwcap_flags_2 |= AV_386_2_AVX512VL; 5233 5234 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VBMI) 5235 hwcap_flags_2 |= AV_386_2_AVX512VBMI; 5236 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VNNI) 5237 hwcap_flags_2 |= AV_386_2_AVX512_VNNI; 5238 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ) 5239 hwcap_flags_2 |= AV_386_2_AVX512VPOPCDQ; 5240 5241 if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124NNIW) 5242 hwcap_flags_2 |= AV_386_2_AVX512_4NNIW; 5243 if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124FMAPS) 5244 hwcap_flags_2 |= AV_386_2_AVX512_4FMAPS; 5245 } 5246 } 5247 if (*ecx & CPUID_INTC_ECX_VMX) 5248 hwcap_flags |= AV_386_VMX; 5249 if (*ecx & CPUID_INTC_ECX_POPCNT) 5250 hwcap_flags |= AV_386_POPCNT; 5251 if (*edx & CPUID_INTC_EDX_FPU) 5252 hwcap_flags |= AV_386_FPU; 5253 if (*edx & CPUID_INTC_EDX_MMX) 5254 hwcap_flags |= AV_386_MMX; 5255 5256 if (*edx & CPUID_INTC_EDX_TSC) 5257 hwcap_flags |= AV_386_TSC; 5258 if (*edx & CPUID_INTC_EDX_CX8) 5259 hwcap_flags |= AV_386_CX8; 5260 if (*edx & CPUID_INTC_EDX_CMOV) 5261 hwcap_flags |= AV_386_CMOV; 5262 if (*ecx & CPUID_INTC_ECX_CX16) 5263 hwcap_flags |= AV_386_CX16; 5264 5265 if (*ecx & CPUID_INTC_ECX_RDRAND) 5266 hwcap_flags_2 |= AV_386_2_RDRAND; 5267 if (*ebx & CPUID_INTC_EBX_7_0_ADX) 5268 hwcap_flags_2 |= AV_386_2_ADX; 5269 if (*ebx & CPUID_INTC_EBX_7_0_RDSEED) 5270 hwcap_flags_2 |= AV_386_2_RDSEED; 5271 if (*ebx & CPUID_INTC_EBX_7_0_SHA) 5272 hwcap_flags_2 |= AV_386_2_SHA; 5273 if (*ebx & CPUID_INTC_EBX_7_0_FSGSBASE) 5274 hwcap_flags_2 |= AV_386_2_FSGSBASE; 5275 if (*ebx & CPUID_INTC_EBX_7_0_CLWB) 5276 hwcap_flags_2 |= AV_386_2_CLWB; 5277 if (*ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT) 5278 hwcap_flags_2 |= AV_386_2_CLFLUSHOPT; 5279 5280 } 5281 /* 5282 * Check a few miscilaneous features. 5283 */ 5284 if (is_x86_feature(x86_featureset, X86FSET_CLZERO)) 5285 hwcap_flags_2 |= AV_386_2_CLZERO; 5286 5287 if (cpi->cpi_xmaxeax < 0x80000001) 5288 goto pass4_done; 5289 5290 switch (cpi->cpi_vendor) { 5291 struct cpuid_regs cp; 5292 uint32_t *edx, *ecx; 5293 5294 case X86_VENDOR_Intel: 5295 /* 5296 * Seems like Intel duplicated what we necessary 5297 * here to make the initial crop of 64-bit OS's work. 5298 * Hopefully, those are the only "extended" bits 5299 * they'll add. 5300 */ 5301 /*FALLTHROUGH*/ 5302 5303 case X86_VENDOR_AMD: 5304 edx = &cpi->cpi_support[AMD_EDX_FEATURES]; 5305 ecx = &cpi->cpi_support[AMD_ECX_FEATURES]; 5306 5307 *edx = CPI_FEATURES_XTD_EDX(cpi); 5308 *ecx = CPI_FEATURES_XTD_ECX(cpi); 5309 5310 /* 5311 * [these features require explicit kernel support] 5312 */ 5313 switch (cpi->cpi_vendor) { 5314 case X86_VENDOR_Intel: 5315 if (!is_x86_feature(x86_featureset, X86FSET_TSCP)) 5316 *edx &= ~CPUID_AMD_EDX_TSCP; 5317 break; 5318 5319 case X86_VENDOR_AMD: 5320 if (!is_x86_feature(x86_featureset, X86FSET_TSCP)) 5321 *edx &= ~CPUID_AMD_EDX_TSCP; 5322 if (!is_x86_feature(x86_featureset, X86FSET_SSE4A)) 5323 *ecx &= ~CPUID_AMD_ECX_SSE4A; 5324 break; 5325 5326 default: 5327 break; 5328 } 5329 5330 /* 5331 * [no explicit support required beyond 5332 * x87 fp context and exception handlers] 5333 */ 5334 if (!fpu_exists) 5335 *edx &= ~(CPUID_AMD_EDX_MMXamd | 5336 CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx); 5337 5338 if (!is_x86_feature(x86_featureset, X86FSET_NX)) 5339 *edx &= ~CPUID_AMD_EDX_NX; 5340 #if !defined(__amd64) 5341 *edx &= ~CPUID_AMD_EDX_LM; 5342 #endif 5343 /* 5344 * Now map the supported feature vector to 5345 * things that we think userland will care about. 5346 */ 5347 #if defined(__amd64) 5348 if (*edx & CPUID_AMD_EDX_SYSC) 5349 hwcap_flags |= AV_386_AMD_SYSC; 5350 #endif 5351 if (*edx & CPUID_AMD_EDX_MMXamd) 5352 hwcap_flags |= AV_386_AMD_MMX; 5353 if (*edx & CPUID_AMD_EDX_3DNow) 5354 hwcap_flags |= AV_386_AMD_3DNow; 5355 if (*edx & CPUID_AMD_EDX_3DNowx) 5356 hwcap_flags |= AV_386_AMD_3DNowx; 5357 if (*ecx & CPUID_AMD_ECX_SVM) 5358 hwcap_flags |= AV_386_AMD_SVM; 5359 5360 switch (cpi->cpi_vendor) { 5361 case X86_VENDOR_AMD: 5362 if (*edx & CPUID_AMD_EDX_TSCP) 5363 hwcap_flags |= AV_386_TSCP; 5364 if (*ecx & CPUID_AMD_ECX_AHF64) 5365 hwcap_flags |= AV_386_AHF; 5366 if (*ecx & CPUID_AMD_ECX_SSE4A) 5367 hwcap_flags |= AV_386_AMD_SSE4A; 5368 if (*ecx & CPUID_AMD_ECX_LZCNT) 5369 hwcap_flags |= AV_386_AMD_LZCNT; 5370 if (*ecx & CPUID_AMD_ECX_MONITORX) 5371 hwcap_flags_2 |= AV_386_2_MONITORX; 5372 break; 5373 5374 case X86_VENDOR_Intel: 5375 if (*edx & CPUID_AMD_EDX_TSCP) 5376 hwcap_flags |= AV_386_TSCP; 5377 if (*ecx & CPUID_AMD_ECX_LZCNT) 5378 hwcap_flags |= AV_386_AMD_LZCNT; 5379 /* 5380 * Aarrgh. 5381 * Intel uses a different bit in the same word. 5382 */ 5383 if (*ecx & CPUID_INTC_ECX_AHF64) 5384 hwcap_flags |= AV_386_AHF; 5385 break; 5386 5387 default: 5388 break; 5389 } 5390 break; 5391 5392 case X86_VENDOR_TM: 5393 cp.cp_eax = 0x80860001; 5394 (void) __cpuid_insn(&cp); 5395 cpi->cpi_support[TM_EDX_FEATURES] = cp.cp_edx; 5396 break; 5397 5398 default: 5399 break; 5400 } 5401 5402 pass4_done: 5403 cpi->cpi_pass = 4; 5404 if (hwcap_out != NULL) { 5405 hwcap_out[0] = hwcap_flags; 5406 hwcap_out[1] = hwcap_flags_2; 5407 } 5408 } 5409 5410 5411 /* 5412 * Simulate the cpuid instruction using the data we previously 5413 * captured about this CPU. We try our best to return the truth 5414 * about the hardware, independently of kernel support. 5415 */ 5416 uint32_t 5417 cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp) 5418 { 5419 struct cpuid_info *cpi; 5420 struct cpuid_regs *xcp; 5421 5422 if (cpu == NULL) 5423 cpu = CPU; 5424 cpi = cpu->cpu_m.mcpu_cpi; 5425 5426 ASSERT(cpuid_checkpass(cpu, 3)); 5427 5428 /* 5429 * CPUID data is cached in two separate places: cpi_std for standard 5430 * CPUID leaves , and cpi_extd for extended CPUID leaves. 5431 */ 5432 if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) { 5433 xcp = &cpi->cpi_std[cp->cp_eax]; 5434 } else if (cp->cp_eax >= CPUID_LEAF_EXT_0 && 5435 cp->cp_eax <= cpi->cpi_xmaxeax && 5436 cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) { 5437 xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0]; 5438 } else { 5439 /* 5440 * The caller is asking for data from an input parameter which 5441 * the kernel has not cached. In this case we go fetch from 5442 * the hardware and return the data directly to the user. 5443 */ 5444 return (__cpuid_insn(cp)); 5445 } 5446 5447 cp->cp_eax = xcp->cp_eax; 5448 cp->cp_ebx = xcp->cp_ebx; 5449 cp->cp_ecx = xcp->cp_ecx; 5450 cp->cp_edx = xcp->cp_edx; 5451 return (cp->cp_eax); 5452 } 5453 5454 int 5455 cpuid_checkpass(cpu_t *cpu, int pass) 5456 { 5457 return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL && 5458 cpu->cpu_m.mcpu_cpi->cpi_pass >= pass); 5459 } 5460 5461 int 5462 cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n) 5463 { 5464 ASSERT(cpuid_checkpass(cpu, 3)); 5465 5466 return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr)); 5467 } 5468 5469 int 5470 cpuid_is_cmt(cpu_t *cpu) 5471 { 5472 if (cpu == NULL) 5473 cpu = CPU; 5474 5475 ASSERT(cpuid_checkpass(cpu, 1)); 5476 5477 return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0); 5478 } 5479 5480 /* 5481 * AMD and Intel both implement the 64-bit variant of the syscall 5482 * instruction (syscallq), so if there's -any- support for syscall, 5483 * cpuid currently says "yes, we support this". 5484 * 5485 * However, Intel decided to -not- implement the 32-bit variant of the 5486 * syscall instruction, so we provide a predicate to allow our caller 5487 * to test that subtlety here. 5488 * 5489 * XXPV Currently, 32-bit syscall instructions don't work via the hypervisor, 5490 * even in the case where the hardware would in fact support it. 5491 */ 5492 /*ARGSUSED*/ 5493 int 5494 cpuid_syscall32_insn(cpu_t *cpu) 5495 { 5496 ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), 1)); 5497 5498 #if !defined(__xpv) 5499 if (cpu == NULL) 5500 cpu = CPU; 5501 5502 /*CSTYLED*/ 5503 { 5504 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 5505 5506 if (cpi->cpi_vendor == X86_VENDOR_AMD && 5507 cpi->cpi_xmaxeax >= 0x80000001 && 5508 (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC)) 5509 return (1); 5510 } 5511 #endif 5512 return (0); 5513 } 5514 5515 int 5516 cpuid_getidstr(cpu_t *cpu, char *s, size_t n) 5517 { 5518 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 5519 5520 static const char fmt[] = 5521 "x86 (%s %X family %d model %d step %d clock %d MHz)"; 5522 static const char fmt_ht[] = 5523 "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)"; 5524 5525 ASSERT(cpuid_checkpass(cpu, 1)); 5526 5527 if (cpuid_is_cmt(cpu)) 5528 return (snprintf(s, n, fmt_ht, cpi->cpi_chipid, 5529 cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax, 5530 cpi->cpi_family, cpi->cpi_model, 5531 cpi->cpi_step, cpu->cpu_type_info.pi_clock)); 5532 return (snprintf(s, n, fmt, 5533 cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax, 5534 cpi->cpi_family, cpi->cpi_model, 5535 cpi->cpi_step, cpu->cpu_type_info.pi_clock)); 5536 } 5537 5538 const char * 5539 cpuid_getvendorstr(cpu_t *cpu) 5540 { 5541 ASSERT(cpuid_checkpass(cpu, 1)); 5542 return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr); 5543 } 5544 5545 uint_t 5546 cpuid_getvendor(cpu_t *cpu) 5547 { 5548 ASSERT(cpuid_checkpass(cpu, 1)); 5549 return (cpu->cpu_m.mcpu_cpi->cpi_vendor); 5550 } 5551 5552 uint_t 5553 cpuid_getfamily(cpu_t *cpu) 5554 { 5555 ASSERT(cpuid_checkpass(cpu, 1)); 5556 return (cpu->cpu_m.mcpu_cpi->cpi_family); 5557 } 5558 5559 uint_t 5560 cpuid_getmodel(cpu_t *cpu) 5561 { 5562 ASSERT(cpuid_checkpass(cpu, 1)); 5563 return (cpu->cpu_m.mcpu_cpi->cpi_model); 5564 } 5565 5566 uint_t 5567 cpuid_get_ncpu_per_chip(cpu_t *cpu) 5568 { 5569 ASSERT(cpuid_checkpass(cpu, 1)); 5570 return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip); 5571 } 5572 5573 uint_t 5574 cpuid_get_ncore_per_chip(cpu_t *cpu) 5575 { 5576 ASSERT(cpuid_checkpass(cpu, 1)); 5577 return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip); 5578 } 5579 5580 uint_t 5581 cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu) 5582 { 5583 ASSERT(cpuid_checkpass(cpu, 2)); 5584 return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache); 5585 } 5586 5587 id_t 5588 cpuid_get_last_lvl_cacheid(cpu_t *cpu) 5589 { 5590 ASSERT(cpuid_checkpass(cpu, 2)); 5591 return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid); 5592 } 5593 5594 uint_t 5595 cpuid_getstep(cpu_t *cpu) 5596 { 5597 ASSERT(cpuid_checkpass(cpu, 1)); 5598 return (cpu->cpu_m.mcpu_cpi->cpi_step); 5599 } 5600 5601 uint_t 5602 cpuid_getsig(struct cpu *cpu) 5603 { 5604 ASSERT(cpuid_checkpass(cpu, 1)); 5605 return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax); 5606 } 5607 5608 uint32_t 5609 cpuid_getchiprev(struct cpu *cpu) 5610 { 5611 ASSERT(cpuid_checkpass(cpu, 1)); 5612 return (cpu->cpu_m.mcpu_cpi->cpi_chiprev); 5613 } 5614 5615 const char * 5616 cpuid_getchiprevstr(struct cpu *cpu) 5617 { 5618 ASSERT(cpuid_checkpass(cpu, 1)); 5619 return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr); 5620 } 5621 5622 uint32_t 5623 cpuid_getsockettype(struct cpu *cpu) 5624 { 5625 ASSERT(cpuid_checkpass(cpu, 1)); 5626 return (cpu->cpu_m.mcpu_cpi->cpi_socket); 5627 } 5628 5629 const char * 5630 cpuid_getsocketstr(cpu_t *cpu) 5631 { 5632 static const char *socketstr = NULL; 5633 struct cpuid_info *cpi; 5634 5635 ASSERT(cpuid_checkpass(cpu, 1)); 5636 cpi = cpu->cpu_m.mcpu_cpi; 5637 5638 /* Assume that socket types are the same across the system */ 5639 if (socketstr == NULL) 5640 socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family, 5641 cpi->cpi_model, cpi->cpi_step); 5642 5643 5644 return (socketstr); 5645 } 5646 5647 int 5648 cpuid_get_chipid(cpu_t *cpu) 5649 { 5650 ASSERT(cpuid_checkpass(cpu, 1)); 5651 5652 if (cpuid_is_cmt(cpu)) 5653 return (cpu->cpu_m.mcpu_cpi->cpi_chipid); 5654 return (cpu->cpu_id); 5655 } 5656 5657 id_t 5658 cpuid_get_coreid(cpu_t *cpu) 5659 { 5660 ASSERT(cpuid_checkpass(cpu, 1)); 5661 return (cpu->cpu_m.mcpu_cpi->cpi_coreid); 5662 } 5663 5664 int 5665 cpuid_get_pkgcoreid(cpu_t *cpu) 5666 { 5667 ASSERT(cpuid_checkpass(cpu, 1)); 5668 return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid); 5669 } 5670 5671 int 5672 cpuid_get_clogid(cpu_t *cpu) 5673 { 5674 ASSERT(cpuid_checkpass(cpu, 1)); 5675 return (cpu->cpu_m.mcpu_cpi->cpi_clogid); 5676 } 5677 5678 int 5679 cpuid_get_cacheid(cpu_t *cpu) 5680 { 5681 ASSERT(cpuid_checkpass(cpu, 1)); 5682 return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid); 5683 } 5684 5685 uint_t 5686 cpuid_get_procnodeid(cpu_t *cpu) 5687 { 5688 ASSERT(cpuid_checkpass(cpu, 1)); 5689 return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid); 5690 } 5691 5692 uint_t 5693 cpuid_get_procnodes_per_pkg(cpu_t *cpu) 5694 { 5695 ASSERT(cpuid_checkpass(cpu, 1)); 5696 return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg); 5697 } 5698 5699 uint_t 5700 cpuid_get_compunitid(cpu_t *cpu) 5701 { 5702 ASSERT(cpuid_checkpass(cpu, 1)); 5703 return (cpu->cpu_m.mcpu_cpi->cpi_compunitid); 5704 } 5705 5706 uint_t 5707 cpuid_get_cores_per_compunit(cpu_t *cpu) 5708 { 5709 ASSERT(cpuid_checkpass(cpu, 1)); 5710 return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit); 5711 } 5712 5713 /*ARGSUSED*/ 5714 int 5715 cpuid_have_cr8access(cpu_t *cpu) 5716 { 5717 #if defined(__amd64) 5718 return (1); 5719 #else 5720 struct cpuid_info *cpi; 5721 5722 ASSERT(cpu != NULL); 5723 cpi = cpu->cpu_m.mcpu_cpi; 5724 if (cpi->cpi_vendor == X86_VENDOR_AMD && cpi->cpi_maxeax >= 1 && 5725 (CPI_FEATURES_XTD_ECX(cpi) & CPUID_AMD_ECX_CR8D) != 0) 5726 return (1); 5727 return (0); 5728 #endif 5729 } 5730 5731 uint32_t 5732 cpuid_get_apicid(cpu_t *cpu) 5733 { 5734 ASSERT(cpuid_checkpass(cpu, 1)); 5735 if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) { 5736 return (UINT32_MAX); 5737 } else { 5738 return (cpu->cpu_m.mcpu_cpi->cpi_apicid); 5739 } 5740 } 5741 5742 void 5743 cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits) 5744 { 5745 struct cpuid_info *cpi; 5746 5747 if (cpu == NULL) 5748 cpu = CPU; 5749 cpi = cpu->cpu_m.mcpu_cpi; 5750 5751 ASSERT(cpuid_checkpass(cpu, 1)); 5752 5753 if (pabits) 5754 *pabits = cpi->cpi_pabits; 5755 if (vabits) 5756 *vabits = cpi->cpi_vabits; 5757 } 5758 5759 size_t 5760 cpuid_get_xsave_size() 5761 { 5762 return (MAX(cpuid_info0.cpi_xsave.xsav_max_size, 5763 sizeof (struct xsave_state))); 5764 } 5765 5766 /* 5767 * Return true if the CPUs on this system require 'pointer clearing' for the 5768 * floating point error pointer exception handling. In the past, this has been 5769 * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to 5770 * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO 5771 * feature bit and is reflected in the cpi_fp_amd_save member. 5772 */ 5773 boolean_t 5774 cpuid_need_fp_excp_handling() 5775 { 5776 return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD && 5777 cpuid_info0.cpi_fp_amd_save != 0); 5778 } 5779 5780 /* 5781 * Returns the number of data TLB entries for a corresponding 5782 * pagesize. If it can't be computed, or isn't known, the 5783 * routine returns zero. If you ask about an architecturally 5784 * impossible pagesize, the routine will panic (so that the 5785 * hat implementor knows that things are inconsistent.) 5786 */ 5787 uint_t 5788 cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize) 5789 { 5790 struct cpuid_info *cpi; 5791 uint_t dtlb_nent = 0; 5792 5793 if (cpu == NULL) 5794 cpu = CPU; 5795 cpi = cpu->cpu_m.mcpu_cpi; 5796 5797 ASSERT(cpuid_checkpass(cpu, 1)); 5798 5799 /* 5800 * Check the L2 TLB info 5801 */ 5802 if (cpi->cpi_xmaxeax >= 0x80000006) { 5803 struct cpuid_regs *cp = &cpi->cpi_extd[6]; 5804 5805 switch (pagesize) { 5806 5807 case 4 * 1024: 5808 /* 5809 * All zero in the top 16 bits of the register 5810 * indicates a unified TLB. Size is in low 16 bits. 5811 */ 5812 if ((cp->cp_ebx & 0xffff0000) == 0) 5813 dtlb_nent = cp->cp_ebx & 0x0000ffff; 5814 else 5815 dtlb_nent = BITX(cp->cp_ebx, 27, 16); 5816 break; 5817 5818 case 2 * 1024 * 1024: 5819 if ((cp->cp_eax & 0xffff0000) == 0) 5820 dtlb_nent = cp->cp_eax & 0x0000ffff; 5821 else 5822 dtlb_nent = BITX(cp->cp_eax, 27, 16); 5823 break; 5824 5825 default: 5826 panic("unknown L2 pagesize"); 5827 /*NOTREACHED*/ 5828 } 5829 } 5830 5831 if (dtlb_nent != 0) 5832 return (dtlb_nent); 5833 5834 /* 5835 * No L2 TLB support for this size, try L1. 5836 */ 5837 if (cpi->cpi_xmaxeax >= 0x80000005) { 5838 struct cpuid_regs *cp = &cpi->cpi_extd[5]; 5839 5840 switch (pagesize) { 5841 case 4 * 1024: 5842 dtlb_nent = BITX(cp->cp_ebx, 23, 16); 5843 break; 5844 case 2 * 1024 * 1024: 5845 dtlb_nent = BITX(cp->cp_eax, 23, 16); 5846 break; 5847 default: 5848 panic("unknown L1 d-TLB pagesize"); 5849 /*NOTREACHED*/ 5850 } 5851 } 5852 5853 return (dtlb_nent); 5854 } 5855 5856 /* 5857 * Return 0 if the erratum is not present or not applicable, positive 5858 * if it is, and negative if the status of the erratum is unknown. 5859 * 5860 * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm) 5861 * Processors" #25759, Rev 3.57, August 2005 5862 */ 5863 int 5864 cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum) 5865 { 5866 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 5867 uint_t eax; 5868 5869 /* 5870 * Bail out if this CPU isn't an AMD CPU, or if it's 5871 * a legacy (32-bit) AMD CPU. 5872 */ 5873 if (cpi->cpi_vendor != X86_VENDOR_AMD || 5874 cpi->cpi_family == 4 || cpi->cpi_family == 5 || 5875 cpi->cpi_family == 6) { 5876 return (0); 5877 } 5878 5879 eax = cpi->cpi_std[1].cp_eax; 5880 5881 #define SH_B0(eax) (eax == 0xf40 || eax == 0xf50) 5882 #define SH_B3(eax) (eax == 0xf51) 5883 #define B(eax) (SH_B0(eax) || SH_B3(eax)) 5884 5885 #define SH_C0(eax) (eax == 0xf48 || eax == 0xf58) 5886 5887 #define SH_CG(eax) (eax == 0xf4a || eax == 0xf5a || eax == 0xf7a) 5888 #define DH_CG(eax) (eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0) 5889 #define CH_CG(eax) (eax == 0xf82 || eax == 0xfb2) 5890 #define CG(eax) (SH_CG(eax) || DH_CG(eax) || CH_CG(eax)) 5891 5892 #define SH_D0(eax) (eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70) 5893 #define DH_D0(eax) (eax == 0x10fc0 || eax == 0x10ff0) 5894 #define CH_D0(eax) (eax == 0x10f80 || eax == 0x10fb0) 5895 #define D0(eax) (SH_D0(eax) || DH_D0(eax) || CH_D0(eax)) 5896 5897 #define SH_E0(eax) (eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70) 5898 #define JH_E1(eax) (eax == 0x20f10) /* JH8_E0 had 0x20f30 */ 5899 #define DH_E3(eax) (eax == 0x20fc0 || eax == 0x20ff0) 5900 #define SH_E4(eax) (eax == 0x20f51 || eax == 0x20f71) 5901 #define BH_E4(eax) (eax == 0x20fb1) 5902 #define SH_E5(eax) (eax == 0x20f42) 5903 #define DH_E6(eax) (eax == 0x20ff2 || eax == 0x20fc2) 5904 #define JH_E6(eax) (eax == 0x20f12 || eax == 0x20f32) 5905 #define EX(eax) (SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \ 5906 SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \ 5907 DH_E6(eax) || JH_E6(eax)) 5908 5909 #define DR_AX(eax) (eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02) 5910 #define DR_B0(eax) (eax == 0x100f20) 5911 #define DR_B1(eax) (eax == 0x100f21) 5912 #define DR_BA(eax) (eax == 0x100f2a) 5913 #define DR_B2(eax) (eax == 0x100f22) 5914 #define DR_B3(eax) (eax == 0x100f23) 5915 #define RB_C0(eax) (eax == 0x100f40) 5916 5917 switch (erratum) { 5918 case 1: 5919 return (cpi->cpi_family < 0x10); 5920 case 51: /* what does the asterisk mean? */ 5921 return (B(eax) || SH_C0(eax) || CG(eax)); 5922 case 52: 5923 return (B(eax)); 5924 case 57: 5925 return (cpi->cpi_family <= 0x11); 5926 case 58: 5927 return (B(eax)); 5928 case 60: 5929 return (cpi->cpi_family <= 0x11); 5930 case 61: 5931 case 62: 5932 case 63: 5933 case 64: 5934 case 65: 5935 case 66: 5936 case 68: 5937 case 69: 5938 case 70: 5939 case 71: 5940 return (B(eax)); 5941 case 72: 5942 return (SH_B0(eax)); 5943 case 74: 5944 return (B(eax)); 5945 case 75: 5946 return (cpi->cpi_family < 0x10); 5947 case 76: 5948 return (B(eax)); 5949 case 77: 5950 return (cpi->cpi_family <= 0x11); 5951 case 78: 5952 return (B(eax) || SH_C0(eax)); 5953 case 79: 5954 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax)); 5955 case 80: 5956 case 81: 5957 case 82: 5958 return (B(eax)); 5959 case 83: 5960 return (B(eax) || SH_C0(eax) || CG(eax)); 5961 case 85: 5962 return (cpi->cpi_family < 0x10); 5963 case 86: 5964 return (SH_C0(eax) || CG(eax)); 5965 case 88: 5966 #if !defined(__amd64) 5967 return (0); 5968 #else 5969 return (B(eax) || SH_C0(eax)); 5970 #endif 5971 case 89: 5972 return (cpi->cpi_family < 0x10); 5973 case 90: 5974 return (B(eax) || SH_C0(eax) || CG(eax)); 5975 case 91: 5976 case 92: 5977 return (B(eax) || SH_C0(eax)); 5978 case 93: 5979 return (SH_C0(eax)); 5980 case 94: 5981 return (B(eax) || SH_C0(eax) || CG(eax)); 5982 case 95: 5983 #if !defined(__amd64) 5984 return (0); 5985 #else 5986 return (B(eax) || SH_C0(eax)); 5987 #endif 5988 case 96: 5989 return (B(eax) || SH_C0(eax) || CG(eax)); 5990 case 97: 5991 case 98: 5992 return (SH_C0(eax) || CG(eax)); 5993 case 99: 5994 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax)); 5995 case 100: 5996 return (B(eax) || SH_C0(eax)); 5997 case 101: 5998 case 103: 5999 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax)); 6000 case 104: 6001 return (SH_C0(eax) || CG(eax) || D0(eax)); 6002 case 105: 6003 case 106: 6004 case 107: 6005 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax)); 6006 case 108: 6007 return (DH_CG(eax)); 6008 case 109: 6009 return (SH_C0(eax) || CG(eax) || D0(eax)); 6010 case 110: 6011 return (D0(eax) || EX(eax)); 6012 case 111: 6013 return (CG(eax)); 6014 case 112: 6015 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax)); 6016 case 113: 6017 return (eax == 0x20fc0); 6018 case 114: 6019 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax)); 6020 case 115: 6021 return (SH_E0(eax) || JH_E1(eax)); 6022 case 116: 6023 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax)); 6024 case 117: 6025 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax)); 6026 case 118: 6027 return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) || 6028 JH_E6(eax)); 6029 case 121: 6030 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax)); 6031 case 122: 6032 return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11); 6033 case 123: 6034 return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax)); 6035 case 131: 6036 return (cpi->cpi_family < 0x10); 6037 case 6336786: 6038 6039 /* 6040 * Test for AdvPowerMgmtInfo.TscPStateInvariant 6041 * if this is a K8 family or newer processor. We're testing for 6042 * this 'erratum' to determine whether or not we have a constant 6043 * TSC. 6044 * 6045 * Our current fix for this is to disable the C1-Clock ramping. 6046 * However, this doesn't work on newer processor families nor 6047 * does it work when virtualized as those devices don't exist. 6048 */ 6049 if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) { 6050 return (0); 6051 } 6052 6053 if (CPI_FAMILY(cpi) == 0xf) { 6054 struct cpuid_regs regs; 6055 regs.cp_eax = 0x80000007; 6056 (void) __cpuid_insn(®s); 6057 return (!(regs.cp_edx & 0x100)); 6058 } 6059 return (0); 6060 case 6323525: 6061 /* 6062 * This erratum (K8 #147) is not present on family 10 and newer. 6063 */ 6064 if (cpi->cpi_family >= 0x10) { 6065 return (0); 6066 } 6067 return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) | 6068 (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40); 6069 6070 case 6671130: 6071 /* 6072 * check for processors (pre-Shanghai) that do not provide 6073 * optimal management of 1gb ptes in its tlb. 6074 */ 6075 return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4); 6076 6077 case 298: 6078 return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) || 6079 DR_B2(eax) || RB_C0(eax)); 6080 6081 case 721: 6082 #if defined(__amd64) 6083 return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12); 6084 #else 6085 return (0); 6086 #endif 6087 6088 default: 6089 return (-1); 6090 6091 } 6092 } 6093 6094 /* 6095 * Determine if specified erratum is present via OSVW (OS Visible Workaround). 6096 * Return 1 if erratum is present, 0 if not present and -1 if indeterminate. 6097 */ 6098 int 6099 osvw_opteron_erratum(cpu_t *cpu, uint_t erratum) 6100 { 6101 struct cpuid_info *cpi; 6102 uint_t osvwid; 6103 static int osvwfeature = -1; 6104 uint64_t osvwlength; 6105 6106 6107 cpi = cpu->cpu_m.mcpu_cpi; 6108 6109 /* confirm OSVW supported */ 6110 if (osvwfeature == -1) { 6111 osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW; 6112 } else { 6113 /* assert that osvw feature setting is consistent on all cpus */ 6114 ASSERT(osvwfeature == 6115 (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW)); 6116 } 6117 if (!osvwfeature) 6118 return (-1); 6119 6120 osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK; 6121 6122 switch (erratum) { 6123 case 298: /* osvwid is 0 */ 6124 osvwid = 0; 6125 if (osvwlength <= (uint64_t)osvwid) { 6126 /* osvwid 0 is unknown */ 6127 return (-1); 6128 } 6129 6130 /* 6131 * Check the OSVW STATUS MSR to determine the state 6132 * of the erratum where: 6133 * 0 - fixed by HW 6134 * 1 - BIOS has applied the workaround when BIOS 6135 * workaround is available. (Or for other errata, 6136 * OS workaround is required.) 6137 * For a value of 1, caller will confirm that the 6138 * erratum 298 workaround has indeed been applied by BIOS. 6139 * 6140 * A 1 may be set in cpus that have a HW fix 6141 * in a mixed cpu system. Regarding erratum 298: 6142 * In a multiprocessor platform, the workaround above 6143 * should be applied to all processors regardless of 6144 * silicon revision when an affected processor is 6145 * present. 6146 */ 6147 6148 return (rdmsr(MSR_AMD_OSVW_STATUS + 6149 (osvwid / OSVW_ID_CNT_PER_MSR)) & 6150 (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR))); 6151 6152 default: 6153 return (-1); 6154 } 6155 } 6156 6157 static const char assoc_str[] = "associativity"; 6158 static const char line_str[] = "line-size"; 6159 static const char size_str[] = "size"; 6160 6161 static void 6162 add_cache_prop(dev_info_t *devi, const char *label, const char *type, 6163 uint32_t val) 6164 { 6165 char buf[128]; 6166 6167 /* 6168 * ndi_prop_update_int() is used because it is desirable for 6169 * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set. 6170 */ 6171 if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf)) 6172 (void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val); 6173 } 6174 6175 /* 6176 * Intel-style cache/tlb description 6177 * 6178 * Standard cpuid level 2 gives a randomly ordered 6179 * selection of tags that index into a table that describes 6180 * cache and tlb properties. 6181 */ 6182 6183 static const char l1_icache_str[] = "l1-icache"; 6184 static const char l1_dcache_str[] = "l1-dcache"; 6185 static const char l2_cache_str[] = "l2-cache"; 6186 static const char l3_cache_str[] = "l3-cache"; 6187 static const char itlb4k_str[] = "itlb-4K"; 6188 static const char dtlb4k_str[] = "dtlb-4K"; 6189 static const char itlb2M_str[] = "itlb-2M"; 6190 static const char itlb4M_str[] = "itlb-4M"; 6191 static const char dtlb4M_str[] = "dtlb-4M"; 6192 static const char dtlb24_str[] = "dtlb0-2M-4M"; 6193 static const char itlb424_str[] = "itlb-4K-2M-4M"; 6194 static const char itlb24_str[] = "itlb-2M-4M"; 6195 static const char dtlb44_str[] = "dtlb-4K-4M"; 6196 static const char sl1_dcache_str[] = "sectored-l1-dcache"; 6197 static const char sl2_cache_str[] = "sectored-l2-cache"; 6198 static const char itrace_str[] = "itrace-cache"; 6199 static const char sl3_cache_str[] = "sectored-l3-cache"; 6200 static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k"; 6201 6202 static const struct cachetab { 6203 uint8_t ct_code; 6204 uint8_t ct_assoc; 6205 uint16_t ct_line_size; 6206 size_t ct_size; 6207 const char *ct_label; 6208 } intel_ctab[] = { 6209 /* 6210 * maintain descending order! 6211 * 6212 * Codes ignored - Reason 6213 * ---------------------- 6214 * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache 6215 * f0H/f1H - Currently we do not interpret prefetch size by design 6216 */ 6217 { 0xe4, 16, 64, 8*1024*1024, l3_cache_str}, 6218 { 0xe3, 16, 64, 4*1024*1024, l3_cache_str}, 6219 { 0xe2, 16, 64, 2*1024*1024, l3_cache_str}, 6220 { 0xde, 12, 64, 6*1024*1024, l3_cache_str}, 6221 { 0xdd, 12, 64, 3*1024*1024, l3_cache_str}, 6222 { 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str}, 6223 { 0xd8, 8, 64, 4*1024*1024, l3_cache_str}, 6224 { 0xd7, 8, 64, 2*1024*1024, l3_cache_str}, 6225 { 0xd6, 8, 64, 1*1024*1024, l3_cache_str}, 6226 { 0xd2, 4, 64, 2*1024*1024, l3_cache_str}, 6227 { 0xd1, 4, 64, 1*1024*1024, l3_cache_str}, 6228 { 0xd0, 4, 64, 512*1024, l3_cache_str}, 6229 { 0xca, 4, 0, 512, sh_l2_tlb4k_str}, 6230 { 0xc0, 4, 0, 8, dtlb44_str }, 6231 { 0xba, 4, 0, 64, dtlb4k_str }, 6232 { 0xb4, 4, 0, 256, dtlb4k_str }, 6233 { 0xb3, 4, 0, 128, dtlb4k_str }, 6234 { 0xb2, 4, 0, 64, itlb4k_str }, 6235 { 0xb0, 4, 0, 128, itlb4k_str }, 6236 { 0x87, 8, 64, 1024*1024, l2_cache_str}, 6237 { 0x86, 4, 64, 512*1024, l2_cache_str}, 6238 { 0x85, 8, 32, 2*1024*1024, l2_cache_str}, 6239 { 0x84, 8, 32, 1024*1024, l2_cache_str}, 6240 { 0x83, 8, 32, 512*1024, l2_cache_str}, 6241 { 0x82, 8, 32, 256*1024, l2_cache_str}, 6242 { 0x80, 8, 64, 512*1024, l2_cache_str}, 6243 { 0x7f, 2, 64, 512*1024, l2_cache_str}, 6244 { 0x7d, 8, 64, 2*1024*1024, sl2_cache_str}, 6245 { 0x7c, 8, 64, 1024*1024, sl2_cache_str}, 6246 { 0x7b, 8, 64, 512*1024, sl2_cache_str}, 6247 { 0x7a, 8, 64, 256*1024, sl2_cache_str}, 6248 { 0x79, 8, 64, 128*1024, sl2_cache_str}, 6249 { 0x78, 8, 64, 1024*1024, l2_cache_str}, 6250 { 0x73, 8, 0, 64*1024, itrace_str}, 6251 { 0x72, 8, 0, 32*1024, itrace_str}, 6252 { 0x71, 8, 0, 16*1024, itrace_str}, 6253 { 0x70, 8, 0, 12*1024, itrace_str}, 6254 { 0x68, 4, 64, 32*1024, sl1_dcache_str}, 6255 { 0x67, 4, 64, 16*1024, sl1_dcache_str}, 6256 { 0x66, 4, 64, 8*1024, sl1_dcache_str}, 6257 { 0x60, 8, 64, 16*1024, sl1_dcache_str}, 6258 { 0x5d, 0, 0, 256, dtlb44_str}, 6259 { 0x5c, 0, 0, 128, dtlb44_str}, 6260 { 0x5b, 0, 0, 64, dtlb44_str}, 6261 { 0x5a, 4, 0, 32, dtlb24_str}, 6262 { 0x59, 0, 0, 16, dtlb4k_str}, 6263 { 0x57, 4, 0, 16, dtlb4k_str}, 6264 { 0x56, 4, 0, 16, dtlb4M_str}, 6265 { 0x55, 0, 0, 7, itlb24_str}, 6266 { 0x52, 0, 0, 256, itlb424_str}, 6267 { 0x51, 0, 0, 128, itlb424_str}, 6268 { 0x50, 0, 0, 64, itlb424_str}, 6269 { 0x4f, 0, 0, 32, itlb4k_str}, 6270 { 0x4e, 24, 64, 6*1024*1024, l2_cache_str}, 6271 { 0x4d, 16, 64, 16*1024*1024, l3_cache_str}, 6272 { 0x4c, 12, 64, 12*1024*1024, l3_cache_str}, 6273 { 0x4b, 16, 64, 8*1024*1024, l3_cache_str}, 6274 { 0x4a, 12, 64, 6*1024*1024, l3_cache_str}, 6275 { 0x49, 16, 64, 4*1024*1024, l3_cache_str}, 6276 { 0x48, 12, 64, 3*1024*1024, l2_cache_str}, 6277 { 0x47, 8, 64, 8*1024*1024, l3_cache_str}, 6278 { 0x46, 4, 64, 4*1024*1024, l3_cache_str}, 6279 { 0x45, 4, 32, 2*1024*1024, l2_cache_str}, 6280 { 0x44, 4, 32, 1024*1024, l2_cache_str}, 6281 { 0x43, 4, 32, 512*1024, l2_cache_str}, 6282 { 0x42, 4, 32, 256*1024, l2_cache_str}, 6283 { 0x41, 4, 32, 128*1024, l2_cache_str}, 6284 { 0x3e, 4, 64, 512*1024, sl2_cache_str}, 6285 { 0x3d, 6, 64, 384*1024, sl2_cache_str}, 6286 { 0x3c, 4, 64, 256*1024, sl2_cache_str}, 6287 { 0x3b, 2, 64, 128*1024, sl2_cache_str}, 6288 { 0x3a, 6, 64, 192*1024, sl2_cache_str}, 6289 { 0x39, 4, 64, 128*1024, sl2_cache_str}, 6290 { 0x30, 8, 64, 32*1024, l1_icache_str}, 6291 { 0x2c, 8, 64, 32*1024, l1_dcache_str}, 6292 { 0x29, 8, 64, 4096*1024, sl3_cache_str}, 6293 { 0x25, 8, 64, 2048*1024, sl3_cache_str}, 6294 { 0x23, 8, 64, 1024*1024, sl3_cache_str}, 6295 { 0x22, 4, 64, 512*1024, sl3_cache_str}, 6296 { 0x0e, 6, 64, 24*1024, l1_dcache_str}, 6297 { 0x0d, 4, 32, 16*1024, l1_dcache_str}, 6298 { 0x0c, 4, 32, 16*1024, l1_dcache_str}, 6299 { 0x0b, 4, 0, 4, itlb4M_str}, 6300 { 0x0a, 2, 32, 8*1024, l1_dcache_str}, 6301 { 0x08, 4, 32, 16*1024, l1_icache_str}, 6302 { 0x06, 4, 32, 8*1024, l1_icache_str}, 6303 { 0x05, 4, 0, 32, dtlb4M_str}, 6304 { 0x04, 4, 0, 8, dtlb4M_str}, 6305 { 0x03, 4, 0, 64, dtlb4k_str}, 6306 { 0x02, 4, 0, 2, itlb4M_str}, 6307 { 0x01, 4, 0, 32, itlb4k_str}, 6308 { 0 } 6309 }; 6310 6311 static const struct cachetab cyrix_ctab[] = { 6312 { 0x70, 4, 0, 32, "tlb-4K" }, 6313 { 0x80, 4, 16, 16*1024, "l1-cache" }, 6314 { 0 } 6315 }; 6316 6317 /* 6318 * Search a cache table for a matching entry 6319 */ 6320 static const struct cachetab * 6321 find_cacheent(const struct cachetab *ct, uint_t code) 6322 { 6323 if (code != 0) { 6324 for (; ct->ct_code != 0; ct++) 6325 if (ct->ct_code <= code) 6326 break; 6327 if (ct->ct_code == code) 6328 return (ct); 6329 } 6330 return (NULL); 6331 } 6332 6333 /* 6334 * Populate cachetab entry with L2 or L3 cache-information using 6335 * cpuid function 4. This function is called from intel_walk_cacheinfo() 6336 * when descriptor 0x49 is encountered. It returns 0 if no such cache 6337 * information is found. 6338 */ 6339 static int 6340 intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi) 6341 { 6342 uint32_t level, i; 6343 int ret = 0; 6344 6345 for (i = 0; i < cpi->cpi_cache_leaf_size; i++) { 6346 level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]); 6347 6348 if (level == 2 || level == 3) { 6349 ct->ct_assoc = 6350 CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1; 6351 ct->ct_line_size = 6352 CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1; 6353 ct->ct_size = ct->ct_assoc * 6354 (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) * 6355 ct->ct_line_size * 6356 (cpi->cpi_cache_leaves[i]->cp_ecx + 1); 6357 6358 if (level == 2) { 6359 ct->ct_label = l2_cache_str; 6360 } else if (level == 3) { 6361 ct->ct_label = l3_cache_str; 6362 } 6363 ret = 1; 6364 } 6365 } 6366 6367 return (ret); 6368 } 6369 6370 /* 6371 * Walk the cacheinfo descriptor, applying 'func' to every valid element 6372 * The walk is terminated if the walker returns non-zero. 6373 */ 6374 static void 6375 intel_walk_cacheinfo(struct cpuid_info *cpi, 6376 void *arg, int (*func)(void *, const struct cachetab *)) 6377 { 6378 const struct cachetab *ct; 6379 struct cachetab des_49_ct, des_b1_ct; 6380 uint8_t *dp; 6381 int i; 6382 6383 if ((dp = cpi->cpi_cacheinfo) == NULL) 6384 return; 6385 for (i = 0; i < cpi->cpi_ncache; i++, dp++) { 6386 /* 6387 * For overloaded descriptor 0x49 we use cpuid function 4 6388 * if supported by the current processor, to create 6389 * cache information. 6390 * For overloaded descriptor 0xb1 we use X86_PAE flag 6391 * to disambiguate the cache information. 6392 */ 6393 if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 && 6394 intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) { 6395 ct = &des_49_ct; 6396 } else if (*dp == 0xb1) { 6397 des_b1_ct.ct_code = 0xb1; 6398 des_b1_ct.ct_assoc = 4; 6399 des_b1_ct.ct_line_size = 0; 6400 if (is_x86_feature(x86_featureset, X86FSET_PAE)) { 6401 des_b1_ct.ct_size = 8; 6402 des_b1_ct.ct_label = itlb2M_str; 6403 } else { 6404 des_b1_ct.ct_size = 4; 6405 des_b1_ct.ct_label = itlb4M_str; 6406 } 6407 ct = &des_b1_ct; 6408 } else { 6409 if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) { 6410 continue; 6411 } 6412 } 6413 6414 if (func(arg, ct) != 0) { 6415 break; 6416 } 6417 } 6418 } 6419 6420 /* 6421 * (Like the Intel one, except for Cyrix CPUs) 6422 */ 6423 static void 6424 cyrix_walk_cacheinfo(struct cpuid_info *cpi, 6425 void *arg, int (*func)(void *, const struct cachetab *)) 6426 { 6427 const struct cachetab *ct; 6428 uint8_t *dp; 6429 int i; 6430 6431 if ((dp = cpi->cpi_cacheinfo) == NULL) 6432 return; 6433 for (i = 0; i < cpi->cpi_ncache; i++, dp++) { 6434 /* 6435 * Search Cyrix-specific descriptor table first .. 6436 */ 6437 if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) { 6438 if (func(arg, ct) != 0) 6439 break; 6440 continue; 6441 } 6442 /* 6443 * .. else fall back to the Intel one 6444 */ 6445 if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) { 6446 if (func(arg, ct) != 0) 6447 break; 6448 continue; 6449 } 6450 } 6451 } 6452 6453 /* 6454 * A cacheinfo walker that adds associativity, line-size, and size properties 6455 * to the devinfo node it is passed as an argument. 6456 */ 6457 static int 6458 add_cacheent_props(void *arg, const struct cachetab *ct) 6459 { 6460 dev_info_t *devi = arg; 6461 6462 add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc); 6463 if (ct->ct_line_size != 0) 6464 add_cache_prop(devi, ct->ct_label, line_str, 6465 ct->ct_line_size); 6466 add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size); 6467 return (0); 6468 } 6469 6470 6471 static const char fully_assoc[] = "fully-associative?"; 6472 6473 /* 6474 * AMD style cache/tlb description 6475 * 6476 * Extended functions 5 and 6 directly describe properties of 6477 * tlbs and various cache levels. 6478 */ 6479 static void 6480 add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc) 6481 { 6482 switch (assoc) { 6483 case 0: /* reserved; ignore */ 6484 break; 6485 default: 6486 add_cache_prop(devi, label, assoc_str, assoc); 6487 break; 6488 case 0xff: 6489 add_cache_prop(devi, label, fully_assoc, 1); 6490 break; 6491 } 6492 } 6493 6494 static void 6495 add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size) 6496 { 6497 if (size == 0) 6498 return; 6499 add_cache_prop(devi, label, size_str, size); 6500 add_amd_assoc(devi, label, assoc); 6501 } 6502 6503 static void 6504 add_amd_cache(dev_info_t *devi, const char *label, 6505 uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size) 6506 { 6507 if (size == 0 || line_size == 0) 6508 return; 6509 add_amd_assoc(devi, label, assoc); 6510 /* 6511 * Most AMD parts have a sectored cache. Multiple cache lines are 6512 * associated with each tag. A sector consists of all cache lines 6513 * associated with a tag. For example, the AMD K6-III has a sector 6514 * size of 2 cache lines per tag. 6515 */ 6516 if (lines_per_tag != 0) 6517 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag); 6518 add_cache_prop(devi, label, line_str, line_size); 6519 add_cache_prop(devi, label, size_str, size * 1024); 6520 } 6521 6522 static void 6523 add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc) 6524 { 6525 switch (assoc) { 6526 case 0: /* off */ 6527 break; 6528 case 1: 6529 case 2: 6530 case 4: 6531 add_cache_prop(devi, label, assoc_str, assoc); 6532 break; 6533 case 6: 6534 add_cache_prop(devi, label, assoc_str, 8); 6535 break; 6536 case 8: 6537 add_cache_prop(devi, label, assoc_str, 16); 6538 break; 6539 case 0xf: 6540 add_cache_prop(devi, label, fully_assoc, 1); 6541 break; 6542 default: /* reserved; ignore */ 6543 break; 6544 } 6545 } 6546 6547 static void 6548 add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size) 6549 { 6550 if (size == 0 || assoc == 0) 6551 return; 6552 add_amd_l2_assoc(devi, label, assoc); 6553 add_cache_prop(devi, label, size_str, size); 6554 } 6555 6556 static void 6557 add_amd_l2_cache(dev_info_t *devi, const char *label, 6558 uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size) 6559 { 6560 if (size == 0 || assoc == 0 || line_size == 0) 6561 return; 6562 add_amd_l2_assoc(devi, label, assoc); 6563 if (lines_per_tag != 0) 6564 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag); 6565 add_cache_prop(devi, label, line_str, line_size); 6566 add_cache_prop(devi, label, size_str, size * 1024); 6567 } 6568 6569 static void 6570 amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi) 6571 { 6572 struct cpuid_regs *cp; 6573 6574 if (cpi->cpi_xmaxeax < 0x80000005) 6575 return; 6576 cp = &cpi->cpi_extd[5]; 6577 6578 /* 6579 * 4M/2M L1 TLB configuration 6580 * 6581 * We report the size for 2M pages because AMD uses two 6582 * TLB entries for one 4M page. 6583 */ 6584 add_amd_tlb(devi, "dtlb-2M", 6585 BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16)); 6586 add_amd_tlb(devi, "itlb-2M", 6587 BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0)); 6588 6589 /* 6590 * 4K L1 TLB configuration 6591 */ 6592 6593 switch (cpi->cpi_vendor) { 6594 uint_t nentries; 6595 case X86_VENDOR_TM: 6596 if (cpi->cpi_family >= 5) { 6597 /* 6598 * Crusoe processors have 256 TLB entries, but 6599 * cpuid data format constrains them to only 6600 * reporting 255 of them. 6601 */ 6602 if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255) 6603 nentries = 256; 6604 /* 6605 * Crusoe processors also have a unified TLB 6606 */ 6607 add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24), 6608 nentries); 6609 break; 6610 } 6611 /*FALLTHROUGH*/ 6612 default: 6613 add_amd_tlb(devi, itlb4k_str, 6614 BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16)); 6615 add_amd_tlb(devi, dtlb4k_str, 6616 BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0)); 6617 break; 6618 } 6619 6620 /* 6621 * data L1 cache configuration 6622 */ 6623 6624 add_amd_cache(devi, l1_dcache_str, 6625 BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16), 6626 BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0)); 6627 6628 /* 6629 * code L1 cache configuration 6630 */ 6631 6632 add_amd_cache(devi, l1_icache_str, 6633 BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16), 6634 BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0)); 6635 6636 if (cpi->cpi_xmaxeax < 0x80000006) 6637 return; 6638 cp = &cpi->cpi_extd[6]; 6639 6640 /* Check for a unified L2 TLB for large pages */ 6641 6642 if (BITX(cp->cp_eax, 31, 16) == 0) 6643 add_amd_l2_tlb(devi, "l2-tlb-2M", 6644 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0)); 6645 else { 6646 add_amd_l2_tlb(devi, "l2-dtlb-2M", 6647 BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16)); 6648 add_amd_l2_tlb(devi, "l2-itlb-2M", 6649 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0)); 6650 } 6651 6652 /* Check for a unified L2 TLB for 4K pages */ 6653 6654 if (BITX(cp->cp_ebx, 31, 16) == 0) { 6655 add_amd_l2_tlb(devi, "l2-tlb-4K", 6656 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0)); 6657 } else { 6658 add_amd_l2_tlb(devi, "l2-dtlb-4K", 6659 BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16)); 6660 add_amd_l2_tlb(devi, "l2-itlb-4K", 6661 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0)); 6662 } 6663 6664 add_amd_l2_cache(devi, l2_cache_str, 6665 BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12), 6666 BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0)); 6667 } 6668 6669 /* 6670 * There are two basic ways that the x86 world describes it cache 6671 * and tlb architecture - Intel's way and AMD's way. 6672 * 6673 * Return which flavor of cache architecture we should use 6674 */ 6675 static int 6676 x86_which_cacheinfo(struct cpuid_info *cpi) 6677 { 6678 switch (cpi->cpi_vendor) { 6679 case X86_VENDOR_Intel: 6680 if (cpi->cpi_maxeax >= 2) 6681 return (X86_VENDOR_Intel); 6682 break; 6683 case X86_VENDOR_AMD: 6684 /* 6685 * The K5 model 1 was the first part from AMD that reported 6686 * cache sizes via extended cpuid functions. 6687 */ 6688 if (cpi->cpi_family > 5 || 6689 (cpi->cpi_family == 5 && cpi->cpi_model >= 1)) 6690 return (X86_VENDOR_AMD); 6691 break; 6692 case X86_VENDOR_TM: 6693 if (cpi->cpi_family >= 5) 6694 return (X86_VENDOR_AMD); 6695 /*FALLTHROUGH*/ 6696 default: 6697 /* 6698 * If they have extended CPU data for 0x80000005 6699 * then we assume they have AMD-format cache 6700 * information. 6701 * 6702 * If not, and the vendor happens to be Cyrix, 6703 * then try our-Cyrix specific handler. 6704 * 6705 * If we're not Cyrix, then assume we're using Intel's 6706 * table-driven format instead. 6707 */ 6708 if (cpi->cpi_xmaxeax >= 0x80000005) 6709 return (X86_VENDOR_AMD); 6710 else if (cpi->cpi_vendor == X86_VENDOR_Cyrix) 6711 return (X86_VENDOR_Cyrix); 6712 else if (cpi->cpi_maxeax >= 2) 6713 return (X86_VENDOR_Intel); 6714 break; 6715 } 6716 return (-1); 6717 } 6718 6719 void 6720 cpuid_set_cpu_properties(void *dip, processorid_t cpu_id, 6721 struct cpuid_info *cpi) 6722 { 6723 dev_info_t *cpu_devi; 6724 int create; 6725 6726 cpu_devi = (dev_info_t *)dip; 6727 6728 /* device_type */ 6729 (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi, 6730 "device_type", "cpu"); 6731 6732 /* reg */ 6733 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6734 "reg", cpu_id); 6735 6736 /* cpu-mhz, and clock-frequency */ 6737 if (cpu_freq > 0) { 6738 long long mul; 6739 6740 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6741 "cpu-mhz", cpu_freq); 6742 if ((mul = cpu_freq * 1000000LL) <= INT_MAX) 6743 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6744 "clock-frequency", (int)mul); 6745 } 6746 6747 if (!is_x86_feature(x86_featureset, X86FSET_CPUID)) { 6748 return; 6749 } 6750 6751 /* vendor-id */ 6752 (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi, 6753 "vendor-id", cpi->cpi_vendorstr); 6754 6755 if (cpi->cpi_maxeax == 0) { 6756 return; 6757 } 6758 6759 /* 6760 * family, model, and step 6761 */ 6762 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6763 "family", CPI_FAMILY(cpi)); 6764 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6765 "cpu-model", CPI_MODEL(cpi)); 6766 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6767 "stepping-id", CPI_STEP(cpi)); 6768 6769 /* type */ 6770 switch (cpi->cpi_vendor) { 6771 case X86_VENDOR_Intel: 6772 create = 1; 6773 break; 6774 default: 6775 create = 0; 6776 break; 6777 } 6778 if (create) 6779 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6780 "type", CPI_TYPE(cpi)); 6781 6782 /* ext-family */ 6783 switch (cpi->cpi_vendor) { 6784 case X86_VENDOR_Intel: 6785 case X86_VENDOR_AMD: 6786 create = cpi->cpi_family >= 0xf; 6787 break; 6788 default: 6789 create = 0; 6790 break; 6791 } 6792 if (create) 6793 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6794 "ext-family", CPI_FAMILY_XTD(cpi)); 6795 6796 /* ext-model */ 6797 switch (cpi->cpi_vendor) { 6798 case X86_VENDOR_Intel: 6799 create = IS_EXTENDED_MODEL_INTEL(cpi); 6800 break; 6801 case X86_VENDOR_AMD: 6802 create = CPI_FAMILY(cpi) == 0xf; 6803 break; 6804 default: 6805 create = 0; 6806 break; 6807 } 6808 if (create) 6809 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6810 "ext-model", CPI_MODEL_XTD(cpi)); 6811 6812 /* generation */ 6813 switch (cpi->cpi_vendor) { 6814 case X86_VENDOR_AMD: 6815 /* 6816 * AMD K5 model 1 was the first part to support this 6817 */ 6818 create = cpi->cpi_xmaxeax >= 0x80000001; 6819 break; 6820 default: 6821 create = 0; 6822 break; 6823 } 6824 if (create) 6825 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6826 "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8)); 6827 6828 /* brand-id */ 6829 switch (cpi->cpi_vendor) { 6830 case X86_VENDOR_Intel: 6831 /* 6832 * brand id first appeared on Pentium III Xeon model 8, 6833 * and Celeron model 8 processors and Opteron 6834 */ 6835 create = cpi->cpi_family > 6 || 6836 (cpi->cpi_family == 6 && cpi->cpi_model >= 8); 6837 break; 6838 case X86_VENDOR_AMD: 6839 create = cpi->cpi_family >= 0xf; 6840 break; 6841 default: 6842 create = 0; 6843 break; 6844 } 6845 if (create && cpi->cpi_brandid != 0) { 6846 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6847 "brand-id", cpi->cpi_brandid); 6848 } 6849 6850 /* chunks, and apic-id */ 6851 switch (cpi->cpi_vendor) { 6852 /* 6853 * first available on Pentium IV and Opteron (K8) 6854 */ 6855 case X86_VENDOR_Intel: 6856 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf; 6857 break; 6858 case X86_VENDOR_AMD: 6859 create = cpi->cpi_family >= 0xf; 6860 break; 6861 default: 6862 create = 0; 6863 break; 6864 } 6865 if (create) { 6866 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6867 "chunks", CPI_CHUNKS(cpi)); 6868 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6869 "apic-id", cpi->cpi_apicid); 6870 if (cpi->cpi_chipid >= 0) { 6871 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6872 "chip#", cpi->cpi_chipid); 6873 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6874 "clog#", cpi->cpi_clogid); 6875 } 6876 } 6877 6878 /* cpuid-features */ 6879 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6880 "cpuid-features", CPI_FEATURES_EDX(cpi)); 6881 6882 6883 /* cpuid-features-ecx */ 6884 switch (cpi->cpi_vendor) { 6885 case X86_VENDOR_Intel: 6886 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf; 6887 break; 6888 case X86_VENDOR_AMD: 6889 create = cpi->cpi_family >= 0xf; 6890 break; 6891 default: 6892 create = 0; 6893 break; 6894 } 6895 if (create) 6896 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6897 "cpuid-features-ecx", CPI_FEATURES_ECX(cpi)); 6898 6899 /* ext-cpuid-features */ 6900 switch (cpi->cpi_vendor) { 6901 case X86_VENDOR_Intel: 6902 case X86_VENDOR_AMD: 6903 case X86_VENDOR_Cyrix: 6904 case X86_VENDOR_TM: 6905 case X86_VENDOR_Centaur: 6906 create = cpi->cpi_xmaxeax >= 0x80000001; 6907 break; 6908 default: 6909 create = 0; 6910 break; 6911 } 6912 if (create) { 6913 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6914 "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi)); 6915 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6916 "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi)); 6917 } 6918 6919 /* 6920 * Brand String first appeared in Intel Pentium IV, AMD K5 6921 * model 1, and Cyrix GXm. On earlier models we try and 6922 * simulate something similar .. so this string should always 6923 * same -something- about the processor, however lame. 6924 */ 6925 (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi, 6926 "brand-string", cpi->cpi_brandstr); 6927 6928 /* 6929 * Finally, cache and tlb information 6930 */ 6931 switch (x86_which_cacheinfo(cpi)) { 6932 case X86_VENDOR_Intel: 6933 intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props); 6934 break; 6935 case X86_VENDOR_Cyrix: 6936 cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props); 6937 break; 6938 case X86_VENDOR_AMD: 6939 amd_cache_info(cpi, cpu_devi); 6940 break; 6941 default: 6942 break; 6943 } 6944 } 6945 6946 struct l2info { 6947 int *l2i_csz; 6948 int *l2i_lsz; 6949 int *l2i_assoc; 6950 int l2i_ret; 6951 }; 6952 6953 /* 6954 * A cacheinfo walker that fetches the size, line-size and associativity 6955 * of the L2 cache 6956 */ 6957 static int 6958 intel_l2cinfo(void *arg, const struct cachetab *ct) 6959 { 6960 struct l2info *l2i = arg; 6961 int *ip; 6962 6963 if (ct->ct_label != l2_cache_str && 6964 ct->ct_label != sl2_cache_str) 6965 return (0); /* not an L2 -- keep walking */ 6966 6967 if ((ip = l2i->l2i_csz) != NULL) 6968 *ip = ct->ct_size; 6969 if ((ip = l2i->l2i_lsz) != NULL) 6970 *ip = ct->ct_line_size; 6971 if ((ip = l2i->l2i_assoc) != NULL) 6972 *ip = ct->ct_assoc; 6973 l2i->l2i_ret = ct->ct_size; 6974 return (1); /* was an L2 -- terminate walk */ 6975 } 6976 6977 /* 6978 * AMD L2/L3 Cache and TLB Associativity Field Definition: 6979 * 6980 * Unlike the associativity for the L1 cache and tlb where the 8 bit 6981 * value is the associativity, the associativity for the L2 cache and 6982 * tlb is encoded in the following table. The 4 bit L2 value serves as 6983 * an index into the amd_afd[] array to determine the associativity. 6984 * -1 is undefined. 0 is fully associative. 6985 */ 6986 6987 static int amd_afd[] = 6988 {-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0}; 6989 6990 static void 6991 amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i) 6992 { 6993 struct cpuid_regs *cp; 6994 uint_t size, assoc; 6995 int i; 6996 int *ip; 6997 6998 if (cpi->cpi_xmaxeax < 0x80000006) 6999 return; 7000 cp = &cpi->cpi_extd[6]; 7001 7002 if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 && 7003 (size = BITX(cp->cp_ecx, 31, 16)) != 0) { 7004 uint_t cachesz = size * 1024; 7005 assoc = amd_afd[i]; 7006 7007 ASSERT(assoc != -1); 7008 7009 if ((ip = l2i->l2i_csz) != NULL) 7010 *ip = cachesz; 7011 if ((ip = l2i->l2i_lsz) != NULL) 7012 *ip = BITX(cp->cp_ecx, 7, 0); 7013 if ((ip = l2i->l2i_assoc) != NULL) 7014 *ip = assoc; 7015 l2i->l2i_ret = cachesz; 7016 } 7017 } 7018 7019 int 7020 getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc) 7021 { 7022 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 7023 struct l2info __l2info, *l2i = &__l2info; 7024 7025 l2i->l2i_csz = csz; 7026 l2i->l2i_lsz = lsz; 7027 l2i->l2i_assoc = assoc; 7028 l2i->l2i_ret = -1; 7029 7030 switch (x86_which_cacheinfo(cpi)) { 7031 case X86_VENDOR_Intel: 7032 intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo); 7033 break; 7034 case X86_VENDOR_Cyrix: 7035 cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo); 7036 break; 7037 case X86_VENDOR_AMD: 7038 amd_l2cacheinfo(cpi, l2i); 7039 break; 7040 default: 7041 break; 7042 } 7043 return (l2i->l2i_ret); 7044 } 7045 7046 #if !defined(__xpv) 7047 7048 uint32_t * 7049 cpuid_mwait_alloc(cpu_t *cpu) 7050 { 7051 uint32_t *ret; 7052 size_t mwait_size; 7053 7054 ASSERT(cpuid_checkpass(CPU, 2)); 7055 7056 mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max; 7057 if (mwait_size == 0) 7058 return (NULL); 7059 7060 /* 7061 * kmem_alloc() returns cache line size aligned data for mwait_size 7062 * allocations. mwait_size is currently cache line sized. Neither 7063 * of these implementation details are guarantied to be true in the 7064 * future. 7065 * 7066 * First try allocating mwait_size as kmem_alloc() currently returns 7067 * correctly aligned memory. If kmem_alloc() does not return 7068 * mwait_size aligned memory, then use mwait_size ROUNDUP. 7069 * 7070 * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we 7071 * decide to free this memory. 7072 */ 7073 ret = kmem_zalloc(mwait_size, KM_SLEEP); 7074 if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) { 7075 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret; 7076 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size; 7077 *ret = MWAIT_RUNNING; 7078 return (ret); 7079 } else { 7080 kmem_free(ret, mwait_size); 7081 ret = kmem_zalloc(mwait_size * 2, KM_SLEEP); 7082 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret; 7083 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2; 7084 ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size); 7085 *ret = MWAIT_RUNNING; 7086 return (ret); 7087 } 7088 } 7089 7090 void 7091 cpuid_mwait_free(cpu_t *cpu) 7092 { 7093 if (cpu->cpu_m.mcpu_cpi == NULL) { 7094 return; 7095 } 7096 7097 if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL && 7098 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) { 7099 kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual, 7100 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual); 7101 } 7102 7103 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL; 7104 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0; 7105 } 7106 7107 void 7108 patch_tsc_read(int flag) 7109 { 7110 size_t cnt; 7111 7112 switch (flag) { 7113 case TSC_NONE: 7114 cnt = &_no_rdtsc_end - &_no_rdtsc_start; 7115 (void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt); 7116 break; 7117 case TSC_RDTSC_MFENCE: 7118 cnt = &_tsc_mfence_end - &_tsc_mfence_start; 7119 (void) memcpy((void *)tsc_read, 7120 (void *)&_tsc_mfence_start, cnt); 7121 break; 7122 case TSC_RDTSC_LFENCE: 7123 cnt = &_tsc_lfence_end - &_tsc_lfence_start; 7124 (void) memcpy((void *)tsc_read, 7125 (void *)&_tsc_lfence_start, cnt); 7126 break; 7127 case TSC_TSCP: 7128 cnt = &_tscp_end - &_tscp_start; 7129 (void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt); 7130 break; 7131 default: 7132 /* Bail for unexpected TSC types. (TSC_NONE covers 0) */ 7133 cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag); 7134 break; 7135 } 7136 tsc_type = flag; 7137 } 7138 7139 int 7140 cpuid_deep_cstates_supported(void) 7141 { 7142 struct cpuid_info *cpi; 7143 struct cpuid_regs regs; 7144 7145 ASSERT(cpuid_checkpass(CPU, 1)); 7146 7147 cpi = CPU->cpu_m.mcpu_cpi; 7148 7149 if (!is_x86_feature(x86_featureset, X86FSET_CPUID)) 7150 return (0); 7151 7152 switch (cpi->cpi_vendor) { 7153 case X86_VENDOR_Intel: 7154 if (cpi->cpi_xmaxeax < 0x80000007) 7155 return (0); 7156 7157 /* 7158 * TSC run at a constant rate in all ACPI C-states? 7159 */ 7160 regs.cp_eax = 0x80000007; 7161 (void) __cpuid_insn(®s); 7162 return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE); 7163 7164 default: 7165 return (0); 7166 } 7167 } 7168 7169 #endif /* !__xpv */ 7170 7171 void 7172 post_startup_cpu_fixups(void) 7173 { 7174 #ifndef __xpv 7175 /* 7176 * Some AMD processors support C1E state. Entering this state will 7177 * cause the local APIC timer to stop, which we can't deal with at 7178 * this time. 7179 */ 7180 if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) { 7181 on_trap_data_t otd; 7182 uint64_t reg; 7183 7184 if (!on_trap(&otd, OT_DATA_ACCESS)) { 7185 reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT); 7186 /* Disable C1E state if it is enabled by BIOS */ 7187 if ((reg >> AMD_ACTONCMPHALT_SHIFT) & 7188 AMD_ACTONCMPHALT_MASK) { 7189 reg &= ~(AMD_ACTONCMPHALT_MASK << 7190 AMD_ACTONCMPHALT_SHIFT); 7191 wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg); 7192 } 7193 } 7194 no_trap(); 7195 } 7196 #endif /* !__xpv */ 7197 } 7198 7199 void 7200 enable_pcid(void) 7201 { 7202 if (x86_use_pcid == -1) 7203 x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID); 7204 7205 if (x86_use_invpcid == -1) { 7206 x86_use_invpcid = is_x86_feature(x86_featureset, 7207 X86FSET_INVPCID); 7208 } 7209 7210 if (!x86_use_pcid) 7211 return; 7212 7213 /* 7214 * Intel say that on setting PCIDE, it immediately starts using the PCID 7215 * bits; better make sure there's nothing there. 7216 */ 7217 ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE); 7218 7219 setcr4(getcr4() | CR4_PCIDE); 7220 } 7221 7222 /* 7223 * Setup necessary registers to enable XSAVE feature on this processor. 7224 * This function needs to be called early enough, so that no xsave/xrstor 7225 * ops will execute on the processor before the MSRs are properly set up. 7226 * 7227 * Current implementation has the following assumption: 7228 * - cpuid_pass1() is done, so that X86 features are known. 7229 * - fpu_probe() is done, so that fp_save_mech is chosen. 7230 */ 7231 void 7232 xsave_setup_msr(cpu_t *cpu) 7233 { 7234 ASSERT(fp_save_mech == FP_XSAVE); 7235 ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE)); 7236 7237 /* Enable OSXSAVE in CR4. */ 7238 setcr4(getcr4() | CR4_OSXSAVE); 7239 /* 7240 * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report 7241 * correct value. 7242 */ 7243 cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE; 7244 setup_xfem(); 7245 } 7246 7247 /* 7248 * Starting with the Westmere processor the local 7249 * APIC timer will continue running in all C-states, 7250 * including the deepest C-states. 7251 */ 7252 int 7253 cpuid_arat_supported(void) 7254 { 7255 struct cpuid_info *cpi; 7256 struct cpuid_regs regs; 7257 7258 ASSERT(cpuid_checkpass(CPU, 1)); 7259 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID)); 7260 7261 cpi = CPU->cpu_m.mcpu_cpi; 7262 7263 switch (cpi->cpi_vendor) { 7264 case X86_VENDOR_Intel: 7265 /* 7266 * Always-running Local APIC Timer is 7267 * indicated by CPUID.6.EAX[2]. 7268 */ 7269 if (cpi->cpi_maxeax >= 6) { 7270 regs.cp_eax = 6; 7271 (void) cpuid_insn(NULL, ®s); 7272 return (regs.cp_eax & CPUID_INTC_EAX_ARAT); 7273 } else { 7274 return (0); 7275 } 7276 default: 7277 return (0); 7278 } 7279 } 7280 7281 /* 7282 * Check support for Intel ENERGY_PERF_BIAS feature 7283 */ 7284 int 7285 cpuid_iepb_supported(struct cpu *cp) 7286 { 7287 struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi; 7288 struct cpuid_regs regs; 7289 7290 ASSERT(cpuid_checkpass(cp, 1)); 7291 7292 if (!(is_x86_feature(x86_featureset, X86FSET_CPUID)) || 7293 !(is_x86_feature(x86_featureset, X86FSET_MSR))) { 7294 return (0); 7295 } 7296 7297 /* 7298 * Intel ENERGY_PERF_BIAS MSR is indicated by 7299 * capability bit CPUID.6.ECX.3 7300 */ 7301 if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6)) 7302 return (0); 7303 7304 regs.cp_eax = 0x6; 7305 (void) cpuid_insn(NULL, ®s); 7306 return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS); 7307 } 7308 7309 /* 7310 * Check support for TSC deadline timer 7311 * 7312 * TSC deadline timer provides a superior software programming 7313 * model over local APIC timer that eliminates "time drifts". 7314 * Instead of specifying a relative time, software specifies an 7315 * absolute time as the target at which the processor should 7316 * generate a timer event. 7317 */ 7318 int 7319 cpuid_deadline_tsc_supported(void) 7320 { 7321 struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi; 7322 struct cpuid_regs regs; 7323 7324 ASSERT(cpuid_checkpass(CPU, 1)); 7325 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID)); 7326 7327 switch (cpi->cpi_vendor) { 7328 case X86_VENDOR_Intel: 7329 if (cpi->cpi_maxeax >= 1) { 7330 regs.cp_eax = 1; 7331 (void) cpuid_insn(NULL, ®s); 7332 return (regs.cp_ecx & CPUID_DEADLINE_TSC); 7333 } else { 7334 return (0); 7335 } 7336 default: 7337 return (0); 7338 } 7339 } 7340 7341 #if defined(__amd64) && !defined(__xpv) 7342 /* 7343 * Patch in versions of bcopy for high performance Intel Nhm processors 7344 * and later... 7345 */ 7346 void 7347 patch_memops(uint_t vendor) 7348 { 7349 size_t cnt, i; 7350 caddr_t to, from; 7351 7352 if ((vendor == X86_VENDOR_Intel) && 7353 is_x86_feature(x86_featureset, X86FSET_SSE4_2)) { 7354 cnt = &bcopy_patch_end - &bcopy_patch_start; 7355 to = &bcopy_ck_size; 7356 from = &bcopy_patch_start; 7357 for (i = 0; i < cnt; i++) { 7358 *to++ = *from++; 7359 } 7360 } 7361 } 7362 #endif /* __amd64 && !__xpv */ 7363 7364 /* 7365 * We're being asked to tell the system how many bits are required to represent 7366 * the various thread and strand IDs. While it's tempting to derive this based 7367 * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite 7368 * correct. Instead, this needs to be based on the number of bits that the APIC 7369 * allows for these different configurations. We only update these to a larger 7370 * value if we find one. 7371 */ 7372 void 7373 cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits) 7374 { 7375 struct cpuid_info *cpi; 7376 7377 VERIFY(cpuid_checkpass(CPU, 1)); 7378 cpi = cpu->cpu_m.mcpu_cpi; 7379 7380 if (cpi->cpi_ncore_bits > *core_nbits) { 7381 *core_nbits = cpi->cpi_ncore_bits; 7382 } 7383 7384 if (cpi->cpi_nthread_bits > *strand_nbits) { 7385 *strand_nbits = cpi->cpi_nthread_bits; 7386 } 7387 } 7388 7389 void 7390 cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset) 7391 { 7392 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 7393 struct cpuid_regs cp; 7394 7395 /* 7396 * Reread the CPUID portions that we need for various security 7397 * information. 7398 */ 7399 if (cpi->cpi_vendor == X86_VENDOR_Intel) { 7400 /* 7401 * Check if we now have leaf 7 available to us. 7402 */ 7403 if (cpi->cpi_maxeax < 7) { 7404 bzero(&cp, sizeof (cp)); 7405 cp.cp_eax = 0; 7406 cpi->cpi_maxeax = __cpuid_insn(&cp); 7407 if (cpi->cpi_maxeax < 7) 7408 return; 7409 } 7410 7411 bzero(&cp, sizeof (cp)); 7412 cp.cp_eax = 7; 7413 cp.cp_ecx = 0; 7414 (void) __cpuid_insn(&cp); 7415 cpi->cpi_std[7] = cp; 7416 } else if (cpi->cpi_vendor == X86_VENDOR_AMD) { 7417 /* No xcpuid support */ 7418 if (cpi->cpi_family < 5 || 7419 (cpi->cpi_family == 5 && cpi->cpi_model < 1)) 7420 return; 7421 7422 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) { 7423 bzero(&cp, sizeof (cp)); 7424 cp.cp_eax = CPUID_LEAF_EXT_0; 7425 cpi->cpi_xmaxeax = __cpuid_insn(&cp); 7426 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) { 7427 return; 7428 } 7429 } 7430 7431 bzero(&cp, sizeof (cp)); 7432 cp.cp_eax = CPUID_LEAF_EXT_8; 7433 (void) __cpuid_insn(&cp); 7434 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp); 7435 cpi->cpi_extd[8] = cp; 7436 } else { 7437 /* 7438 * Nothing to do here. Return an empty set which has already 7439 * been zeroed for us. 7440 */ 7441 return; 7442 } 7443 cpuid_scan_security(cpu, fset); 7444 } 7445 7446 /* ARGSUSED */ 7447 static int 7448 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2) 7449 { 7450 uchar_t *fset; 7451 boolean_t first_pass = (boolean_t)arg1; 7452 7453 fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id); 7454 if (first_pass && CPU->cpu_id != 0) 7455 return (0); 7456 if (!first_pass && CPU->cpu_id == 0) 7457 return (0); 7458 cpuid_pass_ucode(CPU, fset); 7459 7460 return (0); 7461 } 7462 7463 /* 7464 * After a microcode update where the version has changed, then we need to 7465 * rescan CPUID. To do this we check every CPU to make sure that they have the 7466 * same microcode. Then we perform a cross call to all such CPUs. It's the 7467 * caller's job to make sure that no one else can end up doing an update while 7468 * this is going on. 7469 * 7470 * We assume that the system is microcode capable if we're called. 7471 */ 7472 void 7473 cpuid_post_ucodeadm(void) 7474 { 7475 uint32_t rev; 7476 int i; 7477 struct cpu *cpu; 7478 cpuset_t cpuset; 7479 void *argdata; 7480 uchar_t *f0; 7481 7482 argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP); 7483 7484 mutex_enter(&cpu_lock); 7485 cpu = cpu_get(0); 7486 rev = cpu->cpu_m.mcpu_ucode_info->cui_rev; 7487 CPUSET_ONLY(cpuset, 0); 7488 for (i = 1; i < max_ncpus; i++) { 7489 if ((cpu = cpu_get(i)) == NULL) 7490 continue; 7491 7492 if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) { 7493 panic("post microcode update CPU %d has differing " 7494 "microcode revision (%u) from CPU 0 (%u)", 7495 i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev); 7496 } 7497 CPUSET_ADD(cpuset, i); 7498 } 7499 7500 /* 7501 * We do the cross calls in two passes. The first pass is only for the 7502 * boot CPU. The second pass is for all of the other CPUs. This allows 7503 * the boot CPU to go through and change behavior related to patching or 7504 * whether or not Enhanced IBRS needs to be enabled and then allow all 7505 * other CPUs to follow suit. 7506 */ 7507 kpreempt_disable(); 7508 xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset), 7509 cpuid_post_ucodeadm_xc); 7510 xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset), 7511 cpuid_post_ucodeadm_xc); 7512 kpreempt_enable(); 7513 7514 /* 7515 * OK, now look at each CPU and see if their feature sets are equal. 7516 */ 7517 f0 = argdata; 7518 for (i = 1; i < max_ncpus; i++) { 7519 uchar_t *fset; 7520 if (!CPU_IN_SET(cpuset, i)) 7521 continue; 7522 7523 fset = (uchar_t *)((uintptr_t)argdata + 7524 sizeof (x86_featureset) * i); 7525 7526 if (!compare_x86_featureset(f0, fset)) { 7527 panic("Post microcode update CPU %d has " 7528 "differing security feature (%p) set from CPU 0 " 7529 "(%p), not appending to feature set", i, 7530 (void *)fset, (void *)f0); 7531 } 7532 } 7533 7534 mutex_exit(&cpu_lock); 7535 7536 for (i = 0; i < NUM_X86_FEATURES; i++) { 7537 cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n", 7538 x86_feature_names[i]); 7539 if (is_x86_feature(f0, i)) { 7540 add_x86_feature(x86_featureset, i); 7541 } 7542 } 7543 kmem_free(argdata, sizeof (x86_featureset) * NCPU); 7544 }