1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2016 by Delphix. All rights reserved. 24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved. 25 * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net> 26 */ 27 /* 28 * Copyright (c) 2010, Intel Corporation. 29 * All rights reserved. 30 */ 31 /* 32 * Portions Copyright 2009 Advanced Micro Devices, Inc. 33 */ 34 /* 35 * Copyright 2019 Joyent, Inc. 36 */ 37 38 /* 39 * CPU Identification logic 40 * 41 * The purpose of this file and its companion, cpuid_subr.c, is to help deal 42 * with the identification of CPUs, their features, and their topologies. More 43 * specifically, this file helps drive the following: 44 * 45 * 1. Enumeration of features of the processor which are used by the kernel to 46 * determine what features to enable or disable. These may be instruction set 47 * enhancements or features that we use. 48 * 49 * 2. Enumeration of instruction set architecture (ISA) additions that userland 50 * will be told about through the auxiliary vector. 51 * 52 * 3. Understanding the physical topology of the CPU such as the number of 53 * caches, how many cores it has, whether or not it supports symmetric 54 * multi-processing (SMT), etc. 55 * 56 * ------------------------ 57 * CPUID History and Basics 58 * ------------------------ 59 * 60 * The cpuid instruction was added by Intel roughly around the time that the 61 * original Pentium was introduced. The purpose of cpuid was to tell in a 62 * programmatic fashion information about the CPU that previously was guessed 63 * at. For example, an important part of cpuid is that we can know what 64 * extensions to the ISA exist. If you use an invalid opcode you would get a 65 * #UD, so this method allows a program (whether a user program or the kernel) 66 * to determine what exists without crashing or getting a SIGILL. Of course, 67 * this was also during the era of the clones and the AMD Am5x86. The vendor 68 * name shows up first in cpuid for a reason. 69 * 70 * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts 71 * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has 72 * its own meaning. The different leaves are broken down into different regions: 73 * 74 * [ 0, 7fffffff ] This region is called the 'basic' 75 * region. This region is generally defined 76 * by Intel, though some of the original 77 * portions have different meanings based 78 * on the manufacturer. These days, Intel 79 * adds most new features to this region. 80 * AMD adds non-Intel compatible 81 * information in the third, extended 82 * region. Intel uses this for everything 83 * including ISA extensions, CPU 84 * features, cache information, topology, 85 * and more. 86 * 87 * There is a hole carved out of this 88 * region which is reserved for 89 * hypervisors. 90 * 91 * [ 40000000, 4fffffff ] This region, which is found in the 92 * middle of the previous region, is 93 * explicitly promised to never be used by 94 * CPUs. Instead, it is used by hypervisors 95 * to communicate information about 96 * themselves to the operating system. The 97 * values and details are unique for each 98 * hypervisor. 99 * 100 * [ 80000000, ffffffff ] This region is called the 'extended' 101 * region. Some of the low leaves mirror 102 * parts of the basic leaves. This region 103 * has generally been used by AMD for 104 * various extensions. For example, AMD- 105 * specific information about caches, 106 * features, and topology are found in this 107 * region. 108 * 109 * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx, 110 * and %edx, and then issue the cpuid instruction. At the first leaf in each of 111 * the ranges, one of the primary things returned is the maximum valid leaf in 112 * that range. This allows for discovery of what range of CPUID is valid. 113 * 114 * The CPUs have potentially surprising behavior when using an invalid leaf or 115 * unimplemented leaf. If the requested leaf is within the valid basic or 116 * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be 117 * set to zero. However, if you specify a leaf that is outside of a valid range, 118 * then instead it will be filled with the last valid _basic_ leaf. For example, 119 * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or 120 * an invalid extended leaf will return the information for leaf 3. 121 * 122 * Some leaves are broken down into sub-leaves. This means that the value 123 * depends on both the leaf asked for in %eax and a secondary register. For 124 * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get 125 * additional information. Or when getting topology information in leaf 0xb, the 126 * initial value in %ecx changes which level of the topology that you are 127 * getting information about. 128 * 129 * cpuid values are always kept to 32 bits regardless of whether or not the 130 * program is in 64-bit mode. When executing in 64-bit mode, the upper 131 * 32 bits of the register are always set to zero so that way the values are the 132 * same regardless of execution mode. 133 * 134 * ---------------------- 135 * Identifying Processors 136 * ---------------------- 137 * 138 * We can identify a processor in two steps. The first step looks at cpuid leaf 139 * 0. Leaf 0 contains the processor's vendor information. This is done by 140 * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is 141 * 'AuthenticAMD' and on Intel it is 'GenuineIntel'. 142 * 143 * From there, a processor is identified by a combination of three different 144 * values: 145 * 146 * 1. Family 147 * 2. Model 148 * 3. Stepping 149 * 150 * Each vendor uses the family and model to uniquely identify a processor. The 151 * way that family and model are changed depends on the vendor. For example, 152 * Intel has been using family 0x6 for almost all of their processor since the 153 * Pentium Pro/Pentium II era, often called the P6. The model is used to 154 * identify the exact processor. Different models are often used for the client 155 * (consumer) and server parts. Even though each processor often has major 156 * architectural differences, they still are considered the same family by 157 * Intel. 158 * 159 * On the other hand, each major AMD architecture generally has its own family. 160 * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it 161 * the model number is used to help identify specific processors. 162 * 163 * The stepping is used to refer to a revision of a specific microprocessor. The 164 * term comes from equipment used to produce masks that are used to create 165 * integrated circuits. 166 * 167 * The information is present in leaf 1, %eax. In technical documentation you 168 * will see the terms extended model and extended family. The original family, 169 * model, and stepping fields were each 4 bits wide. If the values in either 170 * are 0xf, then one is to consult the extended model and extended family, which 171 * take previously reserved bits and allow for a larger number of models and add 172 * 0xf to them. 173 * 174 * When we process this information, we store the full family, model, and 175 * stepping in the struct cpuid_info members cpi_family, cpi_model, and 176 * cpi_step, respectively. Whenever you are performing comparisons with the 177 * family, model, and stepping, you should use these members and not the raw 178 * values from cpuid. If you must use the raw values from cpuid directly, you 179 * must make sure that you add the extended model and family to the base model 180 * and family. 181 * 182 * In general, we do not use information about the family, model, and stepping 183 * to determine whether or not a feature is present; that is generally driven by 184 * specific leaves. However, when something we care about on the processor is 185 * not considered 'architectural' meaning that it is specific to a set of 186 * processors and not promised in the architecture model to be consistent from 187 * generation to generation, then we will fall back on this information. The 188 * most common cases where this comes up is when we have to workaround errata in 189 * the processor, are dealing with processor-specific features such as CPU 190 * performance counters, or we want to provide additional information for things 191 * such as fault management. 192 * 193 * While processors also do have a brand string, which is the name that people 194 * are familiar with when buying the processor, they are not meant for 195 * programmatic consumption. That is what the family, model, and stepping are 196 * for. 197 * 198 * ------------ 199 * CPUID Passes 200 * ------------ 201 * 202 * As part of performing feature detection, we break this into several different 203 * passes. The passes are as follows: 204 * 205 * Pass 0 This is a primordial pass done in locore.s to deal with 206 * Cyrix CPUs that don't support cpuid. The reality is that 207 * we likely don't run on them any more, but there is still 208 * logic for handling them. 209 * 210 * Pass 1 This is the primary pass and is responsible for doing a 211 * large number of different things: 212 * 213 * 1. Determine which vendor manufactured the CPU and 214 * determining the family, model, and stepping information. 215 * 216 * 2. Gathering a large number of feature flags to 217 * determine which features the CPU support and which 218 * indicate things that we need to do other work in the OS 219 * to enable. Features detected this way are added to the 220 * x86_featureset which can be queried to 221 * determine what we should do. This includes processing 222 * all of the basic and extended CPU features that we care 223 * about. 224 * 225 * 3. Determining the CPU's topology. This includes 226 * information about how many cores and threads are present 227 * in the package. It also is responsible for figuring out 228 * which logical CPUs are potentially part of the same core 229 * and what other resources they might share. For more 230 * information see the 'Topology' section. 231 * 232 * 4. Determining the set of CPU security-specific features 233 * that we need to worry about and determine the 234 * appropriate set of workarounds. 235 * 236 * Pass 1 on the boot CPU occurs before KMDB is started. 237 * 238 * Pass 2 The second pass is done after startup(). Here, we check 239 * other miscellaneous features. Most of this is gathering 240 * additional basic and extended features that we'll use in 241 * later passes or for debugging support. 242 * 243 * Pass 3 The third pass occurs after the kernel memory allocator 244 * has been fully initialized. This gathers information 245 * where we might need dynamic memory available for our 246 * uses. This includes several varying width leaves that 247 * have cache information and the processor's brand string. 248 * 249 * Pass 4 The fourth and final normal pass is performed after the 250 * kernel has brought most everything online. This is 251 * invoked from post_startup(). In this pass, we go through 252 * the set of features that we have enabled and turn that 253 * into the hardware auxiliary vector features that 254 * userland receives. This is used by userland, primarily 255 * by the run-time link-editor (RTLD), though userland 256 * software could also refer to it directly. 257 * 258 * Microcode After a microcode update, we do a selective rescan of 259 * the cpuid leaves to determine what features have 260 * changed. Microcode updates can provide more details 261 * about security related features to deal with issues like 262 * Spectre and L1TF. On occasion, vendors have violated 263 * their contract and removed bits. However, we don't try 264 * to detect that because that puts us in a situation that 265 * we really can't deal with. As such, the only thing we 266 * rescan are security related features today. See 267 * cpuid_pass_ucode(). 268 * 269 * All of the passes (except pass 0) are run on all CPUs. However, for the most 270 * part we only care about what the boot CPU says about this information and use 271 * the other CPUs as a rough guide to sanity check that we have the same feature 272 * set. 273 * 274 * We do not support running multiple logical CPUs with disjoint, let alone 275 * different, feature sets. 276 * 277 * ------------------ 278 * Processor Topology 279 * ------------------ 280 * 281 * One of the important things that we need to do is to understand the topology 282 * of the underlying processor. When we say topology in this case, we're trying 283 * to understand the relationship between the logical CPUs that the operating 284 * system sees and the underlying physical layout. Different logical CPUs may 285 * share different resources which can have important consequences for the 286 * performance of the system. For example, they may share caches, execution 287 * units, and more. 288 * 289 * The topology of the processor changes from generation to generation and 290 * vendor to vendor. Along with that, different vendors use different 291 * terminology, and the operating system itself uses occasionally overlapping 292 * terminology. It's important to understand what this topology looks like so 293 * one can understand the different things that we try to calculate and 294 * determine. 295 * 296 * To get started, let's talk about a little bit of terminology that we've used 297 * so far, is used throughout this file, and is fairly generic across multiple 298 * vendors: 299 * 300 * CPU 301 * A central processing unit (CPU) refers to a logical and/or virtual 302 * entity that the operating system can execute instructions on. The 303 * underlying resources for this CPU may be shared between multiple 304 * entities; however, to the operating system it is a discrete unit. 305 * 306 * PROCESSOR and PACKAGE 307 * 308 * Generally, when we use the term 'processor' on its own, we are referring 309 * to the physical entity that one buys and plugs into a board. However, 310 * because processor has been overloaded and one might see it used to mean 311 * multiple different levels, we will instead use the term 'package' for 312 * the rest of this file. The term package comes from the electrical 313 * engineering side and refers to the physical entity that encloses the 314 * electronics inside. Strictly speaking the package can contain more than 315 * just the CPU, for example, on many processors it may also have what's 316 * called an 'integrated graphical processing unit (GPU)'. Because the 317 * package can encapsulate multiple units, it is the largest physical unit 318 * that we refer to. 319 * 320 * SOCKET 321 * 322 * A socket refers to unit on a system board (generally the motherboard) 323 * that can receive a package. A single package, or processor, is plugged 324 * into a single socket. A system may have multiple sockets. Often times, 325 * the term socket is used interchangeably with package and refers to the 326 * electrical component that has plugged in, and not the receptacle itself. 327 * 328 * CORE 329 * 330 * A core refers to the physical instantiation of a CPU, generally, with a 331 * full set of hardware resources available to it. A package may contain 332 * multiple cores inside of it or it may just have a single one. A 333 * processor with more than one core is often referred to as 'multi-core'. 334 * In illumos, we will use the feature X86FSET_CMP to refer to a system 335 * that has 'multi-core' processors. 336 * 337 * A core may expose a single logical CPU to the operating system, or it 338 * may expose multiple CPUs, which we call threads, defined below. 339 * 340 * Some resources may still be shared by cores in the same package. For 341 * example, many processors will share the level 3 cache between cores. 342 * Some AMD generations share hardware resources between cores. For more 343 * information on that see the section 'AMD Topology'. 344 * 345 * THREAD and STRAND 346 * 347 * In this file, generally a thread refers to a hardware resources and not 348 * the operating system's logical abstraction. A thread is always exposed 349 * as an independent logical CPU to the operating system. A thread belongs 350 * to a specific core. A core may have more than one thread. When that is 351 * the case, the threads that are part of the same core are often referred 352 * to as 'siblings'. 353 * 354 * When multiple threads exist, this is generally referred to as 355 * simultaneous multi-threading (SMT). When Intel introduced this in their 356 * processors they called it hyper-threading (HT). When multiple threads 357 * are active in a core, they split the resources of the core. For example, 358 * two threads may share the same set of hardware execution units. 359 * 360 * The operating system often uses the term 'strand' to refer to a thread. 361 * This helps disambiguate it from the software concept. 362 * 363 * CHIP 364 * 365 * Unfortunately, the term 'chip' is dramatically overloaded. At its most 366 * base meaning, it is used to refer to a single integrated circuit, which 367 * may or may not be the only thing in the package. In illumos, when you 368 * see the term 'chip' it is almost always referring to the same thing as 369 * the 'package'. However, many vendors may use chip to refer to one of 370 * many integrated circuits that have been placed in the package. As an 371 * example, see the subsequent definition. 372 * 373 * To try and keep things consistent, we will only use chip when referring 374 * to the entire integrated circuit package, with the exception of the 375 * definition of multi-chip module (because it is in the name) and use the 376 * term 'die' when we want the more general, potential sub-component 377 * definition. 378 * 379 * DIE 380 * 381 * A die refers to an integrated circuit. Inside of the package there may 382 * be a single die or multiple dies. This is sometimes called a 'chip' in 383 * vendor's parlance, but in this file, we use the term die to refer to a 384 * subcomponent. 385 * 386 * MULTI-CHIP MODULE 387 * 388 * A multi-chip module (MCM) refers to putting multiple distinct chips that 389 * are connected together in the same package. When a multi-chip design is 390 * used, generally each chip is manufactured independently and then joined 391 * together in the package. For example, on AMD's Zen microarchitecture 392 * (family 0x17), the package contains several dies (the second meaning of 393 * chip from above) that are connected together. 394 * 395 * CACHE 396 * 397 * A cache is a part of the processor that maintains copies of recently 398 * accessed memory. Caches are split into levels and then into types. 399 * Commonly there are one to three levels, called level one, two, and 400 * three. The lower the level, the smaller it is, the closer it is to the 401 * execution units of the CPU, and the faster it is to access. The layout 402 * and design of the cache come in many different flavors, consult other 403 * resources for a discussion of those. 404 * 405 * Caches are generally split into two types, the instruction and data 406 * cache. The caches contain what their names suggest, the instruction 407 * cache has executable program text, while the data cache has all other 408 * memory that the processor accesses. As of this writing, data is kept 409 * coherent between all of the caches on x86, so if one modifies program 410 * text before it is executed, that will be in the data cache, and the 411 * instruction cache will be synchronized with that change when the 412 * processor actually executes those instructions. This coherency also 413 * covers the fact that data could show up in multiple caches. 414 * 415 * Generally, the lowest level caches are specific to a core. However, the 416 * last layer cache is shared between some number of cores. The number of 417 * CPUs sharing this last level cache is important. This has implications 418 * for the choices that the scheduler makes, as accessing memory that might 419 * be in a remote cache after thread migration can be quite expensive. 420 * 421 * Sometimes, the word cache is abbreviated with a '$', because in US 422 * English the word cache is pronounced the same as cash. So L1D$ refers to 423 * the L1 data cache, and L2$ would be the L2 cache. This will not be used 424 * in the rest of this theory statement for clarity. 425 * 426 * MEMORY CONTROLLER 427 * 428 * The memory controller is a component that provides access to DRAM. Each 429 * memory controller can access a set number of DRAM channels. Each channel 430 * can have a number of DIMMs (sticks of memory) associated with it. A 431 * given package may have more than one memory controller. The association 432 * of the memory controller to a group of cores is important as it is 433 * cheaper to access memory on the controller that you are associated with. 434 * 435 * NUMA 436 * 437 * NUMA or non-uniform memory access, describes a way that systems are 438 * built. On x86, any processor core can address all of the memory in the 439 * system. However, When using multiple sockets or possibly within a 440 * multi-chip module, some of that memory is physically closer and some of 441 * it is further. Memory that is further away is more expensive to access. 442 * Consider the following image of multiple sockets with memory: 443 * 444 * +--------+ +--------+ 445 * | DIMM A | +----------+ +----------+ | DIMM D | 446 * +--------+-+ | | | | +-+------+-+ 447 * | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E | 448 * +--------+-+ | | | | +-+------+-+ 449 * | DIMM C | +----------+ +----------+ | DIMM F | 450 * +--------+ +--------+ 451 * 452 * In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is 453 * closer to DIMMs D-F. This means that it is cheaper for socket 0 to 454 * access DIMMs A-C and more expensive to access D-F as it has to go 455 * through Socket 1 to get there. The inverse is true for Socket 1. DIMMs 456 * D-F are cheaper than A-C. While the socket form is the most common, when 457 * using multi-chip modules, this can also sometimes occur. For another 458 * example of this that's more involved, see the AMD topology section. 459 * 460 * 461 * Intel Topology 462 * -------------- 463 * 464 * Most Intel processors since Nehalem, (as of this writing the current gen 465 * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of 466 * the package is a single monolithic die. MCMs currently aren't used. Most 467 * parts have three levels of caches, with the L3 cache being shared between 468 * all of the cores on the package. The L1/L2 cache is generally specific to 469 * an individual core. The following image shows at a simplified level what 470 * this looks like. The memory controller is commonly part of something called 471 * the 'Uncore', that used to be separate physical chips that were not a part of 472 * the package, but are now part of the same chip. 473 * 474 * +-----------------------------------------------------------------------+ 475 * | Package | 476 * | +-------------------+ +-------------------+ +-------------------+ | 477 * | | Core | | Core | | Core | | 478 * | | +--------+ +---+ | | +--------+ +---+ | | +--------+ +---+ | | 479 * | | | Thread | | L | | | | Thread | | L | | | | Thread | | L | | | 480 * | | +--------+ | 1 | | | +--------+ | 1 | | | +--------+ | 1 | | | 481 * | | +--------+ | | | | +--------+ | | | | +--------+ | | | | 482 * | | | Thread | | | | | | Thread | | | | | | Thread | | | | | 483 * | | +--------+ +---+ | | +--------+ +---+ | | +--------+ +---+ | | 484 * | | +--------------+ | | +--------------+ | | +--------------+ | | 485 * | | | L2 Cache | | | | L2 Cache | | | | L2 Cache | | | 486 * | | +--------------+ | | +--------------+ | | +--------------+ | | 487 * | +-------------------+ +-------------------+ +-------------------+ | 488 * | +-------------------------------------------------------------------+ | 489 * | | Shared L3 Cache | | 490 * | +-------------------------------------------------------------------+ | 491 * | +-------------------------------------------------------------------+ | 492 * | | Memory Controller | | 493 * | +-------------------------------------------------------------------+ | 494 * +-----------------------------------------------------------------------+ 495 * 496 * A side effect of this current architecture is that what we care about from a 497 * scheduling and topology perspective, is simplified. In general we care about 498 * understanding which logical CPUs are part of the same core and socket. 499 * 500 * To determine the relationship between threads and cores, Intel initially used 501 * the identifier in the advanced programmable interrupt controller (APIC). They 502 * also added cpuid leaf 4 to give additional information about the number of 503 * threads and CPUs in the processor. With the addition of x2apic (which 504 * increased the number of addressable logical CPUs from 8-bits to 32-bits), an 505 * additional cpuid topology leaf 0xB was added. 506 * 507 * AMD Topology 508 * ------------ 509 * 510 * When discussing AMD topology, we want to break this into three distinct 511 * generations of topology. There's the basic topology that has been used in 512 * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced 513 * with family 0x15 (Bulldozer), and there's the topology that was introduced 514 * with family 0x17 (Zen). AMD also has some additional terminology that's worth 515 * talking about. 516 * 517 * Until the introduction of family 0x17 (Zen), AMD did not implement something 518 * that they considered SMT. Whether or not the AMD processors have SMT 519 * influences many things including scheduling and reliability, availability, 520 * and serviceability (RAS) features. 521 * 522 * NODE 523 * 524 * AMD uses the term node to refer to a die that contains a number of cores 525 * and I/O resources. Depending on the processor family and model, more 526 * than one node can be present in the package. When there is more than one 527 * node this indicates a multi-chip module. Usually each node has its own 528 * access to memory and I/O devices. This is important and generally 529 * different from the corresponding Intel Nehalem-Skylake+ processors. As a 530 * result, we track this relationship in the operating system. 531 * 532 * In processors with an L3 cache, the L3 cache is generally shared across 533 * the entire node, though the way this is carved up varies from generation 534 * to generation. 535 * 536 * BULLDOZER 537 * 538 * Starting with the Bulldozer family (0x15) and continuing until the 539 * introduction of the Zen microarchitecture, AMD introduced the idea of a 540 * compute unit. In a compute unit, two traditional cores share a number of 541 * hardware resources. Critically, they share the FPU, L1 instruction 542 * cache, and the L2 cache. Several compute units were then combined inside 543 * of a single node. Because the integer execution units, L1 data cache, 544 * and some other resources were not shared between the cores, AMD never 545 * considered this to be SMT. 546 * 547 * ZEN 548 * 549 * The Zen family (0x17) uses a multi-chip module (MCM) design, the module 550 * is called Zeppelin. These modules are similar to the idea of nodes used 551 * previously. Each of these nodes has two DRAM channels which all of the 552 * cores in the node can access uniformly. These nodes are linked together 553 * in the package, creating a NUMA environment. 554 * 555 * The Zeppelin die itself contains two different 'core complexes'. Each 556 * core complex consists of four cores which each have two threads, for a 557 * total of 8 logical CPUs per complex. Unlike other generations, 558 * where all the logical CPUs in a given node share the L3 cache, here each 559 * core complex has its own shared L3 cache. 560 * 561 * A further thing that we need to consider is that in some configurations, 562 * particularly with the Threadripper line of processors, not every die 563 * actually has its memory controllers wired up to actual memory channels. 564 * This means that some cores have memory attached to them and others 565 * don't. 566 * 567 * To put Zen in perspective, consider the following images: 568 * 569 * +--------------------------------------------------------+ 570 * | Core Complex | 571 * | +-------------------+ +-------------------+ +---+ | 572 * | | Core +----+ | | Core +----+ | | | | 573 * | | +--------+ | L2 | | | +--------+ | L2 | | | | | 574 * | | | Thread | +----+ | | | Thread | +----+ | | | | 575 * | | +--------+-+ +--+ | | +--------+-+ +--+ | | L | | 576 * | | | Thread | |L1| | | | Thread | |L1| | | 3 | | 577 * | | +--------+ +--+ | | +--------+ +--+ | | | | 578 * | +-------------------+ +-------------------+ | C | | 579 * | +-------------------+ +-------------------+ | a | | 580 * | | Core +----+ | | Core +----+ | | c | | 581 * | | +--------+ | L2 | | | +--------+ | L2 | | | h | | 582 * | | | Thread | +----+ | | | Thread | +----+ | | e | | 583 * | | +--------+-+ +--+ | | +--------+-+ +--+ | | | | 584 * | | | Thread | |L1| | | | Thread | |L1| | | | | 585 * | | +--------+ +--+ | | +--------+ +--+ | | | | 586 * | +-------------------+ +-------------------+ +---+ | 587 * | | 588 * +--------------------------------------------------------+ 589 * 590 * This first image represents a single Zen core complex that consists of four 591 * cores. 592 * 593 * 594 * +--------------------------------------------------------+ 595 * | Zeppelin Die | 596 * | +--------------------------------------------------+ | 597 * | | I/O Units (PCIe, SATA, USB, etc.) | | 598 * | +--------------------------------------------------+ | 599 * | HH | 600 * | +-----------+ HH +-----------+ | 601 * | | | HH | | | 602 * | | Core |==========| Core | | 603 * | | Complex |==========| Complex | | 604 * | | | HH | | | 605 * | +-----------+ HH +-----------+ | 606 * | HH | 607 * | +--------------------------------------------------+ | 608 * | | Memory Controller | | 609 * | +--------------------------------------------------+ | 610 * | | 611 * +--------------------------------------------------------+ 612 * 613 * This image represents a single Zeppelin Die. Note how both cores are 614 * connected to the same memory controller and I/O units. While each core 615 * complex has its own L3 cache as seen in the first image, they both have 616 * uniform access to memory. 617 * 618 * 619 * PP PP 620 * PP PP 621 * +----------PP---------------------PP---------+ 622 * | PP PP | 623 * | +-----------+ +-----------+ | 624 * | | | | | | 625 * MMMMMMMMM| Zeppelin |==========| Zeppelin |MMMMMMMMM 626 * MMMMMMMMM| Die |==========| Die |MMMMMMMMM 627 * | | | | | | 628 * | +-----------+ooo ...+-----------+ | 629 * | HH ooo ... HH | 630 * | HH oo.. HH | 631 * | HH ..oo HH | 632 * | HH ... ooo HH | 633 * | +-----------+... ooo+-----------+ | 634 * | | | | | | 635 * MMMMMMMMM| Zeppelin |==========| Zeppelin |MMMMMMMMM 636 * MMMMMMMMM| Die |==========| Die |MMMMMMMMM 637 * | | | | | | 638 * | +-----------+ +-----------+ | 639 * | PP PP | 640 * +----------PP---------------------PP---------+ 641 * PP PP 642 * PP PP 643 * 644 * This image represents a single Zen package. In this example, it has four 645 * Zeppelin dies, though some configurations only have a single one. In this 646 * example, each die is directly connected to the next. Also, each die is 647 * represented as being connected to memory by the 'M' character and connected 648 * to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin 649 * die is made up of two core complexes, we have multiple different NUMA 650 * domains that we care about for these systems. 651 * 652 * CPUID LEAVES 653 * 654 * There are a few different CPUID leaves that we can use to try and understand 655 * the actual state of the world. As part of the introduction of family 0xf, AMD 656 * added CPUID leaf 0x80000008. This leaf tells us the number of logical 657 * processors that are in the system. Because families before Zen didn't have 658 * SMT, this was always the number of cores that were in the system. However, it 659 * should always be thought of as the number of logical threads to be consistent 660 * between generations. In addition we also get the size of the APIC ID that is 661 * used to represent the number of logical processors. This is important for 662 * deriving topology information. 663 * 664 * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a 665 * bit between Bulldozer and later families, but it is quite useful in 666 * determining the topology information. Because this information has changed 667 * across family generations, it's worth calling out what these mean 668 * explicitly. The registers have the following meanings: 669 * 670 * %eax The APIC ID. The entire register is defined to have a 32-bit 671 * APIC ID, even though on systems without x2apic support, it will 672 * be limited to 8 bits. 673 * 674 * %ebx On Bulldozer-era systems this contains information about the 675 * number of cores that are in a compute unit (cores that share 676 * resources). It also contains a per-package compute unit ID that 677 * identifies which compute unit the logical CPU is a part of. 678 * 679 * On Zen-era systems this instead contains the number of threads 680 * per core and the ID of the core that the logical CPU is a part 681 * of. Note, this ID is unique only to the package, it is not 682 * globally unique across the entire system. 683 * 684 * %ecx This contains the number of nodes that exist in the package. It 685 * also contains an ID that identifies which node the logical CPU 686 * is a part of. 687 * 688 * Finally, we also use cpuid leaf 0x8000001D to determine information about the 689 * cache layout to determine which logical CPUs are sharing which caches. 690 * 691 * illumos Topology 692 * ---------------- 693 * 694 * Based on the above we synthesize the information into several different 695 * variables that we store in the 'struct cpuid_info'. We'll go into the details 696 * of what each member is supposed to represent and their uniqueness. In 697 * general, there are two levels of uniqueness that we care about. We care about 698 * an ID that is globally unique. That means that it will be unique across all 699 * entities in the system. For example, the default logical CPU ID is globally 700 * unique. On the other hand, there is some information that we only care about 701 * being unique within the context of a single package / socket. Here are the 702 * variables that we keep track of and their meaning. 703 * 704 * Several of the values that are asking for an identifier, with the exception 705 * of cpi_apicid, are allowed to be synthetic. 706 * 707 * 708 * cpi_apicid 709 * 710 * This is the value of the CPU's APIC id. This should be the full 32-bit 711 * ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit 712 * APIC ID. This value is globally unique between all logical CPUs across 713 * all packages. This is usually required by the APIC. 714 * 715 * cpi_chipid 716 * 717 * This value indicates the ID of the package that the logical CPU is a 718 * part of. This value is allowed to be synthetic. It is usually derived by 719 * taking the CPU's APIC ID and determining how many bits are used to 720 * represent CPU cores in the package. All logical CPUs that are part of 721 * the same package must have the same value. 722 * 723 * cpi_coreid 724 * 725 * This represents the ID of a CPU core. Two logical CPUs should only have 726 * the same cpi_coreid value if they are part of the same core. These 727 * values may be synthetic. On systems that support SMT, this value is 728 * usually derived from the APIC ID, otherwise it is often synthetic and 729 * just set to the value of the cpu_id in the cpu_t. 730 * 731 * cpi_pkgcoreid 732 * 733 * This is similar to the cpi_coreid in that logical CPUs that are part of 734 * the same core should have the same ID. The main difference is that these 735 * values are only required to be unique to a given socket. 736 * 737 * cpi_clogid 738 * 739 * This represents the logical ID of a logical CPU. This value should be 740 * unique within a given socket for each logical CPU. This is allowed to be 741 * synthetic, though it is usually based off of the CPU's apic ID. The 742 * broader system expects that logical CPUs that have are part of the same 743 * core have contiguous numbers. For example, if there were two threads per 744 * core, then the core IDs divided by two should be the same and the first 745 * modulus two should be zero and the second one. For example, IDs 4 and 5 746 * indicate two logical CPUs that are part of the same core. But IDs 5 and 747 * 6 represent two logical CPUs that are part of different cores. 748 * 749 * While it is common for the cpi_coreid and the cpi_clogid to be derived 750 * from the same source, strictly speaking, they don't have to be and the 751 * two values should be considered logically independent. One should not 752 * try to compare a logical CPU's cpi_coreid and cpi_clogid to determine 753 * some kind of relationship. While this is tempting, we've seen cases on 754 * AMD family 0xf where the system's cpu id is not related to its APIC ID. 755 * 756 * cpi_ncpu_per_chip 757 * 758 * This value indicates the total number of logical CPUs that exist in the 759 * physical package. Critically, this is not the number of logical CPUs 760 * that exist for just the single core. 761 * 762 * This value should be the same for all logical CPUs in the same package. 763 * 764 * cpi_ncore_per_chip 765 * 766 * This value indicates the total number of physical CPU cores that exist 767 * in the package. The system compares this value with cpi_ncpu_per_chip to 768 * determine if simultaneous multi-threading (SMT) is enabled. When 769 * cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and 770 * the X86FSET_HTT feature is not set. If this value is greater than one, 771 * than we consider the processor to have the feature X86FSET_CMP, to 772 * indicate that there is support for more than one core. 773 * 774 * This value should be the same for all logical CPUs in the same package. 775 * 776 * cpi_procnodes_per_pkg 777 * 778 * This value indicates the number of 'nodes' that exist in the package. 779 * When processors are actually a multi-chip module, this represents the 780 * number of such modules that exist in the package. Currently, on Intel 781 * based systems this member is always set to 1. 782 * 783 * This value should be the same for all logical CPUs in the same package. 784 * 785 * cpi_procnodeid 786 * 787 * This value indicates the ID of the node that the logical CPU is a part 788 * of. All logical CPUs that are in the same node must have the same value 789 * here. This value must be unique across all of the packages in the 790 * system. On Intel based systems, this is currently set to the value in 791 * cpi_chipid because there is only one node. 792 * 793 * cpi_cores_per_compunit 794 * 795 * This value indicates the number of cores that are part of a compute 796 * unit. See the AMD topology section for this. This member only has real 797 * meaning currently for AMD Bulldozer family processors. For all other 798 * processors, this should currently be set to 1. 799 * 800 * cpi_compunitid 801 * 802 * This indicates the compute unit that the logical CPU belongs to. For 803 * processors without AMD Bulldozer-style compute units this should be set 804 * to the value of cpi_coreid. 805 * 806 * cpi_ncpu_shr_last_cache 807 * 808 * This indicates the number of logical CPUs that are sharing the same last 809 * level cache. This value should be the same for all CPUs that are sharing 810 * that cache. The last cache refers to the cache that is closest to memory 811 * and furthest away from the CPU. 812 * 813 * cpi_last_lvl_cacheid 814 * 815 * This indicates the ID of the last cache that the logical CPU uses. This 816 * cache is often shared between multiple logical CPUs and is the cache 817 * that is closest to memory and furthest away from the CPU. This value 818 * should be the same for a group of logical CPUs only if they actually 819 * share the same last level cache. IDs should not overlap between 820 * packages. 821 * 822 * cpi_ncore_bits 823 * 824 * This indicates the number of bits that are required to represent all of 825 * the cores in the system. As cores are derived based on their APIC IDs, 826 * we aren't guaranteed a run of APIC IDs starting from zero. It's OK for 827 * this value to be larger than the actual number of IDs that are present 828 * in the system. This is used to size tables by the CMI framework. It is 829 * only filled in for Intel and AMD CPUs. 830 * 831 * cpi_nthread_bits 832 * 833 * This indicates the number of bits required to represent all of the IDs 834 * that cover the logical CPUs that exist on a given core. It's OK for this 835 * value to be larger than the actual number of IDs that are present in the 836 * system. This is used to size tables by the CMI framework. It is 837 * only filled in for Intel and AMD CPUs. 838 * 839 * ----------- 840 * Hypervisors 841 * ----------- 842 * 843 * If trying to manage the differences between vendors wasn't bad enough, it can 844 * get worse thanks to our friend hardware virtualization. Hypervisors are given 845 * the ability to interpose on all cpuid instructions and change them to suit 846 * their purposes. In general, this is necessary as the hypervisor wants to be 847 * able to present a more uniform set of features or not necessarily give the 848 * guest operating system kernel knowledge of all features so it can be 849 * more easily migrated between systems. 850 * 851 * When it comes to trying to determine topology information, this can be a 852 * double edged sword. When a hypervisor doesn't actually implement a cpuid 853 * leaf, it'll often return all zeros. Because of that, you'll often see various 854 * checks scattered about fields being non-zero before we assume we can use 855 * them. 856 * 857 * When it comes to topology information, the hypervisor is often incentivized 858 * to lie to you about topology. This is because it doesn't always actually 859 * guarantee that topology at all. The topology path we take in the system 860 * depends on how the CPU advertises itself. If it advertises itself as an Intel 861 * or AMD CPU, then we basically do our normal path. However, when they don't 862 * use an actual vendor, then that usually turns into multiple one-core CPUs 863 * that we enumerate that are often on different sockets. The actual behavior 864 * depends greatly on what the hypervisor actually exposes to us. 865 * 866 * -------------------- 867 * Exposing Information 868 * -------------------- 869 * 870 * We expose CPUID information in three different forms in the system. 871 * 872 * The first is through the x86_featureset variable. This is used in conjunction 873 * with the is_x86_feature() function. This is queried by x86-specific functions 874 * to determine which features are or aren't present in the system and to make 875 * decisions based upon them. For example, users of this include everything from 876 * parts of the system dedicated to reliability, availability, and 877 * serviceability (RAS), to making decisions about how to handle security 878 * mitigations, to various x86-specific drivers. General purpose or 879 * architecture independent drivers should never be calling this function. 880 * 881 * The second means is through the auxiliary vector. The auxiliary vector is a 882 * series of tagged data that the kernel passes down to a user program when it 883 * begins executing. This information is used to indicate to programs what 884 * instruction set extensions are present. For example, information about the 885 * CPU supporting the machine check architecture (MCA) wouldn't be passed down 886 * since user programs cannot make use of it. However, things like the AVX 887 * instruction sets are. Programs use this information to make run-time 888 * decisions about what features they should use. As an example, the run-time 889 * link-editor (rtld) can relocate different functions depending on the hardware 890 * support available. 891 * 892 * The final form is through a series of accessor functions that all have the 893 * form cpuid_get*. This is used by a number of different subsystems in the 894 * kernel to determine more detailed information about what we're running on, 895 * topology information, etc. Some of these subsystems include processor groups 896 * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI, 897 * microcode, and performance monitoring. These functions all ASSERT that the 898 * CPU they're being called on has reached a certain cpuid pass. If the passes 899 * are rearranged, then this needs to be adjusted. 900 * 901 * ----------------------------------------------- 902 * Speculative Execution CPU Side Channel Security 903 * ----------------------------------------------- 904 * 905 * With the advent of the Spectre and Meltdown attacks which exploit speculative 906 * execution in the CPU to create side channels there have been a number of 907 * different attacks and corresponding issues that the operating system needs to 908 * mitigate against. The following list is some of the common, but not 909 * exhaustive, set of issues that we know about and have done some or need to do 910 * more work in the system to mitigate against: 911 * 912 * - Spectre v1 913 * - swapgs (Spectre v1 variant) 914 * - Spectre v2 915 * - Meltdown (Spectre v3) 916 * - Rogue Register Read (Spectre v3a) 917 * - Speculative Store Bypass (Spectre v4) 918 * - ret2spec, SpectreRSB 919 * - L1 Terminal Fault (L1TF) 920 * - Microarchitectural Data Sampling (MDS) 921 * 922 * Each of these requires different sets of mitigations and has different attack 923 * surfaces. For the most part, this discussion is about protecting the kernel 924 * from non-kernel executing environments such as user processes and hardware 925 * virtual machines. Unfortunately, there are a number of user vs. user 926 * scenarios that exist with these. The rest of this section will describe the 927 * overall approach that the system has taken to address these as well as their 928 * shortcomings. Unfortunately, not all of the above have been handled today. 929 * 930 * SPECTRE v2, ret2spec, SpectreRSB 931 * 932 * The second variant of the spectre attack focuses on performing branch target 933 * injection. This generally impacts indirect call instructions in the system. 934 * There are three different ways to mitigate this issue that are commonly 935 * described today: 936 * 937 * 1. Using Indirect Branch Restricted Speculation (IBRS). 938 * 2. Using Retpolines and RSB Stuffing 939 * 3. Using Enhanced Indirect Branch Restricted Speculation (EIBRS) 940 * 941 * IBRS uses a feature added to microcode to restrict speculation, among other 942 * things. This form of mitigation has not been used as it has been generally 943 * seen as too expensive and requires reactivation upon various transitions in 944 * the system. 945 * 946 * As a less impactful alternative to IBRS, retpolines were developed by 947 * Google. These basically require one to replace indirect calls with a specific 948 * trampoline that will cause speculation to fail and break the attack. 949 * Retpolines require compiler support. We always build with retpolines in the 950 * external thunk mode. This means that a traditional indirect call is replaced 951 * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect 952 * of this is that all indirect function calls are performed through a register. 953 * 954 * We have to use a common external location of the thunk and not inline it into 955 * the callsite so that way we can have a single place to patch these functions. 956 * As it turns out, we actually have three different forms of retpolines that 957 * exist in the system: 958 * 959 * 1. A full retpoline 960 * 2. An AMD-specific optimized retpoline 961 * 3. A no-op version 962 * 963 * The first one is used in the general case. The second one is used if we can 964 * determine that we're on an AMD system and we can successfully toggle the 965 * lfence serializing MSR that exists on the platform. Basically with this 966 * present, an lfence is sufficient and we don't need to do anywhere near as 967 * complicated a dance to successfully use retpolines. 968 * 969 * The third form described above is the most curious. It turns out that the way 970 * that retpolines are implemented is that they rely on how speculation is 971 * performed on a 'ret' instruction. Intel has continued to optimize this 972 * process (which is partly why we need to have return stack buffer stuffing, 973 * but more on that in a bit) and in processors starting with Cascade Lake 974 * on the server side, it's dangerous to rely on retpolines. Instead, a new 975 * mechanism has been introduced called Enhanced IBRS (EIBRS). 976 * 977 * Unlike IBRS, EIBRS is designed to be enabled once at boot and left on each 978 * physical core. However, if this is the case, we don't want to use retpolines 979 * any more. Therefore if EIBRS is present, we end up turning each retpoline 980 * function (called a thunk) into a jmp instruction. This means that we're still 981 * paying the cost of an extra jump to the external thunk, but it gives us 982 * flexibility and the ability to have a single kernel image that works across a 983 * wide variety of systems and hardware features. 984 * 985 * Unfortunately, this alone is insufficient. First, Skylake systems have 986 * additional speculation for the Return Stack Buffer (RSB) which is used to 987 * return from call instructions which retpolines take advantage of. However, 988 * this problem is not just limited to Skylake and is actually more pernicious. 989 * The SpectreRSB paper introduces several more problems that can arise with 990 * dealing with this. The RSB can be poisoned just like the indirect branch 991 * predictor. This means that one needs to clear the RSB when transitioning 992 * between two different privilege domains. Some examples include: 993 * 994 * - Switching between two different user processes 995 * - Going between user land and the kernel 996 * - Returning to the kernel from a hardware virtual machine 997 * 998 * Mitigating this involves combining a couple of different things. The first is 999 * SMEP (supervisor mode execution protection) which was introduced in Ivy 1000 * Bridge. When an RSB entry refers to a user address and we're executing in the 1001 * kernel, speculation through it will be stopped when SMEP is enabled. This 1002 * protects against a number of the different cases that we would normally be 1003 * worried about such as when we enter the kernel from user land. 1004 * 1005 * To prevent against additional manipulation of the RSB from other contexts 1006 * such as a non-root VMX context attacking the kernel we first look to enhanced 1007 * IBRS. When EIBRS is present and enabled, then there is nothing else that we 1008 * need to do to protect the kernel at this time. 1009 * 1010 * On CPUs without EIBRS we need to manually overwrite the contents of the 1011 * return stack buffer. We do this through the x86_rsb_stuff() function. 1012 * Currently this is employed on context switch. The x86_rsb_stuff() function is 1013 * disabled when enhanced IBRS is present because Intel claims on such systems 1014 * it will be ineffective. Stuffing the RSB in context switch helps prevent user 1015 * to user attacks via the RSB. 1016 * 1017 * If SMEP is not present, then we would have to stuff the RSB every time we 1018 * transitioned from user mode to the kernel, which isn't very practical right 1019 * now. 1020 * 1021 * To fully protect user to user and vmx to vmx attacks from these classes of 1022 * issues, we would also need to allow them to opt into performing an Indirect 1023 * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up. 1024 * 1025 * By default, the system will enable RSB stuffing and the required variant of 1026 * retpolines and store that information in the x86_spectrev2_mitigation value. 1027 * This will be evaluated after a microcode update as well, though it is 1028 * expected that microcode updates will not take away features. This may mean 1029 * that a late loaded microcode may not end up in the optimal configuration 1030 * (though this should be rare). 1031 * 1032 * Currently we do not build kmdb with retpolines or perform any additional side 1033 * channel security mitigations for it. One complication with kmdb is that it 1034 * requires its own retpoline thunks and it would need to adjust itself based on 1035 * what the kernel does. The threat model of kmdb is more limited and therefore 1036 * it may make more sense to investigate using prediction barriers as the whole 1037 * system is only executing a single instruction at a time while in kmdb. 1038 * 1039 * SPECTRE v1, v4 1040 * 1041 * The v1 and v4 variants of spectre are not currently mitigated in the 1042 * system and require other classes of changes to occur in the code. 1043 * 1044 * SPECTRE v1 (SWAPGS VARIANT) 1045 * 1046 * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but 1047 * can generally affect any branch-dependent code. The swapgs issue is one 1048 * variant of this. If we are coming in from userspace, we can have code like 1049 * this: 1050 * 1051 * cmpw $KCS_SEL, REGOFF_CS(%rsp) 1052 * je 1f 1053 * movq $0, REGOFF_SAVFP(%rsp) 1054 * swapgs 1055 * 1: 1056 * movq %gs:CPU_THREAD, %rax 1057 * 1058 * If an attacker can cause a mis-speculation of the branch here, we could skip 1059 * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based 1060 * load. If subsequent code can act as the usual Spectre cache gadget, this 1061 * would potentially allow KPTI bypass. To fix this, we need an lfence prior to 1062 * any use of the %gs override. 1063 * 1064 * The other case is also an issue: if we're coming into a trap from kernel 1065 * space, we could mis-speculate and swapgs the user %gsbase back in prior to 1066 * using it. AMD systems are not vulnerable to this version, as a swapgs is 1067 * serializing with respect to subsequent uses. But as AMD /does/ need the other 1068 * case, and the fix is the same in both cases (an lfence at the branch target 1069 * 1: in this example), we'll just do it unconditionally. 1070 * 1071 * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it 1072 * harder for user-space to actually set a useful %gsbase value: although it's 1073 * not clear, it might still be feasible via lwp_setprivate(), though, so we 1074 * mitigate anyway. 1075 * 1076 * MELTDOWN 1077 * 1078 * Meltdown, or spectre v3, allowed a user process to read any data in their 1079 * address space regardless of whether or not the page tables in question 1080 * allowed the user to have the ability to read them. The solution to meltdown 1081 * is kernel page table isolation. In this world, there are two page tables that 1082 * are used for a process, one in user land and one in the kernel. To implement 1083 * this we use per-CPU page tables and switch between the user and kernel 1084 * variants when entering and exiting the kernel. For more information about 1085 * this process and how the trampolines work, please see the big theory 1086 * statements and additional comments in: 1087 * 1088 * - uts/i86pc/ml/kpti_trampolines.s 1089 * - uts/i86pc/vm/hat_i86.c 1090 * 1091 * While Meltdown only impacted Intel systems and there are also Intel systems 1092 * that have Meltdown fixed (called Rogue Data Cache Load), we always have 1093 * kernel page table isolation enabled. While this may at first seem weird, an 1094 * important thing to remember is that you can't speculatively read an address 1095 * if it's never in your page table at all. Having user processes without kernel 1096 * pages present provides us with an important layer of defense in the kernel 1097 * against any other side channel attacks that exist and have yet to be 1098 * discovered. As such, kernel page table isolation (KPTI) is always enabled by 1099 * default, no matter the x86 system. 1100 * 1101 * L1 TERMINAL FAULT 1102 * 1103 * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative 1104 * execution uses page table entries. Effectively, it is two different problems. 1105 * The first is that it ignores the not present bit in the page table entries 1106 * when performing speculative execution. This means that something can 1107 * speculatively read the listed physical address if it's present in the L1 1108 * cache under certain conditions (see Intel's documentation for the full set of 1109 * conditions). Secondly, this can be used to bypass hardware virtualization 1110 * extended page tables (EPT) that are part of Intel's hardware virtual machine 1111 * instructions. 1112 * 1113 * For the non-hardware virtualized case, this is relatively easy to deal with. 1114 * We must make sure that all unmapped pages have an address of zero. This means 1115 * that they could read the first 4k of physical memory; however, we never use 1116 * that first page in the operating system and always skip putting it in our 1117 * memory map, even if firmware tells us we can use it in our memory map. While 1118 * other systems try to put extra metadata in the address and reserved bits, 1119 * which led to this being problematic in those cases, we do not. 1120 * 1121 * For hardware virtual machines things are more complicated. Because they can 1122 * construct their own page tables, it isn't hard for them to perform this 1123 * attack against any physical address. The one wrinkle is that this physical 1124 * address must be in the L1 data cache. Thus Intel added an MSR that we can use 1125 * to flush the L1 data cache. We wrap this up in the function 1126 * spec_uarch_flush(). This function is also used in the mitigation of 1127 * microarchitectural data sampling (MDS) discussed later on. Kernel based 1128 * hypervisors such as KVM or bhyve are responsible for performing this before 1129 * entering the guest. 1130 * 1131 * Because this attack takes place in the L1 cache, there's another wrinkle 1132 * here. The L1 cache is shared between all logical CPUs in a core in most Intel 1133 * designs. This means that when a thread enters a hardware virtualized context 1134 * and flushes the L1 data cache, the other thread on the processor may then go 1135 * ahead and put new data in it that can be potentially attacked. While one 1136 * solution is to disable SMT on the system, another option that is available is 1137 * to use a feature for hardware virtualization called 'SMT exclusion'. This 1138 * goes through and makes sure that if a HVM is being scheduled on one thread, 1139 * then the thing on the other thread is from the same hardware virtual machine. 1140 * If an interrupt comes in or the guest exits to the broader system, then the 1141 * other SMT thread will be kicked out. 1142 * 1143 * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the 1144 * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not 1145 * perform L1TF related mitigations. 1146 * 1147 * MICROARCHITECTURAL DATA SAMPLING 1148 * 1149 * Microarchitectural data sampling (MDS) is a combination of four discrete 1150 * vulnerabilities that are similar issues affecting various parts of the CPU's 1151 * microarchitectural implementation around load, store, and fill buffers. 1152 * Specifically it is made up of the following subcomponents: 1153 * 1154 * 1. Microarchitectural Store Buffer Data Sampling (MSBDS) 1155 * 2. Microarchitectural Fill Buffer Data Sampling (MFBDS) 1156 * 3. Microarchitectural Load Port Data Sampling (MLPDS) 1157 * 4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM) 1158 * 1159 * To begin addressing these, Intel has introduced another feature in microcode 1160 * called MD_CLEAR. This changes the verw instruction to operate in a different 1161 * way. This allows us to execute the verw instruction in a particular way to 1162 * flush the state of the affected parts. The L1TF L1D flush mechanism is also 1163 * updated when this microcode is present to flush this state. 1164 * 1165 * Primarily we need to flush this state whenever we transition from the kernel 1166 * to a less privileged context such as user mode or an HVM guest. MSBDS is a 1167 * little bit different. Here the structures are statically sized when a logical 1168 * CPU is in use and resized when it goes to sleep. Therefore, we also need to 1169 * flush the microarchitectural state before the CPU goes idles by calling hlt, 1170 * mwait, or another ACPI method. To perform these flushes, we call 1171 * x86_md_clear() at all of these transition points. 1172 * 1173 * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF, 1174 * then we change the spec_uarch_flush() function to point to x86_md_clear(). If 1175 * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes 1176 * a no-op. 1177 * 1178 * Unfortunately, with this issue hyperthreading rears its ugly head. In 1179 * particular, everything we've discussed above is only valid for a single 1180 * thread executing on a core. In the case where you have hyper-threading 1181 * present, this attack can be performed between threads. The theoretical fix 1182 * for this is to ensure that both threads are always in the same security 1183 * domain. This means that they are executing in the same ring and mutually 1184 * trust each other. Practically speaking, this would mean that a system call 1185 * would have to issue an inter-processor interrupt (IPI) to the other thread. 1186 * Rather than implement this, we recommend that one disables hyper-threading 1187 * through the use of psradm -aS. 1188 * 1189 * SUMMARY 1190 * 1191 * The following table attempts to summarize the mitigations for various issues 1192 * and what's done in various places: 1193 * 1194 * - Spectre v1: Not currently mitigated 1195 * - swapgs: lfences after swapgs paths 1196 * - Spectre v2: Retpolines/RSB Stuffing or EIBRS if HW support 1197 * - Meltdown: Kernel Page Table Isolation 1198 * - Spectre v3a: Updated CPU microcode 1199 * - Spectre v4: Not currently mitigated 1200 * - SpectreRSB: SMEP and RSB Stuffing 1201 * - L1TF: spec_uarch_flush, SMT exclusion, requires microcode 1202 * - MDS: x86_md_clear, requires microcode, disabling hyper threading 1203 * 1204 * The following table indicates the x86 feature set bits that indicate that a 1205 * given problem has been solved or a notable feature is present: 1206 * 1207 * - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS 1208 * - MDS_NO: All forms of MDS 1209 */ 1210 1211 #include <sys/types.h> 1212 #include <sys/archsystm.h> 1213 #include <sys/x86_archext.h> 1214 #include <sys/kmem.h> 1215 #include <sys/systm.h> 1216 #include <sys/cmn_err.h> 1217 #include <sys/sunddi.h> 1218 #include <sys/sunndi.h> 1219 #include <sys/cpuvar.h> 1220 #include <sys/processor.h> 1221 #include <sys/sysmacros.h> 1222 #include <sys/pg.h> 1223 #include <sys/fp.h> 1224 #include <sys/controlregs.h> 1225 #include <sys/bitmap.h> 1226 #include <sys/auxv_386.h> 1227 #include <sys/memnode.h> 1228 #include <sys/pci_cfgspace.h> 1229 #include <sys/comm_page.h> 1230 #include <sys/mach_mmu.h> 1231 #include <sys/ucode.h> 1232 #include <sys/tsc.h> 1233 #include <sys/kobj.h> 1234 #include <sys/asm_misc.h> 1235 1236 #ifdef __xpv 1237 #include <sys/hypervisor.h> 1238 #else 1239 #include <sys/ontrap.h> 1240 #endif 1241 1242 uint_t x86_vendor = X86_VENDOR_IntelClone; 1243 uint_t x86_type = X86_TYPE_OTHER; 1244 uint_t x86_clflush_size = 0; 1245 1246 #if defined(__xpv) 1247 int x86_use_pcid = 0; 1248 int x86_use_invpcid = 0; 1249 #else 1250 int x86_use_pcid = -1; 1251 int x86_use_invpcid = -1; 1252 #endif 1253 1254 typedef enum { 1255 X86_SPECTREV2_RETPOLINE, 1256 X86_SPECTREV2_RETPOLINE_AMD, 1257 X86_SPECTREV2_ENHANCED_IBRS, 1258 X86_SPECTREV2_DISABLED 1259 } x86_spectrev2_mitigation_t; 1260 1261 uint_t x86_disable_spectrev2 = 0; 1262 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation = 1263 X86_SPECTREV2_RETPOLINE; 1264 1265 uint_t pentiumpro_bug4046376; 1266 1267 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)]; 1268 1269 static char *x86_feature_names[NUM_X86_FEATURES] = { 1270 "lgpg", 1271 "tsc", 1272 "msr", 1273 "mtrr", 1274 "pge", 1275 "de", 1276 "cmov", 1277 "mmx", 1278 "mca", 1279 "pae", 1280 "cv8", 1281 "pat", 1282 "sep", 1283 "sse", 1284 "sse2", 1285 "htt", 1286 "asysc", 1287 "nx", 1288 "sse3", 1289 "cx16", 1290 "cmp", 1291 "tscp", 1292 "mwait", 1293 "sse4a", 1294 "cpuid", 1295 "ssse3", 1296 "sse4_1", 1297 "sse4_2", 1298 "1gpg", 1299 "clfsh", 1300 "64", 1301 "aes", 1302 "pclmulqdq", 1303 "xsave", 1304 "avx", 1305 "vmx", 1306 "svm", 1307 "topoext", 1308 "f16c", 1309 "rdrand", 1310 "x2apic", 1311 "avx2", 1312 "bmi1", 1313 "bmi2", 1314 "fma", 1315 "smep", 1316 "smap", 1317 "adx", 1318 "rdseed", 1319 "mpx", 1320 "avx512f", 1321 "avx512dq", 1322 "avx512pf", 1323 "avx512er", 1324 "avx512cd", 1325 "avx512bw", 1326 "avx512vl", 1327 "avx512fma", 1328 "avx512vbmi", 1329 "avx512_vpopcntdq", 1330 "avx512_4vnniw", 1331 "avx512_4fmaps", 1332 "xsaveopt", 1333 "xsavec", 1334 "xsaves", 1335 "sha", 1336 "umip", 1337 "pku", 1338 "ospke", 1339 "pcid", 1340 "invpcid", 1341 "ibrs", 1342 "ibpb", 1343 "stibp", 1344 "ssbd", 1345 "ssbd_virt", 1346 "rdcl_no", 1347 "ibrs_all", 1348 "rsba", 1349 "ssb_no", 1350 "stibp_all", 1351 "flush_cmd", 1352 "l1d_vmentry_no", 1353 "fsgsbase", 1354 "clflushopt", 1355 "clwb", 1356 "monitorx", 1357 "clzero", 1358 "xop", 1359 "fma4", 1360 "tbm", 1361 "avx512_vnni", 1362 "amd_pcec", 1363 "mb_clear", 1364 "mds_no", 1365 "core_thermal", 1366 "pkg_thermal" 1367 }; 1368 1369 boolean_t 1370 is_x86_feature(void *featureset, uint_t feature) 1371 { 1372 ASSERT(feature < NUM_X86_FEATURES); 1373 return (BT_TEST((ulong_t *)featureset, feature)); 1374 } 1375 1376 void 1377 add_x86_feature(void *featureset, uint_t feature) 1378 { 1379 ASSERT(feature < NUM_X86_FEATURES); 1380 BT_SET((ulong_t *)featureset, feature); 1381 } 1382 1383 void 1384 remove_x86_feature(void *featureset, uint_t feature) 1385 { 1386 ASSERT(feature < NUM_X86_FEATURES); 1387 BT_CLEAR((ulong_t *)featureset, feature); 1388 } 1389 1390 boolean_t 1391 compare_x86_featureset(void *setA, void *setB) 1392 { 1393 /* 1394 * We assume that the unused bits of the bitmap are always zero. 1395 */ 1396 if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) { 1397 return (B_TRUE); 1398 } else { 1399 return (B_FALSE); 1400 } 1401 } 1402 1403 void 1404 print_x86_featureset(void *featureset) 1405 { 1406 uint_t i; 1407 1408 for (i = 0; i < NUM_X86_FEATURES; i++) { 1409 if (is_x86_feature(featureset, i)) { 1410 cmn_err(CE_CONT, "?x86_feature: %s\n", 1411 x86_feature_names[i]); 1412 } 1413 } 1414 } 1415 1416 /* Note: This is the maximum size for the CPU, not the size of the structure. */ 1417 static size_t xsave_state_size = 0; 1418 uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE); 1419 boolean_t xsave_force_disable = B_FALSE; 1420 extern int disable_smap; 1421 1422 /* 1423 * This is set to platform type we are running on. 1424 */ 1425 static int platform_type = -1; 1426 1427 #if !defined(__xpv) 1428 /* 1429 * Variable to patch if hypervisor platform detection needs to be 1430 * disabled (e.g. platform_type will always be HW_NATIVE if this is 0). 1431 */ 1432 int enable_platform_detection = 1; 1433 #endif 1434 1435 /* 1436 * monitor/mwait info. 1437 * 1438 * size_actual and buf_actual are the real address and size allocated to get 1439 * proper mwait_buf alignement. buf_actual and size_actual should be passed 1440 * to kmem_free(). Currently kmem_alloc() and mwait happen to both use 1441 * processor cache-line alignment, but this is not guarantied in the furture. 1442 */ 1443 struct mwait_info { 1444 size_t mon_min; /* min size to avoid missed wakeups */ 1445 size_t mon_max; /* size to avoid false wakeups */ 1446 size_t size_actual; /* size actually allocated */ 1447 void *buf_actual; /* memory actually allocated */ 1448 uint32_t support; /* processor support of monitor/mwait */ 1449 }; 1450 1451 /* 1452 * xsave/xrestor info. 1453 * 1454 * This structure contains HW feature bits and the size of the xsave save area. 1455 * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure 1456 * (xsave_state) to describe the xsave layout. However, at runtime the 1457 * per-lwp xsave area is dynamically allocated based on xsav_max_size. The 1458 * xsave_state structure simply represents the legacy layout of the beginning 1459 * of the xsave area. 1460 */ 1461 struct xsave_info { 1462 uint32_t xsav_hw_features_low; /* Supported HW features */ 1463 uint32_t xsav_hw_features_high; /* Supported HW features */ 1464 size_t xsav_max_size; /* max size save area for HW features */ 1465 size_t ymm_size; /* AVX: size of ymm save area */ 1466 size_t ymm_offset; /* AVX: offset for ymm save area */ 1467 size_t bndregs_size; /* MPX: size of bndregs save area */ 1468 size_t bndregs_offset; /* MPX: offset for bndregs save area */ 1469 size_t bndcsr_size; /* MPX: size of bndcsr save area */ 1470 size_t bndcsr_offset; /* MPX: offset for bndcsr save area */ 1471 size_t opmask_size; /* AVX512: size of opmask save */ 1472 size_t opmask_offset; /* AVX512: offset for opmask save */ 1473 size_t zmmlo_size; /* AVX512: size of zmm 256 save */ 1474 size_t zmmlo_offset; /* AVX512: offset for zmm 256 save */ 1475 size_t zmmhi_size; /* AVX512: size of zmm hi reg save */ 1476 size_t zmmhi_offset; /* AVX512: offset for zmm hi reg save */ 1477 }; 1478 1479 1480 /* 1481 * These constants determine how many of the elements of the 1482 * cpuid we cache in the cpuid_info data structure; the 1483 * remaining elements are accessible via the cpuid instruction. 1484 */ 1485 1486 #define NMAX_CPI_STD 8 /* eax = 0 .. 7 */ 1487 #define NMAX_CPI_EXTD 0x1f /* eax = 0x80000000 .. 0x8000001e */ 1488 1489 /* 1490 * See the big theory statement for a more detailed explanation of what some of 1491 * these members mean. 1492 */ 1493 struct cpuid_info { 1494 uint_t cpi_pass; /* last pass completed */ 1495 /* 1496 * standard function information 1497 */ 1498 uint_t cpi_maxeax; /* fn 0: %eax */ 1499 char cpi_vendorstr[13]; /* fn 0: %ebx:%ecx:%edx */ 1500 uint_t cpi_vendor; /* enum of cpi_vendorstr */ 1501 1502 uint_t cpi_family; /* fn 1: extended family */ 1503 uint_t cpi_model; /* fn 1: extended model */ 1504 uint_t cpi_step; /* fn 1: stepping */ 1505 chipid_t cpi_chipid; /* fn 1: %ebx: Intel: chip # */ 1506 /* AMD: package/socket # */ 1507 uint_t cpi_brandid; /* fn 1: %ebx: brand ID */ 1508 int cpi_clogid; /* fn 1: %ebx: thread # */ 1509 uint_t cpi_ncpu_per_chip; /* fn 1: %ebx: logical cpu count */ 1510 uint8_t cpi_cacheinfo[16]; /* fn 2: intel-style cache desc */ 1511 uint_t cpi_ncache; /* fn 2: number of elements */ 1512 uint_t cpi_ncpu_shr_last_cache; /* fn 4: %eax: ncpus sharing cache */ 1513 id_t cpi_last_lvl_cacheid; /* fn 4: %eax: derived cache id */ 1514 uint_t cpi_cache_leaf_size; /* Number of cache elements */ 1515 /* Intel fn: 4, AMD fn: 8000001d */ 1516 struct cpuid_regs **cpi_cache_leaves; /* Acual leaves from above */ 1517 struct cpuid_regs cpi_std[NMAX_CPI_STD]; /* 0 .. 7 */ 1518 /* 1519 * extended function information 1520 */ 1521 uint_t cpi_xmaxeax; /* fn 0x80000000: %eax */ 1522 char cpi_brandstr[49]; /* fn 0x8000000[234] */ 1523 uint8_t cpi_pabits; /* fn 0x80000006: %eax */ 1524 uint8_t cpi_vabits; /* fn 0x80000006: %eax */ 1525 uint8_t cpi_fp_amd_save; /* AMD: FP error pointer save rqd. */ 1526 struct cpuid_regs cpi_extd[NMAX_CPI_EXTD]; /* 0x800000XX */ 1527 1528 id_t cpi_coreid; /* same coreid => strands share core */ 1529 int cpi_pkgcoreid; /* core number within single package */ 1530 uint_t cpi_ncore_per_chip; /* AMD: fn 0x80000008: %ecx[7-0] */ 1531 /* Intel: fn 4: %eax[31-26] */ 1532 1533 /* 1534 * These values represent the number of bits that are required to store 1535 * information about the number of cores and threads. 1536 */ 1537 uint_t cpi_ncore_bits; 1538 uint_t cpi_nthread_bits; 1539 /* 1540 * supported feature information 1541 */ 1542 uint32_t cpi_support[6]; 1543 #define STD_EDX_FEATURES 0 1544 #define AMD_EDX_FEATURES 1 1545 #define TM_EDX_FEATURES 2 1546 #define STD_ECX_FEATURES 3 1547 #define AMD_ECX_FEATURES 4 1548 #define STD_EBX_FEATURES 5 1549 /* 1550 * Synthesized information, where known. 1551 */ 1552 uint32_t cpi_chiprev; /* See X86_CHIPREV_* in x86_archext.h */ 1553 const char *cpi_chiprevstr; /* May be NULL if chiprev unknown */ 1554 uint32_t cpi_socket; /* Chip package/socket type */ 1555 1556 struct mwait_info cpi_mwait; /* fn 5: monitor/mwait info */ 1557 uint32_t cpi_apicid; 1558 uint_t cpi_procnodeid; /* AMD: nodeID on HT, Intel: chipid */ 1559 uint_t cpi_procnodes_per_pkg; /* AMD: # of nodes in the package */ 1560 /* Intel: 1 */ 1561 uint_t cpi_compunitid; /* AMD: ComputeUnit ID, Intel: coreid */ 1562 uint_t cpi_cores_per_compunit; /* AMD: # of cores in the ComputeUnit */ 1563 1564 struct xsave_info cpi_xsave; /* fn D: xsave/xrestor info */ 1565 }; 1566 1567 1568 static struct cpuid_info cpuid_info0; 1569 1570 /* 1571 * These bit fields are defined by the Intel Application Note AP-485 1572 * "Intel Processor Identification and the CPUID Instruction" 1573 */ 1574 #define CPI_FAMILY_XTD(cpi) BITX((cpi)->cpi_std[1].cp_eax, 27, 20) 1575 #define CPI_MODEL_XTD(cpi) BITX((cpi)->cpi_std[1].cp_eax, 19, 16) 1576 #define CPI_TYPE(cpi) BITX((cpi)->cpi_std[1].cp_eax, 13, 12) 1577 #define CPI_FAMILY(cpi) BITX((cpi)->cpi_std[1].cp_eax, 11, 8) 1578 #define CPI_STEP(cpi) BITX((cpi)->cpi_std[1].cp_eax, 3, 0) 1579 #define CPI_MODEL(cpi) BITX((cpi)->cpi_std[1].cp_eax, 7, 4) 1580 1581 #define CPI_FEATURES_EDX(cpi) ((cpi)->cpi_std[1].cp_edx) 1582 #define CPI_FEATURES_ECX(cpi) ((cpi)->cpi_std[1].cp_ecx) 1583 #define CPI_FEATURES_XTD_EDX(cpi) ((cpi)->cpi_extd[1].cp_edx) 1584 #define CPI_FEATURES_XTD_ECX(cpi) ((cpi)->cpi_extd[1].cp_ecx) 1585 #define CPI_FEATURES_7_0_EBX(cpi) ((cpi)->cpi_std[7].cp_ebx) 1586 #define CPI_FEATURES_7_0_ECX(cpi) ((cpi)->cpi_std[7].cp_ecx) 1587 #define CPI_FEATURES_7_0_EDX(cpi) ((cpi)->cpi_std[7].cp_edx) 1588 1589 #define CPI_BRANDID(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 7, 0) 1590 #define CPI_CHUNKS(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 15, 7) 1591 #define CPI_CPU_COUNT(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 23, 16) 1592 #define CPI_APIC_ID(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 31, 24) 1593 1594 #define CPI_MAXEAX_MAX 0x100 /* sanity control */ 1595 #define CPI_XMAXEAX_MAX 0x80000100 1596 #define CPI_FN4_ECX_MAX 0x20 /* sanity: max fn 4 levels */ 1597 #define CPI_FNB_ECX_MAX 0x20 /* sanity: max fn B levels */ 1598 1599 /* 1600 * Function 4 (Deterministic Cache Parameters) macros 1601 * Defined by Intel Application Note AP-485 1602 */ 1603 #define CPI_NUM_CORES(regs) BITX((regs)->cp_eax, 31, 26) 1604 #define CPI_NTHR_SHR_CACHE(regs) BITX((regs)->cp_eax, 25, 14) 1605 #define CPI_FULL_ASSOC_CACHE(regs) BITX((regs)->cp_eax, 9, 9) 1606 #define CPI_SELF_INIT_CACHE(regs) BITX((regs)->cp_eax, 8, 8) 1607 #define CPI_CACHE_LVL(regs) BITX((regs)->cp_eax, 7, 5) 1608 #define CPI_CACHE_TYPE(regs) BITX((regs)->cp_eax, 4, 0) 1609 #define CPI_CPU_LEVEL_TYPE(regs) BITX((regs)->cp_ecx, 15, 8) 1610 1611 #define CPI_CACHE_WAYS(regs) BITX((regs)->cp_ebx, 31, 22) 1612 #define CPI_CACHE_PARTS(regs) BITX((regs)->cp_ebx, 21, 12) 1613 #define CPI_CACHE_COH_LN_SZ(regs) BITX((regs)->cp_ebx, 11, 0) 1614 1615 #define CPI_CACHE_SETS(regs) BITX((regs)->cp_ecx, 31, 0) 1616 1617 #define CPI_PREFCH_STRIDE(regs) BITX((regs)->cp_edx, 9, 0) 1618 1619 1620 /* 1621 * A couple of shorthand macros to identify "later" P6-family chips 1622 * like the Pentium M and Core. First, the "older" P6-based stuff 1623 * (loosely defined as "pre-Pentium-4"): 1624 * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon 1625 */ 1626 #define IS_LEGACY_P6(cpi) ( \ 1627 cpi->cpi_family == 6 && \ 1628 (cpi->cpi_model == 1 || \ 1629 cpi->cpi_model == 3 || \ 1630 cpi->cpi_model == 5 || \ 1631 cpi->cpi_model == 6 || \ 1632 cpi->cpi_model == 7 || \ 1633 cpi->cpi_model == 8 || \ 1634 cpi->cpi_model == 0xA || \ 1635 cpi->cpi_model == 0xB) \ 1636 ) 1637 1638 /* A "new F6" is everything with family 6 that's not the above */ 1639 #define IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi)) 1640 1641 /* Extended family/model support */ 1642 #define IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \ 1643 cpi->cpi_family >= 0xf) 1644 1645 /* 1646 * Info for monitor/mwait idle loop. 1647 * 1648 * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's 1649 * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November 1650 * 2006. 1651 * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual 1652 * Documentation Updates" #33633, Rev 2.05, December 2006. 1653 */ 1654 #define MWAIT_SUPPORT (0x00000001) /* mwait supported */ 1655 #define MWAIT_EXTENSIONS (0x00000002) /* extenstion supported */ 1656 #define MWAIT_ECX_INT_ENABLE (0x00000004) /* ecx 1 extension supported */ 1657 #define MWAIT_SUPPORTED(cpi) ((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON) 1658 #define MWAIT_INT_ENABLE(cpi) ((cpi)->cpi_std[5].cp_ecx & 0x2) 1659 #define MWAIT_EXTENSION(cpi) ((cpi)->cpi_std[5].cp_ecx & 0x1) 1660 #define MWAIT_SIZE_MIN(cpi) BITX((cpi)->cpi_std[5].cp_eax, 15, 0) 1661 #define MWAIT_SIZE_MAX(cpi) BITX((cpi)->cpi_std[5].cp_ebx, 15, 0) 1662 /* 1663 * Number of sub-cstates for a given c-state. 1664 */ 1665 #define MWAIT_NUM_SUBC_STATES(cpi, c_state) \ 1666 BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state) 1667 1668 /* 1669 * XSAVE leaf 0xD enumeration 1670 */ 1671 #define CPUID_LEAFD_2_YMM_OFFSET 576 1672 #define CPUID_LEAFD_2_YMM_SIZE 256 1673 1674 /* 1675 * Common extended leaf names to cut down on typos. 1676 */ 1677 #define CPUID_LEAF_EXT_0 0x80000000 1678 #define CPUID_LEAF_EXT_8 0x80000008 1679 #define CPUID_LEAF_EXT_1d 0x8000001d 1680 #define CPUID_LEAF_EXT_1e 0x8000001e 1681 1682 /* 1683 * Functions we consune from cpuid_subr.c; don't publish these in a header 1684 * file to try and keep people using the expected cpuid_* interfaces. 1685 */ 1686 extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t); 1687 extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t); 1688 extern uint32_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t); 1689 extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t); 1690 extern uint_t _cpuid_vendorstr_to_vendorcode(char *); 1691 1692 /* 1693 * Apply up various platform-dependent restrictions where the 1694 * underlying platform restrictions mean the CPU can be marked 1695 * as less capable than its cpuid instruction would imply. 1696 */ 1697 #if defined(__xpv) 1698 static void 1699 platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp) 1700 { 1701 switch (eax) { 1702 case 1: { 1703 uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ? 1704 0 : CPUID_INTC_EDX_MCA; 1705 cp->cp_edx &= 1706 ~(mcamask | 1707 CPUID_INTC_EDX_PSE | 1708 CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE | 1709 CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR | 1710 CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT | 1711 CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP | 1712 CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT); 1713 break; 1714 } 1715 1716 case 0x80000001: 1717 cp->cp_edx &= 1718 ~(CPUID_AMD_EDX_PSE | 1719 CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE | 1720 CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE | 1721 CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 | 1722 CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP | 1723 CPUID_AMD_EDX_TSCP); 1724 cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY; 1725 break; 1726 default: 1727 break; 1728 } 1729 1730 switch (vendor) { 1731 case X86_VENDOR_Intel: 1732 switch (eax) { 1733 case 4: 1734 /* 1735 * Zero out the (ncores-per-chip - 1) field 1736 */ 1737 cp->cp_eax &= 0x03fffffff; 1738 break; 1739 default: 1740 break; 1741 } 1742 break; 1743 case X86_VENDOR_AMD: 1744 switch (eax) { 1745 1746 case 0x80000001: 1747 cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D; 1748 break; 1749 1750 case CPUID_LEAF_EXT_8: 1751 /* 1752 * Zero out the (ncores-per-chip - 1) field 1753 */ 1754 cp->cp_ecx &= 0xffffff00; 1755 break; 1756 default: 1757 break; 1758 } 1759 break; 1760 default: 1761 break; 1762 } 1763 } 1764 #else 1765 #define platform_cpuid_mangle(vendor, eax, cp) /* nothing */ 1766 #endif 1767 1768 /* 1769 * Some undocumented ways of patching the results of the cpuid 1770 * instruction to permit running Solaris 10 on future cpus that 1771 * we don't currently support. Could be set to non-zero values 1772 * via settings in eeprom. 1773 */ 1774 1775 uint32_t cpuid_feature_ecx_include; 1776 uint32_t cpuid_feature_ecx_exclude; 1777 uint32_t cpuid_feature_edx_include; 1778 uint32_t cpuid_feature_edx_exclude; 1779 1780 /* 1781 * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs. 1782 */ 1783 void 1784 cpuid_alloc_space(cpu_t *cpu) 1785 { 1786 /* 1787 * By convention, cpu0 is the boot cpu, which is set up 1788 * before memory allocation is available. All other cpus get 1789 * their cpuid_info struct allocated here. 1790 */ 1791 ASSERT(cpu->cpu_id != 0); 1792 ASSERT(cpu->cpu_m.mcpu_cpi == NULL); 1793 cpu->cpu_m.mcpu_cpi = 1794 kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP); 1795 } 1796 1797 void 1798 cpuid_free_space(cpu_t *cpu) 1799 { 1800 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 1801 int i; 1802 1803 ASSERT(cpi != NULL); 1804 ASSERT(cpi != &cpuid_info0); 1805 1806 /* 1807 * Free up any cache leaf related dynamic storage. The first entry was 1808 * cached from the standard cpuid storage, so we should not free it. 1809 */ 1810 for (i = 1; i < cpi->cpi_cache_leaf_size; i++) 1811 kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs)); 1812 if (cpi->cpi_cache_leaf_size > 0) 1813 kmem_free(cpi->cpi_cache_leaves, 1814 cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *)); 1815 1816 kmem_free(cpi, sizeof (*cpi)); 1817 cpu->cpu_m.mcpu_cpi = NULL; 1818 } 1819 1820 #if !defined(__xpv) 1821 /* 1822 * Determine the type of the underlying platform. This is used to customize 1823 * initialization of various subsystems (e.g. TSC). determine_platform() must 1824 * only ever be called once to prevent two processors from seeing different 1825 * values of platform_type. Must be called before cpuid_pass1(), the earliest 1826 * consumer to execute (uses _cpuid_chiprev --> synth_amd_info --> get_hwenv). 1827 */ 1828 void 1829 determine_platform(void) 1830 { 1831 struct cpuid_regs cp; 1832 uint32_t base; 1833 uint32_t regs[4]; 1834 char *hvstr = (char *)regs; 1835 1836 ASSERT(platform_type == -1); 1837 1838 platform_type = HW_NATIVE; 1839 1840 if (!enable_platform_detection) 1841 return; 1842 1843 /* 1844 * If Hypervisor CPUID bit is set, try to determine hypervisor 1845 * vendor signature, and set platform type accordingly. 1846 * 1847 * References: 1848 * http://lkml.org/lkml/2008/10/1/246 1849 * http://kb.vmware.com/kb/1009458 1850 */ 1851 cp.cp_eax = 0x1; 1852 (void) __cpuid_insn(&cp); 1853 if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) { 1854 cp.cp_eax = 0x40000000; 1855 (void) __cpuid_insn(&cp); 1856 regs[0] = cp.cp_ebx; 1857 regs[1] = cp.cp_ecx; 1858 regs[2] = cp.cp_edx; 1859 regs[3] = 0; 1860 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) { 1861 platform_type = HW_XEN_HVM; 1862 return; 1863 } 1864 if (strcmp(hvstr, HVSIG_VMWARE) == 0) { 1865 platform_type = HW_VMWARE; 1866 return; 1867 } 1868 if (strcmp(hvstr, HVSIG_KVM) == 0) { 1869 platform_type = HW_KVM; 1870 return; 1871 } 1872 if (strcmp(hvstr, HVSIG_BHYVE) == 0) { 1873 platform_type = HW_BHYVE; 1874 return; 1875 } 1876 if (strcmp(hvstr, HVSIG_MICROSOFT) == 0) 1877 platform_type = HW_MICROSOFT; 1878 } else { 1879 /* 1880 * Check older VMware hardware versions. VMware hypervisor is 1881 * detected by performing an IN operation to VMware hypervisor 1882 * port and checking that value returned in %ebx is VMware 1883 * hypervisor magic value. 1884 * 1885 * References: http://kb.vmware.com/kb/1009458 1886 */ 1887 vmware_port(VMWARE_HVCMD_GETVERSION, regs); 1888 if (regs[1] == VMWARE_HVMAGIC) { 1889 platform_type = HW_VMWARE; 1890 return; 1891 } 1892 } 1893 1894 /* 1895 * Check Xen hypervisor. In a fully virtualized domain, 1896 * Xen's pseudo-cpuid function returns a string representing the 1897 * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum 1898 * supported cpuid function. We need at least a (base + 2) leaf value 1899 * to do what we want to do. Try different base values, since the 1900 * hypervisor might use a different one depending on whether Hyper-V 1901 * emulation is switched on by default or not. 1902 */ 1903 for (base = 0x40000000; base < 0x40010000; base += 0x100) { 1904 cp.cp_eax = base; 1905 (void) __cpuid_insn(&cp); 1906 regs[0] = cp.cp_ebx; 1907 regs[1] = cp.cp_ecx; 1908 regs[2] = cp.cp_edx; 1909 regs[3] = 0; 1910 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 && 1911 cp.cp_eax >= (base + 2)) { 1912 platform_type &= ~HW_NATIVE; 1913 platform_type |= HW_XEN_HVM; 1914 return; 1915 } 1916 } 1917 } 1918 1919 int 1920 get_hwenv(void) 1921 { 1922 ASSERT(platform_type != -1); 1923 return (platform_type); 1924 } 1925 1926 int 1927 is_controldom(void) 1928 { 1929 return (0); 1930 } 1931 1932 #else 1933 1934 int 1935 get_hwenv(void) 1936 { 1937 return (HW_XEN_PV); 1938 } 1939 1940 int 1941 is_controldom(void) 1942 { 1943 return (DOMAIN_IS_INITDOMAIN(xen_info)); 1944 } 1945 1946 #endif /* __xpv */ 1947 1948 /* 1949 * Make sure that we have gathered all of the CPUID leaves that we might need to 1950 * determine topology. We assume that the standard leaf 1 has already been done 1951 * and that xmaxeax has already been calculated. 1952 */ 1953 static void 1954 cpuid_gather_amd_topology_leaves(cpu_t *cpu) 1955 { 1956 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 1957 1958 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) { 1959 struct cpuid_regs *cp; 1960 1961 cp = &cpi->cpi_extd[8]; 1962 cp->cp_eax = CPUID_LEAF_EXT_8; 1963 (void) __cpuid_insn(cp); 1964 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp); 1965 } 1966 1967 if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 1968 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { 1969 struct cpuid_regs *cp; 1970 1971 cp = &cpi->cpi_extd[0x1e]; 1972 cp->cp_eax = CPUID_LEAF_EXT_1e; 1973 (void) __cpuid_insn(cp); 1974 } 1975 } 1976 1977 /* 1978 * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer 1979 * it to everything else. If not, and we're on an AMD system where 8000001e is 1980 * valid, then we use that. Othewrise, we fall back to the default value for the 1981 * APIC ID in leaf 1. 1982 */ 1983 static uint32_t 1984 cpuid_gather_apicid(struct cpuid_info *cpi) 1985 { 1986 /* 1987 * Leaf B changes based on the arguments to it. Beacuse we don't cache 1988 * it, we need to gather it again. 1989 */ 1990 if (cpi->cpi_maxeax >= 0xB) { 1991 struct cpuid_regs regs; 1992 struct cpuid_regs *cp; 1993 1994 cp = ®s; 1995 cp->cp_eax = 0xB; 1996 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0; 1997 (void) __cpuid_insn(cp); 1998 1999 if (cp->cp_ebx != 0) { 2000 return (cp->cp_edx); 2001 } 2002 } 2003 2004 if (cpi->cpi_vendor == X86_VENDOR_AMD && 2005 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 2006 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { 2007 return (cpi->cpi_extd[0x1e].cp_eax); 2008 } 2009 2010 return (CPI_APIC_ID(cpi)); 2011 } 2012 2013 /* 2014 * For AMD processors, attempt to calculate the number of chips and cores that 2015 * exist. The way that we do this varies based on the generation, because the 2016 * generations themselves have changed dramatically. 2017 * 2018 * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores. 2019 * However, with the advent of family 17h (Zen) it actually tells us the number 2020 * of threads, so we need to look at leaf 0x8000001e if available to determine 2021 * its value. Otherwise, for all prior families, the number of enabled cores is 2022 * the same as threads. 2023 * 2024 * If we do not have leaf 0x80000008, then we assume that this processor does 2025 * not have anything. AMD's older CPUID specification says there's no reason to 2026 * fall back to leaf 1. 2027 * 2028 * In some virtualization cases we will not have leaf 8000001e or it will be 2029 * zero. When that happens we assume the number of threads is one. 2030 */ 2031 static void 2032 cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores) 2033 { 2034 uint_t nthreads, nthread_per_core; 2035 2036 nthreads = nthread_per_core = 1; 2037 2038 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) { 2039 nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1; 2040 } else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) { 2041 nthreads = CPI_CPU_COUNT(cpi); 2042 } 2043 2044 /* 2045 * For us to have threads, and know about it, we have to be at least at 2046 * family 17h and have the cpuid bit that says we have extended 2047 * topology. 2048 */ 2049 if (cpi->cpi_family >= 0x17 && 2050 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 2051 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { 2052 nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1; 2053 } 2054 2055 *ncpus = nthreads; 2056 *ncores = nthreads / nthread_per_core; 2057 } 2058 2059 /* 2060 * Seed the initial values for the cores and threads for an Intel based 2061 * processor. These values will be overwritten if we detect that the processor 2062 * supports CPUID leaf 0xb. 2063 */ 2064 static void 2065 cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores) 2066 { 2067 /* 2068 * Only seed the number of physical cores from the first level leaf 4 2069 * information. The number of threads there indicate how many share the 2070 * L1 cache, which may or may not have anything to do with the number of 2071 * logical CPUs per core. 2072 */ 2073 if (cpi->cpi_maxeax >= 4) { 2074 *ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1; 2075 } else { 2076 *ncores = 1; 2077 } 2078 2079 if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) { 2080 *ncpus = CPI_CPU_COUNT(cpi); 2081 } else { 2082 *ncpus = *ncores; 2083 } 2084 } 2085 2086 static boolean_t 2087 cpuid_leafB_getids(cpu_t *cpu) 2088 { 2089 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2090 struct cpuid_regs regs; 2091 struct cpuid_regs *cp; 2092 2093 if (cpi->cpi_maxeax < 0xB) 2094 return (B_FALSE); 2095 2096 cp = ®s; 2097 cp->cp_eax = 0xB; 2098 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0; 2099 2100 (void) __cpuid_insn(cp); 2101 2102 /* 2103 * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which 2104 * indicates that the extended topology enumeration leaf is 2105 * available. 2106 */ 2107 if (cp->cp_ebx != 0) { 2108 uint32_t x2apic_id = 0; 2109 uint_t coreid_shift = 0; 2110 uint_t ncpu_per_core = 1; 2111 uint_t chipid_shift = 0; 2112 uint_t ncpu_per_chip = 1; 2113 uint_t i; 2114 uint_t level; 2115 2116 for (i = 0; i < CPI_FNB_ECX_MAX; i++) { 2117 cp->cp_eax = 0xB; 2118 cp->cp_ecx = i; 2119 2120 (void) __cpuid_insn(cp); 2121 level = CPI_CPU_LEVEL_TYPE(cp); 2122 2123 if (level == 1) { 2124 x2apic_id = cp->cp_edx; 2125 coreid_shift = BITX(cp->cp_eax, 4, 0); 2126 ncpu_per_core = BITX(cp->cp_ebx, 15, 0); 2127 } else if (level == 2) { 2128 x2apic_id = cp->cp_edx; 2129 chipid_shift = BITX(cp->cp_eax, 4, 0); 2130 ncpu_per_chip = BITX(cp->cp_ebx, 15, 0); 2131 } 2132 } 2133 2134 /* 2135 * cpi_apicid is taken care of in cpuid_gather_apicid. 2136 */ 2137 cpi->cpi_ncpu_per_chip = ncpu_per_chip; 2138 cpi->cpi_ncore_per_chip = ncpu_per_chip / 2139 ncpu_per_core; 2140 cpi->cpi_chipid = x2apic_id >> chipid_shift; 2141 cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1); 2142 cpi->cpi_coreid = x2apic_id >> coreid_shift; 2143 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift; 2144 cpi->cpi_procnodeid = cpi->cpi_chipid; 2145 cpi->cpi_compunitid = cpi->cpi_coreid; 2146 2147 if (coreid_shift > 0 && chipid_shift > coreid_shift) { 2148 cpi->cpi_nthread_bits = coreid_shift; 2149 cpi->cpi_ncore_bits = chipid_shift - coreid_shift; 2150 } 2151 2152 return (B_TRUE); 2153 } else { 2154 return (B_FALSE); 2155 } 2156 } 2157 2158 static void 2159 cpuid_intel_getids(cpu_t *cpu, void *feature) 2160 { 2161 uint_t i; 2162 uint_t chipid_shift = 0; 2163 uint_t coreid_shift = 0; 2164 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2165 2166 /* 2167 * There are no compute units or processor nodes currently on Intel. 2168 * Always set these to one. 2169 */ 2170 cpi->cpi_procnodes_per_pkg = 1; 2171 cpi->cpi_cores_per_compunit = 1; 2172 2173 /* 2174 * If cpuid Leaf B is present, use that to try and get this information. 2175 * It will be the most accurate for Intel CPUs. 2176 */ 2177 if (cpuid_leafB_getids(cpu)) 2178 return; 2179 2180 /* 2181 * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip 2182 * and ncore_per_chip. These represent the largest power of two values 2183 * that we need to cover all of the IDs in the system. Therefore, we use 2184 * those values to seed the number of bits needed to cover information 2185 * in the case when leaf B is not available. These values will probably 2186 * be larger than required, but that's OK. 2187 */ 2188 cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip); 2189 cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip); 2190 2191 for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1) 2192 chipid_shift++; 2193 2194 cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift; 2195 cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1); 2196 2197 if (is_x86_feature(feature, X86FSET_CMP)) { 2198 /* 2199 * Multi-core (and possibly multi-threaded) 2200 * processors. 2201 */ 2202 uint_t ncpu_per_core; 2203 if (cpi->cpi_ncore_per_chip == 1) 2204 ncpu_per_core = cpi->cpi_ncpu_per_chip; 2205 else if (cpi->cpi_ncore_per_chip > 1) 2206 ncpu_per_core = cpi->cpi_ncpu_per_chip / 2207 cpi->cpi_ncore_per_chip; 2208 /* 2209 * 8bit APIC IDs on dual core Pentiums 2210 * look like this: 2211 * 2212 * +-----------------------+------+------+ 2213 * | Physical Package ID | MC | HT | 2214 * +-----------------------+------+------+ 2215 * <------- chipid --------> 2216 * <------- coreid ---------------> 2217 * <--- clogid --> 2218 * <------> 2219 * pkgcoreid 2220 * 2221 * Where the number of bits necessary to 2222 * represent MC and HT fields together equals 2223 * to the minimum number of bits necessary to 2224 * store the value of cpi->cpi_ncpu_per_chip. 2225 * Of those bits, the MC part uses the number 2226 * of bits necessary to store the value of 2227 * cpi->cpi_ncore_per_chip. 2228 */ 2229 for (i = 1; i < ncpu_per_core; i <<= 1) 2230 coreid_shift++; 2231 cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift; 2232 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift; 2233 } else if (is_x86_feature(feature, X86FSET_HTT)) { 2234 /* 2235 * Single-core multi-threaded processors. 2236 */ 2237 cpi->cpi_coreid = cpi->cpi_chipid; 2238 cpi->cpi_pkgcoreid = 0; 2239 } else { 2240 /* 2241 * Single-core single-thread processors. 2242 */ 2243 cpi->cpi_coreid = cpu->cpu_id; 2244 cpi->cpi_pkgcoreid = 0; 2245 } 2246 cpi->cpi_procnodeid = cpi->cpi_chipid; 2247 cpi->cpi_compunitid = cpi->cpi_coreid; 2248 } 2249 2250 /* 2251 * Historically, AMD has had CMP chips with only a single thread per core. 2252 * However, starting in family 17h (Zen), this has changed and they now have 2253 * multiple threads. Our internal core id needs to be a unique value. 2254 * 2255 * To determine the core id of an AMD system, if we're from a family before 17h, 2256 * then we just use the cpu id, as that gives us a good value that will be 2257 * unique for each core. If instead, we're on family 17h or later, then we need 2258 * to do something more complicated. CPUID leaf 0x8000001e can tell us 2259 * how many threads are in the system. Based on that, we'll shift the APIC ID. 2260 * We can't use the normal core id in that leaf as it's only unique within the 2261 * socket, which is perfect for cpi_pkgcoreid, but not us. 2262 */ 2263 static id_t 2264 cpuid_amd_get_coreid(cpu_t *cpu) 2265 { 2266 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2267 2268 if (cpi->cpi_family >= 0x17 && 2269 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 2270 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { 2271 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1; 2272 if (nthreads > 1) { 2273 VERIFY3U(nthreads, ==, 2); 2274 return (cpi->cpi_apicid >> 1); 2275 } 2276 } 2277 2278 return (cpu->cpu_id); 2279 } 2280 2281 /* 2282 * IDs on AMD is a more challenging task. This is notable because of the 2283 * following two facts: 2284 * 2285 * 1. Before family 0x17 (Zen), there was no support for SMT and there was 2286 * also no way to get an actual unique core id from the system. As such, we 2287 * synthesize this case by using cpu->cpu_id. This scheme does not, 2288 * however, guarantee that sibling cores of a chip will have sequential 2289 * coreids starting at a multiple of the number of cores per chip - that is 2290 * usually the case, but if the ACPI MADT table is presented in a different 2291 * order then we need to perform a few more gymnastics for the pkgcoreid. 2292 * 2293 * 2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups 2294 * called compute units. These compute units share the L1I cache, L2 cache, 2295 * and the FPU. To deal with this, a new topology leaf was added in 2296 * 0x8000001e. However, parts of this leaf have different meanings 2297 * once we get to family 0x17. 2298 */ 2299 2300 static void 2301 cpuid_amd_getids(cpu_t *cpu, uchar_t *features) 2302 { 2303 int i, first_half, coreidsz; 2304 uint32_t nb_caps_reg; 2305 uint_t node2_1; 2306 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2307 struct cpuid_regs *cp; 2308 2309 /* 2310 * Calculate the core id (this comes from hardware in family 0x17 if it 2311 * hasn't been stripped by virtualization). We always set the compute 2312 * unit id to the same value. Also, initialize the default number of 2313 * cores per compute unit and nodes per package. This will be 2314 * overwritten when we know information about a particular family. 2315 */ 2316 cpi->cpi_coreid = cpuid_amd_get_coreid(cpu); 2317 cpi->cpi_compunitid = cpi->cpi_coreid; 2318 cpi->cpi_cores_per_compunit = 1; 2319 cpi->cpi_procnodes_per_pkg = 1; 2320 2321 /* 2322 * To construct the logical ID, we need to determine how many APIC IDs 2323 * are dedicated to the cores and threads. This is provided for us in 2324 * 0x80000008. However, if it's not present (say due to virtualization), 2325 * then we assume it's one. This should be present on all 64-bit AMD 2326 * processors. It was added in family 0xf (Hammer). 2327 */ 2328 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) { 2329 coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12); 2330 2331 /* 2332 * In AMD parlance chip is really a node while illumos 2333 * uses chip as equivalent to socket/package. 2334 */ 2335 if (coreidsz == 0) { 2336 /* Use legacy method */ 2337 for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1) 2338 coreidsz++; 2339 if (coreidsz == 0) 2340 coreidsz = 1; 2341 } 2342 } else { 2343 /* Assume single-core part */ 2344 coreidsz = 1; 2345 } 2346 cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1); 2347 2348 /* 2349 * The package core ID varies depending on the family. While it may be 2350 * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately, 2351 * this value is the core id in the given node. For non-virtualized 2352 * family 17h, we need to take the logical core id and shift off the 2353 * threads like we do when getting the core id. Otherwise, we can use 2354 * the clogid as is. When family 17h is virtualized, the clogid should 2355 * be sufficient as if we don't have valid data in the leaf, then we 2356 * won't think we have SMT, in which case the cpi_clogid should be 2357 * sufficient. 2358 */ 2359 if (cpi->cpi_family >= 0x17 && 2360 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 2361 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e && 2362 cpi->cpi_extd[0x1e].cp_ebx != 0) { 2363 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1; 2364 if (nthreads > 1) { 2365 VERIFY3U(nthreads, ==, 2); 2366 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1; 2367 } else { 2368 cpi->cpi_pkgcoreid = cpi->cpi_clogid; 2369 } 2370 } else { 2371 cpi->cpi_pkgcoreid = cpi->cpi_clogid; 2372 } 2373 2374 /* 2375 * Obtain the node ID and compute unit IDs. If we're on family 0x15 2376 * (bulldozer) or newer, then we can derive all of this from leaf 2377 * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family. 2378 */ 2379 if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 2380 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { 2381 cp = &cpi->cpi_extd[0x1e]; 2382 2383 cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1; 2384 cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0); 2385 2386 /* 2387 * For Bulldozer-era CPUs, recalculate the compute unit 2388 * information. 2389 */ 2390 if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) { 2391 cpi->cpi_cores_per_compunit = 2392 BITX(cp->cp_ebx, 15, 8) + 1; 2393 cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) + 2394 (cpi->cpi_ncore_per_chip / 2395 cpi->cpi_cores_per_compunit) * 2396 (cpi->cpi_procnodeid / 2397 cpi->cpi_procnodes_per_pkg); 2398 } 2399 } else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) { 2400 cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7; 2401 } else if (cpi->cpi_family == 0x10) { 2402 /* 2403 * See if we are a multi-node processor. 2404 * All processors in the system have the same number of nodes 2405 */ 2406 nb_caps_reg = pci_getl_func(0, 24, 3, 0xe8); 2407 if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) { 2408 /* Single-node */ 2409 cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5, 2410 coreidsz); 2411 } else { 2412 2413 /* 2414 * Multi-node revision D (2 nodes per package 2415 * are supported) 2416 */ 2417 cpi->cpi_procnodes_per_pkg = 2; 2418 2419 first_half = (cpi->cpi_pkgcoreid <= 2420 (cpi->cpi_ncore_per_chip/2 - 1)); 2421 2422 if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) { 2423 /* We are BSP */ 2424 cpi->cpi_procnodeid = (first_half ? 0 : 1); 2425 } else { 2426 2427 /* We are AP */ 2428 /* NodeId[2:1] bits to use for reading F3xe8 */ 2429 node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1; 2430 2431 nb_caps_reg = 2432 pci_getl_func(0, 24 + node2_1, 3, 0xe8); 2433 2434 /* 2435 * Check IntNodeNum bit (31:30, but bit 31 is 2436 * always 0 on dual-node processors) 2437 */ 2438 if (BITX(nb_caps_reg, 30, 30) == 0) 2439 cpi->cpi_procnodeid = node2_1 + 2440 !first_half; 2441 else 2442 cpi->cpi_procnodeid = node2_1 + 2443 first_half; 2444 } 2445 } 2446 } else { 2447 cpi->cpi_procnodeid = 0; 2448 } 2449 2450 cpi->cpi_chipid = 2451 cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg; 2452 2453 cpi->cpi_ncore_bits = coreidsz; 2454 cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip / 2455 cpi->cpi_ncore_per_chip); 2456 } 2457 2458 static void 2459 spec_uarch_flush_noop(void) 2460 { 2461 } 2462 2463 /* 2464 * When microcode is present that mitigates MDS, this wrmsr will also flush the 2465 * MDS-related micro-architectural state that would normally happen by calling 2466 * x86_md_clear(). 2467 */ 2468 static void 2469 spec_uarch_flush_msr(void) 2470 { 2471 wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D); 2472 } 2473 2474 /* 2475 * This function points to a function that will flush certain 2476 * micro-architectural state on the processor. This flush is used to mitigate 2477 * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This 2478 * function can point to one of three functions: 2479 * 2480 * - A noop which is done because we either are vulnerable, but do not have 2481 * microcode available to help deal with a fix, or because we aren't 2482 * vulnerable. 2483 * 2484 * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to 2485 * mitigate MDS is present, also perform the equivalent of the MDS flush; 2486 * however, it only flushes the MDS related micro-architectural state on the 2487 * current hyperthread, it does not do anything for the twin. 2488 * 2489 * - x86_md_clear which will flush the MDS related state. This is done when we 2490 * have a processor that is vulnerable to MDS, but is not vulnerable to L1TF 2491 * (RDCL_NO is set). 2492 */ 2493 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop; 2494 2495 static void 2496 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset) 2497 { 2498 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2499 2500 /* 2501 * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS) 2502 * has been fixed in hardware, it doesn't cover everything related to 2503 * MDS. Therefore we can only rely on MDS_NO to determine that we don't 2504 * need to mitigate this. 2505 */ 2506 if (cpi->cpi_vendor != X86_VENDOR_Intel || 2507 is_x86_feature(featureset, X86FSET_MDS_NO)) { 2508 return; 2509 } 2510 2511 if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) { 2512 const uint8_t nop = NOP_INSTR; 2513 uint8_t *md = (uint8_t *)x86_md_clear; 2514 2515 *md = nop; 2516 } 2517 2518 membar_producer(); 2519 } 2520 2521 static void 2522 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset) 2523 { 2524 boolean_t need_l1d, need_mds; 2525 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2526 2527 /* 2528 * If we're not on Intel or we've mitigated both RDCL and MDS in 2529 * hardware, then there's nothing left for us to do for enabling the 2530 * flush. We can also go ahead and say that SMT exclusion is 2531 * unnecessary. 2532 */ 2533 if (cpi->cpi_vendor != X86_VENDOR_Intel || 2534 (is_x86_feature(featureset, X86FSET_RDCL_NO) && 2535 is_x86_feature(featureset, X86FSET_MDS_NO))) { 2536 extern int smt_exclusion; 2537 smt_exclusion = 0; 2538 spec_uarch_flush = spec_uarch_flush_noop; 2539 membar_producer(); 2540 return; 2541 } 2542 2543 /* 2544 * The locations where we need to perform an L1D flush are required both 2545 * for mitigating L1TF and MDS. When verw support is present in 2546 * microcode, then the L1D flush will take care of doing that as well. 2547 * However, if we have a system where RDCL_NO is present, but we don't 2548 * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full 2549 * L1D flush. 2550 */ 2551 if (!is_x86_feature(featureset, X86FSET_RDCL_NO) && 2552 is_x86_feature(featureset, X86FSET_FLUSH_CMD) && 2553 !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) { 2554 need_l1d = B_TRUE; 2555 } else { 2556 need_l1d = B_FALSE; 2557 } 2558 2559 if (!is_x86_feature(featureset, X86FSET_MDS_NO) && 2560 is_x86_feature(featureset, X86FSET_MD_CLEAR)) { 2561 need_mds = B_TRUE; 2562 } else { 2563 need_mds = B_FALSE; 2564 } 2565 2566 if (need_l1d) { 2567 spec_uarch_flush = spec_uarch_flush_msr; 2568 } else if (need_mds) { 2569 spec_uarch_flush = x86_md_clear; 2570 } else { 2571 /* 2572 * We have no hardware mitigations available to us. 2573 */ 2574 spec_uarch_flush = spec_uarch_flush_noop; 2575 } 2576 membar_producer(); 2577 } 2578 2579 /* 2580 * We default to enabling RSB mitigations. 2581 */ 2582 static void 2583 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit) 2584 { 2585 const uint8_t ret = RET_INSTR; 2586 uint8_t *stuff = (uint8_t *)x86_rsb_stuff; 2587 2588 switch (mit) { 2589 case X86_SPECTREV2_ENHANCED_IBRS: 2590 case X86_SPECTREV2_DISABLED: 2591 *stuff = ret; 2592 break; 2593 default: 2594 break; 2595 } 2596 } 2597 2598 static void 2599 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit) 2600 { 2601 const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi", 2602 "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13", 2603 "_r14", "_r15" }; 2604 const uint_t nthunks = ARRAY_SIZE(thunks); 2605 const char *type; 2606 uint_t i; 2607 2608 if (mit == x86_spectrev2_mitigation) 2609 return; 2610 2611 switch (mit) { 2612 case X86_SPECTREV2_RETPOLINE: 2613 type = "gen"; 2614 break; 2615 case X86_SPECTREV2_RETPOLINE_AMD: 2616 type = "amd"; 2617 break; 2618 case X86_SPECTREV2_ENHANCED_IBRS: 2619 case X86_SPECTREV2_DISABLED: 2620 type = "jmp"; 2621 break; 2622 default: 2623 panic("asked to updated retpoline state with unknown state!"); 2624 } 2625 2626 for (i = 0; i < nthunks; i++) { 2627 uintptr_t source, dest; 2628 int ssize, dsize; 2629 char sourcebuf[64], destbuf[64]; 2630 size_t len; 2631 2632 (void) snprintf(destbuf, sizeof (destbuf), 2633 "__x86_indirect_thunk%s", thunks[i]); 2634 (void) snprintf(sourcebuf, sizeof (sourcebuf), 2635 "__x86_indirect_thunk_%s%s", type, thunks[i]); 2636 2637 source = kobj_getelfsym(sourcebuf, NULL, &ssize); 2638 dest = kobj_getelfsym(destbuf, NULL, &dsize); 2639 VERIFY3U(source, !=, 0); 2640 VERIFY3U(dest, !=, 0); 2641 VERIFY3S(dsize, >=, ssize); 2642 bcopy((void *)source, (void *)dest, ssize); 2643 } 2644 } 2645 2646 static void 2647 cpuid_enable_enhanced_ibrs(void) 2648 { 2649 uint64_t val; 2650 2651 val = rdmsr(MSR_IA32_SPEC_CTRL); 2652 val |= IA32_SPEC_CTRL_IBRS; 2653 wrmsr(MSR_IA32_SPEC_CTRL, val); 2654 } 2655 2656 #ifndef __xpv 2657 /* 2658 * Determine whether or not we can use the AMD optimized retpoline 2659 * functionality. We use this when we know we're on an AMD system and we can 2660 * successfully verify that lfence is dispatch serializing. 2661 */ 2662 static boolean_t 2663 cpuid_use_amd_retpoline(struct cpuid_info *cpi) 2664 { 2665 uint64_t val; 2666 on_trap_data_t otd; 2667 2668 if (cpi->cpi_vendor != X86_VENDOR_AMD) 2669 return (B_FALSE); 2670 2671 /* 2672 * We need to determine whether or not lfence is serializing. It always 2673 * is on families 0xf and 0x11. On others, it's controlled by 2674 * MSR_AMD_DECODE_CONFIG (MSRC001_1029). If some hypervisor gives us a 2675 * crazy old family, don't try and do anything. 2676 */ 2677 if (cpi->cpi_family < 0xf) 2678 return (B_FALSE); 2679 if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11) 2680 return (B_TRUE); 2681 2682 /* 2683 * While it may be tempting to use get_hwenv(), there are no promises 2684 * that a hypervisor will actually declare themselves to be so in a 2685 * friendly way. As such, try to read and set the MSR. If we can then 2686 * read back the value we set (it wasn't just set to zero), then we go 2687 * for it. 2688 */ 2689 if (!on_trap(&otd, OT_DATA_ACCESS)) { 2690 val = rdmsr(MSR_AMD_DECODE_CONFIG); 2691 val |= AMD_DECODE_CONFIG_LFENCE_DISPATCH; 2692 wrmsr(MSR_AMD_DECODE_CONFIG, val); 2693 val = rdmsr(MSR_AMD_DECODE_CONFIG); 2694 } else { 2695 val = 0; 2696 } 2697 no_trap(); 2698 2699 if ((val & AMD_DECODE_CONFIG_LFENCE_DISPATCH) != 0) 2700 return (B_TRUE); 2701 return (B_FALSE); 2702 } 2703 #endif /* !__xpv */ 2704 2705 static void 2706 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset) 2707 { 2708 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2709 x86_spectrev2_mitigation_t v2mit; 2710 2711 if (cpi->cpi_vendor == X86_VENDOR_AMD && 2712 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) { 2713 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB) 2714 add_x86_feature(featureset, X86FSET_IBPB); 2715 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS) 2716 add_x86_feature(featureset, X86FSET_IBRS); 2717 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP) 2718 add_x86_feature(featureset, X86FSET_STIBP); 2719 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL) 2720 add_x86_feature(featureset, X86FSET_STIBP_ALL); 2721 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD) 2722 add_x86_feature(featureset, X86FSET_SSBD); 2723 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD) 2724 add_x86_feature(featureset, X86FSET_SSBD_VIRT); 2725 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO) 2726 add_x86_feature(featureset, X86FSET_SSB_NO); 2727 /* 2728 * Don't enable enhanced IBRS unless we're told that we should 2729 * prefer it and it has the same semantics as Intel. This is 2730 * split into two bits rather than a single one. 2731 */ 2732 if ((cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) && 2733 (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)) { 2734 add_x86_feature(featureset, X86FSET_IBRS_ALL); 2735 } 2736 2737 } else if (cpi->cpi_vendor == X86_VENDOR_Intel && 2738 cpi->cpi_maxeax >= 7) { 2739 struct cpuid_regs *ecp; 2740 ecp = &cpi->cpi_std[7]; 2741 2742 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) { 2743 add_x86_feature(featureset, X86FSET_MD_CLEAR); 2744 } 2745 2746 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) { 2747 add_x86_feature(featureset, X86FSET_IBRS); 2748 add_x86_feature(featureset, X86FSET_IBPB); 2749 } 2750 2751 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) { 2752 add_x86_feature(featureset, X86FSET_STIBP); 2753 } 2754 2755 /* 2756 * Don't read the arch caps MSR on xpv where we lack the 2757 * on_trap(). 2758 */ 2759 #ifndef __xpv 2760 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) { 2761 on_trap_data_t otd; 2762 2763 /* 2764 * Be paranoid and assume we'll get a #GP. 2765 */ 2766 if (!on_trap(&otd, OT_DATA_ACCESS)) { 2767 uint64_t reg; 2768 2769 reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES); 2770 if (reg & IA32_ARCH_CAP_RDCL_NO) { 2771 add_x86_feature(featureset, 2772 X86FSET_RDCL_NO); 2773 } 2774 if (reg & IA32_ARCH_CAP_IBRS_ALL) { 2775 add_x86_feature(featureset, 2776 X86FSET_IBRS_ALL); 2777 } 2778 if (reg & IA32_ARCH_CAP_RSBA) { 2779 add_x86_feature(featureset, 2780 X86FSET_RSBA); 2781 } 2782 if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) { 2783 add_x86_feature(featureset, 2784 X86FSET_L1D_VM_NO); 2785 } 2786 if (reg & IA32_ARCH_CAP_SSB_NO) { 2787 add_x86_feature(featureset, 2788 X86FSET_SSB_NO); 2789 } 2790 if (reg & IA32_ARCH_CAP_MDS_NO) { 2791 add_x86_feature(featureset, 2792 X86FSET_MDS_NO); 2793 } 2794 } 2795 no_trap(); 2796 } 2797 #endif /* !__xpv */ 2798 2799 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD) 2800 add_x86_feature(featureset, X86FSET_SSBD); 2801 2802 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD) 2803 add_x86_feature(featureset, X86FSET_FLUSH_CMD); 2804 } 2805 2806 if (cpu->cpu_id != 0) { 2807 if (x86_spectrev2_mitigation == X86_SPECTREV2_ENHANCED_IBRS) { 2808 cpuid_enable_enhanced_ibrs(); 2809 } 2810 return; 2811 } 2812 2813 /* 2814 * Go through and initialize various security mechanisms that we should 2815 * only do on a single CPU. This includes Spectre V2, L1TF, and MDS. 2816 */ 2817 2818 /* 2819 * By default we've come in with retpolines enabled. Check whether we 2820 * should disable them or enable enhanced IBRS. RSB stuffing is enabled 2821 * by default, but disabled if we are using enhanced IBRS. 2822 */ 2823 if (x86_disable_spectrev2 != 0) { 2824 v2mit = X86_SPECTREV2_DISABLED; 2825 } else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) { 2826 cpuid_enable_enhanced_ibrs(); 2827 v2mit = X86_SPECTREV2_ENHANCED_IBRS; 2828 #ifndef __xpv 2829 } else if (cpuid_use_amd_retpoline(cpi)) { 2830 v2mit = X86_SPECTREV2_RETPOLINE_AMD; 2831 #endif /* !__xpv */ 2832 } else { 2833 v2mit = X86_SPECTREV2_RETPOLINE; 2834 } 2835 2836 cpuid_patch_retpolines(v2mit); 2837 cpuid_patch_rsb(v2mit); 2838 x86_spectrev2_mitigation = v2mit; 2839 membar_producer(); 2840 2841 /* 2842 * We need to determine what changes are required for mitigating L1TF 2843 * and MDS. If the CPU suffers from either of them, then SMT exclusion 2844 * is required. 2845 * 2846 * If any of these are present, then we need to flush u-arch state at 2847 * various points. For MDS, we need to do so whenever we change to a 2848 * lesser privilege level or we are halting the CPU. For L1TF we need to 2849 * flush the L1D cache at VM entry. When we have microcode that handles 2850 * MDS, the L1D flush also clears the other u-arch state that the 2851 * md_clear does. 2852 */ 2853 2854 /* 2855 * Update whether or not we need to be taking explicit action against 2856 * MDS. 2857 */ 2858 cpuid_update_md_clear(cpu, featureset); 2859 2860 /* 2861 * Determine whether SMT exclusion is required and whether or not we 2862 * need to perform an l1d flush. 2863 */ 2864 cpuid_update_l1d_flush(cpu, featureset); 2865 } 2866 2867 /* 2868 * Setup XFeature_Enabled_Mask register. Required by xsave feature. 2869 */ 2870 void 2871 setup_xfem(void) 2872 { 2873 uint64_t flags = XFEATURE_LEGACY_FP; 2874 2875 ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE)); 2876 2877 if (is_x86_feature(x86_featureset, X86FSET_SSE)) 2878 flags |= XFEATURE_SSE; 2879 2880 if (is_x86_feature(x86_featureset, X86FSET_AVX)) 2881 flags |= XFEATURE_AVX; 2882 2883 if (is_x86_feature(x86_featureset, X86FSET_AVX512F)) 2884 flags |= XFEATURE_AVX512; 2885 2886 set_xcr(XFEATURE_ENABLED_MASK, flags); 2887 2888 xsave_bv_all = flags; 2889 } 2890 2891 static void 2892 cpuid_pass1_topology(cpu_t *cpu, uchar_t *featureset) 2893 { 2894 struct cpuid_info *cpi; 2895 2896 cpi = cpu->cpu_m.mcpu_cpi; 2897 2898 if (cpi->cpi_vendor == X86_VENDOR_AMD) { 2899 cpuid_gather_amd_topology_leaves(cpu); 2900 } 2901 2902 cpi->cpi_apicid = cpuid_gather_apicid(cpi); 2903 2904 /* 2905 * Before we can calculate the IDs that we should assign to this 2906 * processor, we need to understand how many cores and threads it has. 2907 */ 2908 switch (cpi->cpi_vendor) { 2909 case X86_VENDOR_Intel: 2910 cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip, 2911 &cpi->cpi_ncore_per_chip); 2912 break; 2913 case X86_VENDOR_AMD: 2914 cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip, 2915 &cpi->cpi_ncore_per_chip); 2916 break; 2917 default: 2918 /* 2919 * If we have some other x86 compatible chip, it's not clear how 2920 * they would behave. The most common case is virtualization 2921 * today, though there are also 64-bit VIA chips. Assume that 2922 * all we can get is the basic Leaf 1 HTT information. 2923 */ 2924 if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) { 2925 cpi->cpi_ncore_per_chip = 1; 2926 cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi); 2927 } 2928 break; 2929 } 2930 2931 /* 2932 * Based on the calculated number of threads and cores, potentially 2933 * assign the HTT and CMT features. 2934 */ 2935 if (cpi->cpi_ncore_per_chip > 1) { 2936 add_x86_feature(featureset, X86FSET_CMP); 2937 } 2938 2939 if (cpi->cpi_ncpu_per_chip > 1 && 2940 cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) { 2941 add_x86_feature(featureset, X86FSET_HTT); 2942 } 2943 2944 /* 2945 * Now that has been set up, we need to go through and calculate all of 2946 * the rest of the parameters that exist. If we think the CPU doesn't 2947 * have either SMT (HTT) or CMP, then we basically go through and fake 2948 * up information in some way. The most likely case for this is 2949 * virtualization where we have a lot of partial topology information. 2950 */ 2951 if (!is_x86_feature(featureset, X86FSET_HTT) && 2952 !is_x86_feature(featureset, X86FSET_CMP)) { 2953 /* 2954 * This is a single core, single-threaded processor. 2955 */ 2956 cpi->cpi_procnodes_per_pkg = 1; 2957 cpi->cpi_cores_per_compunit = 1; 2958 cpi->cpi_compunitid = 0; 2959 cpi->cpi_chipid = -1; 2960 cpi->cpi_clogid = 0; 2961 cpi->cpi_coreid = cpu->cpu_id; 2962 cpi->cpi_pkgcoreid = 0; 2963 if (cpi->cpi_vendor == X86_VENDOR_AMD) { 2964 cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0); 2965 } else { 2966 cpi->cpi_procnodeid = cpi->cpi_chipid; 2967 } 2968 } else { 2969 switch (cpi->cpi_vendor) { 2970 case X86_VENDOR_Intel: 2971 cpuid_intel_getids(cpu, featureset); 2972 break; 2973 case X86_VENDOR_AMD: 2974 cpuid_amd_getids(cpu, featureset); 2975 break; 2976 default: 2977 /* 2978 * In this case, it's hard to say what we should do. 2979 * We're going to model them to the OS as single core 2980 * threads. We don't have a good identifier for them, so 2981 * we're just going to use the cpu id all on a single 2982 * chip. 2983 * 2984 * This case has historically been different from the 2985 * case above where we don't have HTT or CMP. While they 2986 * could be combined, we've opted to keep it separate to 2987 * minimize the risk of topology changes in weird cases. 2988 */ 2989 cpi->cpi_procnodes_per_pkg = 1; 2990 cpi->cpi_cores_per_compunit = 1; 2991 cpi->cpi_chipid = 0; 2992 cpi->cpi_coreid = cpu->cpu_id; 2993 cpi->cpi_clogid = cpu->cpu_id; 2994 cpi->cpi_pkgcoreid = cpu->cpu_id; 2995 cpi->cpi_procnodeid = cpi->cpi_chipid; 2996 cpi->cpi_compunitid = cpi->cpi_coreid; 2997 break; 2998 } 2999 } 3000 } 3001 3002 /* 3003 * Gather relevant CPU features from leaf 6 which covers thermal information. We 3004 * always gather leaf 6 if it's supported; however, we only look for features on 3005 * Intel systems as AMD does not currently define any of the features we look 3006 * for below. 3007 */ 3008 static void 3009 cpuid_pass1_thermal(cpu_t *cpu, uchar_t *featureset) 3010 { 3011 struct cpuid_regs *cp; 3012 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 3013 3014 if (cpi->cpi_maxeax < 6) { 3015 return; 3016 } 3017 3018 cp = &cpi->cpi_std[6]; 3019 cp->cp_eax = 6; 3020 cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0; 3021 (void) __cpuid_insn(cp); 3022 platform_cpuid_mangle(cpi->cpi_vendor, 6, cp); 3023 3024 if (cpi->cpi_vendor != X86_VENDOR_Intel) { 3025 return; 3026 } 3027 3028 if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) { 3029 add_x86_feature(featureset, X86FSET_CORE_THERMAL); 3030 } 3031 3032 if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) { 3033 add_x86_feature(featureset, X86FSET_PKG_THERMAL); 3034 } 3035 } 3036 3037 void 3038 cpuid_pass1(cpu_t *cpu, uchar_t *featureset) 3039 { 3040 uint32_t mask_ecx, mask_edx; 3041 struct cpuid_info *cpi; 3042 struct cpuid_regs *cp; 3043 int xcpuid; 3044 #if !defined(__xpv) 3045 extern int idle_cpu_prefer_mwait; 3046 #endif 3047 3048 /* 3049 * Space statically allocated for BSP, ensure pointer is set 3050 */ 3051 if (cpu->cpu_id == 0) { 3052 if (cpu->cpu_m.mcpu_cpi == NULL) 3053 cpu->cpu_m.mcpu_cpi = &cpuid_info0; 3054 } 3055 3056 add_x86_feature(featureset, X86FSET_CPUID); 3057 3058 cpi = cpu->cpu_m.mcpu_cpi; 3059 ASSERT(cpi != NULL); 3060 cp = &cpi->cpi_std[0]; 3061 cp->cp_eax = 0; 3062 cpi->cpi_maxeax = __cpuid_insn(cp); 3063 { 3064 uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr; 3065 *iptr++ = cp->cp_ebx; 3066 *iptr++ = cp->cp_edx; 3067 *iptr++ = cp->cp_ecx; 3068 *(char *)&cpi->cpi_vendorstr[12] = '\0'; 3069 } 3070 3071 cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr); 3072 x86_vendor = cpi->cpi_vendor; /* for compatibility */ 3073 3074 /* 3075 * Limit the range in case of weird hardware 3076 */ 3077 if (cpi->cpi_maxeax > CPI_MAXEAX_MAX) 3078 cpi->cpi_maxeax = CPI_MAXEAX_MAX; 3079 if (cpi->cpi_maxeax < 1) 3080 goto pass1_done; 3081 3082 cp = &cpi->cpi_std[1]; 3083 cp->cp_eax = 1; 3084 (void) __cpuid_insn(cp); 3085 3086 /* 3087 * Extract identifying constants for easy access. 3088 */ 3089 cpi->cpi_model = CPI_MODEL(cpi); 3090 cpi->cpi_family = CPI_FAMILY(cpi); 3091 3092 if (cpi->cpi_family == 0xf) 3093 cpi->cpi_family += CPI_FAMILY_XTD(cpi); 3094 3095 /* 3096 * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf. 3097 * Intel, and presumably everyone else, uses model == 0xf, as 3098 * one would expect (max value means possible overflow). Sigh. 3099 */ 3100 3101 switch (cpi->cpi_vendor) { 3102 case X86_VENDOR_Intel: 3103 if (IS_EXTENDED_MODEL_INTEL(cpi)) 3104 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4; 3105 break; 3106 case X86_VENDOR_AMD: 3107 if (CPI_FAMILY(cpi) == 0xf) 3108 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4; 3109 break; 3110 default: 3111 if (cpi->cpi_model == 0xf) 3112 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4; 3113 break; 3114 } 3115 3116 cpi->cpi_step = CPI_STEP(cpi); 3117 cpi->cpi_brandid = CPI_BRANDID(cpi); 3118 3119 /* 3120 * *default* assumptions: 3121 * - believe %edx feature word 3122 * - ignore %ecx feature word 3123 * - 32-bit virtual and physical addressing 3124 */ 3125 mask_edx = 0xffffffff; 3126 mask_ecx = 0; 3127 3128 cpi->cpi_pabits = cpi->cpi_vabits = 32; 3129 3130 switch (cpi->cpi_vendor) { 3131 case X86_VENDOR_Intel: 3132 if (cpi->cpi_family == 5) 3133 x86_type = X86_TYPE_P5; 3134 else if (IS_LEGACY_P6(cpi)) { 3135 x86_type = X86_TYPE_P6; 3136 pentiumpro_bug4046376 = 1; 3137 /* 3138 * Clear the SEP bit when it was set erroneously 3139 */ 3140 if (cpi->cpi_model < 3 && cpi->cpi_step < 3) 3141 cp->cp_edx &= ~CPUID_INTC_EDX_SEP; 3142 } else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) { 3143 x86_type = X86_TYPE_P4; 3144 /* 3145 * We don't currently depend on any of the %ecx 3146 * features until Prescott, so we'll only check 3147 * this from P4 onwards. We might want to revisit 3148 * that idea later. 3149 */ 3150 mask_ecx = 0xffffffff; 3151 } else if (cpi->cpi_family > 0xf) 3152 mask_ecx = 0xffffffff; 3153 /* 3154 * We don't support MONITOR/MWAIT if leaf 5 is not available 3155 * to obtain the monitor linesize. 3156 */ 3157 if (cpi->cpi_maxeax < 5) 3158 mask_ecx &= ~CPUID_INTC_ECX_MON; 3159 break; 3160 case X86_VENDOR_IntelClone: 3161 default: 3162 break; 3163 case X86_VENDOR_AMD: 3164 #if defined(OPTERON_ERRATUM_108) 3165 if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) { 3166 cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0; 3167 cpi->cpi_model = 0xc; 3168 } else 3169 #endif 3170 if (cpi->cpi_family == 5) { 3171 /* 3172 * AMD K5 and K6 3173 * 3174 * These CPUs have an incomplete implementation 3175 * of MCA/MCE which we mask away. 3176 */ 3177 mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA); 3178 3179 /* 3180 * Model 0 uses the wrong (APIC) bit 3181 * to indicate PGE. Fix it here. 3182 */ 3183 if (cpi->cpi_model == 0) { 3184 if (cp->cp_edx & 0x200) { 3185 cp->cp_edx &= ~0x200; 3186 cp->cp_edx |= CPUID_INTC_EDX_PGE; 3187 } 3188 } 3189 3190 /* 3191 * Early models had problems w/ MMX; disable. 3192 */ 3193 if (cpi->cpi_model < 6) 3194 mask_edx &= ~CPUID_INTC_EDX_MMX; 3195 } 3196 3197 /* 3198 * For newer families, SSE3 and CX16, at least, are valid; 3199 * enable all 3200 */ 3201 if (cpi->cpi_family >= 0xf) 3202 mask_ecx = 0xffffffff; 3203 /* 3204 * We don't support MONITOR/MWAIT if leaf 5 is not available 3205 * to obtain the monitor linesize. 3206 */ 3207 if (cpi->cpi_maxeax < 5) 3208 mask_ecx &= ~CPUID_INTC_ECX_MON; 3209 3210 #if !defined(__xpv) 3211 /* 3212 * AMD has not historically used MWAIT in the CPU's idle loop. 3213 * Pre-family-10h Opterons do not have the MWAIT instruction. We 3214 * know for certain that in at least family 17h, per AMD, mwait 3215 * is preferred. Families in-between are less certain. 3216 */ 3217 if (cpi->cpi_family < 0x17) { 3218 idle_cpu_prefer_mwait = 0; 3219 } 3220 #endif 3221 3222 break; 3223 case X86_VENDOR_TM: 3224 /* 3225 * workaround the NT workaround in CMS 4.1 3226 */ 3227 if (cpi->cpi_family == 5 && cpi->cpi_model == 4 && 3228 (cpi->cpi_step == 2 || cpi->cpi_step == 3)) 3229 cp->cp_edx |= CPUID_INTC_EDX_CX8; 3230 break; 3231 case X86_VENDOR_Centaur: 3232 /* 3233 * workaround the NT workarounds again 3234 */ 3235 if (cpi->cpi_family == 6) 3236 cp->cp_edx |= CPUID_INTC_EDX_CX8; 3237 break; 3238 case X86_VENDOR_Cyrix: 3239 /* 3240 * We rely heavily on the probing in locore 3241 * to actually figure out what parts, if any, 3242 * of the Cyrix cpuid instruction to believe. 3243 */ 3244 switch (x86_type) { 3245 case X86_TYPE_CYRIX_486: 3246 mask_edx = 0; 3247 break; 3248 case X86_TYPE_CYRIX_6x86: 3249 mask_edx = 0; 3250 break; 3251 case X86_TYPE_CYRIX_6x86L: 3252 mask_edx = 3253 CPUID_INTC_EDX_DE | 3254 CPUID_INTC_EDX_CX8; 3255 break; 3256 case X86_TYPE_CYRIX_6x86MX: 3257 mask_edx = 3258 CPUID_INTC_EDX_DE | 3259 CPUID_INTC_EDX_MSR | 3260 CPUID_INTC_EDX_CX8 | 3261 CPUID_INTC_EDX_PGE | 3262 CPUID_INTC_EDX_CMOV | 3263 CPUID_INTC_EDX_MMX; 3264 break; 3265 case X86_TYPE_CYRIX_GXm: 3266 mask_edx = 3267 CPUID_INTC_EDX_MSR | 3268 CPUID_INTC_EDX_CX8 | 3269 CPUID_INTC_EDX_CMOV | 3270 CPUID_INTC_EDX_MMX; 3271 break; 3272 case X86_TYPE_CYRIX_MediaGX: 3273 break; 3274 case X86_TYPE_CYRIX_MII: 3275 case X86_TYPE_VIA_CYRIX_III: 3276 mask_edx = 3277 CPUID_INTC_EDX_DE | 3278 CPUID_INTC_EDX_TSC | 3279 CPUID_INTC_EDX_MSR | 3280 CPUID_INTC_EDX_CX8 | 3281 CPUID_INTC_EDX_PGE | 3282 CPUID_INTC_EDX_CMOV | 3283 CPUID_INTC_EDX_MMX; 3284 break; 3285 default: 3286 break; 3287 } 3288 break; 3289 } 3290 3291 #if defined(__xpv) 3292 /* 3293 * Do not support MONITOR/MWAIT under a hypervisor 3294 */ 3295 mask_ecx &= ~CPUID_INTC_ECX_MON; 3296 /* 3297 * Do not support XSAVE under a hypervisor for now 3298 */ 3299 xsave_force_disable = B_TRUE; 3300 3301 #endif /* __xpv */ 3302 3303 if (xsave_force_disable) { 3304 mask_ecx &= ~CPUID_INTC_ECX_XSAVE; 3305 mask_ecx &= ~CPUID_INTC_ECX_AVX; 3306 mask_ecx &= ~CPUID_INTC_ECX_F16C; 3307 mask_ecx &= ~CPUID_INTC_ECX_FMA; 3308 } 3309 3310 /* 3311 * Now we've figured out the masks that determine 3312 * which bits we choose to believe, apply the masks 3313 * to the feature words, then map the kernel's view 3314 * of these feature words into its feature word. 3315 */ 3316 cp->cp_edx &= mask_edx; 3317 cp->cp_ecx &= mask_ecx; 3318 3319 /* 3320 * apply any platform restrictions (we don't call this 3321 * immediately after __cpuid_insn here, because we need the 3322 * workarounds applied above first) 3323 */ 3324 platform_cpuid_mangle(cpi->cpi_vendor, 1, cp); 3325 3326 /* 3327 * In addition to ecx and edx, Intel and AMD are storing a bunch of 3328 * instruction set extensions in leaf 7's ebx, ecx, and edx. 3329 */ 3330 if (cpi->cpi_maxeax >= 7) { 3331 struct cpuid_regs *ecp; 3332 ecp = &cpi->cpi_std[7]; 3333 ecp->cp_eax = 7; 3334 ecp->cp_ecx = 0; 3335 (void) __cpuid_insn(ecp); 3336 3337 /* 3338 * If XSAVE has been disabled, just ignore all of the 3339 * extended-save-area dependent flags here. 3340 */ 3341 if (xsave_force_disable) { 3342 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1; 3343 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2; 3344 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2; 3345 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX; 3346 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512; 3347 ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512; 3348 ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512; 3349 } 3350 3351 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP) 3352 add_x86_feature(featureset, X86FSET_SMEP); 3353 3354 /* 3355 * We check disable_smap here in addition to in startup_smap() 3356 * to ensure CPUs that aren't the boot CPU don't accidentally 3357 * include it in the feature set and thus generate a mismatched 3358 * x86 feature set across CPUs. 3359 */ 3360 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP && 3361 disable_smap == 0) 3362 add_x86_feature(featureset, X86FSET_SMAP); 3363 3364 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED) 3365 add_x86_feature(featureset, X86FSET_RDSEED); 3366 3367 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX) 3368 add_x86_feature(featureset, X86FSET_ADX); 3369 3370 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE) 3371 add_x86_feature(featureset, X86FSET_FSGSBASE); 3372 3373 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT) 3374 add_x86_feature(featureset, X86FSET_CLFLUSHOPT); 3375 3376 if (cpi->cpi_vendor == X86_VENDOR_Intel) { 3377 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID) 3378 add_x86_feature(featureset, X86FSET_INVPCID); 3379 3380 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX) 3381 add_x86_feature(featureset, X86FSET_MPX); 3382 3383 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB) 3384 add_x86_feature(featureset, X86FSET_CLWB); 3385 } 3386 } 3387 3388 /* 3389 * fold in overrides from the "eeprom" mechanism 3390 */ 3391 cp->cp_edx |= cpuid_feature_edx_include; 3392 cp->cp_edx &= ~cpuid_feature_edx_exclude; 3393 3394 cp->cp_ecx |= cpuid_feature_ecx_include; 3395 cp->cp_ecx &= ~cpuid_feature_ecx_exclude; 3396 3397 if (cp->cp_edx & CPUID_INTC_EDX_PSE) { 3398 add_x86_feature(featureset, X86FSET_LARGEPAGE); 3399 } 3400 if (cp->cp_edx & CPUID_INTC_EDX_TSC) { 3401 add_x86_feature(featureset, X86FSET_TSC); 3402 } 3403 if (cp->cp_edx & CPUID_INTC_EDX_MSR) { 3404 add_x86_feature(featureset, X86FSET_MSR); 3405 } 3406 if (cp->cp_edx & CPUID_INTC_EDX_MTRR) { 3407 add_x86_feature(featureset, X86FSET_MTRR); 3408 } 3409 if (cp->cp_edx & CPUID_INTC_EDX_PGE) { 3410 add_x86_feature(featureset, X86FSET_PGE); 3411 } 3412 if (cp->cp_edx & CPUID_INTC_EDX_CMOV) { 3413 add_x86_feature(featureset, X86FSET_CMOV); 3414 } 3415 if (cp->cp_edx & CPUID_INTC_EDX_MMX) { 3416 add_x86_feature(featureset, X86FSET_MMX); 3417 } 3418 if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 && 3419 (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) { 3420 add_x86_feature(featureset, X86FSET_MCA); 3421 } 3422 if (cp->cp_edx & CPUID_INTC_EDX_PAE) { 3423 add_x86_feature(featureset, X86FSET_PAE); 3424 } 3425 if (cp->cp_edx & CPUID_INTC_EDX_CX8) { 3426 add_x86_feature(featureset, X86FSET_CX8); 3427 } 3428 if (cp->cp_ecx & CPUID_INTC_ECX_CX16) { 3429 add_x86_feature(featureset, X86FSET_CX16); 3430 } 3431 if (cp->cp_edx & CPUID_INTC_EDX_PAT) { 3432 add_x86_feature(featureset, X86FSET_PAT); 3433 } 3434 if (cp->cp_edx & CPUID_INTC_EDX_SEP) { 3435 add_x86_feature(featureset, X86FSET_SEP); 3436 } 3437 if (cp->cp_edx & CPUID_INTC_EDX_FXSR) { 3438 /* 3439 * In our implementation, fxsave/fxrstor 3440 * are prerequisites before we'll even 3441 * try and do SSE things. 3442 */ 3443 if (cp->cp_edx & CPUID_INTC_EDX_SSE) { 3444 add_x86_feature(featureset, X86FSET_SSE); 3445 } 3446 if (cp->cp_edx & CPUID_INTC_EDX_SSE2) { 3447 add_x86_feature(featureset, X86FSET_SSE2); 3448 } 3449 if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) { 3450 add_x86_feature(featureset, X86FSET_SSE3); 3451 } 3452 if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) { 3453 add_x86_feature(featureset, X86FSET_SSSE3); 3454 } 3455 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) { 3456 add_x86_feature(featureset, X86FSET_SSE4_1); 3457 } 3458 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) { 3459 add_x86_feature(featureset, X86FSET_SSE4_2); 3460 } 3461 if (cp->cp_ecx & CPUID_INTC_ECX_AES) { 3462 add_x86_feature(featureset, X86FSET_AES); 3463 } 3464 if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) { 3465 add_x86_feature(featureset, X86FSET_PCLMULQDQ); 3466 } 3467 3468 if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA) 3469 add_x86_feature(featureset, X86FSET_SHA); 3470 3471 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_UMIP) 3472 add_x86_feature(featureset, X86FSET_UMIP); 3473 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_PKU) 3474 add_x86_feature(featureset, X86FSET_PKU); 3475 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_OSPKE) 3476 add_x86_feature(featureset, X86FSET_OSPKE); 3477 3478 if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) { 3479 add_x86_feature(featureset, X86FSET_XSAVE); 3480 3481 /* We only test AVX & AVX512 when there is XSAVE */ 3482 3483 if (cp->cp_ecx & CPUID_INTC_ECX_AVX) { 3484 add_x86_feature(featureset, 3485 X86FSET_AVX); 3486 3487 /* 3488 * Intel says we can't check these without also 3489 * checking AVX. 3490 */ 3491 if (cp->cp_ecx & CPUID_INTC_ECX_F16C) 3492 add_x86_feature(featureset, 3493 X86FSET_F16C); 3494 3495 if (cp->cp_ecx & CPUID_INTC_ECX_FMA) 3496 add_x86_feature(featureset, 3497 X86FSET_FMA); 3498 3499 if (cpi->cpi_std[7].cp_ebx & 3500 CPUID_INTC_EBX_7_0_BMI1) 3501 add_x86_feature(featureset, 3502 X86FSET_BMI1); 3503 3504 if (cpi->cpi_std[7].cp_ebx & 3505 CPUID_INTC_EBX_7_0_BMI2) 3506 add_x86_feature(featureset, 3507 X86FSET_BMI2); 3508 3509 if (cpi->cpi_std[7].cp_ebx & 3510 CPUID_INTC_EBX_7_0_AVX2) 3511 add_x86_feature(featureset, 3512 X86FSET_AVX2); 3513 } 3514 3515 if (cpi->cpi_vendor == X86_VENDOR_Intel && 3516 (cpi->cpi_std[7].cp_ebx & 3517 CPUID_INTC_EBX_7_0_AVX512F) != 0) { 3518 add_x86_feature(featureset, X86FSET_AVX512F); 3519 3520 if (cpi->cpi_std[7].cp_ebx & 3521 CPUID_INTC_EBX_7_0_AVX512DQ) 3522 add_x86_feature(featureset, 3523 X86FSET_AVX512DQ); 3524 if (cpi->cpi_std[7].cp_ebx & 3525 CPUID_INTC_EBX_7_0_AVX512IFMA) 3526 add_x86_feature(featureset, 3527 X86FSET_AVX512FMA); 3528 if (cpi->cpi_std[7].cp_ebx & 3529 CPUID_INTC_EBX_7_0_AVX512PF) 3530 add_x86_feature(featureset, 3531 X86FSET_AVX512PF); 3532 if (cpi->cpi_std[7].cp_ebx & 3533 CPUID_INTC_EBX_7_0_AVX512ER) 3534 add_x86_feature(featureset, 3535 X86FSET_AVX512ER); 3536 if (cpi->cpi_std[7].cp_ebx & 3537 CPUID_INTC_EBX_7_0_AVX512CD) 3538 add_x86_feature(featureset, 3539 X86FSET_AVX512CD); 3540 if (cpi->cpi_std[7].cp_ebx & 3541 CPUID_INTC_EBX_7_0_AVX512BW) 3542 add_x86_feature(featureset, 3543 X86FSET_AVX512BW); 3544 if (cpi->cpi_std[7].cp_ebx & 3545 CPUID_INTC_EBX_7_0_AVX512VL) 3546 add_x86_feature(featureset, 3547 X86FSET_AVX512VL); 3548 3549 if (cpi->cpi_std[7].cp_ecx & 3550 CPUID_INTC_ECX_7_0_AVX512VBMI) 3551 add_x86_feature(featureset, 3552 X86FSET_AVX512VBMI); 3553 if (cpi->cpi_std[7].cp_ecx & 3554 CPUID_INTC_ECX_7_0_AVX512VNNI) 3555 add_x86_feature(featureset, 3556 X86FSET_AVX512VNNI); 3557 if (cpi->cpi_std[7].cp_ecx & 3558 CPUID_INTC_ECX_7_0_AVX512VPOPCDQ) 3559 add_x86_feature(featureset, 3560 X86FSET_AVX512VPOPCDQ); 3561 3562 if (cpi->cpi_std[7].cp_edx & 3563 CPUID_INTC_EDX_7_0_AVX5124NNIW) 3564 add_x86_feature(featureset, 3565 X86FSET_AVX512NNIW); 3566 if (cpi->cpi_std[7].cp_edx & 3567 CPUID_INTC_EDX_7_0_AVX5124FMAPS) 3568 add_x86_feature(featureset, 3569 X86FSET_AVX512FMAPS); 3570 } 3571 } 3572 } 3573 3574 if (cpi->cpi_vendor == X86_VENDOR_Intel) { 3575 if (cp->cp_ecx & CPUID_INTC_ECX_PCID) { 3576 add_x86_feature(featureset, X86FSET_PCID); 3577 } 3578 } 3579 3580 if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) { 3581 add_x86_feature(featureset, X86FSET_X2APIC); 3582 } 3583 if (cp->cp_edx & CPUID_INTC_EDX_DE) { 3584 add_x86_feature(featureset, X86FSET_DE); 3585 } 3586 #if !defined(__xpv) 3587 if (cp->cp_ecx & CPUID_INTC_ECX_MON) { 3588 3589 /* 3590 * We require the CLFLUSH instruction for erratum workaround 3591 * to use MONITOR/MWAIT. 3592 */ 3593 if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) { 3594 cpi->cpi_mwait.support |= MWAIT_SUPPORT; 3595 add_x86_feature(featureset, X86FSET_MWAIT); 3596 } else { 3597 extern int idle_cpu_assert_cflush_monitor; 3598 3599 /* 3600 * All processors we are aware of which have 3601 * MONITOR/MWAIT also have CLFLUSH. 3602 */ 3603 if (idle_cpu_assert_cflush_monitor) { 3604 ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) && 3605 (cp->cp_edx & CPUID_INTC_EDX_CLFSH)); 3606 } 3607 } 3608 } 3609 #endif /* __xpv */ 3610 3611 if (cp->cp_ecx & CPUID_INTC_ECX_VMX) { 3612 add_x86_feature(featureset, X86FSET_VMX); 3613 } 3614 3615 if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND) 3616 add_x86_feature(featureset, X86FSET_RDRAND); 3617 3618 /* 3619 * Only need it first time, rest of the cpus would follow suit. 3620 * we only capture this for the bootcpu. 3621 */ 3622 if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) { 3623 add_x86_feature(featureset, X86FSET_CLFSH); 3624 x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8); 3625 } 3626 if (is_x86_feature(featureset, X86FSET_PAE)) 3627 cpi->cpi_pabits = 36; 3628 3629 if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) { 3630 struct cpuid_regs r, *ecp; 3631 3632 ecp = &r; 3633 ecp->cp_eax = 0xD; 3634 ecp->cp_ecx = 1; 3635 ecp->cp_edx = ecp->cp_ebx = 0; 3636 (void) __cpuid_insn(ecp); 3637 3638 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT) 3639 add_x86_feature(featureset, X86FSET_XSAVEOPT); 3640 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC) 3641 add_x86_feature(featureset, X86FSET_XSAVEC); 3642 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES) 3643 add_x86_feature(featureset, X86FSET_XSAVES); 3644 } 3645 3646 /* 3647 * Work on the "extended" feature information, doing 3648 * some basic initialization for cpuid_pass2() 3649 */ 3650 xcpuid = 0; 3651 switch (cpi->cpi_vendor) { 3652 case X86_VENDOR_Intel: 3653 /* 3654 * On KVM we know we will have proper support for extended 3655 * cpuid. 3656 */ 3657 if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf || 3658 (get_hwenv() == HW_KVM && cpi->cpi_family == 6 && 3659 (cpi->cpi_model == 6 || cpi->cpi_model == 2))) 3660 xcpuid++; 3661 break; 3662 case X86_VENDOR_AMD: 3663 if (cpi->cpi_family > 5 || 3664 (cpi->cpi_family == 5 && cpi->cpi_model >= 1)) 3665 xcpuid++; 3666 break; 3667 case X86_VENDOR_Cyrix: 3668 /* 3669 * Only these Cyrix CPUs are -known- to support 3670 * extended cpuid operations. 3671 */ 3672 if (x86_type == X86_TYPE_VIA_CYRIX_III || 3673 x86_type == X86_TYPE_CYRIX_GXm) 3674 xcpuid++; 3675 break; 3676 case X86_VENDOR_Centaur: 3677 case X86_VENDOR_TM: 3678 default: 3679 xcpuid++; 3680 break; 3681 } 3682 3683 if (xcpuid) { 3684 cp = &cpi->cpi_extd[0]; 3685 cp->cp_eax = CPUID_LEAF_EXT_0; 3686 cpi->cpi_xmaxeax = __cpuid_insn(cp); 3687 } 3688 3689 if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) { 3690 3691 if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX) 3692 cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX; 3693 3694 switch (cpi->cpi_vendor) { 3695 case X86_VENDOR_Intel: 3696 case X86_VENDOR_AMD: 3697 if (cpi->cpi_xmaxeax < 0x80000001) 3698 break; 3699 cp = &cpi->cpi_extd[1]; 3700 cp->cp_eax = 0x80000001; 3701 (void) __cpuid_insn(cp); 3702 3703 if (cpi->cpi_vendor == X86_VENDOR_AMD && 3704 cpi->cpi_family == 5 && 3705 cpi->cpi_model == 6 && 3706 cpi->cpi_step == 6) { 3707 /* 3708 * K6 model 6 uses bit 10 to indicate SYSC 3709 * Later models use bit 11. Fix it here. 3710 */ 3711 if (cp->cp_edx & 0x400) { 3712 cp->cp_edx &= ~0x400; 3713 cp->cp_edx |= CPUID_AMD_EDX_SYSC; 3714 } 3715 } 3716 3717 platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp); 3718 3719 /* 3720 * Compute the additions to the kernel's feature word. 3721 */ 3722 if (cp->cp_edx & CPUID_AMD_EDX_NX) { 3723 add_x86_feature(featureset, X86FSET_NX); 3724 } 3725 3726 /* 3727 * Regardless whether or not we boot 64-bit, 3728 * we should have a way to identify whether 3729 * the CPU is capable of running 64-bit. 3730 */ 3731 if (cp->cp_edx & CPUID_AMD_EDX_LM) { 3732 add_x86_feature(featureset, X86FSET_64); 3733 } 3734 3735 /* 1 GB large page - enable only for 64 bit kernel */ 3736 if (cp->cp_edx & CPUID_AMD_EDX_1GPG) { 3737 add_x86_feature(featureset, X86FSET_1GPG); 3738 } 3739 3740 if ((cpi->cpi_vendor == X86_VENDOR_AMD) && 3741 (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) && 3742 (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) { 3743 add_x86_feature(featureset, X86FSET_SSE4A); 3744 } 3745 3746 /* 3747 * It's really tricky to support syscall/sysret in 3748 * the i386 kernel; we rely on sysenter/sysexit 3749 * instead. In the amd64 kernel, things are -way- 3750 * better. 3751 */ 3752 if (cp->cp_edx & CPUID_AMD_EDX_SYSC) { 3753 add_x86_feature(featureset, X86FSET_ASYSC); 3754 } 3755 3756 /* 3757 * While we're thinking about system calls, note 3758 * that AMD processors don't support sysenter 3759 * in long mode at all, so don't try to program them. 3760 */ 3761 if (x86_vendor == X86_VENDOR_AMD) { 3762 remove_x86_feature(featureset, X86FSET_SEP); 3763 } 3764 3765 if (cp->cp_edx & CPUID_AMD_EDX_TSCP) { 3766 add_x86_feature(featureset, X86FSET_TSCP); 3767 } 3768 3769 if (cp->cp_ecx & CPUID_AMD_ECX_SVM) { 3770 add_x86_feature(featureset, X86FSET_SVM); 3771 } 3772 3773 if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) { 3774 add_x86_feature(featureset, X86FSET_TOPOEXT); 3775 } 3776 3777 if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) { 3778 add_x86_feature(featureset, X86FSET_AMD_PCEC); 3779 } 3780 3781 if (cp->cp_ecx & CPUID_AMD_ECX_XOP) { 3782 add_x86_feature(featureset, X86FSET_XOP); 3783 } 3784 3785 if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) { 3786 add_x86_feature(featureset, X86FSET_FMA4); 3787 } 3788 3789 if (cp->cp_ecx & CPUID_AMD_ECX_TBM) { 3790 add_x86_feature(featureset, X86FSET_TBM); 3791 } 3792 3793 if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) { 3794 add_x86_feature(featureset, X86FSET_MONITORX); 3795 } 3796 break; 3797 default: 3798 break; 3799 } 3800 3801 /* 3802 * Get CPUID data about processor cores and hyperthreads. 3803 */ 3804 switch (cpi->cpi_vendor) { 3805 case X86_VENDOR_Intel: 3806 if (cpi->cpi_maxeax >= 4) { 3807 cp = &cpi->cpi_std[4]; 3808 cp->cp_eax = 4; 3809 cp->cp_ecx = 0; 3810 (void) __cpuid_insn(cp); 3811 platform_cpuid_mangle(cpi->cpi_vendor, 4, cp); 3812 } 3813 /*FALLTHROUGH*/ 3814 case X86_VENDOR_AMD: 3815 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) 3816 break; 3817 cp = &cpi->cpi_extd[8]; 3818 cp->cp_eax = CPUID_LEAF_EXT_8; 3819 (void) __cpuid_insn(cp); 3820 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, 3821 cp); 3822 3823 /* 3824 * AMD uses ebx for some extended functions. 3825 */ 3826 if (cpi->cpi_vendor == X86_VENDOR_AMD) { 3827 /* 3828 * While we're here, check for the AMD "Error 3829 * Pointer Zero/Restore" feature. This can be 3830 * used to setup the FP save handlers 3831 * appropriately. 3832 */ 3833 if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) { 3834 cpi->cpi_fp_amd_save = 0; 3835 } else { 3836 cpi->cpi_fp_amd_save = 1; 3837 } 3838 3839 if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) { 3840 add_x86_feature(featureset, 3841 X86FSET_CLZERO); 3842 } 3843 } 3844 3845 /* 3846 * Virtual and physical address limits from 3847 * cpuid override previously guessed values. 3848 */ 3849 cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0); 3850 cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8); 3851 break; 3852 default: 3853 break; 3854 } 3855 3856 /* 3857 * Get CPUID data about TSC Invariance in Deep C-State. 3858 */ 3859 switch (cpi->cpi_vendor) { 3860 case X86_VENDOR_Intel: 3861 case X86_VENDOR_AMD: 3862 if (cpi->cpi_maxeax >= 7) { 3863 cp = &cpi->cpi_extd[7]; 3864 cp->cp_eax = 0x80000007; 3865 cp->cp_ecx = 0; 3866 (void) __cpuid_insn(cp); 3867 } 3868 break; 3869 default: 3870 break; 3871 } 3872 } 3873 3874 cpuid_pass1_topology(cpu, featureset); 3875 cpuid_pass1_thermal(cpu, featureset); 3876 3877 /* 3878 * Synthesize chip "revision" and socket type 3879 */ 3880 cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family, 3881 cpi->cpi_model, cpi->cpi_step); 3882 cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor, 3883 cpi->cpi_family, cpi->cpi_model, cpi->cpi_step); 3884 cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family, 3885 cpi->cpi_model, cpi->cpi_step); 3886 3887 if (cpi->cpi_vendor == X86_VENDOR_AMD) { 3888 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 && 3889 cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) { 3890 /* Special handling for AMD FP not necessary. */ 3891 cpi->cpi_fp_amd_save = 0; 3892 } else { 3893 cpi->cpi_fp_amd_save = 1; 3894 } 3895 } 3896 3897 /* 3898 * Check the processor leaves that are used for security features. 3899 */ 3900 cpuid_scan_security(cpu, featureset); 3901 3902 pass1_done: 3903 cpi->cpi_pass = 1; 3904 } 3905 3906 /* 3907 * Make copies of the cpuid table entries we depend on, in 3908 * part for ease of parsing now, in part so that we have only 3909 * one place to correct any of it, in part for ease of 3910 * later export to userland, and in part so we can look at 3911 * this stuff in a crash dump. 3912 */ 3913 3914 /*ARGSUSED*/ 3915 void 3916 cpuid_pass2(cpu_t *cpu) 3917 { 3918 uint_t n, nmax; 3919 int i; 3920 struct cpuid_regs *cp; 3921 uint8_t *dp; 3922 uint32_t *iptr; 3923 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 3924 3925 ASSERT(cpi->cpi_pass == 1); 3926 3927 if (cpi->cpi_maxeax < 1) 3928 goto pass2_done; 3929 3930 if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD) 3931 nmax = NMAX_CPI_STD; 3932 /* 3933 * (We already handled n == 0 and n == 1 in pass 1) 3934 */ 3935 for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) { 3936 /* 3937 * leaves 6 and 7 were handled in pass 1 3938 */ 3939 if (n == 6 || n == 7) 3940 continue; 3941 3942 cp->cp_eax = n; 3943 3944 /* 3945 * CPUID function 4 expects %ecx to be initialized 3946 * with an index which indicates which cache to return 3947 * information about. The OS is expected to call function 4 3948 * with %ecx set to 0, 1, 2, ... until it returns with 3949 * EAX[4:0] set to 0, which indicates there are no more 3950 * caches. 3951 * 3952 * Here, populate cpi_std[4] with the information returned by 3953 * function 4 when %ecx == 0, and do the rest in cpuid_pass3() 3954 * when dynamic memory allocation becomes available. 3955 * 3956 * Note: we need to explicitly initialize %ecx here, since 3957 * function 4 may have been previously invoked. 3958 */ 3959 if (n == 4) 3960 cp->cp_ecx = 0; 3961 3962 (void) __cpuid_insn(cp); 3963 platform_cpuid_mangle(cpi->cpi_vendor, n, cp); 3964 switch (n) { 3965 case 2: 3966 /* 3967 * "the lower 8 bits of the %eax register 3968 * contain a value that identifies the number 3969 * of times the cpuid [instruction] has to be 3970 * executed to obtain a complete image of the 3971 * processor's caching systems." 3972 * 3973 * How *do* they make this stuff up? 3974 */ 3975 cpi->cpi_ncache = sizeof (*cp) * 3976 BITX(cp->cp_eax, 7, 0); 3977 if (cpi->cpi_ncache == 0) 3978 break; 3979 cpi->cpi_ncache--; /* skip count byte */ 3980 3981 /* 3982 * Well, for now, rather than attempt to implement 3983 * this slightly dubious algorithm, we just look 3984 * at the first 15 .. 3985 */ 3986 if (cpi->cpi_ncache > (sizeof (*cp) - 1)) 3987 cpi->cpi_ncache = sizeof (*cp) - 1; 3988 3989 dp = cpi->cpi_cacheinfo; 3990 if (BITX(cp->cp_eax, 31, 31) == 0) { 3991 uint8_t *p = (void *)&cp->cp_eax; 3992 for (i = 1; i < 4; i++) 3993 if (p[i] != 0) 3994 *dp++ = p[i]; 3995 } 3996 if (BITX(cp->cp_ebx, 31, 31) == 0) { 3997 uint8_t *p = (void *)&cp->cp_ebx; 3998 for (i = 0; i < 4; i++) 3999 if (p[i] != 0) 4000 *dp++ = p[i]; 4001 } 4002 if (BITX(cp->cp_ecx, 31, 31) == 0) { 4003 uint8_t *p = (void *)&cp->cp_ecx; 4004 for (i = 0; i < 4; i++) 4005 if (p[i] != 0) 4006 *dp++ = p[i]; 4007 } 4008 if (BITX(cp->cp_edx, 31, 31) == 0) { 4009 uint8_t *p = (void *)&cp->cp_edx; 4010 for (i = 0; i < 4; i++) 4011 if (p[i] != 0) 4012 *dp++ = p[i]; 4013 } 4014 break; 4015 4016 case 3: /* Processor serial number, if PSN supported */ 4017 break; 4018 4019 case 4: /* Deterministic cache parameters */ 4020 break; 4021 4022 case 5: /* Monitor/Mwait parameters */ 4023 { 4024 size_t mwait_size; 4025 4026 /* 4027 * check cpi_mwait.support which was set in cpuid_pass1 4028 */ 4029 if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT)) 4030 break; 4031 4032 /* 4033 * Protect ourself from insane mwait line size. 4034 * Workaround for incomplete hardware emulator(s). 4035 */ 4036 mwait_size = (size_t)MWAIT_SIZE_MAX(cpi); 4037 if (mwait_size < sizeof (uint32_t) || 4038 !ISP2(mwait_size)) { 4039 #if DEBUG 4040 cmn_err(CE_NOTE, "Cannot handle cpu %d mwait " 4041 "size %ld", cpu->cpu_id, (long)mwait_size); 4042 #endif 4043 break; 4044 } 4045 4046 cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi); 4047 cpi->cpi_mwait.mon_max = mwait_size; 4048 if (MWAIT_EXTENSION(cpi)) { 4049 cpi->cpi_mwait.support |= MWAIT_EXTENSIONS; 4050 if (MWAIT_INT_ENABLE(cpi)) 4051 cpi->cpi_mwait.support |= 4052 MWAIT_ECX_INT_ENABLE; 4053 } 4054 break; 4055 } 4056 default: 4057 break; 4058 } 4059 } 4060 4061 /* 4062 * XSAVE enumeration 4063 */ 4064 if (cpi->cpi_maxeax >= 0xD) { 4065 struct cpuid_regs regs; 4066 boolean_t cpuid_d_valid = B_TRUE; 4067 4068 cp = ®s; 4069 cp->cp_eax = 0xD; 4070 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0; 4071 4072 (void) __cpuid_insn(cp); 4073 4074 /* 4075 * Sanity checks for debug 4076 */ 4077 if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 || 4078 (cp->cp_eax & XFEATURE_SSE) == 0) { 4079 cpuid_d_valid = B_FALSE; 4080 } 4081 4082 cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax; 4083 cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx; 4084 cpi->cpi_xsave.xsav_max_size = cp->cp_ecx; 4085 4086 /* 4087 * If the hw supports AVX, get the size and offset in the save 4088 * area for the ymm state. 4089 */ 4090 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) { 4091 cp->cp_eax = 0xD; 4092 cp->cp_ecx = 2; 4093 cp->cp_edx = cp->cp_ebx = 0; 4094 4095 (void) __cpuid_insn(cp); 4096 4097 if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET || 4098 cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) { 4099 cpuid_d_valid = B_FALSE; 4100 } 4101 4102 cpi->cpi_xsave.ymm_size = cp->cp_eax; 4103 cpi->cpi_xsave.ymm_offset = cp->cp_ebx; 4104 } 4105 4106 /* 4107 * If the hw supports MPX, get the size and offset in the 4108 * save area for BNDREGS and BNDCSR. 4109 */ 4110 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) { 4111 cp->cp_eax = 0xD; 4112 cp->cp_ecx = 3; 4113 cp->cp_edx = cp->cp_ebx = 0; 4114 4115 (void) __cpuid_insn(cp); 4116 4117 cpi->cpi_xsave.bndregs_size = cp->cp_eax; 4118 cpi->cpi_xsave.bndregs_offset = cp->cp_ebx; 4119 4120 cp->cp_eax = 0xD; 4121 cp->cp_ecx = 4; 4122 cp->cp_edx = cp->cp_ebx = 0; 4123 4124 (void) __cpuid_insn(cp); 4125 4126 cpi->cpi_xsave.bndcsr_size = cp->cp_eax; 4127 cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx; 4128 } 4129 4130 /* 4131 * If the hw supports AVX512, get the size and offset in the 4132 * save area for the opmask registers and zmm state. 4133 */ 4134 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) { 4135 cp->cp_eax = 0xD; 4136 cp->cp_ecx = 5; 4137 cp->cp_edx = cp->cp_ebx = 0; 4138 4139 (void) __cpuid_insn(cp); 4140 4141 cpi->cpi_xsave.opmask_size = cp->cp_eax; 4142 cpi->cpi_xsave.opmask_offset = cp->cp_ebx; 4143 4144 cp->cp_eax = 0xD; 4145 cp->cp_ecx = 6; 4146 cp->cp_edx = cp->cp_ebx = 0; 4147 4148 (void) __cpuid_insn(cp); 4149 4150 cpi->cpi_xsave.zmmlo_size = cp->cp_eax; 4151 cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx; 4152 4153 cp->cp_eax = 0xD; 4154 cp->cp_ecx = 7; 4155 cp->cp_edx = cp->cp_ebx = 0; 4156 4157 (void) __cpuid_insn(cp); 4158 4159 cpi->cpi_xsave.zmmhi_size = cp->cp_eax; 4160 cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx; 4161 } 4162 4163 if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) { 4164 xsave_state_size = 0; 4165 } else if (cpuid_d_valid) { 4166 xsave_state_size = cpi->cpi_xsave.xsav_max_size; 4167 } else { 4168 /* Broken CPUID 0xD, probably in HVM */ 4169 cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid " 4170 "value: hw_low = %d, hw_high = %d, xsave_size = %d" 4171 ", ymm_size = %d, ymm_offset = %d\n", 4172 cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low, 4173 cpi->cpi_xsave.xsav_hw_features_high, 4174 (int)cpi->cpi_xsave.xsav_max_size, 4175 (int)cpi->cpi_xsave.ymm_size, 4176 (int)cpi->cpi_xsave.ymm_offset); 4177 4178 if (xsave_state_size != 0) { 4179 /* 4180 * This must be a non-boot CPU. We cannot 4181 * continue, because boot cpu has already 4182 * enabled XSAVE. 4183 */ 4184 ASSERT(cpu->cpu_id != 0); 4185 cmn_err(CE_PANIC, "cpu%d: we have already " 4186 "enabled XSAVE on boot cpu, cannot " 4187 "continue.", cpu->cpu_id); 4188 } else { 4189 /* 4190 * If we reached here on the boot CPU, it's also 4191 * almost certain that we'll reach here on the 4192 * non-boot CPUs. When we're here on a boot CPU 4193 * we should disable the feature, on a non-boot 4194 * CPU we need to confirm that we have. 4195 */ 4196 if (cpu->cpu_id == 0) { 4197 remove_x86_feature(x86_featureset, 4198 X86FSET_XSAVE); 4199 remove_x86_feature(x86_featureset, 4200 X86FSET_AVX); 4201 remove_x86_feature(x86_featureset, 4202 X86FSET_F16C); 4203 remove_x86_feature(x86_featureset, 4204 X86FSET_BMI1); 4205 remove_x86_feature(x86_featureset, 4206 X86FSET_BMI2); 4207 remove_x86_feature(x86_featureset, 4208 X86FSET_FMA); 4209 remove_x86_feature(x86_featureset, 4210 X86FSET_AVX2); 4211 remove_x86_feature(x86_featureset, 4212 X86FSET_MPX); 4213 remove_x86_feature(x86_featureset, 4214 X86FSET_AVX512F); 4215 remove_x86_feature(x86_featureset, 4216 X86FSET_AVX512DQ); 4217 remove_x86_feature(x86_featureset, 4218 X86FSET_AVX512PF); 4219 remove_x86_feature(x86_featureset, 4220 X86FSET_AVX512ER); 4221 remove_x86_feature(x86_featureset, 4222 X86FSET_AVX512CD); 4223 remove_x86_feature(x86_featureset, 4224 X86FSET_AVX512BW); 4225 remove_x86_feature(x86_featureset, 4226 X86FSET_AVX512VL); 4227 remove_x86_feature(x86_featureset, 4228 X86FSET_AVX512FMA); 4229 remove_x86_feature(x86_featureset, 4230 X86FSET_AVX512VBMI); 4231 remove_x86_feature(x86_featureset, 4232 X86FSET_AVX512VNNI); 4233 remove_x86_feature(x86_featureset, 4234 X86FSET_AVX512VPOPCDQ); 4235 remove_x86_feature(x86_featureset, 4236 X86FSET_AVX512NNIW); 4237 remove_x86_feature(x86_featureset, 4238 X86FSET_AVX512FMAPS); 4239 4240 CPI_FEATURES_ECX(cpi) &= 4241 ~CPUID_INTC_ECX_XSAVE; 4242 CPI_FEATURES_ECX(cpi) &= 4243 ~CPUID_INTC_ECX_AVX; 4244 CPI_FEATURES_ECX(cpi) &= 4245 ~CPUID_INTC_ECX_F16C; 4246 CPI_FEATURES_ECX(cpi) &= 4247 ~CPUID_INTC_ECX_FMA; 4248 CPI_FEATURES_7_0_EBX(cpi) &= 4249 ~CPUID_INTC_EBX_7_0_BMI1; 4250 CPI_FEATURES_7_0_EBX(cpi) &= 4251 ~CPUID_INTC_EBX_7_0_BMI2; 4252 CPI_FEATURES_7_0_EBX(cpi) &= 4253 ~CPUID_INTC_EBX_7_0_AVX2; 4254 CPI_FEATURES_7_0_EBX(cpi) &= 4255 ~CPUID_INTC_EBX_7_0_MPX; 4256 CPI_FEATURES_7_0_EBX(cpi) &= 4257 ~CPUID_INTC_EBX_7_0_ALL_AVX512; 4258 4259 CPI_FEATURES_7_0_ECX(cpi) &= 4260 ~CPUID_INTC_ECX_7_0_ALL_AVX512; 4261 4262 CPI_FEATURES_7_0_EDX(cpi) &= 4263 ~CPUID_INTC_EDX_7_0_ALL_AVX512; 4264 4265 xsave_force_disable = B_TRUE; 4266 } else { 4267 VERIFY(is_x86_feature(x86_featureset, 4268 X86FSET_XSAVE) == B_FALSE); 4269 } 4270 } 4271 } 4272 } 4273 4274 4275 if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) 4276 goto pass2_done; 4277 4278 if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD) 4279 nmax = NMAX_CPI_EXTD; 4280 /* 4281 * Copy the extended properties, fixing them as we go. 4282 * (We already handled n == 0 and n == 1 in pass 1) 4283 */ 4284 iptr = (void *)cpi->cpi_brandstr; 4285 for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) { 4286 cp->cp_eax = CPUID_LEAF_EXT_0 + n; 4287 (void) __cpuid_insn(cp); 4288 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n, 4289 cp); 4290 switch (n) { 4291 case 2: 4292 case 3: 4293 case 4: 4294 /* 4295 * Extract the brand string 4296 */ 4297 *iptr++ = cp->cp_eax; 4298 *iptr++ = cp->cp_ebx; 4299 *iptr++ = cp->cp_ecx; 4300 *iptr++ = cp->cp_edx; 4301 break; 4302 case 5: 4303 switch (cpi->cpi_vendor) { 4304 case X86_VENDOR_AMD: 4305 /* 4306 * The Athlon and Duron were the first 4307 * parts to report the sizes of the 4308 * TLB for large pages. Before then, 4309 * we don't trust the data. 4310 */ 4311 if (cpi->cpi_family < 6 || 4312 (cpi->cpi_family == 6 && 4313 cpi->cpi_model < 1)) 4314 cp->cp_eax = 0; 4315 break; 4316 default: 4317 break; 4318 } 4319 break; 4320 case 6: 4321 switch (cpi->cpi_vendor) { 4322 case X86_VENDOR_AMD: 4323 /* 4324 * The Athlon and Duron were the first 4325 * AMD parts with L2 TLB's. 4326 * Before then, don't trust the data. 4327 */ 4328 if (cpi->cpi_family < 6 || 4329 cpi->cpi_family == 6 && 4330 cpi->cpi_model < 1) 4331 cp->cp_eax = cp->cp_ebx = 0; 4332 /* 4333 * AMD Duron rev A0 reports L2 4334 * cache size incorrectly as 1K 4335 * when it is really 64K 4336 */ 4337 if (cpi->cpi_family == 6 && 4338 cpi->cpi_model == 3 && 4339 cpi->cpi_step == 0) { 4340 cp->cp_ecx &= 0xffff; 4341 cp->cp_ecx |= 0x400000; 4342 } 4343 break; 4344 case X86_VENDOR_Cyrix: /* VIA C3 */ 4345 /* 4346 * VIA C3 processors are a bit messed 4347 * up w.r.t. encoding cache sizes in %ecx 4348 */ 4349 if (cpi->cpi_family != 6) 4350 break; 4351 /* 4352 * model 7 and 8 were incorrectly encoded 4353 * 4354 * xxx is model 8 really broken? 4355 */ 4356 if (cpi->cpi_model == 7 || 4357 cpi->cpi_model == 8) 4358 cp->cp_ecx = 4359 BITX(cp->cp_ecx, 31, 24) << 16 | 4360 BITX(cp->cp_ecx, 23, 16) << 12 | 4361 BITX(cp->cp_ecx, 15, 8) << 8 | 4362 BITX(cp->cp_ecx, 7, 0); 4363 /* 4364 * model 9 stepping 1 has wrong associativity 4365 */ 4366 if (cpi->cpi_model == 9 && cpi->cpi_step == 1) 4367 cp->cp_ecx |= 8 << 12; 4368 break; 4369 case X86_VENDOR_Intel: 4370 /* 4371 * Extended L2 Cache features function. 4372 * First appeared on Prescott. 4373 */ 4374 default: 4375 break; 4376 } 4377 break; 4378 default: 4379 break; 4380 } 4381 } 4382 4383 pass2_done: 4384 cpi->cpi_pass = 2; 4385 } 4386 4387 static const char * 4388 intel_cpubrand(const struct cpuid_info *cpi) 4389 { 4390 int i; 4391 4392 if (!is_x86_feature(x86_featureset, X86FSET_CPUID) || 4393 cpi->cpi_maxeax < 1 || cpi->cpi_family < 5) 4394 return ("i486"); 4395 4396 switch (cpi->cpi_family) { 4397 case 5: 4398 return ("Intel Pentium(r)"); 4399 case 6: 4400 switch (cpi->cpi_model) { 4401 uint_t celeron, xeon; 4402 const struct cpuid_regs *cp; 4403 case 0: 4404 case 1: 4405 case 2: 4406 return ("Intel Pentium(r) Pro"); 4407 case 3: 4408 case 4: 4409 return ("Intel Pentium(r) II"); 4410 case 6: 4411 return ("Intel Celeron(r)"); 4412 case 5: 4413 case 7: 4414 celeron = xeon = 0; 4415 cp = &cpi->cpi_std[2]; /* cache info */ 4416 4417 for (i = 1; i < 4; i++) { 4418 uint_t tmp; 4419 4420 tmp = (cp->cp_eax >> (8 * i)) & 0xff; 4421 if (tmp == 0x40) 4422 celeron++; 4423 if (tmp >= 0x44 && tmp <= 0x45) 4424 xeon++; 4425 } 4426 4427 for (i = 0; i < 2; i++) { 4428 uint_t tmp; 4429 4430 tmp = (cp->cp_ebx >> (8 * i)) & 0xff; 4431 if (tmp == 0x40) 4432 celeron++; 4433 else if (tmp >= 0x44 && tmp <= 0x45) 4434 xeon++; 4435 } 4436 4437 for (i = 0; i < 4; i++) { 4438 uint_t tmp; 4439 4440 tmp = (cp->cp_ecx >> (8 * i)) & 0xff; 4441 if (tmp == 0x40) 4442 celeron++; 4443 else if (tmp >= 0x44 && tmp <= 0x45) 4444 xeon++; 4445 } 4446 4447 for (i = 0; i < 4; i++) { 4448 uint_t tmp; 4449 4450 tmp = (cp->cp_edx >> (8 * i)) & 0xff; 4451 if (tmp == 0x40) 4452 celeron++; 4453 else if (tmp >= 0x44 && tmp <= 0x45) 4454 xeon++; 4455 } 4456 4457 if (celeron) 4458 return ("Intel Celeron(r)"); 4459 if (xeon) 4460 return (cpi->cpi_model == 5 ? 4461 "Intel Pentium(r) II Xeon(tm)" : 4462 "Intel Pentium(r) III Xeon(tm)"); 4463 return (cpi->cpi_model == 5 ? 4464 "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" : 4465 "Intel Pentium(r) III or Pentium(r) III Xeon(tm)"); 4466 default: 4467 break; 4468 } 4469 default: 4470 break; 4471 } 4472 4473 /* BrandID is present if the field is nonzero */ 4474 if (cpi->cpi_brandid != 0) { 4475 static const struct { 4476 uint_t bt_bid; 4477 const char *bt_str; 4478 } brand_tbl[] = { 4479 { 0x1, "Intel(r) Celeron(r)" }, 4480 { 0x2, "Intel(r) Pentium(r) III" }, 4481 { 0x3, "Intel(r) Pentium(r) III Xeon(tm)" }, 4482 { 0x4, "Intel(r) Pentium(r) III" }, 4483 { 0x6, "Mobile Intel(r) Pentium(r) III" }, 4484 { 0x7, "Mobile Intel(r) Celeron(r)" }, 4485 { 0x8, "Intel(r) Pentium(r) 4" }, 4486 { 0x9, "Intel(r) Pentium(r) 4" }, 4487 { 0xa, "Intel(r) Celeron(r)" }, 4488 { 0xb, "Intel(r) Xeon(tm)" }, 4489 { 0xc, "Intel(r) Xeon(tm) MP" }, 4490 { 0xe, "Mobile Intel(r) Pentium(r) 4" }, 4491 { 0xf, "Mobile Intel(r) Celeron(r)" }, 4492 { 0x11, "Mobile Genuine Intel(r)" }, 4493 { 0x12, "Intel(r) Celeron(r) M" }, 4494 { 0x13, "Mobile Intel(r) Celeron(r)" }, 4495 { 0x14, "Intel(r) Celeron(r)" }, 4496 { 0x15, "Mobile Genuine Intel(r)" }, 4497 { 0x16, "Intel(r) Pentium(r) M" }, 4498 { 0x17, "Mobile Intel(r) Celeron(r)" } 4499 }; 4500 uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]); 4501 uint_t sgn; 4502 4503 sgn = (cpi->cpi_family << 8) | 4504 (cpi->cpi_model << 4) | cpi->cpi_step; 4505 4506 for (i = 0; i < btblmax; i++) 4507 if (brand_tbl[i].bt_bid == cpi->cpi_brandid) 4508 break; 4509 if (i < btblmax) { 4510 if (sgn == 0x6b1 && cpi->cpi_brandid == 3) 4511 return ("Intel(r) Celeron(r)"); 4512 if (sgn < 0xf13 && cpi->cpi_brandid == 0xb) 4513 return ("Intel(r) Xeon(tm) MP"); 4514 if (sgn < 0xf13 && cpi->cpi_brandid == 0xe) 4515 return ("Intel(r) Xeon(tm)"); 4516 return (brand_tbl[i].bt_str); 4517 } 4518 } 4519 4520 return (NULL); 4521 } 4522 4523 static const char * 4524 amd_cpubrand(const struct cpuid_info *cpi) 4525 { 4526 if (!is_x86_feature(x86_featureset, X86FSET_CPUID) || 4527 cpi->cpi_maxeax < 1 || cpi->cpi_family < 5) 4528 return ("i486 compatible"); 4529 4530 switch (cpi->cpi_family) { 4531 case 5: 4532 switch (cpi->cpi_model) { 4533 case 0: 4534 case 1: 4535 case 2: 4536 case 3: 4537 case 4: 4538 case 5: 4539 return ("AMD-K5(r)"); 4540 case 6: 4541 case 7: 4542 return ("AMD-K6(r)"); 4543 case 8: 4544 return ("AMD-K6(r)-2"); 4545 case 9: 4546 return ("AMD-K6(r)-III"); 4547 default: 4548 return ("AMD (family 5)"); 4549 } 4550 case 6: 4551 switch (cpi->cpi_model) { 4552 case 1: 4553 return ("AMD-K7(tm)"); 4554 case 0: 4555 case 2: 4556 case 4: 4557 return ("AMD Athlon(tm)"); 4558 case 3: 4559 case 7: 4560 return ("AMD Duron(tm)"); 4561 case 6: 4562 case 8: 4563 case 10: 4564 /* 4565 * Use the L2 cache size to distinguish 4566 */ 4567 return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ? 4568 "AMD Athlon(tm)" : "AMD Duron(tm)"); 4569 default: 4570 return ("AMD (family 6)"); 4571 } 4572 default: 4573 break; 4574 } 4575 4576 if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 && 4577 cpi->cpi_brandid != 0) { 4578 switch (BITX(cpi->cpi_brandid, 7, 5)) { 4579 case 3: 4580 return ("AMD Opteron(tm) UP 1xx"); 4581 case 4: 4582 return ("AMD Opteron(tm) DP 2xx"); 4583 case 5: 4584 return ("AMD Opteron(tm) MP 8xx"); 4585 default: 4586 return ("AMD Opteron(tm)"); 4587 } 4588 } 4589 4590 return (NULL); 4591 } 4592 4593 static const char * 4594 cyrix_cpubrand(struct cpuid_info *cpi, uint_t type) 4595 { 4596 if (!is_x86_feature(x86_featureset, X86FSET_CPUID) || 4597 cpi->cpi_maxeax < 1 || cpi->cpi_family < 5 || 4598 type == X86_TYPE_CYRIX_486) 4599 return ("i486 compatible"); 4600 4601 switch (type) { 4602 case X86_TYPE_CYRIX_6x86: 4603 return ("Cyrix 6x86"); 4604 case X86_TYPE_CYRIX_6x86L: 4605 return ("Cyrix 6x86L"); 4606 case X86_TYPE_CYRIX_6x86MX: 4607 return ("Cyrix 6x86MX"); 4608 case X86_TYPE_CYRIX_GXm: 4609 return ("Cyrix GXm"); 4610 case X86_TYPE_CYRIX_MediaGX: 4611 return ("Cyrix MediaGX"); 4612 case X86_TYPE_CYRIX_MII: 4613 return ("Cyrix M2"); 4614 case X86_TYPE_VIA_CYRIX_III: 4615 return ("VIA Cyrix M3"); 4616 default: 4617 /* 4618 * Have another wild guess .. 4619 */ 4620 if (cpi->cpi_family == 4 && cpi->cpi_model == 9) 4621 return ("Cyrix 5x86"); 4622 else if (cpi->cpi_family == 5) { 4623 switch (cpi->cpi_model) { 4624 case 2: 4625 return ("Cyrix 6x86"); /* Cyrix M1 */ 4626 case 4: 4627 return ("Cyrix MediaGX"); 4628 default: 4629 break; 4630 } 4631 } else if (cpi->cpi_family == 6) { 4632 switch (cpi->cpi_model) { 4633 case 0: 4634 return ("Cyrix 6x86MX"); /* Cyrix M2? */ 4635 case 5: 4636 case 6: 4637 case 7: 4638 case 8: 4639 case 9: 4640 return ("VIA C3"); 4641 default: 4642 break; 4643 } 4644 } 4645 break; 4646 } 4647 return (NULL); 4648 } 4649 4650 /* 4651 * This only gets called in the case that the CPU extended 4652 * feature brand string (0x80000002, 0x80000003, 0x80000004) 4653 * aren't available, or contain null bytes for some reason. 4654 */ 4655 static void 4656 fabricate_brandstr(struct cpuid_info *cpi) 4657 { 4658 const char *brand = NULL; 4659 4660 switch (cpi->cpi_vendor) { 4661 case X86_VENDOR_Intel: 4662 brand = intel_cpubrand(cpi); 4663 break; 4664 case X86_VENDOR_AMD: 4665 brand = amd_cpubrand(cpi); 4666 break; 4667 case X86_VENDOR_Cyrix: 4668 brand = cyrix_cpubrand(cpi, x86_type); 4669 break; 4670 case X86_VENDOR_NexGen: 4671 if (cpi->cpi_family == 5 && cpi->cpi_model == 0) 4672 brand = "NexGen Nx586"; 4673 break; 4674 case X86_VENDOR_Centaur: 4675 if (cpi->cpi_family == 5) 4676 switch (cpi->cpi_model) { 4677 case 4: 4678 brand = "Centaur C6"; 4679 break; 4680 case 8: 4681 brand = "Centaur C2"; 4682 break; 4683 case 9: 4684 brand = "Centaur C3"; 4685 break; 4686 default: 4687 break; 4688 } 4689 break; 4690 case X86_VENDOR_Rise: 4691 if (cpi->cpi_family == 5 && 4692 (cpi->cpi_model == 0 || cpi->cpi_model == 2)) 4693 brand = "Rise mP6"; 4694 break; 4695 case X86_VENDOR_SiS: 4696 if (cpi->cpi_family == 5 && cpi->cpi_model == 0) 4697 brand = "SiS 55x"; 4698 break; 4699 case X86_VENDOR_TM: 4700 if (cpi->cpi_family == 5 && cpi->cpi_model == 4) 4701 brand = "Transmeta Crusoe TM3x00 or TM5x00"; 4702 break; 4703 case X86_VENDOR_NSC: 4704 case X86_VENDOR_UMC: 4705 default: 4706 break; 4707 } 4708 if (brand) { 4709 (void) strcpy((char *)cpi->cpi_brandstr, brand); 4710 return; 4711 } 4712 4713 /* 4714 * If all else fails ... 4715 */ 4716 (void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr), 4717 "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family, 4718 cpi->cpi_model, cpi->cpi_step); 4719 } 4720 4721 /* 4722 * This routine is called just after kernel memory allocation 4723 * becomes available on cpu0, and as part of mp_startup() on 4724 * the other cpus. 4725 * 4726 * Fixup the brand string, and collect any information from cpuid 4727 * that requires dynamically allocated storage to represent. 4728 */ 4729 /*ARGSUSED*/ 4730 void 4731 cpuid_pass3(cpu_t *cpu) 4732 { 4733 int i, max, shft, level, size; 4734 struct cpuid_regs regs; 4735 struct cpuid_regs *cp; 4736 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 4737 4738 ASSERT(cpi->cpi_pass == 2); 4739 4740 /* 4741 * Deterministic cache parameters 4742 * 4743 * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The 4744 * values that are present are currently defined to be the same. This 4745 * means we can use the same logic to parse it as long as we use the 4746 * appropriate leaf to get the data. If you're updating this, make sure 4747 * you're careful about which vendor supports which aspect. 4748 * 4749 * Take this opportunity to detect the number of threads sharing the 4750 * last level cache, and construct a corresponding cache id. The 4751 * respective cpuid_info members are initialized to the default case of 4752 * "no last level cache sharing". 4753 */ 4754 cpi->cpi_ncpu_shr_last_cache = 1; 4755 cpi->cpi_last_lvl_cacheid = cpu->cpu_id; 4756 4757 if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) || 4758 (cpi->cpi_vendor == X86_VENDOR_AMD && 4759 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d && 4760 is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) { 4761 uint32_t leaf; 4762 4763 if (cpi->cpi_vendor == X86_VENDOR_Intel) { 4764 leaf = 4; 4765 } else { 4766 leaf = CPUID_LEAF_EXT_1d; 4767 } 4768 4769 /* 4770 * Find the # of elements (size) returned by the leaf and along 4771 * the way detect last level cache sharing details. 4772 */ 4773 bzero(®s, sizeof (regs)); 4774 cp = ®s; 4775 for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) { 4776 cp->cp_eax = leaf; 4777 cp->cp_ecx = i; 4778 4779 (void) __cpuid_insn(cp); 4780 4781 if (CPI_CACHE_TYPE(cp) == 0) 4782 break; 4783 level = CPI_CACHE_LVL(cp); 4784 if (level > max) { 4785 max = level; 4786 cpi->cpi_ncpu_shr_last_cache = 4787 CPI_NTHR_SHR_CACHE(cp) + 1; 4788 } 4789 } 4790 cpi->cpi_cache_leaf_size = size = i; 4791 4792 /* 4793 * Allocate the cpi_cache_leaves array. The first element 4794 * references the regs for the corresponding leaf with %ecx set 4795 * to 0. This was gathered in cpuid_pass2(). 4796 */ 4797 if (size > 0) { 4798 cpi->cpi_cache_leaves = 4799 kmem_alloc(size * sizeof (cp), KM_SLEEP); 4800 if (cpi->cpi_vendor == X86_VENDOR_Intel) { 4801 cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4]; 4802 } else { 4803 cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d]; 4804 } 4805 4806 /* 4807 * Allocate storage to hold the additional regs 4808 * for the leaf, %ecx == 1 .. cpi_cache_leaf_size. 4809 * 4810 * The regs for the leaf, %ecx == 0 has already 4811 * been allocated as indicated above. 4812 */ 4813 for (i = 1; i < size; i++) { 4814 cp = cpi->cpi_cache_leaves[i] = 4815 kmem_zalloc(sizeof (regs), KM_SLEEP); 4816 cp->cp_eax = leaf; 4817 cp->cp_ecx = i; 4818 4819 (void) __cpuid_insn(cp); 4820 } 4821 } 4822 /* 4823 * Determine the number of bits needed to represent 4824 * the number of CPUs sharing the last level cache. 4825 * 4826 * Shift off that number of bits from the APIC id to 4827 * derive the cache id. 4828 */ 4829 shft = 0; 4830 for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1) 4831 shft++; 4832 cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft; 4833 } 4834 4835 /* 4836 * Now fixup the brand string 4837 */ 4838 if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) { 4839 fabricate_brandstr(cpi); 4840 } else { 4841 4842 /* 4843 * If we successfully extracted a brand string from the cpuid 4844 * instruction, clean it up by removing leading spaces and 4845 * similar junk. 4846 */ 4847 if (cpi->cpi_brandstr[0]) { 4848 size_t maxlen = sizeof (cpi->cpi_brandstr); 4849 char *src, *dst; 4850 4851 dst = src = (char *)cpi->cpi_brandstr; 4852 src[maxlen - 1] = '\0'; 4853 /* 4854 * strip leading spaces 4855 */ 4856 while (*src == ' ') 4857 src++; 4858 /* 4859 * Remove any 'Genuine' or "Authentic" prefixes 4860 */ 4861 if (strncmp(src, "Genuine ", 8) == 0) 4862 src += 8; 4863 if (strncmp(src, "Authentic ", 10) == 0) 4864 src += 10; 4865 4866 /* 4867 * Now do an in-place copy. 4868 * Map (R) to (r) and (TM) to (tm). 4869 * The era of teletypes is long gone, and there's 4870 * -really- no need to shout. 4871 */ 4872 while (*src != '\0') { 4873 if (src[0] == '(') { 4874 if (strncmp(src + 1, "R)", 2) == 0) { 4875 (void) strncpy(dst, "(r)", 3); 4876 src += 3; 4877 dst += 3; 4878 continue; 4879 } 4880 if (strncmp(src + 1, "TM)", 3) == 0) { 4881 (void) strncpy(dst, "(tm)", 4); 4882 src += 4; 4883 dst += 4; 4884 continue; 4885 } 4886 } 4887 *dst++ = *src++; 4888 } 4889 *dst = '\0'; 4890 4891 /* 4892 * Finally, remove any trailing spaces 4893 */ 4894 while (--dst > cpi->cpi_brandstr) 4895 if (*dst == ' ') 4896 *dst = '\0'; 4897 else 4898 break; 4899 } else 4900 fabricate_brandstr(cpi); 4901 } 4902 cpi->cpi_pass = 3; 4903 } 4904 4905 /* 4906 * This routine is called out of bind_hwcap() much later in the life 4907 * of the kernel (post_startup()). The job of this routine is to resolve 4908 * the hardware feature support and kernel support for those features into 4909 * what we're actually going to tell applications via the aux vector. 4910 */ 4911 void 4912 cpuid_pass4(cpu_t *cpu, uint_t *hwcap_out) 4913 { 4914 struct cpuid_info *cpi; 4915 uint_t hwcap_flags = 0, hwcap_flags_2 = 0; 4916 4917 if (cpu == NULL) 4918 cpu = CPU; 4919 cpi = cpu->cpu_m.mcpu_cpi; 4920 4921 ASSERT(cpi->cpi_pass == 3); 4922 4923 if (cpi->cpi_maxeax >= 1) { 4924 uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES]; 4925 uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES]; 4926 uint32_t *ebx = &cpi->cpi_support[STD_EBX_FEATURES]; 4927 4928 *edx = CPI_FEATURES_EDX(cpi); 4929 *ecx = CPI_FEATURES_ECX(cpi); 4930 *ebx = CPI_FEATURES_7_0_EBX(cpi); 4931 4932 /* 4933 * [these require explicit kernel support] 4934 */ 4935 if (!is_x86_feature(x86_featureset, X86FSET_SEP)) 4936 *edx &= ~CPUID_INTC_EDX_SEP; 4937 4938 if (!is_x86_feature(x86_featureset, X86FSET_SSE)) 4939 *edx &= ~(CPUID_INTC_EDX_FXSR|CPUID_INTC_EDX_SSE); 4940 if (!is_x86_feature(x86_featureset, X86FSET_SSE2)) 4941 *edx &= ~CPUID_INTC_EDX_SSE2; 4942 4943 if (!is_x86_feature(x86_featureset, X86FSET_HTT)) 4944 *edx &= ~CPUID_INTC_EDX_HTT; 4945 4946 if (!is_x86_feature(x86_featureset, X86FSET_SSE3)) 4947 *ecx &= ~CPUID_INTC_ECX_SSE3; 4948 4949 if (!is_x86_feature(x86_featureset, X86FSET_SSSE3)) 4950 *ecx &= ~CPUID_INTC_ECX_SSSE3; 4951 if (!is_x86_feature(x86_featureset, X86FSET_SSE4_1)) 4952 *ecx &= ~CPUID_INTC_ECX_SSE4_1; 4953 if (!is_x86_feature(x86_featureset, X86FSET_SSE4_2)) 4954 *ecx &= ~CPUID_INTC_ECX_SSE4_2; 4955 if (!is_x86_feature(x86_featureset, X86FSET_AES)) 4956 *ecx &= ~CPUID_INTC_ECX_AES; 4957 if (!is_x86_feature(x86_featureset, X86FSET_PCLMULQDQ)) 4958 *ecx &= ~CPUID_INTC_ECX_PCLMULQDQ; 4959 if (!is_x86_feature(x86_featureset, X86FSET_XSAVE)) 4960 *ecx &= ~(CPUID_INTC_ECX_XSAVE | 4961 CPUID_INTC_ECX_OSXSAVE); 4962 if (!is_x86_feature(x86_featureset, X86FSET_AVX)) 4963 *ecx &= ~CPUID_INTC_ECX_AVX; 4964 if (!is_x86_feature(x86_featureset, X86FSET_F16C)) 4965 *ecx &= ~CPUID_INTC_ECX_F16C; 4966 if (!is_x86_feature(x86_featureset, X86FSET_FMA)) 4967 *ecx &= ~CPUID_INTC_ECX_FMA; 4968 if (!is_x86_feature(x86_featureset, X86FSET_BMI1)) 4969 *ebx &= ~CPUID_INTC_EBX_7_0_BMI1; 4970 if (!is_x86_feature(x86_featureset, X86FSET_BMI2)) 4971 *ebx &= ~CPUID_INTC_EBX_7_0_BMI2; 4972 if (!is_x86_feature(x86_featureset, X86FSET_AVX2)) 4973 *ebx &= ~CPUID_INTC_EBX_7_0_AVX2; 4974 if (!is_x86_feature(x86_featureset, X86FSET_RDSEED)) 4975 *ebx &= ~CPUID_INTC_EBX_7_0_RDSEED; 4976 if (!is_x86_feature(x86_featureset, X86FSET_ADX)) 4977 *ebx &= ~CPUID_INTC_EBX_7_0_ADX; 4978 4979 /* 4980 * [no explicit support required beyond x87 fp context] 4981 */ 4982 if (!fpu_exists) 4983 *edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX); 4984 4985 /* 4986 * Now map the supported feature vector to things that we 4987 * think userland will care about. 4988 */ 4989 if (*edx & CPUID_INTC_EDX_SEP) 4990 hwcap_flags |= AV_386_SEP; 4991 if (*edx & CPUID_INTC_EDX_SSE) 4992 hwcap_flags |= AV_386_FXSR | AV_386_SSE; 4993 if (*edx & CPUID_INTC_EDX_SSE2) 4994 hwcap_flags |= AV_386_SSE2; 4995 if (*ecx & CPUID_INTC_ECX_SSE3) 4996 hwcap_flags |= AV_386_SSE3; 4997 if (*ecx & CPUID_INTC_ECX_SSSE3) 4998 hwcap_flags |= AV_386_SSSE3; 4999 if (*ecx & CPUID_INTC_ECX_SSE4_1) 5000 hwcap_flags |= AV_386_SSE4_1; 5001 if (*ecx & CPUID_INTC_ECX_SSE4_2) 5002 hwcap_flags |= AV_386_SSE4_2; 5003 if (*ecx & CPUID_INTC_ECX_MOVBE) 5004 hwcap_flags |= AV_386_MOVBE; 5005 if (*ecx & CPUID_INTC_ECX_AES) 5006 hwcap_flags |= AV_386_AES; 5007 if (*ecx & CPUID_INTC_ECX_PCLMULQDQ) 5008 hwcap_flags |= AV_386_PCLMULQDQ; 5009 if ((*ecx & CPUID_INTC_ECX_XSAVE) && 5010 (*ecx & CPUID_INTC_ECX_OSXSAVE)) { 5011 hwcap_flags |= AV_386_XSAVE; 5012 5013 if (*ecx & CPUID_INTC_ECX_AVX) { 5014 uint32_t *ecx_7 = &CPI_FEATURES_7_0_ECX(cpi); 5015 uint32_t *edx_7 = &CPI_FEATURES_7_0_EDX(cpi); 5016 5017 hwcap_flags |= AV_386_AVX; 5018 if (*ecx & CPUID_INTC_ECX_F16C) 5019 hwcap_flags_2 |= AV_386_2_F16C; 5020 if (*ecx & CPUID_INTC_ECX_FMA) 5021 hwcap_flags_2 |= AV_386_2_FMA; 5022 5023 if (*ebx & CPUID_INTC_EBX_7_0_BMI1) 5024 hwcap_flags_2 |= AV_386_2_BMI1; 5025 if (*ebx & CPUID_INTC_EBX_7_0_BMI2) 5026 hwcap_flags_2 |= AV_386_2_BMI2; 5027 if (*ebx & CPUID_INTC_EBX_7_0_AVX2) 5028 hwcap_flags_2 |= AV_386_2_AVX2; 5029 if (*ebx & CPUID_INTC_EBX_7_0_AVX512F) 5030 hwcap_flags_2 |= AV_386_2_AVX512F; 5031 if (*ebx & CPUID_INTC_EBX_7_0_AVX512DQ) 5032 hwcap_flags_2 |= AV_386_2_AVX512DQ; 5033 if (*ebx & CPUID_INTC_EBX_7_0_AVX512IFMA) 5034 hwcap_flags_2 |= AV_386_2_AVX512IFMA; 5035 if (*ebx & CPUID_INTC_EBX_7_0_AVX512PF) 5036 hwcap_flags_2 |= AV_386_2_AVX512PF; 5037 if (*ebx & CPUID_INTC_EBX_7_0_AVX512ER) 5038 hwcap_flags_2 |= AV_386_2_AVX512ER; 5039 if (*ebx & CPUID_INTC_EBX_7_0_AVX512CD) 5040 hwcap_flags_2 |= AV_386_2_AVX512CD; 5041 if (*ebx & CPUID_INTC_EBX_7_0_AVX512BW) 5042 hwcap_flags_2 |= AV_386_2_AVX512BW; 5043 if (*ebx & CPUID_INTC_EBX_7_0_AVX512VL) 5044 hwcap_flags_2 |= AV_386_2_AVX512VL; 5045 5046 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VBMI) 5047 hwcap_flags_2 |= AV_386_2_AVX512VBMI; 5048 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VNNI) 5049 hwcap_flags_2 |= AV_386_2_AVX512_VNNI; 5050 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ) 5051 hwcap_flags_2 |= AV_386_2_AVX512VPOPCDQ; 5052 5053 if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124NNIW) 5054 hwcap_flags_2 |= AV_386_2_AVX512_4NNIW; 5055 if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124FMAPS) 5056 hwcap_flags_2 |= AV_386_2_AVX512_4FMAPS; 5057 } 5058 } 5059 if (*ecx & CPUID_INTC_ECX_VMX) 5060 hwcap_flags |= AV_386_VMX; 5061 if (*ecx & CPUID_INTC_ECX_POPCNT) 5062 hwcap_flags |= AV_386_POPCNT; 5063 if (*edx & CPUID_INTC_EDX_FPU) 5064 hwcap_flags |= AV_386_FPU; 5065 if (*edx & CPUID_INTC_EDX_MMX) 5066 hwcap_flags |= AV_386_MMX; 5067 5068 if (*edx & CPUID_INTC_EDX_TSC) 5069 hwcap_flags |= AV_386_TSC; 5070 if (*edx & CPUID_INTC_EDX_CX8) 5071 hwcap_flags |= AV_386_CX8; 5072 if (*edx & CPUID_INTC_EDX_CMOV) 5073 hwcap_flags |= AV_386_CMOV; 5074 if (*ecx & CPUID_INTC_ECX_CX16) 5075 hwcap_flags |= AV_386_CX16; 5076 5077 if (*ecx & CPUID_INTC_ECX_RDRAND) 5078 hwcap_flags_2 |= AV_386_2_RDRAND; 5079 if (*ebx & CPUID_INTC_EBX_7_0_ADX) 5080 hwcap_flags_2 |= AV_386_2_ADX; 5081 if (*ebx & CPUID_INTC_EBX_7_0_RDSEED) 5082 hwcap_flags_2 |= AV_386_2_RDSEED; 5083 if (*ebx & CPUID_INTC_EBX_7_0_SHA) 5084 hwcap_flags_2 |= AV_386_2_SHA; 5085 if (*ebx & CPUID_INTC_EBX_7_0_FSGSBASE) 5086 hwcap_flags_2 |= AV_386_2_FSGSBASE; 5087 if (*ebx & CPUID_INTC_EBX_7_0_CLWB) 5088 hwcap_flags_2 |= AV_386_2_CLWB; 5089 if (*ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT) 5090 hwcap_flags_2 |= AV_386_2_CLFLUSHOPT; 5091 5092 } 5093 /* 5094 * Check a few miscilaneous features. 5095 */ 5096 if (is_x86_feature(x86_featureset, X86FSET_CLZERO)) 5097 hwcap_flags_2 |= AV_386_2_CLZERO; 5098 5099 if (cpi->cpi_xmaxeax < 0x80000001) 5100 goto pass4_done; 5101 5102 switch (cpi->cpi_vendor) { 5103 struct cpuid_regs cp; 5104 uint32_t *edx, *ecx; 5105 5106 case X86_VENDOR_Intel: 5107 /* 5108 * Seems like Intel duplicated what we necessary 5109 * here to make the initial crop of 64-bit OS's work. 5110 * Hopefully, those are the only "extended" bits 5111 * they'll add. 5112 */ 5113 /*FALLTHROUGH*/ 5114 5115 case X86_VENDOR_AMD: 5116 edx = &cpi->cpi_support[AMD_EDX_FEATURES]; 5117 ecx = &cpi->cpi_support[AMD_ECX_FEATURES]; 5118 5119 *edx = CPI_FEATURES_XTD_EDX(cpi); 5120 *ecx = CPI_FEATURES_XTD_ECX(cpi); 5121 5122 /* 5123 * [these features require explicit kernel support] 5124 */ 5125 switch (cpi->cpi_vendor) { 5126 case X86_VENDOR_Intel: 5127 if (!is_x86_feature(x86_featureset, X86FSET_TSCP)) 5128 *edx &= ~CPUID_AMD_EDX_TSCP; 5129 break; 5130 5131 case X86_VENDOR_AMD: 5132 if (!is_x86_feature(x86_featureset, X86FSET_TSCP)) 5133 *edx &= ~CPUID_AMD_EDX_TSCP; 5134 if (!is_x86_feature(x86_featureset, X86FSET_SSE4A)) 5135 *ecx &= ~CPUID_AMD_ECX_SSE4A; 5136 break; 5137 5138 default: 5139 break; 5140 } 5141 5142 /* 5143 * [no explicit support required beyond 5144 * x87 fp context and exception handlers] 5145 */ 5146 if (!fpu_exists) 5147 *edx &= ~(CPUID_AMD_EDX_MMXamd | 5148 CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx); 5149 5150 if (!is_x86_feature(x86_featureset, X86FSET_NX)) 5151 *edx &= ~CPUID_AMD_EDX_NX; 5152 #if !defined(__amd64) 5153 *edx &= ~CPUID_AMD_EDX_LM; 5154 #endif 5155 /* 5156 * Now map the supported feature vector to 5157 * things that we think userland will care about. 5158 */ 5159 #if defined(__amd64) 5160 if (*edx & CPUID_AMD_EDX_SYSC) 5161 hwcap_flags |= AV_386_AMD_SYSC; 5162 #endif 5163 if (*edx & CPUID_AMD_EDX_MMXamd) 5164 hwcap_flags |= AV_386_AMD_MMX; 5165 if (*edx & CPUID_AMD_EDX_3DNow) 5166 hwcap_flags |= AV_386_AMD_3DNow; 5167 if (*edx & CPUID_AMD_EDX_3DNowx) 5168 hwcap_flags |= AV_386_AMD_3DNowx; 5169 if (*ecx & CPUID_AMD_ECX_SVM) 5170 hwcap_flags |= AV_386_AMD_SVM; 5171 5172 switch (cpi->cpi_vendor) { 5173 case X86_VENDOR_AMD: 5174 if (*edx & CPUID_AMD_EDX_TSCP) 5175 hwcap_flags |= AV_386_TSCP; 5176 if (*ecx & CPUID_AMD_ECX_AHF64) 5177 hwcap_flags |= AV_386_AHF; 5178 if (*ecx & CPUID_AMD_ECX_SSE4A) 5179 hwcap_flags |= AV_386_AMD_SSE4A; 5180 if (*ecx & CPUID_AMD_ECX_LZCNT) 5181 hwcap_flags |= AV_386_AMD_LZCNT; 5182 if (*ecx & CPUID_AMD_ECX_MONITORX) 5183 hwcap_flags_2 |= AV_386_2_MONITORX; 5184 break; 5185 5186 case X86_VENDOR_Intel: 5187 if (*edx & CPUID_AMD_EDX_TSCP) 5188 hwcap_flags |= AV_386_TSCP; 5189 if (*ecx & CPUID_AMD_ECX_LZCNT) 5190 hwcap_flags |= AV_386_AMD_LZCNT; 5191 /* 5192 * Aarrgh. 5193 * Intel uses a different bit in the same word. 5194 */ 5195 if (*ecx & CPUID_INTC_ECX_AHF64) 5196 hwcap_flags |= AV_386_AHF; 5197 break; 5198 5199 default: 5200 break; 5201 } 5202 break; 5203 5204 case X86_VENDOR_TM: 5205 cp.cp_eax = 0x80860001; 5206 (void) __cpuid_insn(&cp); 5207 cpi->cpi_support[TM_EDX_FEATURES] = cp.cp_edx; 5208 break; 5209 5210 default: 5211 break; 5212 } 5213 5214 pass4_done: 5215 cpi->cpi_pass = 4; 5216 if (hwcap_out != NULL) { 5217 hwcap_out[0] = hwcap_flags; 5218 hwcap_out[1] = hwcap_flags_2; 5219 } 5220 } 5221 5222 5223 /* 5224 * Simulate the cpuid instruction using the data we previously 5225 * captured about this CPU. We try our best to return the truth 5226 * about the hardware, independently of kernel support. 5227 */ 5228 uint32_t 5229 cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp) 5230 { 5231 struct cpuid_info *cpi; 5232 struct cpuid_regs *xcp; 5233 5234 if (cpu == NULL) 5235 cpu = CPU; 5236 cpi = cpu->cpu_m.mcpu_cpi; 5237 5238 ASSERT(cpuid_checkpass(cpu, 3)); 5239 5240 /* 5241 * CPUID data is cached in two separate places: cpi_std for standard 5242 * CPUID leaves , and cpi_extd for extended CPUID leaves. 5243 */ 5244 if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) { 5245 xcp = &cpi->cpi_std[cp->cp_eax]; 5246 } else if (cp->cp_eax >= CPUID_LEAF_EXT_0 && 5247 cp->cp_eax <= cpi->cpi_xmaxeax && 5248 cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) { 5249 xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0]; 5250 } else { 5251 /* 5252 * The caller is asking for data from an input parameter which 5253 * the kernel has not cached. In this case we go fetch from 5254 * the hardware and return the data directly to the user. 5255 */ 5256 return (__cpuid_insn(cp)); 5257 } 5258 5259 cp->cp_eax = xcp->cp_eax; 5260 cp->cp_ebx = xcp->cp_ebx; 5261 cp->cp_ecx = xcp->cp_ecx; 5262 cp->cp_edx = xcp->cp_edx; 5263 return (cp->cp_eax); 5264 } 5265 5266 int 5267 cpuid_checkpass(cpu_t *cpu, int pass) 5268 { 5269 return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL && 5270 cpu->cpu_m.mcpu_cpi->cpi_pass >= pass); 5271 } 5272 5273 int 5274 cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n) 5275 { 5276 ASSERT(cpuid_checkpass(cpu, 3)); 5277 5278 return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr)); 5279 } 5280 5281 int 5282 cpuid_is_cmt(cpu_t *cpu) 5283 { 5284 if (cpu == NULL) 5285 cpu = CPU; 5286 5287 ASSERT(cpuid_checkpass(cpu, 1)); 5288 5289 return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0); 5290 } 5291 5292 /* 5293 * AMD and Intel both implement the 64-bit variant of the syscall 5294 * instruction (syscallq), so if there's -any- support for syscall, 5295 * cpuid currently says "yes, we support this". 5296 * 5297 * However, Intel decided to -not- implement the 32-bit variant of the 5298 * syscall instruction, so we provide a predicate to allow our caller 5299 * to test that subtlety here. 5300 * 5301 * XXPV Currently, 32-bit syscall instructions don't work via the hypervisor, 5302 * even in the case where the hardware would in fact support it. 5303 */ 5304 /*ARGSUSED*/ 5305 int 5306 cpuid_syscall32_insn(cpu_t *cpu) 5307 { 5308 ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), 1)); 5309 5310 #if !defined(__xpv) 5311 if (cpu == NULL) 5312 cpu = CPU; 5313 5314 /*CSTYLED*/ 5315 { 5316 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 5317 5318 if (cpi->cpi_vendor == X86_VENDOR_AMD && 5319 cpi->cpi_xmaxeax >= 0x80000001 && 5320 (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC)) 5321 return (1); 5322 } 5323 #endif 5324 return (0); 5325 } 5326 5327 int 5328 cpuid_getidstr(cpu_t *cpu, char *s, size_t n) 5329 { 5330 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 5331 5332 static const char fmt[] = 5333 "x86 (%s %X family %d model %d step %d clock %d MHz)"; 5334 static const char fmt_ht[] = 5335 "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)"; 5336 5337 ASSERT(cpuid_checkpass(cpu, 1)); 5338 5339 if (cpuid_is_cmt(cpu)) 5340 return (snprintf(s, n, fmt_ht, cpi->cpi_chipid, 5341 cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax, 5342 cpi->cpi_family, cpi->cpi_model, 5343 cpi->cpi_step, cpu->cpu_type_info.pi_clock)); 5344 return (snprintf(s, n, fmt, 5345 cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax, 5346 cpi->cpi_family, cpi->cpi_model, 5347 cpi->cpi_step, cpu->cpu_type_info.pi_clock)); 5348 } 5349 5350 const char * 5351 cpuid_getvendorstr(cpu_t *cpu) 5352 { 5353 ASSERT(cpuid_checkpass(cpu, 1)); 5354 return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr); 5355 } 5356 5357 uint_t 5358 cpuid_getvendor(cpu_t *cpu) 5359 { 5360 ASSERT(cpuid_checkpass(cpu, 1)); 5361 return (cpu->cpu_m.mcpu_cpi->cpi_vendor); 5362 } 5363 5364 uint_t 5365 cpuid_getfamily(cpu_t *cpu) 5366 { 5367 ASSERT(cpuid_checkpass(cpu, 1)); 5368 return (cpu->cpu_m.mcpu_cpi->cpi_family); 5369 } 5370 5371 uint_t 5372 cpuid_getmodel(cpu_t *cpu) 5373 { 5374 ASSERT(cpuid_checkpass(cpu, 1)); 5375 return (cpu->cpu_m.mcpu_cpi->cpi_model); 5376 } 5377 5378 uint_t 5379 cpuid_get_ncpu_per_chip(cpu_t *cpu) 5380 { 5381 ASSERT(cpuid_checkpass(cpu, 1)); 5382 return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip); 5383 } 5384 5385 uint_t 5386 cpuid_get_ncore_per_chip(cpu_t *cpu) 5387 { 5388 ASSERT(cpuid_checkpass(cpu, 1)); 5389 return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip); 5390 } 5391 5392 uint_t 5393 cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu) 5394 { 5395 ASSERT(cpuid_checkpass(cpu, 2)); 5396 return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache); 5397 } 5398 5399 id_t 5400 cpuid_get_last_lvl_cacheid(cpu_t *cpu) 5401 { 5402 ASSERT(cpuid_checkpass(cpu, 2)); 5403 return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid); 5404 } 5405 5406 uint_t 5407 cpuid_getstep(cpu_t *cpu) 5408 { 5409 ASSERT(cpuid_checkpass(cpu, 1)); 5410 return (cpu->cpu_m.mcpu_cpi->cpi_step); 5411 } 5412 5413 uint_t 5414 cpuid_getsig(struct cpu *cpu) 5415 { 5416 ASSERT(cpuid_checkpass(cpu, 1)); 5417 return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax); 5418 } 5419 5420 uint32_t 5421 cpuid_getchiprev(struct cpu *cpu) 5422 { 5423 ASSERT(cpuid_checkpass(cpu, 1)); 5424 return (cpu->cpu_m.mcpu_cpi->cpi_chiprev); 5425 } 5426 5427 const char * 5428 cpuid_getchiprevstr(struct cpu *cpu) 5429 { 5430 ASSERT(cpuid_checkpass(cpu, 1)); 5431 return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr); 5432 } 5433 5434 uint32_t 5435 cpuid_getsockettype(struct cpu *cpu) 5436 { 5437 ASSERT(cpuid_checkpass(cpu, 1)); 5438 return (cpu->cpu_m.mcpu_cpi->cpi_socket); 5439 } 5440 5441 const char * 5442 cpuid_getsocketstr(cpu_t *cpu) 5443 { 5444 static const char *socketstr = NULL; 5445 struct cpuid_info *cpi; 5446 5447 ASSERT(cpuid_checkpass(cpu, 1)); 5448 cpi = cpu->cpu_m.mcpu_cpi; 5449 5450 /* Assume that socket types are the same across the system */ 5451 if (socketstr == NULL) 5452 socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family, 5453 cpi->cpi_model, cpi->cpi_step); 5454 5455 5456 return (socketstr); 5457 } 5458 5459 int 5460 cpuid_get_chipid(cpu_t *cpu) 5461 { 5462 ASSERT(cpuid_checkpass(cpu, 1)); 5463 5464 if (cpuid_is_cmt(cpu)) 5465 return (cpu->cpu_m.mcpu_cpi->cpi_chipid); 5466 return (cpu->cpu_id); 5467 } 5468 5469 id_t 5470 cpuid_get_coreid(cpu_t *cpu) 5471 { 5472 ASSERT(cpuid_checkpass(cpu, 1)); 5473 return (cpu->cpu_m.mcpu_cpi->cpi_coreid); 5474 } 5475 5476 int 5477 cpuid_get_pkgcoreid(cpu_t *cpu) 5478 { 5479 ASSERT(cpuid_checkpass(cpu, 1)); 5480 return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid); 5481 } 5482 5483 int 5484 cpuid_get_clogid(cpu_t *cpu) 5485 { 5486 ASSERT(cpuid_checkpass(cpu, 1)); 5487 return (cpu->cpu_m.mcpu_cpi->cpi_clogid); 5488 } 5489 5490 int 5491 cpuid_get_cacheid(cpu_t *cpu) 5492 { 5493 ASSERT(cpuid_checkpass(cpu, 1)); 5494 return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid); 5495 } 5496 5497 uint_t 5498 cpuid_get_procnodeid(cpu_t *cpu) 5499 { 5500 ASSERT(cpuid_checkpass(cpu, 1)); 5501 return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid); 5502 } 5503 5504 uint_t 5505 cpuid_get_procnodes_per_pkg(cpu_t *cpu) 5506 { 5507 ASSERT(cpuid_checkpass(cpu, 1)); 5508 return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg); 5509 } 5510 5511 uint_t 5512 cpuid_get_compunitid(cpu_t *cpu) 5513 { 5514 ASSERT(cpuid_checkpass(cpu, 1)); 5515 return (cpu->cpu_m.mcpu_cpi->cpi_compunitid); 5516 } 5517 5518 uint_t 5519 cpuid_get_cores_per_compunit(cpu_t *cpu) 5520 { 5521 ASSERT(cpuid_checkpass(cpu, 1)); 5522 return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit); 5523 } 5524 5525 /*ARGSUSED*/ 5526 int 5527 cpuid_have_cr8access(cpu_t *cpu) 5528 { 5529 #if defined(__amd64) 5530 return (1); 5531 #else 5532 struct cpuid_info *cpi; 5533 5534 ASSERT(cpu != NULL); 5535 cpi = cpu->cpu_m.mcpu_cpi; 5536 if (cpi->cpi_vendor == X86_VENDOR_AMD && cpi->cpi_maxeax >= 1 && 5537 (CPI_FEATURES_XTD_ECX(cpi) & CPUID_AMD_ECX_CR8D) != 0) 5538 return (1); 5539 return (0); 5540 #endif 5541 } 5542 5543 uint32_t 5544 cpuid_get_apicid(cpu_t *cpu) 5545 { 5546 ASSERT(cpuid_checkpass(cpu, 1)); 5547 if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) { 5548 return (UINT32_MAX); 5549 } else { 5550 return (cpu->cpu_m.mcpu_cpi->cpi_apicid); 5551 } 5552 } 5553 5554 void 5555 cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits) 5556 { 5557 struct cpuid_info *cpi; 5558 5559 if (cpu == NULL) 5560 cpu = CPU; 5561 cpi = cpu->cpu_m.mcpu_cpi; 5562 5563 ASSERT(cpuid_checkpass(cpu, 1)); 5564 5565 if (pabits) 5566 *pabits = cpi->cpi_pabits; 5567 if (vabits) 5568 *vabits = cpi->cpi_vabits; 5569 } 5570 5571 size_t 5572 cpuid_get_xsave_size() 5573 { 5574 return (MAX(cpuid_info0.cpi_xsave.xsav_max_size, 5575 sizeof (struct xsave_state))); 5576 } 5577 5578 /* 5579 * Return true if the CPUs on this system require 'pointer clearing' for the 5580 * floating point error pointer exception handling. In the past, this has been 5581 * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to 5582 * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO 5583 * feature bit and is reflected in the cpi_fp_amd_save member. 5584 */ 5585 boolean_t 5586 cpuid_need_fp_excp_handling() 5587 { 5588 return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD && 5589 cpuid_info0.cpi_fp_amd_save != 0); 5590 } 5591 5592 /* 5593 * Returns the number of data TLB entries for a corresponding 5594 * pagesize. If it can't be computed, or isn't known, the 5595 * routine returns zero. If you ask about an architecturally 5596 * impossible pagesize, the routine will panic (so that the 5597 * hat implementor knows that things are inconsistent.) 5598 */ 5599 uint_t 5600 cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize) 5601 { 5602 struct cpuid_info *cpi; 5603 uint_t dtlb_nent = 0; 5604 5605 if (cpu == NULL) 5606 cpu = CPU; 5607 cpi = cpu->cpu_m.mcpu_cpi; 5608 5609 ASSERT(cpuid_checkpass(cpu, 1)); 5610 5611 /* 5612 * Check the L2 TLB info 5613 */ 5614 if (cpi->cpi_xmaxeax >= 0x80000006) { 5615 struct cpuid_regs *cp = &cpi->cpi_extd[6]; 5616 5617 switch (pagesize) { 5618 5619 case 4 * 1024: 5620 /* 5621 * All zero in the top 16 bits of the register 5622 * indicates a unified TLB. Size is in low 16 bits. 5623 */ 5624 if ((cp->cp_ebx & 0xffff0000) == 0) 5625 dtlb_nent = cp->cp_ebx & 0x0000ffff; 5626 else 5627 dtlb_nent = BITX(cp->cp_ebx, 27, 16); 5628 break; 5629 5630 case 2 * 1024 * 1024: 5631 if ((cp->cp_eax & 0xffff0000) == 0) 5632 dtlb_nent = cp->cp_eax & 0x0000ffff; 5633 else 5634 dtlb_nent = BITX(cp->cp_eax, 27, 16); 5635 break; 5636 5637 default: 5638 panic("unknown L2 pagesize"); 5639 /*NOTREACHED*/ 5640 } 5641 } 5642 5643 if (dtlb_nent != 0) 5644 return (dtlb_nent); 5645 5646 /* 5647 * No L2 TLB support for this size, try L1. 5648 */ 5649 if (cpi->cpi_xmaxeax >= 0x80000005) { 5650 struct cpuid_regs *cp = &cpi->cpi_extd[5]; 5651 5652 switch (pagesize) { 5653 case 4 * 1024: 5654 dtlb_nent = BITX(cp->cp_ebx, 23, 16); 5655 break; 5656 case 2 * 1024 * 1024: 5657 dtlb_nent = BITX(cp->cp_eax, 23, 16); 5658 break; 5659 default: 5660 panic("unknown L1 d-TLB pagesize"); 5661 /*NOTREACHED*/ 5662 } 5663 } 5664 5665 return (dtlb_nent); 5666 } 5667 5668 /* 5669 * Return 0 if the erratum is not present or not applicable, positive 5670 * if it is, and negative if the status of the erratum is unknown. 5671 * 5672 * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm) 5673 * Processors" #25759, Rev 3.57, August 2005 5674 */ 5675 int 5676 cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum) 5677 { 5678 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 5679 uint_t eax; 5680 5681 /* 5682 * Bail out if this CPU isn't an AMD CPU, or if it's 5683 * a legacy (32-bit) AMD CPU. 5684 */ 5685 if (cpi->cpi_vendor != X86_VENDOR_AMD || 5686 cpi->cpi_family == 4 || cpi->cpi_family == 5 || 5687 cpi->cpi_family == 6) { 5688 return (0); 5689 } 5690 5691 eax = cpi->cpi_std[1].cp_eax; 5692 5693 #define SH_B0(eax) (eax == 0xf40 || eax == 0xf50) 5694 #define SH_B3(eax) (eax == 0xf51) 5695 #define B(eax) (SH_B0(eax) || SH_B3(eax)) 5696 5697 #define SH_C0(eax) (eax == 0xf48 || eax == 0xf58) 5698 5699 #define SH_CG(eax) (eax == 0xf4a || eax == 0xf5a || eax == 0xf7a) 5700 #define DH_CG(eax) (eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0) 5701 #define CH_CG(eax) (eax == 0xf82 || eax == 0xfb2) 5702 #define CG(eax) (SH_CG(eax) || DH_CG(eax) || CH_CG(eax)) 5703 5704 #define SH_D0(eax) (eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70) 5705 #define DH_D0(eax) (eax == 0x10fc0 || eax == 0x10ff0) 5706 #define CH_D0(eax) (eax == 0x10f80 || eax == 0x10fb0) 5707 #define D0(eax) (SH_D0(eax) || DH_D0(eax) || CH_D0(eax)) 5708 5709 #define SH_E0(eax) (eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70) 5710 #define JH_E1(eax) (eax == 0x20f10) /* JH8_E0 had 0x20f30 */ 5711 #define DH_E3(eax) (eax == 0x20fc0 || eax == 0x20ff0) 5712 #define SH_E4(eax) (eax == 0x20f51 || eax == 0x20f71) 5713 #define BH_E4(eax) (eax == 0x20fb1) 5714 #define SH_E5(eax) (eax == 0x20f42) 5715 #define DH_E6(eax) (eax == 0x20ff2 || eax == 0x20fc2) 5716 #define JH_E6(eax) (eax == 0x20f12 || eax == 0x20f32) 5717 #define EX(eax) (SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \ 5718 SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \ 5719 DH_E6(eax) || JH_E6(eax)) 5720 5721 #define DR_AX(eax) (eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02) 5722 #define DR_B0(eax) (eax == 0x100f20) 5723 #define DR_B1(eax) (eax == 0x100f21) 5724 #define DR_BA(eax) (eax == 0x100f2a) 5725 #define DR_B2(eax) (eax == 0x100f22) 5726 #define DR_B3(eax) (eax == 0x100f23) 5727 #define RB_C0(eax) (eax == 0x100f40) 5728 5729 switch (erratum) { 5730 case 1: 5731 return (cpi->cpi_family < 0x10); 5732 case 51: /* what does the asterisk mean? */ 5733 return (B(eax) || SH_C0(eax) || CG(eax)); 5734 case 52: 5735 return (B(eax)); 5736 case 57: 5737 return (cpi->cpi_family <= 0x11); 5738 case 58: 5739 return (B(eax)); 5740 case 60: 5741 return (cpi->cpi_family <= 0x11); 5742 case 61: 5743 case 62: 5744 case 63: 5745 case 64: 5746 case 65: 5747 case 66: 5748 case 68: 5749 case 69: 5750 case 70: 5751 case 71: 5752 return (B(eax)); 5753 case 72: 5754 return (SH_B0(eax)); 5755 case 74: 5756 return (B(eax)); 5757 case 75: 5758 return (cpi->cpi_family < 0x10); 5759 case 76: 5760 return (B(eax)); 5761 case 77: 5762 return (cpi->cpi_family <= 0x11); 5763 case 78: 5764 return (B(eax) || SH_C0(eax)); 5765 case 79: 5766 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax)); 5767 case 80: 5768 case 81: 5769 case 82: 5770 return (B(eax)); 5771 case 83: 5772 return (B(eax) || SH_C0(eax) || CG(eax)); 5773 case 85: 5774 return (cpi->cpi_family < 0x10); 5775 case 86: 5776 return (SH_C0(eax) || CG(eax)); 5777 case 88: 5778 #if !defined(__amd64) 5779 return (0); 5780 #else 5781 return (B(eax) || SH_C0(eax)); 5782 #endif 5783 case 89: 5784 return (cpi->cpi_family < 0x10); 5785 case 90: 5786 return (B(eax) || SH_C0(eax) || CG(eax)); 5787 case 91: 5788 case 92: 5789 return (B(eax) || SH_C0(eax)); 5790 case 93: 5791 return (SH_C0(eax)); 5792 case 94: 5793 return (B(eax) || SH_C0(eax) || CG(eax)); 5794 case 95: 5795 #if !defined(__amd64) 5796 return (0); 5797 #else 5798 return (B(eax) || SH_C0(eax)); 5799 #endif 5800 case 96: 5801 return (B(eax) || SH_C0(eax) || CG(eax)); 5802 case 97: 5803 case 98: 5804 return (SH_C0(eax) || CG(eax)); 5805 case 99: 5806 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax)); 5807 case 100: 5808 return (B(eax) || SH_C0(eax)); 5809 case 101: 5810 case 103: 5811 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax)); 5812 case 104: 5813 return (SH_C0(eax) || CG(eax) || D0(eax)); 5814 case 105: 5815 case 106: 5816 case 107: 5817 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax)); 5818 case 108: 5819 return (DH_CG(eax)); 5820 case 109: 5821 return (SH_C0(eax) || CG(eax) || D0(eax)); 5822 case 110: 5823 return (D0(eax) || EX(eax)); 5824 case 111: 5825 return (CG(eax)); 5826 case 112: 5827 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax)); 5828 case 113: 5829 return (eax == 0x20fc0); 5830 case 114: 5831 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax)); 5832 case 115: 5833 return (SH_E0(eax) || JH_E1(eax)); 5834 case 116: 5835 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax)); 5836 case 117: 5837 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax)); 5838 case 118: 5839 return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) || 5840 JH_E6(eax)); 5841 case 121: 5842 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax)); 5843 case 122: 5844 return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11); 5845 case 123: 5846 return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax)); 5847 case 131: 5848 return (cpi->cpi_family < 0x10); 5849 case 6336786: 5850 5851 /* 5852 * Test for AdvPowerMgmtInfo.TscPStateInvariant 5853 * if this is a K8 family or newer processor. We're testing for 5854 * this 'erratum' to determine whether or not we have a constant 5855 * TSC. 5856 * 5857 * Our current fix for this is to disable the C1-Clock ramping. 5858 * However, this doesn't work on newer processor families nor 5859 * does it work when virtualized as those devices don't exist. 5860 */ 5861 if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) { 5862 return (0); 5863 } 5864 5865 if (CPI_FAMILY(cpi) == 0xf) { 5866 struct cpuid_regs regs; 5867 regs.cp_eax = 0x80000007; 5868 (void) __cpuid_insn(®s); 5869 return (!(regs.cp_edx & 0x100)); 5870 } 5871 return (0); 5872 case 6323525: 5873 /* 5874 * This erratum (K8 #147) is not present on family 10 and newer. 5875 */ 5876 if (cpi->cpi_family >= 0x10) { 5877 return (0); 5878 } 5879 return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) | 5880 (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40); 5881 5882 case 6671130: 5883 /* 5884 * check for processors (pre-Shanghai) that do not provide 5885 * optimal management of 1gb ptes in its tlb. 5886 */ 5887 return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4); 5888 5889 case 298: 5890 return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) || 5891 DR_B2(eax) || RB_C0(eax)); 5892 5893 case 721: 5894 #if defined(__amd64) 5895 return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12); 5896 #else 5897 return (0); 5898 #endif 5899 5900 default: 5901 return (-1); 5902 5903 } 5904 } 5905 5906 /* 5907 * Determine if specified erratum is present via OSVW (OS Visible Workaround). 5908 * Return 1 if erratum is present, 0 if not present and -1 if indeterminate. 5909 */ 5910 int 5911 osvw_opteron_erratum(cpu_t *cpu, uint_t erratum) 5912 { 5913 struct cpuid_info *cpi; 5914 uint_t osvwid; 5915 static int osvwfeature = -1; 5916 uint64_t osvwlength; 5917 5918 5919 cpi = cpu->cpu_m.mcpu_cpi; 5920 5921 /* confirm OSVW supported */ 5922 if (osvwfeature == -1) { 5923 osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW; 5924 } else { 5925 /* assert that osvw feature setting is consistent on all cpus */ 5926 ASSERT(osvwfeature == 5927 (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW)); 5928 } 5929 if (!osvwfeature) 5930 return (-1); 5931 5932 osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK; 5933 5934 switch (erratum) { 5935 case 298: /* osvwid is 0 */ 5936 osvwid = 0; 5937 if (osvwlength <= (uint64_t)osvwid) { 5938 /* osvwid 0 is unknown */ 5939 return (-1); 5940 } 5941 5942 /* 5943 * Check the OSVW STATUS MSR to determine the state 5944 * of the erratum where: 5945 * 0 - fixed by HW 5946 * 1 - BIOS has applied the workaround when BIOS 5947 * workaround is available. (Or for other errata, 5948 * OS workaround is required.) 5949 * For a value of 1, caller will confirm that the 5950 * erratum 298 workaround has indeed been applied by BIOS. 5951 * 5952 * A 1 may be set in cpus that have a HW fix 5953 * in a mixed cpu system. Regarding erratum 298: 5954 * In a multiprocessor platform, the workaround above 5955 * should be applied to all processors regardless of 5956 * silicon revision when an affected processor is 5957 * present. 5958 */ 5959 5960 return (rdmsr(MSR_AMD_OSVW_STATUS + 5961 (osvwid / OSVW_ID_CNT_PER_MSR)) & 5962 (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR))); 5963 5964 default: 5965 return (-1); 5966 } 5967 } 5968 5969 static const char assoc_str[] = "associativity"; 5970 static const char line_str[] = "line-size"; 5971 static const char size_str[] = "size"; 5972 5973 static void 5974 add_cache_prop(dev_info_t *devi, const char *label, const char *type, 5975 uint32_t val) 5976 { 5977 char buf[128]; 5978 5979 /* 5980 * ndi_prop_update_int() is used because it is desirable for 5981 * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set. 5982 */ 5983 if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf)) 5984 (void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val); 5985 } 5986 5987 /* 5988 * Intel-style cache/tlb description 5989 * 5990 * Standard cpuid level 2 gives a randomly ordered 5991 * selection of tags that index into a table that describes 5992 * cache and tlb properties. 5993 */ 5994 5995 static const char l1_icache_str[] = "l1-icache"; 5996 static const char l1_dcache_str[] = "l1-dcache"; 5997 static const char l2_cache_str[] = "l2-cache"; 5998 static const char l3_cache_str[] = "l3-cache"; 5999 static const char itlb4k_str[] = "itlb-4K"; 6000 static const char dtlb4k_str[] = "dtlb-4K"; 6001 static const char itlb2M_str[] = "itlb-2M"; 6002 static const char itlb4M_str[] = "itlb-4M"; 6003 static const char dtlb4M_str[] = "dtlb-4M"; 6004 static const char dtlb24_str[] = "dtlb0-2M-4M"; 6005 static const char itlb424_str[] = "itlb-4K-2M-4M"; 6006 static const char itlb24_str[] = "itlb-2M-4M"; 6007 static const char dtlb44_str[] = "dtlb-4K-4M"; 6008 static const char sl1_dcache_str[] = "sectored-l1-dcache"; 6009 static const char sl2_cache_str[] = "sectored-l2-cache"; 6010 static const char itrace_str[] = "itrace-cache"; 6011 static const char sl3_cache_str[] = "sectored-l3-cache"; 6012 static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k"; 6013 6014 static const struct cachetab { 6015 uint8_t ct_code; 6016 uint8_t ct_assoc; 6017 uint16_t ct_line_size; 6018 size_t ct_size; 6019 const char *ct_label; 6020 } intel_ctab[] = { 6021 /* 6022 * maintain descending order! 6023 * 6024 * Codes ignored - Reason 6025 * ---------------------- 6026 * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache 6027 * f0H/f1H - Currently we do not interpret prefetch size by design 6028 */ 6029 { 0xe4, 16, 64, 8*1024*1024, l3_cache_str}, 6030 { 0xe3, 16, 64, 4*1024*1024, l3_cache_str}, 6031 { 0xe2, 16, 64, 2*1024*1024, l3_cache_str}, 6032 { 0xde, 12, 64, 6*1024*1024, l3_cache_str}, 6033 { 0xdd, 12, 64, 3*1024*1024, l3_cache_str}, 6034 { 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str}, 6035 { 0xd8, 8, 64, 4*1024*1024, l3_cache_str}, 6036 { 0xd7, 8, 64, 2*1024*1024, l3_cache_str}, 6037 { 0xd6, 8, 64, 1*1024*1024, l3_cache_str}, 6038 { 0xd2, 4, 64, 2*1024*1024, l3_cache_str}, 6039 { 0xd1, 4, 64, 1*1024*1024, l3_cache_str}, 6040 { 0xd0, 4, 64, 512*1024, l3_cache_str}, 6041 { 0xca, 4, 0, 512, sh_l2_tlb4k_str}, 6042 { 0xc0, 4, 0, 8, dtlb44_str }, 6043 { 0xba, 4, 0, 64, dtlb4k_str }, 6044 { 0xb4, 4, 0, 256, dtlb4k_str }, 6045 { 0xb3, 4, 0, 128, dtlb4k_str }, 6046 { 0xb2, 4, 0, 64, itlb4k_str }, 6047 { 0xb0, 4, 0, 128, itlb4k_str }, 6048 { 0x87, 8, 64, 1024*1024, l2_cache_str}, 6049 { 0x86, 4, 64, 512*1024, l2_cache_str}, 6050 { 0x85, 8, 32, 2*1024*1024, l2_cache_str}, 6051 { 0x84, 8, 32, 1024*1024, l2_cache_str}, 6052 { 0x83, 8, 32, 512*1024, l2_cache_str}, 6053 { 0x82, 8, 32, 256*1024, l2_cache_str}, 6054 { 0x80, 8, 64, 512*1024, l2_cache_str}, 6055 { 0x7f, 2, 64, 512*1024, l2_cache_str}, 6056 { 0x7d, 8, 64, 2*1024*1024, sl2_cache_str}, 6057 { 0x7c, 8, 64, 1024*1024, sl2_cache_str}, 6058 { 0x7b, 8, 64, 512*1024, sl2_cache_str}, 6059 { 0x7a, 8, 64, 256*1024, sl2_cache_str}, 6060 { 0x79, 8, 64, 128*1024, sl2_cache_str}, 6061 { 0x78, 8, 64, 1024*1024, l2_cache_str}, 6062 { 0x73, 8, 0, 64*1024, itrace_str}, 6063 { 0x72, 8, 0, 32*1024, itrace_str}, 6064 { 0x71, 8, 0, 16*1024, itrace_str}, 6065 { 0x70, 8, 0, 12*1024, itrace_str}, 6066 { 0x68, 4, 64, 32*1024, sl1_dcache_str}, 6067 { 0x67, 4, 64, 16*1024, sl1_dcache_str}, 6068 { 0x66, 4, 64, 8*1024, sl1_dcache_str}, 6069 { 0x60, 8, 64, 16*1024, sl1_dcache_str}, 6070 { 0x5d, 0, 0, 256, dtlb44_str}, 6071 { 0x5c, 0, 0, 128, dtlb44_str}, 6072 { 0x5b, 0, 0, 64, dtlb44_str}, 6073 { 0x5a, 4, 0, 32, dtlb24_str}, 6074 { 0x59, 0, 0, 16, dtlb4k_str}, 6075 { 0x57, 4, 0, 16, dtlb4k_str}, 6076 { 0x56, 4, 0, 16, dtlb4M_str}, 6077 { 0x55, 0, 0, 7, itlb24_str}, 6078 { 0x52, 0, 0, 256, itlb424_str}, 6079 { 0x51, 0, 0, 128, itlb424_str}, 6080 { 0x50, 0, 0, 64, itlb424_str}, 6081 { 0x4f, 0, 0, 32, itlb4k_str}, 6082 { 0x4e, 24, 64, 6*1024*1024, l2_cache_str}, 6083 { 0x4d, 16, 64, 16*1024*1024, l3_cache_str}, 6084 { 0x4c, 12, 64, 12*1024*1024, l3_cache_str}, 6085 { 0x4b, 16, 64, 8*1024*1024, l3_cache_str}, 6086 { 0x4a, 12, 64, 6*1024*1024, l3_cache_str}, 6087 { 0x49, 16, 64, 4*1024*1024, l3_cache_str}, 6088 { 0x48, 12, 64, 3*1024*1024, l2_cache_str}, 6089 { 0x47, 8, 64, 8*1024*1024, l3_cache_str}, 6090 { 0x46, 4, 64, 4*1024*1024, l3_cache_str}, 6091 { 0x45, 4, 32, 2*1024*1024, l2_cache_str}, 6092 { 0x44, 4, 32, 1024*1024, l2_cache_str}, 6093 { 0x43, 4, 32, 512*1024, l2_cache_str}, 6094 { 0x42, 4, 32, 256*1024, l2_cache_str}, 6095 { 0x41, 4, 32, 128*1024, l2_cache_str}, 6096 { 0x3e, 4, 64, 512*1024, sl2_cache_str}, 6097 { 0x3d, 6, 64, 384*1024, sl2_cache_str}, 6098 { 0x3c, 4, 64, 256*1024, sl2_cache_str}, 6099 { 0x3b, 2, 64, 128*1024, sl2_cache_str}, 6100 { 0x3a, 6, 64, 192*1024, sl2_cache_str}, 6101 { 0x39, 4, 64, 128*1024, sl2_cache_str}, 6102 { 0x30, 8, 64, 32*1024, l1_icache_str}, 6103 { 0x2c, 8, 64, 32*1024, l1_dcache_str}, 6104 { 0x29, 8, 64, 4096*1024, sl3_cache_str}, 6105 { 0x25, 8, 64, 2048*1024, sl3_cache_str}, 6106 { 0x23, 8, 64, 1024*1024, sl3_cache_str}, 6107 { 0x22, 4, 64, 512*1024, sl3_cache_str}, 6108 { 0x0e, 6, 64, 24*1024, l1_dcache_str}, 6109 { 0x0d, 4, 32, 16*1024, l1_dcache_str}, 6110 { 0x0c, 4, 32, 16*1024, l1_dcache_str}, 6111 { 0x0b, 4, 0, 4, itlb4M_str}, 6112 { 0x0a, 2, 32, 8*1024, l1_dcache_str}, 6113 { 0x08, 4, 32, 16*1024, l1_icache_str}, 6114 { 0x06, 4, 32, 8*1024, l1_icache_str}, 6115 { 0x05, 4, 0, 32, dtlb4M_str}, 6116 { 0x04, 4, 0, 8, dtlb4M_str}, 6117 { 0x03, 4, 0, 64, dtlb4k_str}, 6118 { 0x02, 4, 0, 2, itlb4M_str}, 6119 { 0x01, 4, 0, 32, itlb4k_str}, 6120 { 0 } 6121 }; 6122 6123 static const struct cachetab cyrix_ctab[] = { 6124 { 0x70, 4, 0, 32, "tlb-4K" }, 6125 { 0x80, 4, 16, 16*1024, "l1-cache" }, 6126 { 0 } 6127 }; 6128 6129 /* 6130 * Search a cache table for a matching entry 6131 */ 6132 static const struct cachetab * 6133 find_cacheent(const struct cachetab *ct, uint_t code) 6134 { 6135 if (code != 0) { 6136 for (; ct->ct_code != 0; ct++) 6137 if (ct->ct_code <= code) 6138 break; 6139 if (ct->ct_code == code) 6140 return (ct); 6141 } 6142 return (NULL); 6143 } 6144 6145 /* 6146 * Populate cachetab entry with L2 or L3 cache-information using 6147 * cpuid function 4. This function is called from intel_walk_cacheinfo() 6148 * when descriptor 0x49 is encountered. It returns 0 if no such cache 6149 * information is found. 6150 */ 6151 static int 6152 intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi) 6153 { 6154 uint32_t level, i; 6155 int ret = 0; 6156 6157 for (i = 0; i < cpi->cpi_cache_leaf_size; i++) { 6158 level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]); 6159 6160 if (level == 2 || level == 3) { 6161 ct->ct_assoc = 6162 CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1; 6163 ct->ct_line_size = 6164 CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1; 6165 ct->ct_size = ct->ct_assoc * 6166 (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) * 6167 ct->ct_line_size * 6168 (cpi->cpi_cache_leaves[i]->cp_ecx + 1); 6169 6170 if (level == 2) { 6171 ct->ct_label = l2_cache_str; 6172 } else if (level == 3) { 6173 ct->ct_label = l3_cache_str; 6174 } 6175 ret = 1; 6176 } 6177 } 6178 6179 return (ret); 6180 } 6181 6182 /* 6183 * Walk the cacheinfo descriptor, applying 'func' to every valid element 6184 * The walk is terminated if the walker returns non-zero. 6185 */ 6186 static void 6187 intel_walk_cacheinfo(struct cpuid_info *cpi, 6188 void *arg, int (*func)(void *, const struct cachetab *)) 6189 { 6190 const struct cachetab *ct; 6191 struct cachetab des_49_ct, des_b1_ct; 6192 uint8_t *dp; 6193 int i; 6194 6195 if ((dp = cpi->cpi_cacheinfo) == NULL) 6196 return; 6197 for (i = 0; i < cpi->cpi_ncache; i++, dp++) { 6198 /* 6199 * For overloaded descriptor 0x49 we use cpuid function 4 6200 * if supported by the current processor, to create 6201 * cache information. 6202 * For overloaded descriptor 0xb1 we use X86_PAE flag 6203 * to disambiguate the cache information. 6204 */ 6205 if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 && 6206 intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) { 6207 ct = &des_49_ct; 6208 } else if (*dp == 0xb1) { 6209 des_b1_ct.ct_code = 0xb1; 6210 des_b1_ct.ct_assoc = 4; 6211 des_b1_ct.ct_line_size = 0; 6212 if (is_x86_feature(x86_featureset, X86FSET_PAE)) { 6213 des_b1_ct.ct_size = 8; 6214 des_b1_ct.ct_label = itlb2M_str; 6215 } else { 6216 des_b1_ct.ct_size = 4; 6217 des_b1_ct.ct_label = itlb4M_str; 6218 } 6219 ct = &des_b1_ct; 6220 } else { 6221 if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) { 6222 continue; 6223 } 6224 } 6225 6226 if (func(arg, ct) != 0) { 6227 break; 6228 } 6229 } 6230 } 6231 6232 /* 6233 * (Like the Intel one, except for Cyrix CPUs) 6234 */ 6235 static void 6236 cyrix_walk_cacheinfo(struct cpuid_info *cpi, 6237 void *arg, int (*func)(void *, const struct cachetab *)) 6238 { 6239 const struct cachetab *ct; 6240 uint8_t *dp; 6241 int i; 6242 6243 if ((dp = cpi->cpi_cacheinfo) == NULL) 6244 return; 6245 for (i = 0; i < cpi->cpi_ncache; i++, dp++) { 6246 /* 6247 * Search Cyrix-specific descriptor table first .. 6248 */ 6249 if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) { 6250 if (func(arg, ct) != 0) 6251 break; 6252 continue; 6253 } 6254 /* 6255 * .. else fall back to the Intel one 6256 */ 6257 if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) { 6258 if (func(arg, ct) != 0) 6259 break; 6260 continue; 6261 } 6262 } 6263 } 6264 6265 /* 6266 * A cacheinfo walker that adds associativity, line-size, and size properties 6267 * to the devinfo node it is passed as an argument. 6268 */ 6269 static int 6270 add_cacheent_props(void *arg, const struct cachetab *ct) 6271 { 6272 dev_info_t *devi = arg; 6273 6274 add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc); 6275 if (ct->ct_line_size != 0) 6276 add_cache_prop(devi, ct->ct_label, line_str, 6277 ct->ct_line_size); 6278 add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size); 6279 return (0); 6280 } 6281 6282 6283 static const char fully_assoc[] = "fully-associative?"; 6284 6285 /* 6286 * AMD style cache/tlb description 6287 * 6288 * Extended functions 5 and 6 directly describe properties of 6289 * tlbs and various cache levels. 6290 */ 6291 static void 6292 add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc) 6293 { 6294 switch (assoc) { 6295 case 0: /* reserved; ignore */ 6296 break; 6297 default: 6298 add_cache_prop(devi, label, assoc_str, assoc); 6299 break; 6300 case 0xff: 6301 add_cache_prop(devi, label, fully_assoc, 1); 6302 break; 6303 } 6304 } 6305 6306 static void 6307 add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size) 6308 { 6309 if (size == 0) 6310 return; 6311 add_cache_prop(devi, label, size_str, size); 6312 add_amd_assoc(devi, label, assoc); 6313 } 6314 6315 static void 6316 add_amd_cache(dev_info_t *devi, const char *label, 6317 uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size) 6318 { 6319 if (size == 0 || line_size == 0) 6320 return; 6321 add_amd_assoc(devi, label, assoc); 6322 /* 6323 * Most AMD parts have a sectored cache. Multiple cache lines are 6324 * associated with each tag. A sector consists of all cache lines 6325 * associated with a tag. For example, the AMD K6-III has a sector 6326 * size of 2 cache lines per tag. 6327 */ 6328 if (lines_per_tag != 0) 6329 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag); 6330 add_cache_prop(devi, label, line_str, line_size); 6331 add_cache_prop(devi, label, size_str, size * 1024); 6332 } 6333 6334 static void 6335 add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc) 6336 { 6337 switch (assoc) { 6338 case 0: /* off */ 6339 break; 6340 case 1: 6341 case 2: 6342 case 4: 6343 add_cache_prop(devi, label, assoc_str, assoc); 6344 break; 6345 case 6: 6346 add_cache_prop(devi, label, assoc_str, 8); 6347 break; 6348 case 8: 6349 add_cache_prop(devi, label, assoc_str, 16); 6350 break; 6351 case 0xf: 6352 add_cache_prop(devi, label, fully_assoc, 1); 6353 break; 6354 default: /* reserved; ignore */ 6355 break; 6356 } 6357 } 6358 6359 static void 6360 add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size) 6361 { 6362 if (size == 0 || assoc == 0) 6363 return; 6364 add_amd_l2_assoc(devi, label, assoc); 6365 add_cache_prop(devi, label, size_str, size); 6366 } 6367 6368 static void 6369 add_amd_l2_cache(dev_info_t *devi, const char *label, 6370 uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size) 6371 { 6372 if (size == 0 || assoc == 0 || line_size == 0) 6373 return; 6374 add_amd_l2_assoc(devi, label, assoc); 6375 if (lines_per_tag != 0) 6376 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag); 6377 add_cache_prop(devi, label, line_str, line_size); 6378 add_cache_prop(devi, label, size_str, size * 1024); 6379 } 6380 6381 static void 6382 amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi) 6383 { 6384 struct cpuid_regs *cp; 6385 6386 if (cpi->cpi_xmaxeax < 0x80000005) 6387 return; 6388 cp = &cpi->cpi_extd[5]; 6389 6390 /* 6391 * 4M/2M L1 TLB configuration 6392 * 6393 * We report the size for 2M pages because AMD uses two 6394 * TLB entries for one 4M page. 6395 */ 6396 add_amd_tlb(devi, "dtlb-2M", 6397 BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16)); 6398 add_amd_tlb(devi, "itlb-2M", 6399 BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0)); 6400 6401 /* 6402 * 4K L1 TLB configuration 6403 */ 6404 6405 switch (cpi->cpi_vendor) { 6406 uint_t nentries; 6407 case X86_VENDOR_TM: 6408 if (cpi->cpi_family >= 5) { 6409 /* 6410 * Crusoe processors have 256 TLB entries, but 6411 * cpuid data format constrains them to only 6412 * reporting 255 of them. 6413 */ 6414 if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255) 6415 nentries = 256; 6416 /* 6417 * Crusoe processors also have a unified TLB 6418 */ 6419 add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24), 6420 nentries); 6421 break; 6422 } 6423 /*FALLTHROUGH*/ 6424 default: 6425 add_amd_tlb(devi, itlb4k_str, 6426 BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16)); 6427 add_amd_tlb(devi, dtlb4k_str, 6428 BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0)); 6429 break; 6430 } 6431 6432 /* 6433 * data L1 cache configuration 6434 */ 6435 6436 add_amd_cache(devi, l1_dcache_str, 6437 BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16), 6438 BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0)); 6439 6440 /* 6441 * code L1 cache configuration 6442 */ 6443 6444 add_amd_cache(devi, l1_icache_str, 6445 BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16), 6446 BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0)); 6447 6448 if (cpi->cpi_xmaxeax < 0x80000006) 6449 return; 6450 cp = &cpi->cpi_extd[6]; 6451 6452 /* Check for a unified L2 TLB for large pages */ 6453 6454 if (BITX(cp->cp_eax, 31, 16) == 0) 6455 add_amd_l2_tlb(devi, "l2-tlb-2M", 6456 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0)); 6457 else { 6458 add_amd_l2_tlb(devi, "l2-dtlb-2M", 6459 BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16)); 6460 add_amd_l2_tlb(devi, "l2-itlb-2M", 6461 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0)); 6462 } 6463 6464 /* Check for a unified L2 TLB for 4K pages */ 6465 6466 if (BITX(cp->cp_ebx, 31, 16) == 0) { 6467 add_amd_l2_tlb(devi, "l2-tlb-4K", 6468 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0)); 6469 } else { 6470 add_amd_l2_tlb(devi, "l2-dtlb-4K", 6471 BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16)); 6472 add_amd_l2_tlb(devi, "l2-itlb-4K", 6473 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0)); 6474 } 6475 6476 add_amd_l2_cache(devi, l2_cache_str, 6477 BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12), 6478 BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0)); 6479 } 6480 6481 /* 6482 * There are two basic ways that the x86 world describes it cache 6483 * and tlb architecture - Intel's way and AMD's way. 6484 * 6485 * Return which flavor of cache architecture we should use 6486 */ 6487 static int 6488 x86_which_cacheinfo(struct cpuid_info *cpi) 6489 { 6490 switch (cpi->cpi_vendor) { 6491 case X86_VENDOR_Intel: 6492 if (cpi->cpi_maxeax >= 2) 6493 return (X86_VENDOR_Intel); 6494 break; 6495 case X86_VENDOR_AMD: 6496 /* 6497 * The K5 model 1 was the first part from AMD that reported 6498 * cache sizes via extended cpuid functions. 6499 */ 6500 if (cpi->cpi_family > 5 || 6501 (cpi->cpi_family == 5 && cpi->cpi_model >= 1)) 6502 return (X86_VENDOR_AMD); 6503 break; 6504 case X86_VENDOR_TM: 6505 if (cpi->cpi_family >= 5) 6506 return (X86_VENDOR_AMD); 6507 /*FALLTHROUGH*/ 6508 default: 6509 /* 6510 * If they have extended CPU data for 0x80000005 6511 * then we assume they have AMD-format cache 6512 * information. 6513 * 6514 * If not, and the vendor happens to be Cyrix, 6515 * then try our-Cyrix specific handler. 6516 * 6517 * If we're not Cyrix, then assume we're using Intel's 6518 * table-driven format instead. 6519 */ 6520 if (cpi->cpi_xmaxeax >= 0x80000005) 6521 return (X86_VENDOR_AMD); 6522 else if (cpi->cpi_vendor == X86_VENDOR_Cyrix) 6523 return (X86_VENDOR_Cyrix); 6524 else if (cpi->cpi_maxeax >= 2) 6525 return (X86_VENDOR_Intel); 6526 break; 6527 } 6528 return (-1); 6529 } 6530 6531 void 6532 cpuid_set_cpu_properties(void *dip, processorid_t cpu_id, 6533 struct cpuid_info *cpi) 6534 { 6535 dev_info_t *cpu_devi; 6536 int create; 6537 6538 cpu_devi = (dev_info_t *)dip; 6539 6540 /* device_type */ 6541 (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi, 6542 "device_type", "cpu"); 6543 6544 /* reg */ 6545 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6546 "reg", cpu_id); 6547 6548 /* cpu-mhz, and clock-frequency */ 6549 if (cpu_freq > 0) { 6550 long long mul; 6551 6552 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6553 "cpu-mhz", cpu_freq); 6554 if ((mul = cpu_freq * 1000000LL) <= INT_MAX) 6555 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6556 "clock-frequency", (int)mul); 6557 } 6558 6559 if (!is_x86_feature(x86_featureset, X86FSET_CPUID)) { 6560 return; 6561 } 6562 6563 /* vendor-id */ 6564 (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi, 6565 "vendor-id", cpi->cpi_vendorstr); 6566 6567 if (cpi->cpi_maxeax == 0) { 6568 return; 6569 } 6570 6571 /* 6572 * family, model, and step 6573 */ 6574 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6575 "family", CPI_FAMILY(cpi)); 6576 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6577 "cpu-model", CPI_MODEL(cpi)); 6578 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6579 "stepping-id", CPI_STEP(cpi)); 6580 6581 /* type */ 6582 switch (cpi->cpi_vendor) { 6583 case X86_VENDOR_Intel: 6584 create = 1; 6585 break; 6586 default: 6587 create = 0; 6588 break; 6589 } 6590 if (create) 6591 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6592 "type", CPI_TYPE(cpi)); 6593 6594 /* ext-family */ 6595 switch (cpi->cpi_vendor) { 6596 case X86_VENDOR_Intel: 6597 case X86_VENDOR_AMD: 6598 create = cpi->cpi_family >= 0xf; 6599 break; 6600 default: 6601 create = 0; 6602 break; 6603 } 6604 if (create) 6605 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6606 "ext-family", CPI_FAMILY_XTD(cpi)); 6607 6608 /* ext-model */ 6609 switch (cpi->cpi_vendor) { 6610 case X86_VENDOR_Intel: 6611 create = IS_EXTENDED_MODEL_INTEL(cpi); 6612 break; 6613 case X86_VENDOR_AMD: 6614 create = CPI_FAMILY(cpi) == 0xf; 6615 break; 6616 default: 6617 create = 0; 6618 break; 6619 } 6620 if (create) 6621 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6622 "ext-model", CPI_MODEL_XTD(cpi)); 6623 6624 /* generation */ 6625 switch (cpi->cpi_vendor) { 6626 case X86_VENDOR_AMD: 6627 /* 6628 * AMD K5 model 1 was the first part to support this 6629 */ 6630 create = cpi->cpi_xmaxeax >= 0x80000001; 6631 break; 6632 default: 6633 create = 0; 6634 break; 6635 } 6636 if (create) 6637 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6638 "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8)); 6639 6640 /* brand-id */ 6641 switch (cpi->cpi_vendor) { 6642 case X86_VENDOR_Intel: 6643 /* 6644 * brand id first appeared on Pentium III Xeon model 8, 6645 * and Celeron model 8 processors and Opteron 6646 */ 6647 create = cpi->cpi_family > 6 || 6648 (cpi->cpi_family == 6 && cpi->cpi_model >= 8); 6649 break; 6650 case X86_VENDOR_AMD: 6651 create = cpi->cpi_family >= 0xf; 6652 break; 6653 default: 6654 create = 0; 6655 break; 6656 } 6657 if (create && cpi->cpi_brandid != 0) { 6658 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6659 "brand-id", cpi->cpi_brandid); 6660 } 6661 6662 /* chunks, and apic-id */ 6663 switch (cpi->cpi_vendor) { 6664 /* 6665 * first available on Pentium IV and Opteron (K8) 6666 */ 6667 case X86_VENDOR_Intel: 6668 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf; 6669 break; 6670 case X86_VENDOR_AMD: 6671 create = cpi->cpi_family >= 0xf; 6672 break; 6673 default: 6674 create = 0; 6675 break; 6676 } 6677 if (create) { 6678 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6679 "chunks", CPI_CHUNKS(cpi)); 6680 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6681 "apic-id", cpi->cpi_apicid); 6682 if (cpi->cpi_chipid >= 0) { 6683 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6684 "chip#", cpi->cpi_chipid); 6685 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6686 "clog#", cpi->cpi_clogid); 6687 } 6688 } 6689 6690 /* cpuid-features */ 6691 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6692 "cpuid-features", CPI_FEATURES_EDX(cpi)); 6693 6694 6695 /* cpuid-features-ecx */ 6696 switch (cpi->cpi_vendor) { 6697 case X86_VENDOR_Intel: 6698 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf; 6699 break; 6700 case X86_VENDOR_AMD: 6701 create = cpi->cpi_family >= 0xf; 6702 break; 6703 default: 6704 create = 0; 6705 break; 6706 } 6707 if (create) 6708 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6709 "cpuid-features-ecx", CPI_FEATURES_ECX(cpi)); 6710 6711 /* ext-cpuid-features */ 6712 switch (cpi->cpi_vendor) { 6713 case X86_VENDOR_Intel: 6714 case X86_VENDOR_AMD: 6715 case X86_VENDOR_Cyrix: 6716 case X86_VENDOR_TM: 6717 case X86_VENDOR_Centaur: 6718 create = cpi->cpi_xmaxeax >= 0x80000001; 6719 break; 6720 default: 6721 create = 0; 6722 break; 6723 } 6724 if (create) { 6725 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6726 "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi)); 6727 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6728 "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi)); 6729 } 6730 6731 /* 6732 * Brand String first appeared in Intel Pentium IV, AMD K5 6733 * model 1, and Cyrix GXm. On earlier models we try and 6734 * simulate something similar .. so this string should always 6735 * same -something- about the processor, however lame. 6736 */ 6737 (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi, 6738 "brand-string", cpi->cpi_brandstr); 6739 6740 /* 6741 * Finally, cache and tlb information 6742 */ 6743 switch (x86_which_cacheinfo(cpi)) { 6744 case X86_VENDOR_Intel: 6745 intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props); 6746 break; 6747 case X86_VENDOR_Cyrix: 6748 cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props); 6749 break; 6750 case X86_VENDOR_AMD: 6751 amd_cache_info(cpi, cpu_devi); 6752 break; 6753 default: 6754 break; 6755 } 6756 } 6757 6758 struct l2info { 6759 int *l2i_csz; 6760 int *l2i_lsz; 6761 int *l2i_assoc; 6762 int l2i_ret; 6763 }; 6764 6765 /* 6766 * A cacheinfo walker that fetches the size, line-size and associativity 6767 * of the L2 cache 6768 */ 6769 static int 6770 intel_l2cinfo(void *arg, const struct cachetab *ct) 6771 { 6772 struct l2info *l2i = arg; 6773 int *ip; 6774 6775 if (ct->ct_label != l2_cache_str && 6776 ct->ct_label != sl2_cache_str) 6777 return (0); /* not an L2 -- keep walking */ 6778 6779 if ((ip = l2i->l2i_csz) != NULL) 6780 *ip = ct->ct_size; 6781 if ((ip = l2i->l2i_lsz) != NULL) 6782 *ip = ct->ct_line_size; 6783 if ((ip = l2i->l2i_assoc) != NULL) 6784 *ip = ct->ct_assoc; 6785 l2i->l2i_ret = ct->ct_size; 6786 return (1); /* was an L2 -- terminate walk */ 6787 } 6788 6789 /* 6790 * AMD L2/L3 Cache and TLB Associativity Field Definition: 6791 * 6792 * Unlike the associativity for the L1 cache and tlb where the 8 bit 6793 * value is the associativity, the associativity for the L2 cache and 6794 * tlb is encoded in the following table. The 4 bit L2 value serves as 6795 * an index into the amd_afd[] array to determine the associativity. 6796 * -1 is undefined. 0 is fully associative. 6797 */ 6798 6799 static int amd_afd[] = 6800 {-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0}; 6801 6802 static void 6803 amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i) 6804 { 6805 struct cpuid_regs *cp; 6806 uint_t size, assoc; 6807 int i; 6808 int *ip; 6809 6810 if (cpi->cpi_xmaxeax < 0x80000006) 6811 return; 6812 cp = &cpi->cpi_extd[6]; 6813 6814 if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 && 6815 (size = BITX(cp->cp_ecx, 31, 16)) != 0) { 6816 uint_t cachesz = size * 1024; 6817 assoc = amd_afd[i]; 6818 6819 ASSERT(assoc != -1); 6820 6821 if ((ip = l2i->l2i_csz) != NULL) 6822 *ip = cachesz; 6823 if ((ip = l2i->l2i_lsz) != NULL) 6824 *ip = BITX(cp->cp_ecx, 7, 0); 6825 if ((ip = l2i->l2i_assoc) != NULL) 6826 *ip = assoc; 6827 l2i->l2i_ret = cachesz; 6828 } 6829 } 6830 6831 int 6832 getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc) 6833 { 6834 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 6835 struct l2info __l2info, *l2i = &__l2info; 6836 6837 l2i->l2i_csz = csz; 6838 l2i->l2i_lsz = lsz; 6839 l2i->l2i_assoc = assoc; 6840 l2i->l2i_ret = -1; 6841 6842 switch (x86_which_cacheinfo(cpi)) { 6843 case X86_VENDOR_Intel: 6844 intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo); 6845 break; 6846 case X86_VENDOR_Cyrix: 6847 cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo); 6848 break; 6849 case X86_VENDOR_AMD: 6850 amd_l2cacheinfo(cpi, l2i); 6851 break; 6852 default: 6853 break; 6854 } 6855 return (l2i->l2i_ret); 6856 } 6857 6858 #if !defined(__xpv) 6859 6860 uint32_t * 6861 cpuid_mwait_alloc(cpu_t *cpu) 6862 { 6863 uint32_t *ret; 6864 size_t mwait_size; 6865 6866 ASSERT(cpuid_checkpass(CPU, 2)); 6867 6868 mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max; 6869 if (mwait_size == 0) 6870 return (NULL); 6871 6872 /* 6873 * kmem_alloc() returns cache line size aligned data for mwait_size 6874 * allocations. mwait_size is currently cache line sized. Neither 6875 * of these implementation details are guarantied to be true in the 6876 * future. 6877 * 6878 * First try allocating mwait_size as kmem_alloc() currently returns 6879 * correctly aligned memory. If kmem_alloc() does not return 6880 * mwait_size aligned memory, then use mwait_size ROUNDUP. 6881 * 6882 * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we 6883 * decide to free this memory. 6884 */ 6885 ret = kmem_zalloc(mwait_size, KM_SLEEP); 6886 if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) { 6887 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret; 6888 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size; 6889 *ret = MWAIT_RUNNING; 6890 return (ret); 6891 } else { 6892 kmem_free(ret, mwait_size); 6893 ret = kmem_zalloc(mwait_size * 2, KM_SLEEP); 6894 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret; 6895 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2; 6896 ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size); 6897 *ret = MWAIT_RUNNING; 6898 return (ret); 6899 } 6900 } 6901 6902 void 6903 cpuid_mwait_free(cpu_t *cpu) 6904 { 6905 if (cpu->cpu_m.mcpu_cpi == NULL) { 6906 return; 6907 } 6908 6909 if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL && 6910 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) { 6911 kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual, 6912 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual); 6913 } 6914 6915 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL; 6916 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0; 6917 } 6918 6919 void 6920 patch_tsc_read(int flag) 6921 { 6922 size_t cnt; 6923 6924 switch (flag) { 6925 case TSC_NONE: 6926 cnt = &_no_rdtsc_end - &_no_rdtsc_start; 6927 (void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt); 6928 break; 6929 case TSC_RDTSC_MFENCE: 6930 cnt = &_tsc_mfence_end - &_tsc_mfence_start; 6931 (void) memcpy((void *)tsc_read, 6932 (void *)&_tsc_mfence_start, cnt); 6933 break; 6934 case TSC_RDTSC_LFENCE: 6935 cnt = &_tsc_lfence_end - &_tsc_lfence_start; 6936 (void) memcpy((void *)tsc_read, 6937 (void *)&_tsc_lfence_start, cnt); 6938 break; 6939 case TSC_TSCP: 6940 cnt = &_tscp_end - &_tscp_start; 6941 (void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt); 6942 break; 6943 default: 6944 /* Bail for unexpected TSC types. (TSC_NONE covers 0) */ 6945 cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag); 6946 break; 6947 } 6948 tsc_type = flag; 6949 } 6950 6951 int 6952 cpuid_deep_cstates_supported(void) 6953 { 6954 struct cpuid_info *cpi; 6955 struct cpuid_regs regs; 6956 6957 ASSERT(cpuid_checkpass(CPU, 1)); 6958 6959 cpi = CPU->cpu_m.mcpu_cpi; 6960 6961 if (!is_x86_feature(x86_featureset, X86FSET_CPUID)) 6962 return (0); 6963 6964 switch (cpi->cpi_vendor) { 6965 case X86_VENDOR_Intel: 6966 if (cpi->cpi_xmaxeax < 0x80000007) 6967 return (0); 6968 6969 /* 6970 * TSC run at a constant rate in all ACPI C-states? 6971 */ 6972 regs.cp_eax = 0x80000007; 6973 (void) __cpuid_insn(®s); 6974 return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE); 6975 6976 default: 6977 return (0); 6978 } 6979 } 6980 6981 #endif /* !__xpv */ 6982 6983 void 6984 post_startup_cpu_fixups(void) 6985 { 6986 #ifndef __xpv 6987 /* 6988 * Some AMD processors support C1E state. Entering this state will 6989 * cause the local APIC timer to stop, which we can't deal with at 6990 * this time. 6991 */ 6992 if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) { 6993 on_trap_data_t otd; 6994 uint64_t reg; 6995 6996 if (!on_trap(&otd, OT_DATA_ACCESS)) { 6997 reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT); 6998 /* Disable C1E state if it is enabled by BIOS */ 6999 if ((reg >> AMD_ACTONCMPHALT_SHIFT) & 7000 AMD_ACTONCMPHALT_MASK) { 7001 reg &= ~(AMD_ACTONCMPHALT_MASK << 7002 AMD_ACTONCMPHALT_SHIFT); 7003 wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg); 7004 } 7005 } 7006 no_trap(); 7007 } 7008 #endif /* !__xpv */ 7009 } 7010 7011 void 7012 enable_pcid(void) 7013 { 7014 if (x86_use_pcid == -1) 7015 x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID); 7016 7017 if (x86_use_invpcid == -1) { 7018 x86_use_invpcid = is_x86_feature(x86_featureset, 7019 X86FSET_INVPCID); 7020 } 7021 7022 if (!x86_use_pcid) 7023 return; 7024 7025 /* 7026 * Intel say that on setting PCIDE, it immediately starts using the PCID 7027 * bits; better make sure there's nothing there. 7028 */ 7029 ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE); 7030 7031 setcr4(getcr4() | CR4_PCIDE); 7032 } 7033 7034 /* 7035 * Setup necessary registers to enable XSAVE feature on this processor. 7036 * This function needs to be called early enough, so that no xsave/xrstor 7037 * ops will execute on the processor before the MSRs are properly set up. 7038 * 7039 * Current implementation has the following assumption: 7040 * - cpuid_pass1() is done, so that X86 features are known. 7041 * - fpu_probe() is done, so that fp_save_mech is chosen. 7042 */ 7043 void 7044 xsave_setup_msr(cpu_t *cpu) 7045 { 7046 ASSERT(fp_save_mech == FP_XSAVE); 7047 ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE)); 7048 7049 /* Enable OSXSAVE in CR4. */ 7050 setcr4(getcr4() | CR4_OSXSAVE); 7051 /* 7052 * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report 7053 * correct value. 7054 */ 7055 cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE; 7056 setup_xfem(); 7057 } 7058 7059 /* 7060 * Starting with the Westmere processor the local 7061 * APIC timer will continue running in all C-states, 7062 * including the deepest C-states. 7063 */ 7064 int 7065 cpuid_arat_supported(void) 7066 { 7067 struct cpuid_info *cpi; 7068 struct cpuid_regs regs; 7069 7070 ASSERT(cpuid_checkpass(CPU, 1)); 7071 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID)); 7072 7073 cpi = CPU->cpu_m.mcpu_cpi; 7074 7075 switch (cpi->cpi_vendor) { 7076 case X86_VENDOR_Intel: 7077 /* 7078 * Always-running Local APIC Timer is 7079 * indicated by CPUID.6.EAX[2]. 7080 */ 7081 if (cpi->cpi_maxeax >= 6) { 7082 regs.cp_eax = 6; 7083 (void) cpuid_insn(NULL, ®s); 7084 return (regs.cp_eax & CPUID_INTC_EAX_ARAT); 7085 } else { 7086 return (0); 7087 } 7088 default: 7089 return (0); 7090 } 7091 } 7092 7093 /* 7094 * Check support for Intel ENERGY_PERF_BIAS feature 7095 */ 7096 int 7097 cpuid_iepb_supported(struct cpu *cp) 7098 { 7099 struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi; 7100 struct cpuid_regs regs; 7101 7102 ASSERT(cpuid_checkpass(cp, 1)); 7103 7104 if (!(is_x86_feature(x86_featureset, X86FSET_CPUID)) || 7105 !(is_x86_feature(x86_featureset, X86FSET_MSR))) { 7106 return (0); 7107 } 7108 7109 /* 7110 * Intel ENERGY_PERF_BIAS MSR is indicated by 7111 * capability bit CPUID.6.ECX.3 7112 */ 7113 if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6)) 7114 return (0); 7115 7116 regs.cp_eax = 0x6; 7117 (void) cpuid_insn(NULL, ®s); 7118 return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS); 7119 } 7120 7121 /* 7122 * Check support for TSC deadline timer 7123 * 7124 * TSC deadline timer provides a superior software programming 7125 * model over local APIC timer that eliminates "time drifts". 7126 * Instead of specifying a relative time, software specifies an 7127 * absolute time as the target at which the processor should 7128 * generate a timer event. 7129 */ 7130 int 7131 cpuid_deadline_tsc_supported(void) 7132 { 7133 struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi; 7134 struct cpuid_regs regs; 7135 7136 ASSERT(cpuid_checkpass(CPU, 1)); 7137 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID)); 7138 7139 switch (cpi->cpi_vendor) { 7140 case X86_VENDOR_Intel: 7141 if (cpi->cpi_maxeax >= 1) { 7142 regs.cp_eax = 1; 7143 (void) cpuid_insn(NULL, ®s); 7144 return (regs.cp_ecx & CPUID_DEADLINE_TSC); 7145 } else { 7146 return (0); 7147 } 7148 default: 7149 return (0); 7150 } 7151 } 7152 7153 #if defined(__amd64) && !defined(__xpv) 7154 /* 7155 * Patch in versions of bcopy for high performance Intel Nhm processors 7156 * and later... 7157 */ 7158 void 7159 patch_memops(uint_t vendor) 7160 { 7161 size_t cnt, i; 7162 caddr_t to, from; 7163 7164 if ((vendor == X86_VENDOR_Intel) && 7165 is_x86_feature(x86_featureset, X86FSET_SSE4_2)) { 7166 cnt = &bcopy_patch_end - &bcopy_patch_start; 7167 to = &bcopy_ck_size; 7168 from = &bcopy_patch_start; 7169 for (i = 0; i < cnt; i++) { 7170 *to++ = *from++; 7171 } 7172 } 7173 } 7174 #endif /* __amd64 && !__xpv */ 7175 7176 /* 7177 * We're being asked to tell the system how many bits are required to represent 7178 * the various thread and strand IDs. While it's tempting to derive this based 7179 * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite 7180 * correct. Instead, this needs to be based on the number of bits that the APIC 7181 * allows for these different configurations. We only update these to a larger 7182 * value if we find one. 7183 */ 7184 void 7185 cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits) 7186 { 7187 struct cpuid_info *cpi; 7188 7189 VERIFY(cpuid_checkpass(CPU, 1)); 7190 cpi = cpu->cpu_m.mcpu_cpi; 7191 7192 if (cpi->cpi_ncore_bits > *core_nbits) { 7193 *core_nbits = cpi->cpi_ncore_bits; 7194 } 7195 7196 if (cpi->cpi_nthread_bits > *strand_nbits) { 7197 *strand_nbits = cpi->cpi_nthread_bits; 7198 } 7199 } 7200 7201 void 7202 cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset) 7203 { 7204 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 7205 struct cpuid_regs cp; 7206 7207 /* 7208 * Reread the CPUID portions that we need for various security 7209 * information. 7210 */ 7211 if (cpi->cpi_vendor == X86_VENDOR_Intel) { 7212 /* 7213 * Check if we now have leaf 7 available to us. 7214 */ 7215 if (cpi->cpi_maxeax < 7) { 7216 bzero(&cp, sizeof (cp)); 7217 cp.cp_eax = 0; 7218 cpi->cpi_maxeax = __cpuid_insn(&cp); 7219 if (cpi->cpi_maxeax < 7) 7220 return; 7221 } 7222 7223 bzero(&cp, sizeof (cp)); 7224 cp.cp_eax = 7; 7225 cp.cp_ecx = 0; 7226 (void) __cpuid_insn(&cp); 7227 cpi->cpi_std[7] = cp; 7228 } else if (cpi->cpi_vendor == X86_VENDOR_AMD) { 7229 /* No xcpuid support */ 7230 if (cpi->cpi_family < 5 || 7231 (cpi->cpi_family == 5 && cpi->cpi_model < 1)) 7232 return; 7233 7234 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) { 7235 bzero(&cp, sizeof (cp)); 7236 cp.cp_eax = CPUID_LEAF_EXT_0; 7237 cpi->cpi_xmaxeax = __cpuid_insn(&cp); 7238 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) { 7239 return; 7240 } 7241 } 7242 7243 bzero(&cp, sizeof (cp)); 7244 cp.cp_eax = CPUID_LEAF_EXT_8; 7245 (void) __cpuid_insn(&cp); 7246 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp); 7247 cpi->cpi_extd[8] = cp; 7248 } else { 7249 /* 7250 * Nothing to do here. Return an empty set which has already 7251 * been zeroed for us. 7252 */ 7253 return; 7254 } 7255 cpuid_scan_security(cpu, fset); 7256 } 7257 7258 /* ARGSUSED */ 7259 static int 7260 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2) 7261 { 7262 uchar_t *fset; 7263 boolean_t first_pass = (boolean_t)arg1; 7264 7265 fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id); 7266 if (first_pass && CPU->cpu_id != 0) 7267 return (0); 7268 if (!first_pass && CPU->cpu_id == 0) 7269 return (0); 7270 cpuid_pass_ucode(CPU, fset); 7271 7272 return (0); 7273 } 7274 7275 /* 7276 * After a microcode update where the version has changed, then we need to 7277 * rescan CPUID. To do this we check every CPU to make sure that they have the 7278 * same microcode. Then we perform a cross call to all such CPUs. It's the 7279 * caller's job to make sure that no one else can end up doing an update while 7280 * this is going on. 7281 * 7282 * We assume that the system is microcode capable if we're called. 7283 */ 7284 void 7285 cpuid_post_ucodeadm(void) 7286 { 7287 uint32_t rev; 7288 int i; 7289 struct cpu *cpu; 7290 cpuset_t cpuset; 7291 void *argdata; 7292 uchar_t *f0; 7293 7294 argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP); 7295 7296 mutex_enter(&cpu_lock); 7297 cpu = cpu_get(0); 7298 rev = cpu->cpu_m.mcpu_ucode_info->cui_rev; 7299 CPUSET_ONLY(cpuset, 0); 7300 for (i = 1; i < max_ncpus; i++) { 7301 if ((cpu = cpu_get(i)) == NULL) 7302 continue; 7303 7304 if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) { 7305 panic("post microcode update CPU %d has differing " 7306 "microcode revision (%u) from CPU 0 (%u)", 7307 i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev); 7308 } 7309 CPUSET_ADD(cpuset, i); 7310 } 7311 7312 /* 7313 * We do the cross calls in two passes. The first pass is only for the 7314 * boot CPU. The second pass is for all of the other CPUs. This allows 7315 * the boot CPU to go through and change behavior related to patching or 7316 * whether or not Enhanced IBRS needs to be enabled and then allow all 7317 * other CPUs to follow suit. 7318 */ 7319 kpreempt_disable(); 7320 xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset), 7321 cpuid_post_ucodeadm_xc); 7322 xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset), 7323 cpuid_post_ucodeadm_xc); 7324 kpreempt_enable(); 7325 7326 /* 7327 * OK, now look at each CPU and see if their feature sets are equal. 7328 */ 7329 f0 = argdata; 7330 for (i = 1; i < max_ncpus; i++) { 7331 uchar_t *fset; 7332 if (!CPU_IN_SET(cpuset, i)) 7333 continue; 7334 7335 fset = (uchar_t *)((uintptr_t)argdata + 7336 sizeof (x86_featureset) * i); 7337 7338 if (!compare_x86_featureset(f0, fset)) { 7339 panic("Post microcode update CPU %d has " 7340 "differing security feature (%p) set from CPU 0 " 7341 "(%p), not appending to feature set", i, 7342 (void *)fset, (void *)f0); 7343 } 7344 } 7345 7346 mutex_exit(&cpu_lock); 7347 7348 for (i = 0; i < NUM_X86_FEATURES; i++) { 7349 cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n", 7350 x86_feature_names[i]); 7351 if (is_x86_feature(f0, i)) { 7352 add_x86_feature(x86_featureset, i); 7353 } 7354 } 7355 kmem_free(argdata, sizeof (x86_featureset) * NCPU); 7356 }