1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2016 by Delphix. All rights reserved. 24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved. 25 * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net> 26 */ 27 /* 28 * Copyright (c) 2010, Intel Corporation. 29 * All rights reserved. 30 */ 31 /* 32 * Portions Copyright 2009 Advanced Micro Devices, Inc. 33 */ 34 /* 35 * Copyright 2019 Joyent, Inc. 36 */ 37 38 /* 39 * CPU Identification logic 40 * 41 * The purpose of this file and its companion, cpuid_subr.c, is to help deal 42 * with the identification of CPUs, their features, and their topologies. More 43 * specifically, this file helps drive the following: 44 * 45 * 1. Enumeration of features of the processor which are used by the kernel to 46 * determine what features to enable or disable. These may be instruction set 47 * enhancements or features that we use. 48 * 49 * 2. Enumeration of instruction set architecture (ISA) additions that userland 50 * will be told about through the auxiliary vector. 51 * 52 * 3. Understanding the physical topology of the CPU such as the number of 53 * caches, how many cores it has, whether or not it supports symmetric 54 * multi-processing (SMT), etc. 55 * 56 * ------------------------ 57 * CPUID History and Basics 58 * ------------------------ 59 * 60 * The cpuid instruction was added by Intel roughly around the time that the 61 * original Pentium was introduced. The purpose of cpuid was to tell in a 62 * programmatic fashion information about the CPU that previously was guessed 63 * at. For example, an important part of cpuid is that we can know what 64 * extensions to the ISA exist. If you use an invalid opcode you would get a 65 * #UD, so this method allows a program (whether a user program or the kernel) 66 * to determine what exists without crashing or getting a SIGILL. Of course, 67 * this was also during the era of the clones and the AMD Am5x86. The vendor 68 * name shows up first in cpuid for a reason. 69 * 70 * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts 71 * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has 72 * its own meaning. The different leaves are broken down into different regions: 73 * 74 * [ 0, 7fffffff ] This region is called the 'basic' 75 * region. This region is generally defined 76 * by Intel, though some of the original 77 * portions have different meanings based 78 * on the manufacturer. These days, Intel 79 * adds most new features to this region. 80 * AMD adds non-Intel compatible 81 * information in the third, extended 82 * region. Intel uses this for everything 83 * including ISA extensions, CPU 84 * features, cache information, topology, 85 * and more. 86 * 87 * There is a hole carved out of this 88 * region which is reserved for 89 * hypervisors. 90 * 91 * [ 40000000, 4fffffff ] This region, which is found in the 92 * middle of the previous region, is 93 * explicitly promised to never be used by 94 * CPUs. Instead, it is used by hypervisors 95 * to communicate information about 96 * themselves to the operating system. The 97 * values and details are unique for each 98 * hypervisor. 99 * 100 * [ 80000000, ffffffff ] This region is called the 'extended' 101 * region. Some of the low leaves mirror 102 * parts of the basic leaves. This region 103 * has generally been used by AMD for 104 * various extensions. For example, AMD- 105 * specific information about caches, 106 * features, and topology are found in this 107 * region. 108 * 109 * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx, 110 * and %edx, and then issue the cpuid instruction. At the first leaf in each of 111 * the ranges, one of the primary things returned is the maximum valid leaf in 112 * that range. This allows for discovery of what range of CPUID is valid. 113 * 114 * The CPUs have potentially surprising behavior when using an invalid leaf or 115 * unimplemented leaf. If the requested leaf is within the valid basic or 116 * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be 117 * set to zero. However, if you specify a leaf that is outside of a valid range, 118 * then instead it will be filled with the last valid _basic_ leaf. For example, 119 * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or 120 * an invalid extended leaf will return the information for leaf 3. 121 * 122 * Some leaves are broken down into sub-leaves. This means that the value 123 * depends on both the leaf asked for in %eax and a secondary register. For 124 * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get 125 * additional information. Or when getting topology information in leaf 0xb, the 126 * initial value in %ecx changes which level of the topology that you are 127 * getting information about. 128 * 129 * cpuid values are always kept to 32 bits regardless of whether or not the 130 * program is in 64-bit mode. When executing in 64-bit mode, the upper 131 * 32 bits of the register are always set to zero so that way the values are the 132 * same regardless of execution mode. 133 * 134 * ---------------------- 135 * Identifying Processors 136 * ---------------------- 137 * 138 * We can identify a processor in two steps. The first step looks at cpuid leaf 139 * 0. Leaf 0 contains the processor's vendor information. This is done by 140 * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is 141 * 'AuthenticAMD' and on Intel it is 'GenuineIntel'. 142 * 143 * From there, a processor is identified by a combination of three different 144 * values: 145 * 146 * 1. Family 147 * 2. Model 148 * 3. Stepping 149 * 150 * Each vendor uses the family and model to uniquely identify a processor. The 151 * way that family and model are changed depends on the vendor. For example, 152 * Intel has been using family 0x6 for almost all of their processor since the 153 * Pentium Pro/Pentium II era, often called the P6. The model is used to 154 * identify the exact processor. Different models are often used for the client 155 * (consumer) and server parts. Even though each processor often has major 156 * architectural differences, they still are considered the same family by 157 * Intel. 158 * 159 * On the other hand, each major AMD architecture generally has its own family. 160 * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it 161 * the model number is used to help identify specific processors. 162 * 163 * The stepping is used to refer to a revision of a specific microprocessor. The 164 * term comes from equipment used to produce masks that are used to create 165 * integrated circuits. 166 * 167 * The information is present in leaf 1, %eax. In technical documentation you 168 * will see the terms extended model and extended family. The original family, 169 * model, and stepping fields were each 4 bits wide. If the values in either 170 * are 0xf, then one is to consult the extended model and extended family, which 171 * take previously reserved bits and allow for a larger number of models and add 172 * 0xf to them. 173 * 174 * When we process this information, we store the full family, model, and 175 * stepping in the struct cpuid_info members cpi_family, cpi_model, and 176 * cpi_step, respectively. Whenever you are performing comparisons with the 177 * family, model, and stepping, you should use these members and not the raw 178 * values from cpuid. If you must use the raw values from cpuid directly, you 179 * must make sure that you add the extended model and family to the base model 180 * and family. 181 * 182 * In general, we do not use information about the family, model, and stepping 183 * to determine whether or not a feature is present; that is generally driven by 184 * specific leaves. However, when something we care about on the processor is 185 * not considered 'architectural' meaning that it is specific to a set of 186 * processors and not promised in the architecture model to be consistent from 187 * generation to generation, then we will fall back on this information. The 188 * most common cases where this comes up is when we have to workaround errata in 189 * the processor, are dealing with processor-specific features such as CPU 190 * performance counters, or we want to provide additional information for things 191 * such as fault management. 192 * 193 * While processors also do have a brand string, which is the name that people 194 * are familiar with when buying the processor, they are not meant for 195 * programmatic consumption. That is what the family, model, and stepping are 196 * for. 197 * 198 * ------------ 199 * CPUID Passes 200 * ------------ 201 * 202 * As part of performing feature detection, we break this into several different 203 * passes. The passes are as follows: 204 * 205 * Pass 0 This is a primordial pass done in locore.s to deal with 206 * Cyrix CPUs that don't support cpuid. The reality is that 207 * we likely don't run on them any more, but there is still 208 * logic for handling them. 209 * 210 * Pass 1 This is the primary pass and is responsible for doing a 211 * large number of different things: 212 * 213 * 1. Determine which vendor manufactured the CPU and 214 * determining the family, model, and stepping information. 215 * 216 * 2. Gathering a large number of feature flags to 217 * determine which features the CPU support and which 218 * indicate things that we need to do other work in the OS 219 * to enable. Features detected this way are added to the 220 * x86_featureset which can be queried to 221 * determine what we should do. This includes processing 222 * all of the basic and extended CPU features that we care 223 * about. 224 * 225 * 3. Determining the CPU's topology. This includes 226 * information about how many cores and threads are present 227 * in the package. It also is responsible for figuring out 228 * which logical CPUs are potentially part of the same core 229 * and what other resources they might share. For more 230 * information see the 'Topology' section. 231 * 232 * 4. Determining the set of CPU security-specific features 233 * that we need to worry about and determine the 234 * appropriate set of workarounds. 235 * 236 * Pass 1 on the boot CPU occurs before KMDB is started. 237 * 238 * Pass 2 The second pass is done after startup(). Here, we check 239 * other miscellaneous features. Most of this is gathering 240 * additional basic and extended features that we'll use in 241 * later passes or for debugging support. 242 * 243 * Pass 3 The third pass occurs after the kernel memory allocator 244 * has been fully initialized. This gathers information 245 * where we might need dynamic memory available for our 246 * uses. This includes several varying width leaves that 247 * have cache information and the processor's brand string. 248 * 249 * Pass 4 The fourth and final normal pass is performed after the 250 * kernel has brought most everything online. This is 251 * invoked from post_startup(). In this pass, we go through 252 * the set of features that we have enabled and turn that 253 * into the hardware auxiliary vector features that 254 * userland receives. This is used by userland, primarily 255 * by the run-time link-editor (RTLD), though userland 256 * software could also refer to it directly. 257 * 258 * Microcode After a microcode update, we do a selective rescan of 259 * the cpuid leaves to determine what features have 260 * changed. Microcode updates can provide more details 261 * about security related features to deal with issues like 262 * Spectre and L1TF. On occasion, vendors have violated 263 * their contract and removed bits. However, we don't try 264 * to detect that because that puts us in a situation that 265 * we really can't deal with. As such, the only thing we 266 * rescan are security related features today. See 267 * cpuid_pass_ucode(). 268 * 269 * All of the passes (except pass 0) are run on all CPUs. However, for the most 270 * part we only care about what the boot CPU says about this information and use 271 * the other CPUs as a rough guide to sanity check that we have the same feature 272 * set. 273 * 274 * We do not support running multiple logical CPUs with disjoint, let alone 275 * different, feature sets. 276 * 277 * ------------------ 278 * Processor Topology 279 * ------------------ 280 * 281 * One of the important things that we need to do is to understand the topology 282 * of the underlying processor. When we say topology in this case, we're trying 283 * to understand the relationship between the logical CPUs that the operating 284 * system sees and the underlying physical layout. Different logical CPUs may 285 * share different resources which can have important consequences for the 286 * performance of the system. For example, they may share caches, execution 287 * units, and more. 288 * 289 * The topology of the processor changes from generation to generation and 290 * vendor to vendor. Along with that, different vendors use different 291 * terminology, and the operating system itself uses occasionally overlapping 292 * terminology. It's important to understand what this topology looks like so 293 * one can understand the different things that we try to calculate and 294 * determine. 295 * 296 * To get started, let's talk about a little bit of terminology that we've used 297 * so far, is used throughout this file, and is fairly generic across multiple 298 * vendors: 299 * 300 * CPU 301 * A central processing unit (CPU) refers to a logical and/or virtual 302 * entity that the operating system can execute instructions on. The 303 * underlying resources for this CPU may be shared between multiple 304 * entities; however, to the operating system it is a discrete unit. 305 * 306 * PROCESSOR and PACKAGE 307 * 308 * Generally, when we use the term 'processor' on its own, we are referring 309 * to the physical entity that one buys and plugs into a board. However, 310 * because processor has been overloaded and one might see it used to mean 311 * multiple different levels, we will instead use the term 'package' for 312 * the rest of this file. The term package comes from the electrical 313 * engineering side and refers to the physical entity that encloses the 314 * electronics inside. Strictly speaking the package can contain more than 315 * just the CPU, for example, on many processors it may also have what's 316 * called an 'integrated graphical processing unit (GPU)'. Because the 317 * package can encapsulate multiple units, it is the largest physical unit 318 * that we refer to. 319 * 320 * SOCKET 321 * 322 * A socket refers to unit on a system board (generally the motherboard) 323 * that can receive a package. A single package, or processor, is plugged 324 * into a single socket. A system may have multiple sockets. Often times, 325 * the term socket is used interchangeably with package and refers to the 326 * electrical component that has plugged in, and not the receptacle itself. 327 * 328 * CORE 329 * 330 * A core refers to the physical instantiation of a CPU, generally, with a 331 * full set of hardware resources available to it. A package may contain 332 * multiple cores inside of it or it may just have a single one. A 333 * processor with more than one core is often referred to as 'multi-core'. 334 * In illumos, we will use the feature X86FSET_CMP to refer to a system 335 * that has 'multi-core' processors. 336 * 337 * A core may expose a single logical CPU to the operating system, or it 338 * may expose multiple CPUs, which we call threads, defined below. 339 * 340 * Some resources may still be shared by cores in the same package. For 341 * example, many processors will share the level 3 cache between cores. 342 * Some AMD generations share hardware resources between cores. For more 343 * information on that see the section 'AMD Topology'. 344 * 345 * THREAD and STRAND 346 * 347 * In this file, generally a thread refers to a hardware resources and not 348 * the operating system's logical abstraction. A thread is always exposed 349 * as an independent logical CPU to the operating system. A thread belongs 350 * to a specific core. A core may have more than one thread. When that is 351 * the case, the threads that are part of the same core are often referred 352 * to as 'siblings'. 353 * 354 * When multiple threads exist, this is generally referred to as 355 * simultaneous multi-threading (SMT). When Intel introduced this in their 356 * processors they called it hyper-threading (HT). When multiple threads 357 * are active in a core, they split the resources of the core. For example, 358 * two threads may share the same set of hardware execution units. 359 * 360 * The operating system often uses the term 'strand' to refer to a thread. 361 * This helps disambiguate it from the software concept. 362 * 363 * CHIP 364 * 365 * Unfortunately, the term 'chip' is dramatically overloaded. At its most 366 * base meaning, it is used to refer to a single integrated circuit, which 367 * may or may not be the only thing in the package. In illumos, when you 368 * see the term 'chip' it is almost always referring to the same thing as 369 * the 'package'. However, many vendors may use chip to refer to one of 370 * many integrated circuits that have been placed in the package. As an 371 * example, see the subsequent definition. 372 * 373 * To try and keep things consistent, we will only use chip when referring 374 * to the entire integrated circuit package, with the exception of the 375 * definition of multi-chip module (because it is in the name) and use the 376 * term 'die' when we want the more general, potential sub-component 377 * definition. 378 * 379 * DIE 380 * 381 * A die refers to an integrated circuit. Inside of the package there may 382 * be a single die or multiple dies. This is sometimes called a 'chip' in 383 * vendor's parlance, but in this file, we use the term die to refer to a 384 * subcomponent. 385 * 386 * MULTI-CHIP MODULE 387 * 388 * A multi-chip module (MCM) refers to putting multiple distinct chips that 389 * are connected together in the same package. When a multi-chip design is 390 * used, generally each chip is manufactured independently and then joined 391 * together in the package. For example, on AMD's Zen microarchitecture 392 * (family 0x17), the package contains several dies (the second meaning of 393 * chip from above) that are connected together. 394 * 395 * CACHE 396 * 397 * A cache is a part of the processor that maintains copies of recently 398 * accessed memory. Caches are split into levels and then into types. 399 * Commonly there are one to three levels, called level one, two, and 400 * three. The lower the level, the smaller it is, the closer it is to the 401 * execution units of the CPU, and the faster it is to access. The layout 402 * and design of the cache come in many different flavors, consult other 403 * resources for a discussion of those. 404 * 405 * Caches are generally split into two types, the instruction and data 406 * cache. The caches contain what their names suggest, the instruction 407 * cache has executable program text, while the data cache has all other 408 * memory that the processor accesses. As of this writing, data is kept 409 * coherent between all of the caches on x86, so if one modifies program 410 * text before it is executed, that will be in the data cache, and the 411 * instruction cache will be synchronized with that change when the 412 * processor actually executes those instructions. This coherency also 413 * covers the fact that data could show up in multiple caches. 414 * 415 * Generally, the lowest level caches are specific to a core. However, the 416 * last layer cache is shared between some number of cores. The number of 417 * CPUs sharing this last level cache is important. This has implications 418 * for the choices that the scheduler makes, as accessing memory that might 419 * be in a remote cache after thread migration can be quite expensive. 420 * 421 * Sometimes, the word cache is abbreviated with a '$', because in US 422 * English the word cache is pronounced the same as cash. So L1D$ refers to 423 * the L1 data cache, and L2$ would be the L2 cache. This will not be used 424 * in the rest of this theory statement for clarity. 425 * 426 * MEMORY CONTROLLER 427 * 428 * The memory controller is a component that provides access to DRAM. Each 429 * memory controller can access a set number of DRAM channels. Each channel 430 * can have a number of DIMMs (sticks of memory) associated with it. A 431 * given package may have more than one memory controller. The association 432 * of the memory controller to a group of cores is important as it is 433 * cheaper to access memory on the controller that you are associated with. 434 * 435 * NUMA 436 * 437 * NUMA or non-uniform memory access, describes a way that systems are 438 * built. On x86, any processor core can address all of the memory in the 439 * system. However, When using multiple sockets or possibly within a 440 * multi-chip module, some of that memory is physically closer and some of 441 * it is further. Memory that is further away is more expensive to access. 442 * Consider the following image of multiple sockets with memory: 443 * 444 * +--------+ +--------+ 445 * | DIMM A | +----------+ +----------+ | DIMM D | 446 * +--------+-+ | | | | +-+------+-+ 447 * | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E | 448 * +--------+-+ | | | | +-+------+-+ 449 * | DIMM C | +----------+ +----------+ | DIMM F | 450 * +--------+ +--------+ 451 * 452 * In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is 453 * closer to DIMMs D-F. This means that it is cheaper for socket 0 to 454 * access DIMMs A-C and more expensive to access D-F as it has to go 455 * through Socket 1 to get there. The inverse is true for Socket 1. DIMMs 456 * D-F are cheaper than A-C. While the socket form is the most common, when 457 * using multi-chip modules, this can also sometimes occur. For another 458 * example of this that's more involved, see the AMD topology section. 459 * 460 * 461 * Intel Topology 462 * -------------- 463 * 464 * Most Intel processors since Nehalem, (as of this writing the current gen 465 * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of 466 * the package is a single monolithic die. MCMs currently aren't used. Most 467 * parts have three levels of caches, with the L3 cache being shared between 468 * all of the cores on the package. The L1/L2 cache is generally specific to 469 * an individual core. The following image shows at a simplified level what 470 * this looks like. The memory controller is commonly part of something called 471 * the 'Uncore', that used to be separate physical chips that were not a part of 472 * the package, but are now part of the same chip. 473 * 474 * +-----------------------------------------------------------------------+ 475 * | Package | 476 * | +-------------------+ +-------------------+ +-------------------+ | 477 * | | Core | | Core | | Core | | 478 * | | +--------+ +---+ | | +--------+ +---+ | | +--------+ +---+ | | 479 * | | | Thread | | L | | | | Thread | | L | | | | Thread | | L | | | 480 * | | +--------+ | 1 | | | +--------+ | 1 | | | +--------+ | 1 | | | 481 * | | +--------+ | | | | +--------+ | | | | +--------+ | | | | 482 * | | | Thread | | | | | | Thread | | | | | | Thread | | | | | 483 * | | +--------+ +---+ | | +--------+ +---+ | | +--------+ +---+ | | 484 * | | +--------------+ | | +--------------+ | | +--------------+ | | 485 * | | | L2 Cache | | | | L2 Cache | | | | L2 Cache | | | 486 * | | +--------------+ | | +--------------+ | | +--------------+ | | 487 * | +-------------------+ +-------------------+ +-------------------+ | 488 * | +-------------------------------------------------------------------+ | 489 * | | Shared L3 Cache | | 490 * | +-------------------------------------------------------------------+ | 491 * | +-------------------------------------------------------------------+ | 492 * | | Memory Controller | | 493 * | +-------------------------------------------------------------------+ | 494 * +-----------------------------------------------------------------------+ 495 * 496 * A side effect of this current architecture is that what we care about from a 497 * scheduling and topology perspective, is simplified. In general we care about 498 * understanding which logical CPUs are part of the same core and socket. 499 * 500 * To determine the relationship between threads and cores, Intel initially used 501 * the identifier in the advanced programmable interrupt controller (APIC). They 502 * also added cpuid leaf 4 to give additional information about the number of 503 * threads and CPUs in the processor. With the addition of x2apic (which 504 * increased the number of addressable logical CPUs from 8-bits to 32-bits), an 505 * additional cpuid topology leaf 0xB was added. 506 * 507 * AMD Topology 508 * ------------ 509 * 510 * When discussing AMD topology, we want to break this into three distinct 511 * generations of topology. There's the basic topology that has been used in 512 * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced 513 * with family 0x15 (Bulldozer), and there's the topology that was introduced 514 * with family 0x17 (Zen). AMD also has some additional terminology that's worth 515 * talking about. 516 * 517 * Until the introduction of family 0x17 (Zen), AMD did not implement something 518 * that they considered SMT. Whether or not the AMD processors have SMT 519 * influences many things including scheduling and reliability, availability, 520 * and serviceability (RAS) features. 521 * 522 * NODE 523 * 524 * AMD uses the term node to refer to a die that contains a number of cores 525 * and I/O resources. Depending on the processor family and model, more 526 * than one node can be present in the package. When there is more than one 527 * node this indicates a multi-chip module. Usually each node has its own 528 * access to memory and I/O devices. This is important and generally 529 * different from the corresponding Intel Nehalem-Skylake+ processors. As a 530 * result, we track this relationship in the operating system. 531 * 532 * In processors with an L3 cache, the L3 cache is generally shared across 533 * the entire node, though the way this is carved up varies from generation 534 * to generation. 535 * 536 * BULLDOZER 537 * 538 * Starting with the Bulldozer family (0x15) and continuing until the 539 * introduction of the Zen microarchitecture, AMD introduced the idea of a 540 * compute unit. In a compute unit, two traditional cores share a number of 541 * hardware resources. Critically, they share the FPU, L1 instruction 542 * cache, and the L2 cache. Several compute units were then combined inside 543 * of a single node. Because the integer execution units, L1 data cache, 544 * and some other resources were not shared between the cores, AMD never 545 * considered this to be SMT. 546 * 547 * ZEN 548 * 549 * The Zen family (0x17) uses a multi-chip module (MCM) design, the module 550 * is called Zeppelin. These modules are similar to the idea of nodes used 551 * previously. Each of these nodes has two DRAM channels which all of the 552 * cores in the node can access uniformly. These nodes are linked together 553 * in the package, creating a NUMA environment. 554 * 555 * The Zeppelin die itself contains two different 'core complexes'. Each 556 * core complex consists of four cores which each have two threads, for a 557 * total of 8 logical CPUs per complex. Unlike other generations, 558 * where all the logical CPUs in a given node share the L3 cache, here each 559 * core complex has its own shared L3 cache. 560 * 561 * A further thing that we need to consider is that in some configurations, 562 * particularly with the Threadripper line of processors, not every die 563 * actually has its memory controllers wired up to actual memory channels. 564 * This means that some cores have memory attached to them and others 565 * don't. 566 * 567 * To put Zen in perspective, consider the following images: 568 * 569 * +--------------------------------------------------------+ 570 * | Core Complex | 571 * | +-------------------+ +-------------------+ +---+ | 572 * | | Core +----+ | | Core +----+ | | | | 573 * | | +--------+ | L2 | | | +--------+ | L2 | | | | | 574 * | | | Thread | +----+ | | | Thread | +----+ | | | | 575 * | | +--------+-+ +--+ | | +--------+-+ +--+ | | L | | 576 * | | | Thread | |L1| | | | Thread | |L1| | | 3 | | 577 * | | +--------+ +--+ | | +--------+ +--+ | | | | 578 * | +-------------------+ +-------------------+ | C | | 579 * | +-------------------+ +-------------------+ | a | | 580 * | | Core +----+ | | Core +----+ | | c | | 581 * | | +--------+ | L2 | | | +--------+ | L2 | | | h | | 582 * | | | Thread | +----+ | | | Thread | +----+ | | e | | 583 * | | +--------+-+ +--+ | | +--------+-+ +--+ | | | | 584 * | | | Thread | |L1| | | | Thread | |L1| | | | | 585 * | | +--------+ +--+ | | +--------+ +--+ | | | | 586 * | +-------------------+ +-------------------+ +---+ | 587 * | | 588 * +--------------------------------------------------------+ 589 * 590 * This first image represents a single Zen core complex that consists of four 591 * cores. 592 * 593 * 594 * +--------------------------------------------------------+ 595 * | Zeppelin Die | 596 * | +--------------------------------------------------+ | 597 * | | I/O Units (PCIe, SATA, USB, etc.) | | 598 * | +--------------------------------------------------+ | 599 * | HH | 600 * | +-----------+ HH +-----------+ | 601 * | | | HH | | | 602 * | | Core |==========| Core | | 603 * | | Complex |==========| Complex | | 604 * | | | HH | | | 605 * | +-----------+ HH +-----------+ | 606 * | HH | 607 * | +--------------------------------------------------+ | 608 * | | Memory Controller | | 609 * | +--------------------------------------------------+ | 610 * | | 611 * +--------------------------------------------------------+ 612 * 613 * This image represents a single Zeppelin Die. Note how both cores are 614 * connected to the same memory controller and I/O units. While each core 615 * complex has its own L3 cache as seen in the first image, they both have 616 * uniform access to memory. 617 * 618 * 619 * PP PP 620 * PP PP 621 * +----------PP---------------------PP---------+ 622 * | PP PP | 623 * | +-----------+ +-----------+ | 624 * | | | | | | 625 * MMMMMMMMM| Zeppelin |==========| Zeppelin |MMMMMMMMM 626 * MMMMMMMMM| Die |==========| Die |MMMMMMMMM 627 * | | | | | | 628 * | +-----------+ooo ...+-----------+ | 629 * | HH ooo ... HH | 630 * | HH oo.. HH | 631 * | HH ..oo HH | 632 * | HH ... ooo HH | 633 * | +-----------+... ooo+-----------+ | 634 * | | | | | | 635 * MMMMMMMMM| Zeppelin |==========| Zeppelin |MMMMMMMMM 636 * MMMMMMMMM| Die |==========| Die |MMMMMMMMM 637 * | | | | | | 638 * | +-----------+ +-----------+ | 639 * | PP PP | 640 * +----------PP---------------------PP---------+ 641 * PP PP 642 * PP PP 643 * 644 * This image represents a single Zen package. In this example, it has four 645 * Zeppelin dies, though some configurations only have a single one. In this 646 * example, each die is directly connected to the next. Also, each die is 647 * represented as being connected to memory by the 'M' character and connected 648 * to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin 649 * die is made up of two core complexes, we have multiple different NUMA 650 * domains that we care about for these systems. 651 * 652 * CPUID LEAVES 653 * 654 * There are a few different CPUID leaves that we can use to try and understand 655 * the actual state of the world. As part of the introduction of family 0xf, AMD 656 * added CPUID leaf 0x80000008. This leaf tells us the number of logical 657 * processors that are in the system. Because families before Zen didn't have 658 * SMT, this was always the number of cores that were in the system. However, it 659 * should always be thought of as the number of logical threads to be consistent 660 * between generations. In addition we also get the size of the APIC ID that is 661 * used to represent the number of logical processors. This is important for 662 * deriving topology information. 663 * 664 * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a 665 * bit between Bulldozer and later families, but it is quite useful in 666 * determining the topology information. Because this information has changed 667 * across family generations, it's worth calling out what these mean 668 * explicitly. The registers have the following meanings: 669 * 670 * %eax The APIC ID. The entire register is defined to have a 32-bit 671 * APIC ID, even though on systems without x2apic support, it will 672 * be limited to 8 bits. 673 * 674 * %ebx On Bulldozer-era systems this contains information about the 675 * number of cores that are in a compute unit (cores that share 676 * resources). It also contains a per-package compute unit ID that 677 * identifies which compute unit the logical CPU is a part of. 678 * 679 * On Zen-era systems this instead contains the number of threads 680 * per core and the ID of the core that the logical CPU is a part 681 * of. Note, this ID is unique only to the package, it is not 682 * globally unique across the entire system. 683 * 684 * %ecx This contains the number of nodes that exist in the package. It 685 * also contains an ID that identifies which node the logical CPU 686 * is a part of. 687 * 688 * Finally, we also use cpuid leaf 0x8000001D to determine information about the 689 * cache layout to determine which logical CPUs are sharing which caches. 690 * 691 * illumos Topology 692 * ---------------- 693 * 694 * Based on the above we synthesize the information into several different 695 * variables that we store in the 'struct cpuid_info'. We'll go into the details 696 * of what each member is supposed to represent and their uniqueness. In 697 * general, there are two levels of uniqueness that we care about. We care about 698 * an ID that is globally unique. That means that it will be unique across all 699 * entities in the system. For example, the default logical CPU ID is globally 700 * unique. On the other hand, there is some information that we only care about 701 * being unique within the context of a single package / socket. Here are the 702 * variables that we keep track of and their meaning. 703 * 704 * Several of the values that are asking for an identifier, with the exception 705 * of cpi_apicid, are allowed to be synthetic. 706 * 707 * 708 * cpi_apicid 709 * 710 * This is the value of the CPU's APIC id. This should be the full 32-bit 711 * ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit 712 * APIC ID. This value is globally unique between all logical CPUs across 713 * all packages. This is usually required by the APIC. 714 * 715 * cpi_chipid 716 * 717 * This value indicates the ID of the package that the logical CPU is a 718 * part of. This value is allowed to be synthetic. It is usually derived by 719 * taking the CPU's APIC ID and determining how many bits are used to 720 * represent CPU cores in the package. All logical CPUs that are part of 721 * the same package must have the same value. 722 * 723 * cpi_coreid 724 * 725 * This represents the ID of a CPU core. Two logical CPUs should only have 726 * the same cpi_coreid value if they are part of the same core. These 727 * values may be synthetic. On systems that support SMT, this value is 728 * usually derived from the APIC ID, otherwise it is often synthetic and 729 * just set to the value of the cpu_id in the cpu_t. 730 * 731 * cpi_pkgcoreid 732 * 733 * This is similar to the cpi_coreid in that logical CPUs that are part of 734 * the same core should have the same ID. The main difference is that these 735 * values are only required to be unique to a given socket. 736 * 737 * cpi_clogid 738 * 739 * This represents the logical ID of a logical CPU. This value should be 740 * unique within a given socket for each logical CPU. This is allowed to be 741 * synthetic, though it is usually based off of the CPU's apic ID. The 742 * broader system expects that logical CPUs that have are part of the same 743 * core have contiguous numbers. For example, if there were two threads per 744 * core, then the core IDs divided by two should be the same and the first 745 * modulus two should be zero and the second one. For example, IDs 4 and 5 746 * indicate two logical CPUs that are part of the same core. But IDs 5 and 747 * 6 represent two logical CPUs that are part of different cores. 748 * 749 * While it is common for the cpi_coreid and the cpi_clogid to be derived 750 * from the same source, strictly speaking, they don't have to be and the 751 * two values should be considered logically independent. One should not 752 * try to compare a logical CPU's cpi_coreid and cpi_clogid to determine 753 * some kind of relationship. While this is tempting, we've seen cases on 754 * AMD family 0xf where the system's cpu id is not related to its APIC ID. 755 * 756 * cpi_ncpu_per_chip 757 * 758 * This value indicates the total number of logical CPUs that exist in the 759 * physical package. Critically, this is not the number of logical CPUs 760 * that exist for just the single core. 761 * 762 * This value should be the same for all logical CPUs in the same package. 763 * 764 * cpi_ncore_per_chip 765 * 766 * This value indicates the total number of physical CPU cores that exist 767 * in the package. The system compares this value with cpi_ncpu_per_chip to 768 * determine if simultaneous multi-threading (SMT) is enabled. When 769 * cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and 770 * the X86FSET_HTT feature is not set. If this value is greater than one, 771 * than we consider the processor to have the feature X86FSET_CMP, to 772 * indicate that there is support for more than one core. 773 * 774 * This value should be the same for all logical CPUs in the same package. 775 * 776 * cpi_procnodes_per_pkg 777 * 778 * This value indicates the number of 'nodes' that exist in the package. 779 * When processors are actually a multi-chip module, this represents the 780 * number of such modules that exist in the package. Currently, on Intel 781 * based systems this member is always set to 1. 782 * 783 * This value should be the same for all logical CPUs in the same package. 784 * 785 * cpi_procnodeid 786 * 787 * This value indicates the ID of the node that the logical CPU is a part 788 * of. All logical CPUs that are in the same node must have the same value 789 * here. This value must be unique across all of the packages in the 790 * system. On Intel based systems, this is currently set to the value in 791 * cpi_chipid because there is only one node. 792 * 793 * cpi_cores_per_compunit 794 * 795 * This value indicates the number of cores that are part of a compute 796 * unit. See the AMD topology section for this. This member only has real 797 * meaning currently for AMD Bulldozer family processors. For all other 798 * processors, this should currently be set to 1. 799 * 800 * cpi_compunitid 801 * 802 * This indicates the compute unit that the logical CPU belongs to. For 803 * processors without AMD Bulldozer-style compute units this should be set 804 * to the value of cpi_coreid. 805 * 806 * cpi_ncpu_shr_last_cache 807 * 808 * This indicates the number of logical CPUs that are sharing the same last 809 * level cache. This value should be the same for all CPUs that are sharing 810 * that cache. The last cache refers to the cache that is closest to memory 811 * and furthest away from the CPU. 812 * 813 * cpi_last_lvl_cacheid 814 * 815 * This indicates the ID of the last cache that the logical CPU uses. This 816 * cache is often shared between multiple logical CPUs and is the cache 817 * that is closest to memory and furthest away from the CPU. This value 818 * should be the same for a group of logical CPUs only if they actually 819 * share the same last level cache. IDs should not overlap between 820 * packages. 821 * 822 * cpi_ncore_bits 823 * 824 * This indicates the number of bits that are required to represent all of 825 * the cores in the system. As cores are derived based on their APIC IDs, 826 * we aren't guaranteed a run of APIC IDs starting from zero. It's OK for 827 * this value to be larger than the actual number of IDs that are present 828 * in the system. This is used to size tables by the CMI framework. It is 829 * only filled in for Intel and AMD CPUs. 830 * 831 * cpi_nthread_bits 832 * 833 * This indicates the number of bits required to represent all of the IDs 834 * that cover the logical CPUs that exist on a given core. It's OK for this 835 * value to be larger than the actual number of IDs that are present in the 836 * system. This is used to size tables by the CMI framework. It is 837 * only filled in for Intel and AMD CPUs. 838 * 839 * ----------- 840 * Hypervisors 841 * ----------- 842 * 843 * If trying to manage the differences between vendors wasn't bad enough, it can 844 * get worse thanks to our friend hardware virtualization. Hypervisors are given 845 * the ability to interpose on all cpuid instructions and change them to suit 846 * their purposes. In general, this is necessary as the hypervisor wants to be 847 * able to present a more uniform set of features or not necessarily give the 848 * guest operating system kernel knowledge of all features so it can be 849 * more easily migrated between systems. 850 * 851 * When it comes to trying to determine topology information, this can be a 852 * double edged sword. When a hypervisor doesn't actually implement a cpuid 853 * leaf, it'll often return all zeros. Because of that, you'll often see various 854 * checks scattered about fields being non-zero before we assume we can use 855 * them. 856 * 857 * When it comes to topology information, the hypervisor is often incentivized 858 * to lie to you about topology. This is because it doesn't always actually 859 * guarantee that topology at all. The topology path we take in the system 860 * depends on how the CPU advertises itself. If it advertises itself as an Intel 861 * or AMD CPU, then we basically do our normal path. However, when they don't 862 * use an actual vendor, then that usually turns into multiple one-core CPUs 863 * that we enumerate that are often on different sockets. The actual behavior 864 * depends greatly on what the hypervisor actually exposes to us. 865 * 866 * -------------------- 867 * Exposing Information 868 * -------------------- 869 * 870 * We expose CPUID information in three different forms in the system. 871 * 872 * The first is through the x86_featureset variable. This is used in conjunction 873 * with the is_x86_feature() function. This is queried by x86-specific functions 874 * to determine which features are or aren't present in the system and to make 875 * decisions based upon them. For example, users of this include everything from 876 * parts of the system dedicated to reliability, availability, and 877 * serviceability (RAS), to making decisions about how to handle security 878 * mitigations, to various x86-specific drivers. General purpose or 879 * architecture independent drivers should never be calling this function. 880 * 881 * The second means is through the auxiliary vector. The auxiliary vector is a 882 * series of tagged data that the kernel passes down to a user program when it 883 * begins executing. This information is used to indicate to programs what 884 * instruction set extensions are present. For example, information about the 885 * CPU supporting the machine check architecture (MCA) wouldn't be passed down 886 * since user programs cannot make use of it. However, things like the AVX 887 * instruction sets are. Programs use this information to make run-time 888 * decisions about what features they should use. As an example, the run-time 889 * link-editor (rtld) can relocate different functions depending on the hardware 890 * support available. 891 * 892 * The final form is through a series of accessor functions that all have the 893 * form cpuid_get*. This is used by a number of different subsystems in the 894 * kernel to determine more detailed information about what we're running on, 895 * topology information, etc. Some of these subsystems include processor groups 896 * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI, 897 * microcode, and performance monitoring. These functions all ASSERT that the 898 * CPU they're being called on has reached a certain cpuid pass. If the passes 899 * are rearranged, then this needs to be adjusted. 900 * 901 * ----------------------------------------------- 902 * Speculative Execution CPU Side Channel Security 903 * ----------------------------------------------- 904 * 905 * With the advent of the Spectre and Meltdown attacks which exploit speculative 906 * execution in the CPU to create side channels there have been a number of 907 * different attacks and corresponding issues that the operating system needs to 908 * mitigate against. The following list is some of the common, but not 909 * exhaustive, set of issues that we know about and have done some or need to do 910 * more work in the system to mitigate against: 911 * 912 * - Spectre v1 913 * - Spectre v2 914 * - Meltdown (Spectre v3) 915 * - Rogue Register Read (Spectre v3a) 916 * - Speculative Store Bypass (Spectre v4) 917 * - ret2spec, SpectreRSB 918 * - L1 Terminal Fault (L1TF) 919 * - Microarchitectural Data Sampling (MDS) 920 * 921 * Each of these requires different sets of mitigations and has different attack 922 * surfaces. For the most part, this discussion is about protecting the kernel 923 * from non-kernel executing environments such as user processes and hardware 924 * virtual machines. Unfortunately, there are a number of user vs. user 925 * scenarios that exist with these. The rest of this section will describe the 926 * overall approach that the system has taken to address these as well as their 927 * shortcomings. Unfortunately, not all of the above have been handled today. 928 * 929 * SPECTRE FAMILY (Spectre v2, ret2spec, SpectreRSB) 930 * 931 * The second variant of the spectre attack focuses on performing branch target 932 * injection. This generally impacts indirect call instructions in the system. 933 * There are three different ways to mitigate this issue that are commonly 934 * described today: 935 * 936 * 1. Using Indirect Branch Restricted Speculation (IBRS). 937 * 2. Using Retpolines and RSB Stuffing 938 * 3. Using Enhanced Indirect Branch Restricted Speculation (EIBRS) 939 * 940 * IBRS uses a feature added to microcode to restrict speculation, among other 941 * things. This form of mitigation has not been used as it has been generally 942 * seen as too expensive and requires reactivation upon various transitions in 943 * the system. 944 * 945 * As a less impactful alternative to IBRS, retpolines were developed by 946 * Google. These basically require one to replace indirect calls with a specific 947 * trampoline that will cause speculation to fail and break the attack. 948 * Retpolines require compiler support. We always build with retpolines in the 949 * external thunk mode. This means that a traditional indirect call is replaced 950 * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect 951 * of this is that all indirect function calls are performed through a register. 952 * 953 * We have to use a common external location of the thunk and not inline it into 954 * the callsite so that way we can have a single place to patch these functions. 955 * As it turns out, we actually have three different forms of retpolines that 956 * exist in the system: 957 * 958 * 1. A full retpoline 959 * 2. An AMD-specific optimized retpoline 960 * 3. A no-op version 961 * 962 * The first one is used in the general case. The second one is used if we can 963 * determine that we're on an AMD system and we can successfully toggle the 964 * lfence serializing MSR that exists on the platform. Basically with this 965 * present, an lfence is sufficient and we don't need to do anywhere near as 966 * complicated a dance to successfully use retpolines. 967 * 968 * The third form described above is the most curious. It turns out that the way 969 * that retpolines are implemented is that they rely on how speculation is 970 * performed on a 'ret' instruction. Intel has continued to optimize this 971 * process (which is partly why we need to have return stack buffer stuffing, 972 * but more on that in a bit) and in processors starting with Cascade Lake 973 * on the server side, it's dangerous to rely on retpolines. Instead, a new 974 * mechanism has been introduced called Enhanced IBRS (EIBRS). 975 * 976 * Unlike IBRS, EIBRS is designed to be enabled once at boot and left on each 977 * physical core. However, if this is the case, we don't want to use retpolines 978 * any more. Therefore if EIBRS is present, we end up turning each retpoline 979 * function (called a thunk) into a jmp instruction. This means that we're still 980 * paying the cost of an extra jump to the external thunk, but it gives us 981 * flexibility and the ability to have a single kernel image that works across a 982 * wide variety of systems and hardware features. 983 * 984 * Unfortunately, this alone is insufficient. First, Skylake systems have 985 * additional speculation for the Return Stack Buffer (RSB) which is used to 986 * return from call instructions which retpolines take advantage of. However, 987 * this problem is not just limited to Skylake and is actually more pernicious. 988 * The SpectreRSB paper introduces several more problems that can arise with 989 * dealing with this. The RSB can be poisoned just like the indirect branch 990 * predictor. This means that one needs to clear the RSB when transitioning 991 * between two different privilege domains. Some examples include: 992 * 993 * - Switching between two different user processes 994 * - Going between user land and the kernel 995 * - Returning to the kernel from a hardware virtual machine 996 * 997 * Mitigating this involves combining a couple of different things. The first is 998 * SMEP (supervisor mode execution protection) which was introduced in Ivy 999 * Bridge. When an RSB entry refers to a user address and we're executing in the 1000 * kernel, speculation through it will be stopped when SMEP is enabled. This 1001 * protects against a number of the different cases that we would normally be 1002 * worried about such as when we enter the kernel from user land. 1003 * 1004 * To prevent against additional manipulation of the RSB from other contexts 1005 * such as a non-root VMX context attacking the kernel we first look to enhanced 1006 * IBRS. When EIBRS is present and enabled, then there is nothing else that we 1007 * need to do to protect the kernel at this time. 1008 * 1009 * On CPUs without EIBRS we need to manually overwrite the contents of the 1010 * return stack buffer. We do this through the x86_rsb_stuff() function. 1011 * Currently this is employed on context switch. The x86_rsb_stuff() function is 1012 * disabled when enhanced IBRS is present because Intel claims on such systems 1013 * it will be ineffective. Stuffing the RSB in context switch helps prevent user 1014 * to user attacks via the RSB. 1015 * 1016 * If SMEP is not present, then we would have to stuff the RSB every time we 1017 * transitioned from user mode to the kernel, which isn't very practical right 1018 * now. 1019 * 1020 * To fully protect user to user and vmx to vmx attacks from these classes of 1021 * issues, we would also need to allow them to opt into performing an Indirect 1022 * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up. 1023 * 1024 * By default, the system will enable RSB stuffing and the required variant of 1025 * retpolines and store that information in the x86_spectrev2_mitigation value. 1026 * This will be evaluated after a microcode update as well, though it is 1027 * expected that microcode updates will not take away features. This may mean 1028 * that a late loaded microcode may not end up in the optimal configuration 1029 * (though this should be rare). 1030 * 1031 * Currently we do not build kmdb with retpolines or perform any additional side 1032 * channel security mitigations for it. One complication with kmdb is that it 1033 * requires its own retpoline thunks and it would need to adjust itself based on 1034 * what the kernel does. The threat model of kmdb is more limited and therefore 1035 * it may make more sense to investigate using prediction barriers as the whole 1036 * system is only executing a single instruction at a time while in kmdb. 1037 * 1038 * SPECTRE FAMILY (v1, v4) 1039 * 1040 * The v1 and v4 variants of spectre are not currently mitigated in the 1041 * system and require other classes of changes to occur in the code. 1042 * 1043 * MELTDOWN 1044 * 1045 * Meltdown, or spectre v3, allowed a user process to read any data in their 1046 * address space regardless of whether or not the page tables in question 1047 * allowed the user to have the ability to read them. The solution to meltdown 1048 * is kernel page table isolation. In this world, there are two page tables that 1049 * are used for a process, one in user land and one in the kernel. To implement 1050 * this we use per-CPU page tables and switch between the user and kernel 1051 * variants when entering and exiting the kernel. For more information about 1052 * this process and how the trampolines work, please see the big theory 1053 * statements and additional comments in: 1054 * 1055 * - uts/i86pc/ml/kpti_trampolines.s 1056 * - uts/i86pc/vm/hat_i86.c 1057 * 1058 * While Meltdown only impacted Intel systems and there are also Intel systems 1059 * that have Meltdown fixed (called Rogue Data Cache Load), we always have 1060 * kernel page table isolation enabled. While this may at first seem weird, an 1061 * important thing to remember is that you can't speculatively read an address 1062 * if it's never in your page table at all. Having user processes without kernel 1063 * pages present provides us with an important layer of defense in the kernel 1064 * against any other side channel attacks that exist and have yet to be 1065 * discovered. As such, kernel page table isolation (KPTI) is always enabled by 1066 * default, no matter the x86 system. 1067 * 1068 * L1 TERMINAL FAULT 1069 * 1070 * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative 1071 * execution uses page table entries. Effectively, it is two different problems. 1072 * The first is that it ignores the not present bit in the page table entries 1073 * when performing speculative execution. This means that something can 1074 * speculatively read the listed physical address if it's present in the L1 1075 * cache under certain conditions (see Intel's documentation for the full set of 1076 * conditions). Secondly, this can be used to bypass hardware virtualization 1077 * extended page tables (EPT) that are part of Intel's hardware virtual machine 1078 * instructions. 1079 * 1080 * For the non-hardware virtualized case, this is relatively easy to deal with. 1081 * We must make sure that all unmapped pages have an address of zero. This means 1082 * that they could read the first 4k of physical memory; however, we never use 1083 * that first page in the operating system and always skip putting it in our 1084 * memory map, even if firmware tells us we can use it in our memory map. While 1085 * other systems try to put extra metadata in the address and reserved bits, 1086 * which led to this being problematic in those cases, we do not. 1087 * 1088 * For hardware virtual machines things are more complicated. Because they can 1089 * construct their own page tables, it isn't hard for them to perform this 1090 * attack against any physical address. The one wrinkle is that this physical 1091 * address must be in the L1 data cache. Thus Intel added an MSR that we can use 1092 * to flush the L1 data cache. We wrap this up in the function 1093 * spec_uarch_flush(). This function is also used in the mitigation of 1094 * microarchitectural data sampling (MDS) discussed later on. Kernel based 1095 * hypervisors such as KVM or bhyve are responsible for performing this before 1096 * entering the guest. 1097 * 1098 * Because this attack takes place in the L1 cache, there's another wrinkle 1099 * here. The L1 cache is shared between all logical CPUs in a core in most Intel 1100 * designs. This means that when a thread enters a hardware virtualized context 1101 * and flushes the L1 data cache, the other thread on the processor may then go 1102 * ahead and put new data in it that can be potentially attacked. While one 1103 * solution is to disable SMT on the system, another option that is available is 1104 * to use a feature for hardware virtualization called 'SMT exclusion'. This 1105 * goes through and makes sure that if a HVM is being scheduled on one thread, 1106 * then the thing on the other thread is from the same hardware virtual machine. 1107 * If an interrupt comes in or the guest exits to the broader system, then the 1108 * other SMT thread will be kicked out. 1109 * 1110 * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the 1111 * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not 1112 * perform L1TF related mitigations. 1113 * 1114 * MICROARCHITECTURAL DATA SAMPLING 1115 * 1116 * Microarchitectural data sampling (MDS) is a combination of four discrete 1117 * vulnerabilities that are similar issues affecting various parts of the CPU's 1118 * microarchitectural implementation around load, store, and fill buffers. 1119 * Specifically it is made up of the following subcomponents: 1120 * 1121 * 1. Microarchitectural Store Buffer Data Sampling (MSBDS) 1122 * 2. Microarchitectural Fill Buffer Data Sampling (MFBDS) 1123 * 3. Microarchitectural Load Port Data Sampling (MLPDS) 1124 * 4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM) 1125 * 1126 * To begin addressing these, Intel has introduced another feature in microcode 1127 * called MD_CLEAR. This changes the verw instruction to operate in a different 1128 * way. This allows us to execute the verw instruction in a particular way to 1129 * flush the state of the affected parts. The L1TF L1D flush mechanism is also 1130 * updated when this microcode is present to flush this state. 1131 * 1132 * Primarily we need to flush this state whenever we transition from the kernel 1133 * to a less privileged context such as user mode or an HVM guest. MSBDS is a 1134 * little bit different. Here the structures are statically sized when a logical 1135 * CPU is in use and resized when it goes to sleep. Therefore, we also need to 1136 * flush the microarchitectural state before the CPU goes idles by calling hlt, 1137 * mwait, or another ACPI method. To perform these flushes, we call 1138 * x86_md_clear() at all of these transition points. 1139 * 1140 * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF, 1141 * then we change the spec_uarch_flush() function to point to x86_md_clear(). If 1142 * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes 1143 * a no-op. 1144 * 1145 * Unfortunately, with this issue hyperthreading rears its ugly head. In 1146 * particular, everything we've discussed above is only valid for a single 1147 * thread executing on a core. In the case where you have hyper-threading 1148 * present, this attack can be performed between threads. The theoretical fix 1149 * for this is to ensure that both threads are always in the same security 1150 * domain. This means that they are executing in the same ring and mutually 1151 * trust each other. Practically speaking, this would mean that a system call 1152 * would have to issue an inter-processor interrupt (IPI) to the other thread. 1153 * Rather than implement this, we recommend that one disables hyper-threading 1154 * through the use of psradm -aS. 1155 * 1156 * SUMMARY 1157 * 1158 * The following table attempts to summarize the mitigations for various issues 1159 * and what's done in various places: 1160 * 1161 * - Spectre v1: Not currently mitigated 1162 * - Spectre v2: Retpolines/RSB Stuffing or EIBRS if HW support 1163 * - Meltdown: Kernel Page Table Isolation 1164 * - Spectre v3a: Updated CPU microcode 1165 * - Spectre v4: Not currently mitigated 1166 * - SpectreRSB: SMEP and RSB Stuffing 1167 * - L1TF: spec_uarch_flush, smt exclusion, requires microcode 1168 * - MDS: x86_md_clear, requires microcode, disabling hyper threading 1169 * 1170 * The following table indicates the x86 feature set bits that indicate that a 1171 * given problem has been solved or a notable feature is present: 1172 * 1173 * - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS 1174 * - MDS_NO: All forms of MDS 1175 */ 1176 1177 #include <sys/types.h> 1178 #include <sys/archsystm.h> 1179 #include <sys/x86_archext.h> 1180 #include <sys/kmem.h> 1181 #include <sys/systm.h> 1182 #include <sys/cmn_err.h> 1183 #include <sys/sunddi.h> 1184 #include <sys/sunndi.h> 1185 #include <sys/cpuvar.h> 1186 #include <sys/processor.h> 1187 #include <sys/sysmacros.h> 1188 #include <sys/pg.h> 1189 #include <sys/fp.h> 1190 #include <sys/controlregs.h> 1191 #include <sys/bitmap.h> 1192 #include <sys/auxv_386.h> 1193 #include <sys/memnode.h> 1194 #include <sys/pci_cfgspace.h> 1195 #include <sys/comm_page.h> 1196 #include <sys/mach_mmu.h> 1197 #include <sys/ucode.h> 1198 #include <sys/tsc.h> 1199 #include <sys/kobj.h> 1200 #include <sys/asm_misc.h> 1201 1202 #ifdef __xpv 1203 #include <sys/hypervisor.h> 1204 #else 1205 #include <sys/ontrap.h> 1206 #endif 1207 1208 uint_t x86_vendor = X86_VENDOR_IntelClone; 1209 uint_t x86_type = X86_TYPE_OTHER; 1210 uint_t x86_clflush_size = 0; 1211 1212 #if defined(__xpv) 1213 int x86_use_pcid = 0; 1214 int x86_use_invpcid = 0; 1215 #else 1216 int x86_use_pcid = -1; 1217 int x86_use_invpcid = -1; 1218 #endif 1219 1220 typedef enum { 1221 X86_SPECTREV2_RETPOLINE, 1222 X86_SPECTREV2_RETPOLINE_AMD, 1223 X86_SPECTREV2_ENHANCED_IBRS, 1224 X86_SPECTREV2_DISABLED 1225 } x86_spectrev2_mitigation_t; 1226 1227 uint_t x86_disable_spectrev2 = 0; 1228 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation = 1229 X86_SPECTREV2_RETPOLINE; 1230 1231 uint_t pentiumpro_bug4046376; 1232 1233 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)]; 1234 1235 static char *x86_feature_names[NUM_X86_FEATURES] = { 1236 "lgpg", 1237 "tsc", 1238 "msr", 1239 "mtrr", 1240 "pge", 1241 "de", 1242 "cmov", 1243 "mmx", 1244 "mca", 1245 "pae", 1246 "cv8", 1247 "pat", 1248 "sep", 1249 "sse", 1250 "sse2", 1251 "htt", 1252 "asysc", 1253 "nx", 1254 "sse3", 1255 "cx16", 1256 "cmp", 1257 "tscp", 1258 "mwait", 1259 "sse4a", 1260 "cpuid", 1261 "ssse3", 1262 "sse4_1", 1263 "sse4_2", 1264 "1gpg", 1265 "clfsh", 1266 "64", 1267 "aes", 1268 "pclmulqdq", 1269 "xsave", 1270 "avx", 1271 "vmx", 1272 "svm", 1273 "topoext", 1274 "f16c", 1275 "rdrand", 1276 "x2apic", 1277 "avx2", 1278 "bmi1", 1279 "bmi2", 1280 "fma", 1281 "smep", 1282 "smap", 1283 "adx", 1284 "rdseed", 1285 "mpx", 1286 "avx512f", 1287 "avx512dq", 1288 "avx512pf", 1289 "avx512er", 1290 "avx512cd", 1291 "avx512bw", 1292 "avx512vl", 1293 "avx512fma", 1294 "avx512vbmi", 1295 "avx512_vpopcntdq", 1296 "avx512_4vnniw", 1297 "avx512_4fmaps", 1298 "xsaveopt", 1299 "xsavec", 1300 "xsaves", 1301 "sha", 1302 "umip", 1303 "pku", 1304 "ospke", 1305 "pcid", 1306 "invpcid", 1307 "ibrs", 1308 "ibpb", 1309 "stibp", 1310 "ssbd", 1311 "ssbd_virt", 1312 "rdcl_no", 1313 "ibrs_all", 1314 "rsba", 1315 "ssb_no", 1316 "stibp_all", 1317 "flush_cmd", 1318 "l1d_vmentry_no", 1319 "fsgsbase", 1320 "clflushopt", 1321 "clwb", 1322 "monitorx", 1323 "clzero", 1324 "xop", 1325 "fma4", 1326 "tbm", 1327 "avx512_vnni", 1328 "amd_pcec", 1329 "mb_clear", 1330 "mds_no", 1331 "core_thermal", 1332 "pkg_thermal" 1333 }; 1334 1335 boolean_t 1336 is_x86_feature(void *featureset, uint_t feature) 1337 { 1338 ASSERT(feature < NUM_X86_FEATURES); 1339 return (BT_TEST((ulong_t *)featureset, feature)); 1340 } 1341 1342 void 1343 add_x86_feature(void *featureset, uint_t feature) 1344 { 1345 ASSERT(feature < NUM_X86_FEATURES); 1346 BT_SET((ulong_t *)featureset, feature); 1347 } 1348 1349 void 1350 remove_x86_feature(void *featureset, uint_t feature) 1351 { 1352 ASSERT(feature < NUM_X86_FEATURES); 1353 BT_CLEAR((ulong_t *)featureset, feature); 1354 } 1355 1356 boolean_t 1357 compare_x86_featureset(void *setA, void *setB) 1358 { 1359 /* 1360 * We assume that the unused bits of the bitmap are always zero. 1361 */ 1362 if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) { 1363 return (B_TRUE); 1364 } else { 1365 return (B_FALSE); 1366 } 1367 } 1368 1369 void 1370 print_x86_featureset(void *featureset) 1371 { 1372 uint_t i; 1373 1374 for (i = 0; i < NUM_X86_FEATURES; i++) { 1375 if (is_x86_feature(featureset, i)) { 1376 cmn_err(CE_CONT, "?x86_feature: %s\n", 1377 x86_feature_names[i]); 1378 } 1379 } 1380 } 1381 1382 /* Note: This is the maximum size for the CPU, not the size of the structure. */ 1383 static size_t xsave_state_size = 0; 1384 uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE); 1385 boolean_t xsave_force_disable = B_FALSE; 1386 extern int disable_smap; 1387 1388 /* 1389 * This is set to platform type we are running on. 1390 */ 1391 static int platform_type = -1; 1392 1393 #if !defined(__xpv) 1394 /* 1395 * Variable to patch if hypervisor platform detection needs to be 1396 * disabled (e.g. platform_type will always be HW_NATIVE if this is 0). 1397 */ 1398 int enable_platform_detection = 1; 1399 #endif 1400 1401 /* 1402 * monitor/mwait info. 1403 * 1404 * size_actual and buf_actual are the real address and size allocated to get 1405 * proper mwait_buf alignement. buf_actual and size_actual should be passed 1406 * to kmem_free(). Currently kmem_alloc() and mwait happen to both use 1407 * processor cache-line alignment, but this is not guarantied in the furture. 1408 */ 1409 struct mwait_info { 1410 size_t mon_min; /* min size to avoid missed wakeups */ 1411 size_t mon_max; /* size to avoid false wakeups */ 1412 size_t size_actual; /* size actually allocated */ 1413 void *buf_actual; /* memory actually allocated */ 1414 uint32_t support; /* processor support of monitor/mwait */ 1415 }; 1416 1417 /* 1418 * xsave/xrestor info. 1419 * 1420 * This structure contains HW feature bits and the size of the xsave save area. 1421 * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure 1422 * (xsave_state) to describe the xsave layout. However, at runtime the 1423 * per-lwp xsave area is dynamically allocated based on xsav_max_size. The 1424 * xsave_state structure simply represents the legacy layout of the beginning 1425 * of the xsave area. 1426 */ 1427 struct xsave_info { 1428 uint32_t xsav_hw_features_low; /* Supported HW features */ 1429 uint32_t xsav_hw_features_high; /* Supported HW features */ 1430 size_t xsav_max_size; /* max size save area for HW features */ 1431 size_t ymm_size; /* AVX: size of ymm save area */ 1432 size_t ymm_offset; /* AVX: offset for ymm save area */ 1433 size_t bndregs_size; /* MPX: size of bndregs save area */ 1434 size_t bndregs_offset; /* MPX: offset for bndregs save area */ 1435 size_t bndcsr_size; /* MPX: size of bndcsr save area */ 1436 size_t bndcsr_offset; /* MPX: offset for bndcsr save area */ 1437 size_t opmask_size; /* AVX512: size of opmask save */ 1438 size_t opmask_offset; /* AVX512: offset for opmask save */ 1439 size_t zmmlo_size; /* AVX512: size of zmm 256 save */ 1440 size_t zmmlo_offset; /* AVX512: offset for zmm 256 save */ 1441 size_t zmmhi_size; /* AVX512: size of zmm hi reg save */ 1442 size_t zmmhi_offset; /* AVX512: offset for zmm hi reg save */ 1443 }; 1444 1445 1446 /* 1447 * These constants determine how many of the elements of the 1448 * cpuid we cache in the cpuid_info data structure; the 1449 * remaining elements are accessible via the cpuid instruction. 1450 */ 1451 1452 #define NMAX_CPI_STD 8 /* eax = 0 .. 7 */ 1453 #define NMAX_CPI_EXTD 0x1f /* eax = 0x80000000 .. 0x8000001e */ 1454 1455 /* 1456 * See the big theory statement for a more detailed explanation of what some of 1457 * these members mean. 1458 */ 1459 struct cpuid_info { 1460 uint_t cpi_pass; /* last pass completed */ 1461 /* 1462 * standard function information 1463 */ 1464 uint_t cpi_maxeax; /* fn 0: %eax */ 1465 char cpi_vendorstr[13]; /* fn 0: %ebx:%ecx:%edx */ 1466 uint_t cpi_vendor; /* enum of cpi_vendorstr */ 1467 1468 uint_t cpi_family; /* fn 1: extended family */ 1469 uint_t cpi_model; /* fn 1: extended model */ 1470 uint_t cpi_step; /* fn 1: stepping */ 1471 chipid_t cpi_chipid; /* fn 1: %ebx: Intel: chip # */ 1472 /* AMD: package/socket # */ 1473 uint_t cpi_brandid; /* fn 1: %ebx: brand ID */ 1474 int cpi_clogid; /* fn 1: %ebx: thread # */ 1475 uint_t cpi_ncpu_per_chip; /* fn 1: %ebx: logical cpu count */ 1476 uint8_t cpi_cacheinfo[16]; /* fn 2: intel-style cache desc */ 1477 uint_t cpi_ncache; /* fn 2: number of elements */ 1478 uint_t cpi_ncpu_shr_last_cache; /* fn 4: %eax: ncpus sharing cache */ 1479 id_t cpi_last_lvl_cacheid; /* fn 4: %eax: derived cache id */ 1480 uint_t cpi_cache_leaf_size; /* Number of cache elements */ 1481 /* Intel fn: 4, AMD fn: 8000001d */ 1482 struct cpuid_regs **cpi_cache_leaves; /* Acual leaves from above */ 1483 struct cpuid_regs cpi_std[NMAX_CPI_STD]; /* 0 .. 7 */ 1484 /* 1485 * extended function information 1486 */ 1487 uint_t cpi_xmaxeax; /* fn 0x80000000: %eax */ 1488 char cpi_brandstr[49]; /* fn 0x8000000[234] */ 1489 uint8_t cpi_pabits; /* fn 0x80000006: %eax */ 1490 uint8_t cpi_vabits; /* fn 0x80000006: %eax */ 1491 uint8_t cpi_fp_amd_save; /* AMD: FP error pointer save rqd. */ 1492 struct cpuid_regs cpi_extd[NMAX_CPI_EXTD]; /* 0x800000XX */ 1493 1494 id_t cpi_coreid; /* same coreid => strands share core */ 1495 int cpi_pkgcoreid; /* core number within single package */ 1496 uint_t cpi_ncore_per_chip; /* AMD: fn 0x80000008: %ecx[7-0] */ 1497 /* Intel: fn 4: %eax[31-26] */ 1498 1499 /* 1500 * These values represent the number of bits that are required to store 1501 * information about the number of cores and threads. 1502 */ 1503 uint_t cpi_ncore_bits; 1504 uint_t cpi_nthread_bits; 1505 /* 1506 * supported feature information 1507 */ 1508 uint32_t cpi_support[6]; 1509 #define STD_EDX_FEATURES 0 1510 #define AMD_EDX_FEATURES 1 1511 #define TM_EDX_FEATURES 2 1512 #define STD_ECX_FEATURES 3 1513 #define AMD_ECX_FEATURES 4 1514 #define STD_EBX_FEATURES 5 1515 /* 1516 * Synthesized information, where known. 1517 */ 1518 uint32_t cpi_chiprev; /* See X86_CHIPREV_* in x86_archext.h */ 1519 const char *cpi_chiprevstr; /* May be NULL if chiprev unknown */ 1520 uint32_t cpi_socket; /* Chip package/socket type */ 1521 1522 struct mwait_info cpi_mwait; /* fn 5: monitor/mwait info */ 1523 uint32_t cpi_apicid; 1524 uint_t cpi_procnodeid; /* AMD: nodeID on HT, Intel: chipid */ 1525 uint_t cpi_procnodes_per_pkg; /* AMD: # of nodes in the package */ 1526 /* Intel: 1 */ 1527 uint_t cpi_compunitid; /* AMD: ComputeUnit ID, Intel: coreid */ 1528 uint_t cpi_cores_per_compunit; /* AMD: # of cores in the ComputeUnit */ 1529 1530 struct xsave_info cpi_xsave; /* fn D: xsave/xrestor info */ 1531 }; 1532 1533 1534 static struct cpuid_info cpuid_info0; 1535 1536 /* 1537 * These bit fields are defined by the Intel Application Note AP-485 1538 * "Intel Processor Identification and the CPUID Instruction" 1539 */ 1540 #define CPI_FAMILY_XTD(cpi) BITX((cpi)->cpi_std[1].cp_eax, 27, 20) 1541 #define CPI_MODEL_XTD(cpi) BITX((cpi)->cpi_std[1].cp_eax, 19, 16) 1542 #define CPI_TYPE(cpi) BITX((cpi)->cpi_std[1].cp_eax, 13, 12) 1543 #define CPI_FAMILY(cpi) BITX((cpi)->cpi_std[1].cp_eax, 11, 8) 1544 #define CPI_STEP(cpi) BITX((cpi)->cpi_std[1].cp_eax, 3, 0) 1545 #define CPI_MODEL(cpi) BITX((cpi)->cpi_std[1].cp_eax, 7, 4) 1546 1547 #define CPI_FEATURES_EDX(cpi) ((cpi)->cpi_std[1].cp_edx) 1548 #define CPI_FEATURES_ECX(cpi) ((cpi)->cpi_std[1].cp_ecx) 1549 #define CPI_FEATURES_XTD_EDX(cpi) ((cpi)->cpi_extd[1].cp_edx) 1550 #define CPI_FEATURES_XTD_ECX(cpi) ((cpi)->cpi_extd[1].cp_ecx) 1551 #define CPI_FEATURES_7_0_EBX(cpi) ((cpi)->cpi_std[7].cp_ebx) 1552 #define CPI_FEATURES_7_0_ECX(cpi) ((cpi)->cpi_std[7].cp_ecx) 1553 #define CPI_FEATURES_7_0_EDX(cpi) ((cpi)->cpi_std[7].cp_edx) 1554 1555 #define CPI_BRANDID(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 7, 0) 1556 #define CPI_CHUNKS(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 15, 7) 1557 #define CPI_CPU_COUNT(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 23, 16) 1558 #define CPI_APIC_ID(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 31, 24) 1559 1560 #define CPI_MAXEAX_MAX 0x100 /* sanity control */ 1561 #define CPI_XMAXEAX_MAX 0x80000100 1562 #define CPI_FN4_ECX_MAX 0x20 /* sanity: max fn 4 levels */ 1563 #define CPI_FNB_ECX_MAX 0x20 /* sanity: max fn B levels */ 1564 1565 /* 1566 * Function 4 (Deterministic Cache Parameters) macros 1567 * Defined by Intel Application Note AP-485 1568 */ 1569 #define CPI_NUM_CORES(regs) BITX((regs)->cp_eax, 31, 26) 1570 #define CPI_NTHR_SHR_CACHE(regs) BITX((regs)->cp_eax, 25, 14) 1571 #define CPI_FULL_ASSOC_CACHE(regs) BITX((regs)->cp_eax, 9, 9) 1572 #define CPI_SELF_INIT_CACHE(regs) BITX((regs)->cp_eax, 8, 8) 1573 #define CPI_CACHE_LVL(regs) BITX((regs)->cp_eax, 7, 5) 1574 #define CPI_CACHE_TYPE(regs) BITX((regs)->cp_eax, 4, 0) 1575 #define CPI_CPU_LEVEL_TYPE(regs) BITX((regs)->cp_ecx, 15, 8) 1576 1577 #define CPI_CACHE_WAYS(regs) BITX((regs)->cp_ebx, 31, 22) 1578 #define CPI_CACHE_PARTS(regs) BITX((regs)->cp_ebx, 21, 12) 1579 #define CPI_CACHE_COH_LN_SZ(regs) BITX((regs)->cp_ebx, 11, 0) 1580 1581 #define CPI_CACHE_SETS(regs) BITX((regs)->cp_ecx, 31, 0) 1582 1583 #define CPI_PREFCH_STRIDE(regs) BITX((regs)->cp_edx, 9, 0) 1584 1585 1586 /* 1587 * A couple of shorthand macros to identify "later" P6-family chips 1588 * like the Pentium M and Core. First, the "older" P6-based stuff 1589 * (loosely defined as "pre-Pentium-4"): 1590 * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon 1591 */ 1592 #define IS_LEGACY_P6(cpi) ( \ 1593 cpi->cpi_family == 6 && \ 1594 (cpi->cpi_model == 1 || \ 1595 cpi->cpi_model == 3 || \ 1596 cpi->cpi_model == 5 || \ 1597 cpi->cpi_model == 6 || \ 1598 cpi->cpi_model == 7 || \ 1599 cpi->cpi_model == 8 || \ 1600 cpi->cpi_model == 0xA || \ 1601 cpi->cpi_model == 0xB) \ 1602 ) 1603 1604 /* A "new F6" is everything with family 6 that's not the above */ 1605 #define IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi)) 1606 1607 /* Extended family/model support */ 1608 #define IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \ 1609 cpi->cpi_family >= 0xf) 1610 1611 /* 1612 * Info for monitor/mwait idle loop. 1613 * 1614 * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's 1615 * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November 1616 * 2006. 1617 * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual 1618 * Documentation Updates" #33633, Rev 2.05, December 2006. 1619 */ 1620 #define MWAIT_SUPPORT (0x00000001) /* mwait supported */ 1621 #define MWAIT_EXTENSIONS (0x00000002) /* extenstion supported */ 1622 #define MWAIT_ECX_INT_ENABLE (0x00000004) /* ecx 1 extension supported */ 1623 #define MWAIT_SUPPORTED(cpi) ((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON) 1624 #define MWAIT_INT_ENABLE(cpi) ((cpi)->cpi_std[5].cp_ecx & 0x2) 1625 #define MWAIT_EXTENSION(cpi) ((cpi)->cpi_std[5].cp_ecx & 0x1) 1626 #define MWAIT_SIZE_MIN(cpi) BITX((cpi)->cpi_std[5].cp_eax, 15, 0) 1627 #define MWAIT_SIZE_MAX(cpi) BITX((cpi)->cpi_std[5].cp_ebx, 15, 0) 1628 /* 1629 * Number of sub-cstates for a given c-state. 1630 */ 1631 #define MWAIT_NUM_SUBC_STATES(cpi, c_state) \ 1632 BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state) 1633 1634 /* 1635 * XSAVE leaf 0xD enumeration 1636 */ 1637 #define CPUID_LEAFD_2_YMM_OFFSET 576 1638 #define CPUID_LEAFD_2_YMM_SIZE 256 1639 1640 /* 1641 * Common extended leaf names to cut down on typos. 1642 */ 1643 #define CPUID_LEAF_EXT_0 0x80000000 1644 #define CPUID_LEAF_EXT_8 0x80000008 1645 #define CPUID_LEAF_EXT_1d 0x8000001d 1646 #define CPUID_LEAF_EXT_1e 0x8000001e 1647 1648 /* 1649 * Functions we consune from cpuid_subr.c; don't publish these in a header 1650 * file to try and keep people using the expected cpuid_* interfaces. 1651 */ 1652 extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t); 1653 extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t); 1654 extern uint32_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t); 1655 extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t); 1656 extern uint_t _cpuid_vendorstr_to_vendorcode(char *); 1657 1658 /* 1659 * Apply up various platform-dependent restrictions where the 1660 * underlying platform restrictions mean the CPU can be marked 1661 * as less capable than its cpuid instruction would imply. 1662 */ 1663 #if defined(__xpv) 1664 static void 1665 platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp) 1666 { 1667 switch (eax) { 1668 case 1: { 1669 uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ? 1670 0 : CPUID_INTC_EDX_MCA; 1671 cp->cp_edx &= 1672 ~(mcamask | 1673 CPUID_INTC_EDX_PSE | 1674 CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE | 1675 CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR | 1676 CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT | 1677 CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP | 1678 CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT); 1679 break; 1680 } 1681 1682 case 0x80000001: 1683 cp->cp_edx &= 1684 ~(CPUID_AMD_EDX_PSE | 1685 CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE | 1686 CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE | 1687 CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 | 1688 CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP | 1689 CPUID_AMD_EDX_TSCP); 1690 cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY; 1691 break; 1692 default: 1693 break; 1694 } 1695 1696 switch (vendor) { 1697 case X86_VENDOR_Intel: 1698 switch (eax) { 1699 case 4: 1700 /* 1701 * Zero out the (ncores-per-chip - 1) field 1702 */ 1703 cp->cp_eax &= 0x03fffffff; 1704 break; 1705 default: 1706 break; 1707 } 1708 break; 1709 case X86_VENDOR_AMD: 1710 switch (eax) { 1711 1712 case 0x80000001: 1713 cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D; 1714 break; 1715 1716 case CPUID_LEAF_EXT_8: 1717 /* 1718 * Zero out the (ncores-per-chip - 1) field 1719 */ 1720 cp->cp_ecx &= 0xffffff00; 1721 break; 1722 default: 1723 break; 1724 } 1725 break; 1726 default: 1727 break; 1728 } 1729 } 1730 #else 1731 #define platform_cpuid_mangle(vendor, eax, cp) /* nothing */ 1732 #endif 1733 1734 /* 1735 * Some undocumented ways of patching the results of the cpuid 1736 * instruction to permit running Solaris 10 on future cpus that 1737 * we don't currently support. Could be set to non-zero values 1738 * via settings in eeprom. 1739 */ 1740 1741 uint32_t cpuid_feature_ecx_include; 1742 uint32_t cpuid_feature_ecx_exclude; 1743 uint32_t cpuid_feature_edx_include; 1744 uint32_t cpuid_feature_edx_exclude; 1745 1746 /* 1747 * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs. 1748 */ 1749 void 1750 cpuid_alloc_space(cpu_t *cpu) 1751 { 1752 /* 1753 * By convention, cpu0 is the boot cpu, which is set up 1754 * before memory allocation is available. All other cpus get 1755 * their cpuid_info struct allocated here. 1756 */ 1757 ASSERT(cpu->cpu_id != 0); 1758 ASSERT(cpu->cpu_m.mcpu_cpi == NULL); 1759 cpu->cpu_m.mcpu_cpi = 1760 kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP); 1761 } 1762 1763 void 1764 cpuid_free_space(cpu_t *cpu) 1765 { 1766 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 1767 int i; 1768 1769 ASSERT(cpi != NULL); 1770 ASSERT(cpi != &cpuid_info0); 1771 1772 /* 1773 * Free up any cache leaf related dynamic storage. The first entry was 1774 * cached from the standard cpuid storage, so we should not free it. 1775 */ 1776 for (i = 1; i < cpi->cpi_cache_leaf_size; i++) 1777 kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs)); 1778 if (cpi->cpi_cache_leaf_size > 0) 1779 kmem_free(cpi->cpi_cache_leaves, 1780 cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *)); 1781 1782 kmem_free(cpi, sizeof (*cpi)); 1783 cpu->cpu_m.mcpu_cpi = NULL; 1784 } 1785 1786 #if !defined(__xpv) 1787 /* 1788 * Determine the type of the underlying platform. This is used to customize 1789 * initialization of various subsystems (e.g. TSC). determine_platform() must 1790 * only ever be called once to prevent two processors from seeing different 1791 * values of platform_type. Must be called before cpuid_pass1(), the earliest 1792 * consumer to execute (uses _cpuid_chiprev --> synth_amd_info --> get_hwenv). 1793 */ 1794 void 1795 determine_platform(void) 1796 { 1797 struct cpuid_regs cp; 1798 uint32_t base; 1799 uint32_t regs[4]; 1800 char *hvstr = (char *)regs; 1801 1802 ASSERT(platform_type == -1); 1803 1804 platform_type = HW_NATIVE; 1805 1806 if (!enable_platform_detection) 1807 return; 1808 1809 /* 1810 * If Hypervisor CPUID bit is set, try to determine hypervisor 1811 * vendor signature, and set platform type accordingly. 1812 * 1813 * References: 1814 * http://lkml.org/lkml/2008/10/1/246 1815 * http://kb.vmware.com/kb/1009458 1816 */ 1817 cp.cp_eax = 0x1; 1818 (void) __cpuid_insn(&cp); 1819 if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) { 1820 cp.cp_eax = 0x40000000; 1821 (void) __cpuid_insn(&cp); 1822 regs[0] = cp.cp_ebx; 1823 regs[1] = cp.cp_ecx; 1824 regs[2] = cp.cp_edx; 1825 regs[3] = 0; 1826 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) { 1827 platform_type = HW_XEN_HVM; 1828 return; 1829 } 1830 if (strcmp(hvstr, HVSIG_VMWARE) == 0) { 1831 platform_type = HW_VMWARE; 1832 return; 1833 } 1834 if (strcmp(hvstr, HVSIG_KVM) == 0) { 1835 platform_type = HW_KVM; 1836 return; 1837 } 1838 if (strcmp(hvstr, HVSIG_BHYVE) == 0) { 1839 platform_type = HW_BHYVE; 1840 return; 1841 } 1842 if (strcmp(hvstr, HVSIG_MICROSOFT) == 0) 1843 platform_type = HW_MICROSOFT; 1844 } else { 1845 /* 1846 * Check older VMware hardware versions. VMware hypervisor is 1847 * detected by performing an IN operation to VMware hypervisor 1848 * port and checking that value returned in %ebx is VMware 1849 * hypervisor magic value. 1850 * 1851 * References: http://kb.vmware.com/kb/1009458 1852 */ 1853 vmware_port(VMWARE_HVCMD_GETVERSION, regs); 1854 if (regs[1] == VMWARE_HVMAGIC) { 1855 platform_type = HW_VMWARE; 1856 return; 1857 } 1858 } 1859 1860 /* 1861 * Check Xen hypervisor. In a fully virtualized domain, 1862 * Xen's pseudo-cpuid function returns a string representing the 1863 * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum 1864 * supported cpuid function. We need at least a (base + 2) leaf value 1865 * to do what we want to do. Try different base values, since the 1866 * hypervisor might use a different one depending on whether Hyper-V 1867 * emulation is switched on by default or not. 1868 */ 1869 for (base = 0x40000000; base < 0x40010000; base += 0x100) { 1870 cp.cp_eax = base; 1871 (void) __cpuid_insn(&cp); 1872 regs[0] = cp.cp_ebx; 1873 regs[1] = cp.cp_ecx; 1874 regs[2] = cp.cp_edx; 1875 regs[3] = 0; 1876 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 && 1877 cp.cp_eax >= (base + 2)) { 1878 platform_type &= ~HW_NATIVE; 1879 platform_type |= HW_XEN_HVM; 1880 return; 1881 } 1882 } 1883 } 1884 1885 int 1886 get_hwenv(void) 1887 { 1888 ASSERT(platform_type != -1); 1889 return (platform_type); 1890 } 1891 1892 int 1893 is_controldom(void) 1894 { 1895 return (0); 1896 } 1897 1898 #else 1899 1900 int 1901 get_hwenv(void) 1902 { 1903 return (HW_XEN_PV); 1904 } 1905 1906 int 1907 is_controldom(void) 1908 { 1909 return (DOMAIN_IS_INITDOMAIN(xen_info)); 1910 } 1911 1912 #endif /* __xpv */ 1913 1914 /* 1915 * Make sure that we have gathered all of the CPUID leaves that we might need to 1916 * determine topology. We assume that the standard leaf 1 has already been done 1917 * and that xmaxeax has already been calculated. 1918 */ 1919 static void 1920 cpuid_gather_amd_topology_leaves(cpu_t *cpu) 1921 { 1922 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 1923 1924 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) { 1925 struct cpuid_regs *cp; 1926 1927 cp = &cpi->cpi_extd[8]; 1928 cp->cp_eax = CPUID_LEAF_EXT_8; 1929 (void) __cpuid_insn(cp); 1930 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp); 1931 } 1932 1933 if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 1934 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { 1935 struct cpuid_regs *cp; 1936 1937 cp = &cpi->cpi_extd[0x1e]; 1938 cp->cp_eax = CPUID_LEAF_EXT_1e; 1939 (void) __cpuid_insn(cp); 1940 } 1941 } 1942 1943 /* 1944 * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer 1945 * it to everything else. If not, and we're on an AMD system where 8000001e is 1946 * valid, then we use that. Othewrise, we fall back to the default value for the 1947 * APIC ID in leaf 1. 1948 */ 1949 static uint32_t 1950 cpuid_gather_apicid(struct cpuid_info *cpi) 1951 { 1952 /* 1953 * Leaf B changes based on the arguments to it. Beacuse we don't cache 1954 * it, we need to gather it again. 1955 */ 1956 if (cpi->cpi_maxeax >= 0xB) { 1957 struct cpuid_regs regs; 1958 struct cpuid_regs *cp; 1959 1960 cp = ®s; 1961 cp->cp_eax = 0xB; 1962 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0; 1963 (void) __cpuid_insn(cp); 1964 1965 if (cp->cp_ebx != 0) { 1966 return (cp->cp_edx); 1967 } 1968 } 1969 1970 if (cpi->cpi_vendor == X86_VENDOR_AMD && 1971 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 1972 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { 1973 return (cpi->cpi_extd[0x1e].cp_eax); 1974 } 1975 1976 return (CPI_APIC_ID(cpi)); 1977 } 1978 1979 /* 1980 * For AMD processors, attempt to calculate the number of chips and cores that 1981 * exist. The way that we do this varies based on the generation, because the 1982 * generations themselves have changed dramatically. 1983 * 1984 * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores. 1985 * However, with the advent of family 17h (Zen) it actually tells us the number 1986 * of threads, so we need to look at leaf 0x8000001e if available to determine 1987 * its value. Otherwise, for all prior families, the number of enabled cores is 1988 * the same as threads. 1989 * 1990 * If we do not have leaf 0x80000008, then we assume that this processor does 1991 * not have anything. AMD's older CPUID specification says there's no reason to 1992 * fall back to leaf 1. 1993 * 1994 * In some virtualization cases we will not have leaf 8000001e or it will be 1995 * zero. When that happens we assume the number of threads is one. 1996 */ 1997 static void 1998 cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores) 1999 { 2000 uint_t nthreads, nthread_per_core; 2001 2002 nthreads = nthread_per_core = 1; 2003 2004 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) { 2005 nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1; 2006 } else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) { 2007 nthreads = CPI_CPU_COUNT(cpi); 2008 } 2009 2010 /* 2011 * For us to have threads, and know about it, we have to be at least at 2012 * family 17h and have the cpuid bit that says we have extended 2013 * topology. 2014 */ 2015 if (cpi->cpi_family >= 0x17 && 2016 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 2017 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { 2018 nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1; 2019 } 2020 2021 *ncpus = nthreads; 2022 *ncores = nthreads / nthread_per_core; 2023 } 2024 2025 /* 2026 * Seed the initial values for the cores and threads for an Intel based 2027 * processor. These values will be overwritten if we detect that the processor 2028 * supports CPUID leaf 0xb. 2029 */ 2030 static void 2031 cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores) 2032 { 2033 /* 2034 * Only seed the number of physical cores from the first level leaf 4 2035 * information. The number of threads there indicate how many share the 2036 * L1 cache, which may or may not have anything to do with the number of 2037 * logical CPUs per core. 2038 */ 2039 if (cpi->cpi_maxeax >= 4) { 2040 *ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1; 2041 } else { 2042 *ncores = 1; 2043 } 2044 2045 if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) { 2046 *ncpus = CPI_CPU_COUNT(cpi); 2047 } else { 2048 *ncpus = *ncores; 2049 } 2050 } 2051 2052 static boolean_t 2053 cpuid_leafB_getids(cpu_t *cpu) 2054 { 2055 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2056 struct cpuid_regs regs; 2057 struct cpuid_regs *cp; 2058 2059 if (cpi->cpi_maxeax < 0xB) 2060 return (B_FALSE); 2061 2062 cp = ®s; 2063 cp->cp_eax = 0xB; 2064 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0; 2065 2066 (void) __cpuid_insn(cp); 2067 2068 /* 2069 * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which 2070 * indicates that the extended topology enumeration leaf is 2071 * available. 2072 */ 2073 if (cp->cp_ebx != 0) { 2074 uint32_t x2apic_id = 0; 2075 uint_t coreid_shift = 0; 2076 uint_t ncpu_per_core = 1; 2077 uint_t chipid_shift = 0; 2078 uint_t ncpu_per_chip = 1; 2079 uint_t i; 2080 uint_t level; 2081 2082 for (i = 0; i < CPI_FNB_ECX_MAX; i++) { 2083 cp->cp_eax = 0xB; 2084 cp->cp_ecx = i; 2085 2086 (void) __cpuid_insn(cp); 2087 level = CPI_CPU_LEVEL_TYPE(cp); 2088 2089 if (level == 1) { 2090 x2apic_id = cp->cp_edx; 2091 coreid_shift = BITX(cp->cp_eax, 4, 0); 2092 ncpu_per_core = BITX(cp->cp_ebx, 15, 0); 2093 } else if (level == 2) { 2094 x2apic_id = cp->cp_edx; 2095 chipid_shift = BITX(cp->cp_eax, 4, 0); 2096 ncpu_per_chip = BITX(cp->cp_ebx, 15, 0); 2097 } 2098 } 2099 2100 /* 2101 * cpi_apicid is taken care of in cpuid_gather_apicid. 2102 */ 2103 cpi->cpi_ncpu_per_chip = ncpu_per_chip; 2104 cpi->cpi_ncore_per_chip = ncpu_per_chip / 2105 ncpu_per_core; 2106 cpi->cpi_chipid = x2apic_id >> chipid_shift; 2107 cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1); 2108 cpi->cpi_coreid = x2apic_id >> coreid_shift; 2109 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift; 2110 cpi->cpi_procnodeid = cpi->cpi_chipid; 2111 cpi->cpi_compunitid = cpi->cpi_coreid; 2112 2113 if (coreid_shift > 0 && chipid_shift > coreid_shift) { 2114 cpi->cpi_nthread_bits = coreid_shift; 2115 cpi->cpi_ncore_bits = chipid_shift - coreid_shift; 2116 } 2117 2118 return (B_TRUE); 2119 } else { 2120 return (B_FALSE); 2121 } 2122 } 2123 2124 static void 2125 cpuid_intel_getids(cpu_t *cpu, void *feature) 2126 { 2127 uint_t i; 2128 uint_t chipid_shift = 0; 2129 uint_t coreid_shift = 0; 2130 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2131 2132 /* 2133 * There are no compute units or processor nodes currently on Intel. 2134 * Always set these to one. 2135 */ 2136 cpi->cpi_procnodes_per_pkg = 1; 2137 cpi->cpi_cores_per_compunit = 1; 2138 2139 /* 2140 * If cpuid Leaf B is present, use that to try and get this information. 2141 * It will be the most accurate for Intel CPUs. 2142 */ 2143 if (cpuid_leafB_getids(cpu)) 2144 return; 2145 2146 /* 2147 * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip 2148 * and ncore_per_chip. These represent the largest power of two values 2149 * that we need to cover all of the IDs in the system. Therefore, we use 2150 * those values to seed the number of bits needed to cover information 2151 * in the case when leaf B is not available. These values will probably 2152 * be larger than required, but that's OK. 2153 */ 2154 cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip); 2155 cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip); 2156 2157 for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1) 2158 chipid_shift++; 2159 2160 cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift; 2161 cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1); 2162 2163 if (is_x86_feature(feature, X86FSET_CMP)) { 2164 /* 2165 * Multi-core (and possibly multi-threaded) 2166 * processors. 2167 */ 2168 uint_t ncpu_per_core; 2169 if (cpi->cpi_ncore_per_chip == 1) 2170 ncpu_per_core = cpi->cpi_ncpu_per_chip; 2171 else if (cpi->cpi_ncore_per_chip > 1) 2172 ncpu_per_core = cpi->cpi_ncpu_per_chip / 2173 cpi->cpi_ncore_per_chip; 2174 /* 2175 * 8bit APIC IDs on dual core Pentiums 2176 * look like this: 2177 * 2178 * +-----------------------+------+------+ 2179 * | Physical Package ID | MC | HT | 2180 * +-----------------------+------+------+ 2181 * <------- chipid --------> 2182 * <------- coreid ---------------> 2183 * <--- clogid --> 2184 * <------> 2185 * pkgcoreid 2186 * 2187 * Where the number of bits necessary to 2188 * represent MC and HT fields together equals 2189 * to the minimum number of bits necessary to 2190 * store the value of cpi->cpi_ncpu_per_chip. 2191 * Of those bits, the MC part uses the number 2192 * of bits necessary to store the value of 2193 * cpi->cpi_ncore_per_chip. 2194 */ 2195 for (i = 1; i < ncpu_per_core; i <<= 1) 2196 coreid_shift++; 2197 cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift; 2198 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift; 2199 } else if (is_x86_feature(feature, X86FSET_HTT)) { 2200 /* 2201 * Single-core multi-threaded processors. 2202 */ 2203 cpi->cpi_coreid = cpi->cpi_chipid; 2204 cpi->cpi_pkgcoreid = 0; 2205 } else { 2206 /* 2207 * Single-core single-thread processors. 2208 */ 2209 cpi->cpi_coreid = cpu->cpu_id; 2210 cpi->cpi_pkgcoreid = 0; 2211 } 2212 cpi->cpi_procnodeid = cpi->cpi_chipid; 2213 cpi->cpi_compunitid = cpi->cpi_coreid; 2214 } 2215 2216 /* 2217 * Historically, AMD has had CMP chips with only a single thread per core. 2218 * However, starting in family 17h (Zen), this has changed and they now have 2219 * multiple threads. Our internal core id needs to be a unique value. 2220 * 2221 * To determine the core id of an AMD system, if we're from a family before 17h, 2222 * then we just use the cpu id, as that gives us a good value that will be 2223 * unique for each core. If instead, we're on family 17h or later, then we need 2224 * to do something more complicated. CPUID leaf 0x8000001e can tell us 2225 * how many threads are in the system. Based on that, we'll shift the APIC ID. 2226 * We can't use the normal core id in that leaf as it's only unique within the 2227 * socket, which is perfect for cpi_pkgcoreid, but not us. 2228 */ 2229 static id_t 2230 cpuid_amd_get_coreid(cpu_t *cpu) 2231 { 2232 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2233 2234 if (cpi->cpi_family >= 0x17 && 2235 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 2236 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { 2237 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1; 2238 if (nthreads > 1) { 2239 VERIFY3U(nthreads, ==, 2); 2240 return (cpi->cpi_apicid >> 1); 2241 } 2242 } 2243 2244 return (cpu->cpu_id); 2245 } 2246 2247 /* 2248 * IDs on AMD is a more challenging task. This is notable because of the 2249 * following two facts: 2250 * 2251 * 1. Before family 0x17 (Zen), there was no support for SMT and there was 2252 * also no way to get an actual unique core id from the system. As such, we 2253 * synthesize this case by using cpu->cpu_id. This scheme does not, 2254 * however, guarantee that sibling cores of a chip will have sequential 2255 * coreids starting at a multiple of the number of cores per chip - that is 2256 * usually the case, but if the ACPI MADT table is presented in a different 2257 * order then we need to perform a few more gymnastics for the pkgcoreid. 2258 * 2259 * 2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups 2260 * called compute units. These compute units share the L1I cache, L2 cache, 2261 * and the FPU. To deal with this, a new topology leaf was added in 2262 * 0x8000001e. However, parts of this leaf have different meanings 2263 * once we get to family 0x17. 2264 */ 2265 2266 static void 2267 cpuid_amd_getids(cpu_t *cpu, uchar_t *features) 2268 { 2269 int i, first_half, coreidsz; 2270 uint32_t nb_caps_reg; 2271 uint_t node2_1; 2272 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2273 struct cpuid_regs *cp; 2274 2275 /* 2276 * Calculate the core id (this comes from hardware in family 0x17 if it 2277 * hasn't been stripped by virtualization). We always set the compute 2278 * unit id to the same value. Also, initialize the default number of 2279 * cores per compute unit and nodes per package. This will be 2280 * overwritten when we know information about a particular family. 2281 */ 2282 cpi->cpi_coreid = cpuid_amd_get_coreid(cpu); 2283 cpi->cpi_compunitid = cpi->cpi_coreid; 2284 cpi->cpi_cores_per_compunit = 1; 2285 cpi->cpi_procnodes_per_pkg = 1; 2286 2287 /* 2288 * To construct the logical ID, we need to determine how many APIC IDs 2289 * are dedicated to the cores and threads. This is provided for us in 2290 * 0x80000008. However, if it's not present (say due to virtualization), 2291 * then we assume it's one. This should be present on all 64-bit AMD 2292 * processors. It was added in family 0xf (Hammer). 2293 */ 2294 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) { 2295 coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12); 2296 2297 /* 2298 * In AMD parlance chip is really a node while illumos 2299 * uses chip as equivalent to socket/package. 2300 */ 2301 if (coreidsz == 0) { 2302 /* Use legacy method */ 2303 for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1) 2304 coreidsz++; 2305 if (coreidsz == 0) 2306 coreidsz = 1; 2307 } 2308 } else { 2309 /* Assume single-core part */ 2310 coreidsz = 1; 2311 } 2312 cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1); 2313 2314 /* 2315 * The package core ID varies depending on the family. While it may be 2316 * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately, 2317 * this value is the core id in the given node. For non-virtualized 2318 * family 17h, we need to take the logical core id and shift off the 2319 * threads like we do when getting the core id. Otherwise, we can use 2320 * the clogid as is. When family 17h is virtualized, the clogid should 2321 * be sufficient as if we don't have valid data in the leaf, then we 2322 * won't think we have SMT, in which case the cpi_clogid should be 2323 * sufficient. 2324 */ 2325 if (cpi->cpi_family >= 0x17 && 2326 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 2327 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e && 2328 cpi->cpi_extd[0x1e].cp_ebx != 0) { 2329 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1; 2330 if (nthreads > 1) { 2331 VERIFY3U(nthreads, ==, 2); 2332 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1; 2333 } else { 2334 cpi->cpi_pkgcoreid = cpi->cpi_clogid; 2335 } 2336 } else { 2337 cpi->cpi_pkgcoreid = cpi->cpi_clogid; 2338 } 2339 2340 /* 2341 * Obtain the node ID and compute unit IDs. If we're on family 0x15 2342 * (bulldozer) or newer, then we can derive all of this from leaf 2343 * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family. 2344 */ 2345 if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 2346 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { 2347 cp = &cpi->cpi_extd[0x1e]; 2348 2349 cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1; 2350 cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0); 2351 2352 /* 2353 * For Bulldozer-era CPUs, recalculate the compute unit 2354 * information. 2355 */ 2356 if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) { 2357 cpi->cpi_cores_per_compunit = 2358 BITX(cp->cp_ebx, 15, 8) + 1; 2359 cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) + 2360 (cpi->cpi_ncore_per_chip / 2361 cpi->cpi_cores_per_compunit) * 2362 (cpi->cpi_procnodeid / 2363 cpi->cpi_procnodes_per_pkg); 2364 } 2365 } else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) { 2366 cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7; 2367 } else if (cpi->cpi_family == 0x10) { 2368 /* 2369 * See if we are a multi-node processor. 2370 * All processors in the system have the same number of nodes 2371 */ 2372 nb_caps_reg = pci_getl_func(0, 24, 3, 0xe8); 2373 if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) { 2374 /* Single-node */ 2375 cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5, 2376 coreidsz); 2377 } else { 2378 2379 /* 2380 * Multi-node revision D (2 nodes per package 2381 * are supported) 2382 */ 2383 cpi->cpi_procnodes_per_pkg = 2; 2384 2385 first_half = (cpi->cpi_pkgcoreid <= 2386 (cpi->cpi_ncore_per_chip/2 - 1)); 2387 2388 if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) { 2389 /* We are BSP */ 2390 cpi->cpi_procnodeid = (first_half ? 0 : 1); 2391 } else { 2392 2393 /* We are AP */ 2394 /* NodeId[2:1] bits to use for reading F3xe8 */ 2395 node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1; 2396 2397 nb_caps_reg = 2398 pci_getl_func(0, 24 + node2_1, 3, 0xe8); 2399 2400 /* 2401 * Check IntNodeNum bit (31:30, but bit 31 is 2402 * always 0 on dual-node processors) 2403 */ 2404 if (BITX(nb_caps_reg, 30, 30) == 0) 2405 cpi->cpi_procnodeid = node2_1 + 2406 !first_half; 2407 else 2408 cpi->cpi_procnodeid = node2_1 + 2409 first_half; 2410 } 2411 } 2412 } else { 2413 cpi->cpi_procnodeid = 0; 2414 } 2415 2416 cpi->cpi_chipid = 2417 cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg; 2418 2419 cpi->cpi_ncore_bits = coreidsz; 2420 cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip / 2421 cpi->cpi_ncore_per_chip); 2422 } 2423 2424 static void 2425 spec_uarch_flush_noop(void) 2426 { 2427 } 2428 2429 /* 2430 * When microcode is present that mitigates MDS, this wrmsr will also flush the 2431 * MDS-related micro-architectural state that would normally happen by calling 2432 * x86_md_clear(). 2433 */ 2434 static void 2435 spec_uarch_flush_msr(void) 2436 { 2437 wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D); 2438 } 2439 2440 /* 2441 * This function points to a function that will flush certain 2442 * micro-architectural state on the processor. This flush is used to mitigate 2443 * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This 2444 * function can point to one of three functions: 2445 * 2446 * - A noop which is done because we either are vulnerable, but do not have 2447 * microcode available to help deal with a fix, or because we aren't 2448 * vulnerable. 2449 * 2450 * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to 2451 * mitigate MDS is present, also perform the equivalent of the MDS flush; 2452 * however, it only flushes the MDS related micro-architectural state on the 2453 * current hyperthread, it does not do anything for the twin. 2454 * 2455 * - x86_md_clear which will flush the MDS related state. This is done when we 2456 * have a processor that is vulnerable to MDS, but is not vulnerable to L1TF 2457 * (RDCL_NO is set). 2458 */ 2459 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop; 2460 2461 static void 2462 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset) 2463 { 2464 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2465 2466 /* 2467 * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS) 2468 * has been fixed in hardware, it doesn't cover everything related to 2469 * MDS. Therefore we can only rely on MDS_NO to determine that we don't 2470 * need to mitigate this. 2471 */ 2472 if (cpi->cpi_vendor != X86_VENDOR_Intel || 2473 is_x86_feature(featureset, X86FSET_MDS_NO)) { 2474 return; 2475 } 2476 2477 if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) { 2478 const uint8_t nop = NOP_INSTR; 2479 uint8_t *md = (uint8_t *)x86_md_clear; 2480 2481 *md = nop; 2482 } 2483 2484 membar_producer(); 2485 } 2486 2487 static void 2488 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset) 2489 { 2490 boolean_t need_l1d, need_mds; 2491 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2492 2493 /* 2494 * If we're not on Intel or we've mitigated both RDCL and MDS in 2495 * hardware, then there's nothing left for us to do for enabling the 2496 * flush. We can also go ahead and say that SMT exclusion is 2497 * unnecessary. 2498 */ 2499 if (cpi->cpi_vendor != X86_VENDOR_Intel || 2500 (is_x86_feature(featureset, X86FSET_RDCL_NO) && 2501 is_x86_feature(featureset, X86FSET_MDS_NO))) { 2502 extern int smt_exclusion; 2503 smt_exclusion = 0; 2504 spec_uarch_flush = spec_uarch_flush_noop; 2505 membar_producer(); 2506 return; 2507 } 2508 2509 /* 2510 * The locations where we need to perform an L1D flush are required both 2511 * for mitigating L1TF and MDS. When verw support is present in 2512 * microcode, then the L1D flush will take care of doing that as well. 2513 * However, if we have a system where RDCL_NO is present, but we don't 2514 * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full 2515 * L1D flush. 2516 */ 2517 if (!is_x86_feature(featureset, X86FSET_RDCL_NO) && 2518 is_x86_feature(featureset, X86FSET_FLUSH_CMD) && 2519 !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) { 2520 need_l1d = B_TRUE; 2521 } else { 2522 need_l1d = B_FALSE; 2523 } 2524 2525 if (!is_x86_feature(featureset, X86FSET_MDS_NO) && 2526 is_x86_feature(featureset, X86FSET_MD_CLEAR)) { 2527 need_mds = B_TRUE; 2528 } else { 2529 need_mds = B_FALSE; 2530 } 2531 2532 if (need_l1d) { 2533 spec_uarch_flush = spec_uarch_flush_msr; 2534 } else if (need_mds) { 2535 spec_uarch_flush = x86_md_clear; 2536 } else { 2537 /* 2538 * We have no hardware mitigations available to us. 2539 */ 2540 spec_uarch_flush = spec_uarch_flush_noop; 2541 } 2542 membar_producer(); 2543 } 2544 2545 /* 2546 * We default to enabling RSB mitigations. 2547 */ 2548 static void 2549 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit) 2550 { 2551 const uint8_t ret = RET_INSTR; 2552 uint8_t *stuff = (uint8_t *)x86_rsb_stuff; 2553 2554 switch (mit) { 2555 case X86_SPECTREV2_ENHANCED_IBRS: 2556 case X86_SPECTREV2_DISABLED: 2557 *stuff = ret; 2558 break; 2559 default: 2560 break; 2561 } 2562 } 2563 2564 static void 2565 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit) 2566 { 2567 const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi", 2568 "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13", 2569 "_r14", "_r15" }; 2570 const uint_t nthunks = ARRAY_SIZE(thunks); 2571 const char *type; 2572 uint_t i; 2573 2574 if (mit == x86_spectrev2_mitigation) 2575 return; 2576 2577 switch (mit) { 2578 case X86_SPECTREV2_RETPOLINE: 2579 type = "gen"; 2580 break; 2581 case X86_SPECTREV2_RETPOLINE_AMD: 2582 type = "amd"; 2583 break; 2584 case X86_SPECTREV2_ENHANCED_IBRS: 2585 case X86_SPECTREV2_DISABLED: 2586 type = "jmp"; 2587 break; 2588 default: 2589 panic("asked to updated retpoline state with unknown state!"); 2590 } 2591 2592 for (i = 0; i < nthunks; i++) { 2593 uintptr_t source, dest; 2594 int ssize, dsize; 2595 char sourcebuf[64], destbuf[64]; 2596 size_t len; 2597 2598 (void) snprintf(destbuf, sizeof (destbuf), 2599 "__x86_indirect_thunk%s", thunks[i]); 2600 (void) snprintf(sourcebuf, sizeof (sourcebuf), 2601 "__x86_indirect_thunk_%s%s", type, thunks[i]); 2602 2603 source = kobj_getelfsym(sourcebuf, NULL, &ssize); 2604 dest = kobj_getelfsym(destbuf, NULL, &dsize); 2605 VERIFY3U(source, !=, 0); 2606 VERIFY3U(dest, !=, 0); 2607 VERIFY3S(dsize, >=, ssize); 2608 bcopy((void *)source, (void *)dest, ssize); 2609 } 2610 } 2611 2612 static void 2613 cpuid_enable_enhanced_ibrs(void) 2614 { 2615 uint64_t val; 2616 2617 val = rdmsr(MSR_IA32_SPEC_CTRL); 2618 val |= IA32_SPEC_CTRL_IBRS; 2619 wrmsr(MSR_IA32_SPEC_CTRL, val); 2620 } 2621 2622 #ifndef __xpv 2623 /* 2624 * Determine whether or not we can use the AMD optimized retpoline 2625 * functionality. We use this when we know we're on an AMD system and we can 2626 * successfully verify that lfence is dispatch serializing. 2627 */ 2628 static boolean_t 2629 cpuid_use_amd_retpoline(struct cpuid_info *cpi) 2630 { 2631 uint64_t val; 2632 on_trap_data_t otd; 2633 2634 if (cpi->cpi_vendor != X86_VENDOR_AMD) 2635 return (B_FALSE); 2636 2637 /* 2638 * We need to determine whether or not lfence is serializing. It always 2639 * is on families 0xf and 0x11. On others, it's controlled by 2640 * MSR_AMD_DECODE_CONFIG (MSRC001_1029). If some hypervisor gives us a 2641 * crazy old family, don't try and do anything. 2642 */ 2643 if (cpi->cpi_family < 0xf) 2644 return (B_FALSE); 2645 if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11) 2646 return (B_TRUE); 2647 2648 /* 2649 * While it may be tempting to use get_hwenv(), there are no promises 2650 * that a hypervisor will actually declare themselves to be so in a 2651 * friendly way. As such, try to read and set the MSR. If we can then 2652 * read back the value we set (it wasn't just set to zero), then we go 2653 * for it. 2654 */ 2655 if (!on_trap(&otd, OT_DATA_ACCESS)) { 2656 val = rdmsr(MSR_AMD_DECODE_CONFIG); 2657 val |= AMD_DECODE_CONFIG_LFENCE_DISPATCH; 2658 wrmsr(MSR_AMD_DECODE_CONFIG, val); 2659 val = rdmsr(MSR_AMD_DECODE_CONFIG); 2660 } else { 2661 val = 0; 2662 } 2663 no_trap(); 2664 2665 if ((val & AMD_DECODE_CONFIG_LFENCE_DISPATCH) != 0) 2666 return (B_TRUE); 2667 return (B_FALSE); 2668 } 2669 #endif /* !__xpv */ 2670 2671 static void 2672 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset) 2673 { 2674 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2675 x86_spectrev2_mitigation_t v2mit; 2676 2677 if (cpi->cpi_vendor == X86_VENDOR_AMD && 2678 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) { 2679 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB) 2680 add_x86_feature(featureset, X86FSET_IBPB); 2681 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS) 2682 add_x86_feature(featureset, X86FSET_IBRS); 2683 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP) 2684 add_x86_feature(featureset, X86FSET_STIBP); 2685 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL) 2686 add_x86_feature(featureset, X86FSET_STIBP_ALL); 2687 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD) 2688 add_x86_feature(featureset, X86FSET_SSBD); 2689 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD) 2690 add_x86_feature(featureset, X86FSET_SSBD_VIRT); 2691 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO) 2692 add_x86_feature(featureset, X86FSET_SSB_NO); 2693 /* 2694 * Don't enable enhanced IBRS unless we're told that we should 2695 * prefer it and it has the same semantics as Intel. This is 2696 * split into two bits rather than a single one. 2697 */ 2698 if ((cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) && 2699 (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)) { 2700 add_x86_feature(featureset, X86FSET_IBRS_ALL); 2701 } 2702 2703 } else if (cpi->cpi_vendor == X86_VENDOR_Intel && 2704 cpi->cpi_maxeax >= 7) { 2705 struct cpuid_regs *ecp; 2706 ecp = &cpi->cpi_std[7]; 2707 2708 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) { 2709 add_x86_feature(featureset, X86FSET_MD_CLEAR); 2710 } 2711 2712 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) { 2713 add_x86_feature(featureset, X86FSET_IBRS); 2714 add_x86_feature(featureset, X86FSET_IBPB); 2715 } 2716 2717 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) { 2718 add_x86_feature(featureset, X86FSET_STIBP); 2719 } 2720 2721 /* 2722 * Don't read the arch caps MSR on xpv where we lack the 2723 * on_trap(). 2724 */ 2725 #ifndef __xpv 2726 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) { 2727 on_trap_data_t otd; 2728 2729 /* 2730 * Be paranoid and assume we'll get a #GP. 2731 */ 2732 if (!on_trap(&otd, OT_DATA_ACCESS)) { 2733 uint64_t reg; 2734 2735 reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES); 2736 if (reg & IA32_ARCH_CAP_RDCL_NO) { 2737 add_x86_feature(featureset, 2738 X86FSET_RDCL_NO); 2739 } 2740 if (reg & IA32_ARCH_CAP_IBRS_ALL) { 2741 add_x86_feature(featureset, 2742 X86FSET_IBRS_ALL); 2743 } 2744 if (reg & IA32_ARCH_CAP_RSBA) { 2745 add_x86_feature(featureset, 2746 X86FSET_RSBA); 2747 } 2748 if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) { 2749 add_x86_feature(featureset, 2750 X86FSET_L1D_VM_NO); 2751 } 2752 if (reg & IA32_ARCH_CAP_SSB_NO) { 2753 add_x86_feature(featureset, 2754 X86FSET_SSB_NO); 2755 } 2756 if (reg & IA32_ARCH_CAP_MDS_NO) { 2757 add_x86_feature(featureset, 2758 X86FSET_MDS_NO); 2759 } 2760 } 2761 no_trap(); 2762 } 2763 #endif /* !__xpv */ 2764 2765 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD) 2766 add_x86_feature(featureset, X86FSET_SSBD); 2767 2768 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD) 2769 add_x86_feature(featureset, X86FSET_FLUSH_CMD); 2770 } 2771 2772 if (cpu->cpu_id != 0) { 2773 if (x86_spectrev2_mitigation == X86_SPECTREV2_ENHANCED_IBRS) { 2774 cpuid_enable_enhanced_ibrs(); 2775 } 2776 return; 2777 } 2778 2779 /* 2780 * Go through and initialize various security mechanisms that we should 2781 * only do on a single CPU. This includes Spectre V2, L1TF, and MDS. 2782 */ 2783 2784 /* 2785 * By default we've come in with retpolines enabled. Check whether we 2786 * should disable them or enable enhanced IBRS. RSB stuffing is enabled 2787 * by default, but disabled if we are using enhanced IBRS. 2788 */ 2789 if (x86_disable_spectrev2 != 0) { 2790 v2mit = X86_SPECTREV2_DISABLED; 2791 } else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) { 2792 cpuid_enable_enhanced_ibrs(); 2793 v2mit = X86_SPECTREV2_ENHANCED_IBRS; 2794 #ifndef __xpv 2795 } else if (cpuid_use_amd_retpoline(cpi)) { 2796 v2mit = X86_SPECTREV2_RETPOLINE_AMD; 2797 #endif /* !__xpv */ 2798 } else { 2799 v2mit = X86_SPECTREV2_RETPOLINE; 2800 } 2801 2802 cpuid_patch_retpolines(v2mit); 2803 cpuid_patch_rsb(v2mit); 2804 x86_spectrev2_mitigation = v2mit; 2805 membar_producer(); 2806 2807 /* 2808 * We need to determine what changes are required for mitigating L1TF 2809 * and MDS. If the CPU suffers from either of them, then SMT exclusion 2810 * is required. 2811 * 2812 * If any of these are present, then we need to flush u-arch state at 2813 * various points. For MDS, we need to do so whenever we change to a 2814 * lesser privilege level or we are halting the CPU. For L1TF we need to 2815 * flush the L1D cache at VM entry. When we have microcode that handles 2816 * MDS, the L1D flush also clears the other u-arch state that the 2817 * md_clear does. 2818 */ 2819 2820 /* 2821 * Update whether or not we need to be taking explicit action against 2822 * MDS. 2823 */ 2824 cpuid_update_md_clear(cpu, featureset); 2825 2826 /* 2827 * Determine whether SMT exclusion is required and whether or not we 2828 * need to perform an l1d flush. 2829 */ 2830 cpuid_update_l1d_flush(cpu, featureset); 2831 } 2832 2833 /* 2834 * Setup XFeature_Enabled_Mask register. Required by xsave feature. 2835 */ 2836 void 2837 setup_xfem(void) 2838 { 2839 uint64_t flags = XFEATURE_LEGACY_FP; 2840 2841 ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE)); 2842 2843 if (is_x86_feature(x86_featureset, X86FSET_SSE)) 2844 flags |= XFEATURE_SSE; 2845 2846 if (is_x86_feature(x86_featureset, X86FSET_AVX)) 2847 flags |= XFEATURE_AVX; 2848 2849 if (is_x86_feature(x86_featureset, X86FSET_AVX512F)) 2850 flags |= XFEATURE_AVX512; 2851 2852 set_xcr(XFEATURE_ENABLED_MASK, flags); 2853 2854 xsave_bv_all = flags; 2855 } 2856 2857 static void 2858 cpuid_pass1_topology(cpu_t *cpu, uchar_t *featureset) 2859 { 2860 struct cpuid_info *cpi; 2861 2862 cpi = cpu->cpu_m.mcpu_cpi; 2863 2864 if (cpi->cpi_vendor == X86_VENDOR_AMD) { 2865 cpuid_gather_amd_topology_leaves(cpu); 2866 } 2867 2868 cpi->cpi_apicid = cpuid_gather_apicid(cpi); 2869 2870 /* 2871 * Before we can calculate the IDs that we should assign to this 2872 * processor, we need to understand how many cores and threads it has. 2873 */ 2874 switch (cpi->cpi_vendor) { 2875 case X86_VENDOR_Intel: 2876 cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip, 2877 &cpi->cpi_ncore_per_chip); 2878 break; 2879 case X86_VENDOR_AMD: 2880 cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip, 2881 &cpi->cpi_ncore_per_chip); 2882 break; 2883 default: 2884 /* 2885 * If we have some other x86 compatible chip, it's not clear how 2886 * they would behave. The most common case is virtualization 2887 * today, though there are also 64-bit VIA chips. Assume that 2888 * all we can get is the basic Leaf 1 HTT information. 2889 */ 2890 if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) { 2891 cpi->cpi_ncore_per_chip = 1; 2892 cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi); 2893 } 2894 break; 2895 } 2896 2897 /* 2898 * Based on the calculated number of threads and cores, potentially 2899 * assign the HTT and CMT features. 2900 */ 2901 if (cpi->cpi_ncore_per_chip > 1) { 2902 add_x86_feature(featureset, X86FSET_CMP); 2903 } 2904 2905 if (cpi->cpi_ncpu_per_chip > 1 && 2906 cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) { 2907 add_x86_feature(featureset, X86FSET_HTT); 2908 } 2909 2910 /* 2911 * Now that has been set up, we need to go through and calculate all of 2912 * the rest of the parameters that exist. If we think the CPU doesn't 2913 * have either SMT (HTT) or CMP, then we basically go through and fake 2914 * up information in some way. The most likely case for this is 2915 * virtualization where we have a lot of partial topology information. 2916 */ 2917 if (!is_x86_feature(featureset, X86FSET_HTT) && 2918 !is_x86_feature(featureset, X86FSET_CMP)) { 2919 /* 2920 * This is a single core, single-threaded processor. 2921 */ 2922 cpi->cpi_procnodes_per_pkg = 1; 2923 cpi->cpi_cores_per_compunit = 1; 2924 cpi->cpi_compunitid = 0; 2925 cpi->cpi_chipid = -1; 2926 cpi->cpi_clogid = 0; 2927 cpi->cpi_coreid = cpu->cpu_id; 2928 cpi->cpi_pkgcoreid = 0; 2929 if (cpi->cpi_vendor == X86_VENDOR_AMD) { 2930 cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0); 2931 } else { 2932 cpi->cpi_procnodeid = cpi->cpi_chipid; 2933 } 2934 } else { 2935 switch (cpi->cpi_vendor) { 2936 case X86_VENDOR_Intel: 2937 cpuid_intel_getids(cpu, featureset); 2938 break; 2939 case X86_VENDOR_AMD: 2940 cpuid_amd_getids(cpu, featureset); 2941 break; 2942 default: 2943 /* 2944 * In this case, it's hard to say what we should do. 2945 * We're going to model them to the OS as single core 2946 * threads. We don't have a good identifier for them, so 2947 * we're just going to use the cpu id all on a single 2948 * chip. 2949 * 2950 * This case has historically been different from the 2951 * case above where we don't have HTT or CMP. While they 2952 * could be combined, we've opted to keep it separate to 2953 * minimize the risk of topology changes in weird cases. 2954 */ 2955 cpi->cpi_procnodes_per_pkg = 1; 2956 cpi->cpi_cores_per_compunit = 1; 2957 cpi->cpi_chipid = 0; 2958 cpi->cpi_coreid = cpu->cpu_id; 2959 cpi->cpi_clogid = cpu->cpu_id; 2960 cpi->cpi_pkgcoreid = cpu->cpu_id; 2961 cpi->cpi_procnodeid = cpi->cpi_chipid; 2962 cpi->cpi_compunitid = cpi->cpi_coreid; 2963 break; 2964 } 2965 } 2966 } 2967 2968 /* 2969 * Gather relevant CPU features from leaf 6 which covers thermal information. We 2970 * always gather leaf 6 if it's supported; however, we only look for features on 2971 * Intel systems as AMD does not currently define any of the features we look 2972 * for below. 2973 */ 2974 static void 2975 cpuid_pass1_thermal(cpu_t *cpu, uchar_t *featureset) 2976 { 2977 struct cpuid_regs *cp; 2978 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2979 2980 if (cpi->cpi_maxeax < 6) { 2981 return; 2982 } 2983 2984 cp = &cpi->cpi_std[6]; 2985 cp->cp_eax = 6; 2986 cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0; 2987 (void) __cpuid_insn(cp); 2988 platform_cpuid_mangle(cpi->cpi_vendor, 6, cp); 2989 2990 if (cpi->cpi_vendor != X86_VENDOR_Intel) { 2991 return; 2992 } 2993 2994 if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) { 2995 add_x86_feature(featureset, X86FSET_CORE_THERMAL); 2996 } 2997 2998 if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) { 2999 add_x86_feature(featureset, X86FSET_PKG_THERMAL); 3000 } 3001 } 3002 3003 void 3004 cpuid_pass1(cpu_t *cpu, uchar_t *featureset) 3005 { 3006 uint32_t mask_ecx, mask_edx; 3007 struct cpuid_info *cpi; 3008 struct cpuid_regs *cp; 3009 int xcpuid; 3010 #if !defined(__xpv) 3011 extern int idle_cpu_prefer_mwait; 3012 #endif 3013 3014 /* 3015 * Space statically allocated for BSP, ensure pointer is set 3016 */ 3017 if (cpu->cpu_id == 0) { 3018 if (cpu->cpu_m.mcpu_cpi == NULL) 3019 cpu->cpu_m.mcpu_cpi = &cpuid_info0; 3020 } 3021 3022 add_x86_feature(featureset, X86FSET_CPUID); 3023 3024 cpi = cpu->cpu_m.mcpu_cpi; 3025 ASSERT(cpi != NULL); 3026 cp = &cpi->cpi_std[0]; 3027 cp->cp_eax = 0; 3028 cpi->cpi_maxeax = __cpuid_insn(cp); 3029 { 3030 uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr; 3031 *iptr++ = cp->cp_ebx; 3032 *iptr++ = cp->cp_edx; 3033 *iptr++ = cp->cp_ecx; 3034 *(char *)&cpi->cpi_vendorstr[12] = '\0'; 3035 } 3036 3037 cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr); 3038 x86_vendor = cpi->cpi_vendor; /* for compatibility */ 3039 3040 /* 3041 * Limit the range in case of weird hardware 3042 */ 3043 if (cpi->cpi_maxeax > CPI_MAXEAX_MAX) 3044 cpi->cpi_maxeax = CPI_MAXEAX_MAX; 3045 if (cpi->cpi_maxeax < 1) 3046 goto pass1_done; 3047 3048 cp = &cpi->cpi_std[1]; 3049 cp->cp_eax = 1; 3050 (void) __cpuid_insn(cp); 3051 3052 /* 3053 * Extract identifying constants for easy access. 3054 */ 3055 cpi->cpi_model = CPI_MODEL(cpi); 3056 cpi->cpi_family = CPI_FAMILY(cpi); 3057 3058 if (cpi->cpi_family == 0xf) 3059 cpi->cpi_family += CPI_FAMILY_XTD(cpi); 3060 3061 /* 3062 * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf. 3063 * Intel, and presumably everyone else, uses model == 0xf, as 3064 * one would expect (max value means possible overflow). Sigh. 3065 */ 3066 3067 switch (cpi->cpi_vendor) { 3068 case X86_VENDOR_Intel: 3069 if (IS_EXTENDED_MODEL_INTEL(cpi)) 3070 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4; 3071 break; 3072 case X86_VENDOR_AMD: 3073 if (CPI_FAMILY(cpi) == 0xf) 3074 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4; 3075 break; 3076 default: 3077 if (cpi->cpi_model == 0xf) 3078 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4; 3079 break; 3080 } 3081 3082 cpi->cpi_step = CPI_STEP(cpi); 3083 cpi->cpi_brandid = CPI_BRANDID(cpi); 3084 3085 /* 3086 * *default* assumptions: 3087 * - believe %edx feature word 3088 * - ignore %ecx feature word 3089 * - 32-bit virtual and physical addressing 3090 */ 3091 mask_edx = 0xffffffff; 3092 mask_ecx = 0; 3093 3094 cpi->cpi_pabits = cpi->cpi_vabits = 32; 3095 3096 switch (cpi->cpi_vendor) { 3097 case X86_VENDOR_Intel: 3098 if (cpi->cpi_family == 5) 3099 x86_type = X86_TYPE_P5; 3100 else if (IS_LEGACY_P6(cpi)) { 3101 x86_type = X86_TYPE_P6; 3102 pentiumpro_bug4046376 = 1; 3103 /* 3104 * Clear the SEP bit when it was set erroneously 3105 */ 3106 if (cpi->cpi_model < 3 && cpi->cpi_step < 3) 3107 cp->cp_edx &= ~CPUID_INTC_EDX_SEP; 3108 } else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) { 3109 x86_type = X86_TYPE_P4; 3110 /* 3111 * We don't currently depend on any of the %ecx 3112 * features until Prescott, so we'll only check 3113 * this from P4 onwards. We might want to revisit 3114 * that idea later. 3115 */ 3116 mask_ecx = 0xffffffff; 3117 } else if (cpi->cpi_family > 0xf) 3118 mask_ecx = 0xffffffff; 3119 /* 3120 * We don't support MONITOR/MWAIT if leaf 5 is not available 3121 * to obtain the monitor linesize. 3122 */ 3123 if (cpi->cpi_maxeax < 5) 3124 mask_ecx &= ~CPUID_INTC_ECX_MON; 3125 break; 3126 case X86_VENDOR_IntelClone: 3127 default: 3128 break; 3129 case X86_VENDOR_AMD: 3130 #if defined(OPTERON_ERRATUM_108) 3131 if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) { 3132 cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0; 3133 cpi->cpi_model = 0xc; 3134 } else 3135 #endif 3136 if (cpi->cpi_family == 5) { 3137 /* 3138 * AMD K5 and K6 3139 * 3140 * These CPUs have an incomplete implementation 3141 * of MCA/MCE which we mask away. 3142 */ 3143 mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA); 3144 3145 /* 3146 * Model 0 uses the wrong (APIC) bit 3147 * to indicate PGE. Fix it here. 3148 */ 3149 if (cpi->cpi_model == 0) { 3150 if (cp->cp_edx & 0x200) { 3151 cp->cp_edx &= ~0x200; 3152 cp->cp_edx |= CPUID_INTC_EDX_PGE; 3153 } 3154 } 3155 3156 /* 3157 * Early models had problems w/ MMX; disable. 3158 */ 3159 if (cpi->cpi_model < 6) 3160 mask_edx &= ~CPUID_INTC_EDX_MMX; 3161 } 3162 3163 /* 3164 * For newer families, SSE3 and CX16, at least, are valid; 3165 * enable all 3166 */ 3167 if (cpi->cpi_family >= 0xf) 3168 mask_ecx = 0xffffffff; 3169 /* 3170 * We don't support MONITOR/MWAIT if leaf 5 is not available 3171 * to obtain the monitor linesize. 3172 */ 3173 if (cpi->cpi_maxeax < 5) 3174 mask_ecx &= ~CPUID_INTC_ECX_MON; 3175 3176 #if !defined(__xpv) 3177 /* 3178 * AMD has not historically used MWAIT in the CPU's idle loop. 3179 * Pre-family-10h Opterons do not have the MWAIT instruction. We 3180 * know for certain that in at least family 17h, per AMD, mwait 3181 * is preferred. Families in-between are less certain. 3182 */ 3183 if (cpi->cpi_family < 0x17) { 3184 idle_cpu_prefer_mwait = 0; 3185 } 3186 #endif 3187 3188 break; 3189 case X86_VENDOR_TM: 3190 /* 3191 * workaround the NT workaround in CMS 4.1 3192 */ 3193 if (cpi->cpi_family == 5 && cpi->cpi_model == 4 && 3194 (cpi->cpi_step == 2 || cpi->cpi_step == 3)) 3195 cp->cp_edx |= CPUID_INTC_EDX_CX8; 3196 break; 3197 case X86_VENDOR_Centaur: 3198 /* 3199 * workaround the NT workarounds again 3200 */ 3201 if (cpi->cpi_family == 6) 3202 cp->cp_edx |= CPUID_INTC_EDX_CX8; 3203 break; 3204 case X86_VENDOR_Cyrix: 3205 /* 3206 * We rely heavily on the probing in locore 3207 * to actually figure out what parts, if any, 3208 * of the Cyrix cpuid instruction to believe. 3209 */ 3210 switch (x86_type) { 3211 case X86_TYPE_CYRIX_486: 3212 mask_edx = 0; 3213 break; 3214 case X86_TYPE_CYRIX_6x86: 3215 mask_edx = 0; 3216 break; 3217 case X86_TYPE_CYRIX_6x86L: 3218 mask_edx = 3219 CPUID_INTC_EDX_DE | 3220 CPUID_INTC_EDX_CX8; 3221 break; 3222 case X86_TYPE_CYRIX_6x86MX: 3223 mask_edx = 3224 CPUID_INTC_EDX_DE | 3225 CPUID_INTC_EDX_MSR | 3226 CPUID_INTC_EDX_CX8 | 3227 CPUID_INTC_EDX_PGE | 3228 CPUID_INTC_EDX_CMOV | 3229 CPUID_INTC_EDX_MMX; 3230 break; 3231 case X86_TYPE_CYRIX_GXm: 3232 mask_edx = 3233 CPUID_INTC_EDX_MSR | 3234 CPUID_INTC_EDX_CX8 | 3235 CPUID_INTC_EDX_CMOV | 3236 CPUID_INTC_EDX_MMX; 3237 break; 3238 case X86_TYPE_CYRIX_MediaGX: 3239 break; 3240 case X86_TYPE_CYRIX_MII: 3241 case X86_TYPE_VIA_CYRIX_III: 3242 mask_edx = 3243 CPUID_INTC_EDX_DE | 3244 CPUID_INTC_EDX_TSC | 3245 CPUID_INTC_EDX_MSR | 3246 CPUID_INTC_EDX_CX8 | 3247 CPUID_INTC_EDX_PGE | 3248 CPUID_INTC_EDX_CMOV | 3249 CPUID_INTC_EDX_MMX; 3250 break; 3251 default: 3252 break; 3253 } 3254 break; 3255 } 3256 3257 #if defined(__xpv) 3258 /* 3259 * Do not support MONITOR/MWAIT under a hypervisor 3260 */ 3261 mask_ecx &= ~CPUID_INTC_ECX_MON; 3262 /* 3263 * Do not support XSAVE under a hypervisor for now 3264 */ 3265 xsave_force_disable = B_TRUE; 3266 3267 #endif /* __xpv */ 3268 3269 if (xsave_force_disable) { 3270 mask_ecx &= ~CPUID_INTC_ECX_XSAVE; 3271 mask_ecx &= ~CPUID_INTC_ECX_AVX; 3272 mask_ecx &= ~CPUID_INTC_ECX_F16C; 3273 mask_ecx &= ~CPUID_INTC_ECX_FMA; 3274 } 3275 3276 /* 3277 * Now we've figured out the masks that determine 3278 * which bits we choose to believe, apply the masks 3279 * to the feature words, then map the kernel's view 3280 * of these feature words into its feature word. 3281 */ 3282 cp->cp_edx &= mask_edx; 3283 cp->cp_ecx &= mask_ecx; 3284 3285 /* 3286 * apply any platform restrictions (we don't call this 3287 * immediately after __cpuid_insn here, because we need the 3288 * workarounds applied above first) 3289 */ 3290 platform_cpuid_mangle(cpi->cpi_vendor, 1, cp); 3291 3292 /* 3293 * In addition to ecx and edx, Intel and AMD are storing a bunch of 3294 * instruction set extensions in leaf 7's ebx, ecx, and edx. 3295 */ 3296 if (cpi->cpi_maxeax >= 7) { 3297 struct cpuid_regs *ecp; 3298 ecp = &cpi->cpi_std[7]; 3299 ecp->cp_eax = 7; 3300 ecp->cp_ecx = 0; 3301 (void) __cpuid_insn(ecp); 3302 3303 /* 3304 * If XSAVE has been disabled, just ignore all of the 3305 * extended-save-area dependent flags here. 3306 */ 3307 if (xsave_force_disable) { 3308 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1; 3309 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2; 3310 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2; 3311 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX; 3312 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512; 3313 ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512; 3314 ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512; 3315 } 3316 3317 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP) 3318 add_x86_feature(featureset, X86FSET_SMEP); 3319 3320 /* 3321 * We check disable_smap here in addition to in startup_smap() 3322 * to ensure CPUs that aren't the boot CPU don't accidentally 3323 * include it in the feature set and thus generate a mismatched 3324 * x86 feature set across CPUs. 3325 */ 3326 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP && 3327 disable_smap == 0) 3328 add_x86_feature(featureset, X86FSET_SMAP); 3329 3330 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED) 3331 add_x86_feature(featureset, X86FSET_RDSEED); 3332 3333 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX) 3334 add_x86_feature(featureset, X86FSET_ADX); 3335 3336 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE) 3337 add_x86_feature(featureset, X86FSET_FSGSBASE); 3338 3339 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT) 3340 add_x86_feature(featureset, X86FSET_CLFLUSHOPT); 3341 3342 if (cpi->cpi_vendor == X86_VENDOR_Intel) { 3343 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID) 3344 add_x86_feature(featureset, X86FSET_INVPCID); 3345 3346 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX) 3347 add_x86_feature(featureset, X86FSET_MPX); 3348 3349 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB) 3350 add_x86_feature(featureset, X86FSET_CLWB); 3351 } 3352 } 3353 3354 /* 3355 * fold in overrides from the "eeprom" mechanism 3356 */ 3357 cp->cp_edx |= cpuid_feature_edx_include; 3358 cp->cp_edx &= ~cpuid_feature_edx_exclude; 3359 3360 cp->cp_ecx |= cpuid_feature_ecx_include; 3361 cp->cp_ecx &= ~cpuid_feature_ecx_exclude; 3362 3363 if (cp->cp_edx & CPUID_INTC_EDX_PSE) { 3364 add_x86_feature(featureset, X86FSET_LARGEPAGE); 3365 } 3366 if (cp->cp_edx & CPUID_INTC_EDX_TSC) { 3367 add_x86_feature(featureset, X86FSET_TSC); 3368 } 3369 if (cp->cp_edx & CPUID_INTC_EDX_MSR) { 3370 add_x86_feature(featureset, X86FSET_MSR); 3371 } 3372 if (cp->cp_edx & CPUID_INTC_EDX_MTRR) { 3373 add_x86_feature(featureset, X86FSET_MTRR); 3374 } 3375 if (cp->cp_edx & CPUID_INTC_EDX_PGE) { 3376 add_x86_feature(featureset, X86FSET_PGE); 3377 } 3378 if (cp->cp_edx & CPUID_INTC_EDX_CMOV) { 3379 add_x86_feature(featureset, X86FSET_CMOV); 3380 } 3381 if (cp->cp_edx & CPUID_INTC_EDX_MMX) { 3382 add_x86_feature(featureset, X86FSET_MMX); 3383 } 3384 if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 && 3385 (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) { 3386 add_x86_feature(featureset, X86FSET_MCA); 3387 } 3388 if (cp->cp_edx & CPUID_INTC_EDX_PAE) { 3389 add_x86_feature(featureset, X86FSET_PAE); 3390 } 3391 if (cp->cp_edx & CPUID_INTC_EDX_CX8) { 3392 add_x86_feature(featureset, X86FSET_CX8); 3393 } 3394 if (cp->cp_ecx & CPUID_INTC_ECX_CX16) { 3395 add_x86_feature(featureset, X86FSET_CX16); 3396 } 3397 if (cp->cp_edx & CPUID_INTC_EDX_PAT) { 3398 add_x86_feature(featureset, X86FSET_PAT); 3399 } 3400 if (cp->cp_edx & CPUID_INTC_EDX_SEP) { 3401 add_x86_feature(featureset, X86FSET_SEP); 3402 } 3403 if (cp->cp_edx & CPUID_INTC_EDX_FXSR) { 3404 /* 3405 * In our implementation, fxsave/fxrstor 3406 * are prerequisites before we'll even 3407 * try and do SSE things. 3408 */ 3409 if (cp->cp_edx & CPUID_INTC_EDX_SSE) { 3410 add_x86_feature(featureset, X86FSET_SSE); 3411 } 3412 if (cp->cp_edx & CPUID_INTC_EDX_SSE2) { 3413 add_x86_feature(featureset, X86FSET_SSE2); 3414 } 3415 if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) { 3416 add_x86_feature(featureset, X86FSET_SSE3); 3417 } 3418 if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) { 3419 add_x86_feature(featureset, X86FSET_SSSE3); 3420 } 3421 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) { 3422 add_x86_feature(featureset, X86FSET_SSE4_1); 3423 } 3424 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) { 3425 add_x86_feature(featureset, X86FSET_SSE4_2); 3426 } 3427 if (cp->cp_ecx & CPUID_INTC_ECX_AES) { 3428 add_x86_feature(featureset, X86FSET_AES); 3429 } 3430 if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) { 3431 add_x86_feature(featureset, X86FSET_PCLMULQDQ); 3432 } 3433 3434 if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA) 3435 add_x86_feature(featureset, X86FSET_SHA); 3436 3437 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_UMIP) 3438 add_x86_feature(featureset, X86FSET_UMIP); 3439 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_PKU) 3440 add_x86_feature(featureset, X86FSET_PKU); 3441 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_OSPKE) 3442 add_x86_feature(featureset, X86FSET_OSPKE); 3443 3444 if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) { 3445 add_x86_feature(featureset, X86FSET_XSAVE); 3446 3447 /* We only test AVX & AVX512 when there is XSAVE */ 3448 3449 if (cp->cp_ecx & CPUID_INTC_ECX_AVX) { 3450 add_x86_feature(featureset, 3451 X86FSET_AVX); 3452 3453 /* 3454 * Intel says we can't check these without also 3455 * checking AVX. 3456 */ 3457 if (cp->cp_ecx & CPUID_INTC_ECX_F16C) 3458 add_x86_feature(featureset, 3459 X86FSET_F16C); 3460 3461 if (cp->cp_ecx & CPUID_INTC_ECX_FMA) 3462 add_x86_feature(featureset, 3463 X86FSET_FMA); 3464 3465 if (cpi->cpi_std[7].cp_ebx & 3466 CPUID_INTC_EBX_7_0_BMI1) 3467 add_x86_feature(featureset, 3468 X86FSET_BMI1); 3469 3470 if (cpi->cpi_std[7].cp_ebx & 3471 CPUID_INTC_EBX_7_0_BMI2) 3472 add_x86_feature(featureset, 3473 X86FSET_BMI2); 3474 3475 if (cpi->cpi_std[7].cp_ebx & 3476 CPUID_INTC_EBX_7_0_AVX2) 3477 add_x86_feature(featureset, 3478 X86FSET_AVX2); 3479 } 3480 3481 if (cpi->cpi_vendor == X86_VENDOR_Intel && 3482 (cpi->cpi_std[7].cp_ebx & 3483 CPUID_INTC_EBX_7_0_AVX512F) != 0) { 3484 add_x86_feature(featureset, X86FSET_AVX512F); 3485 3486 if (cpi->cpi_std[7].cp_ebx & 3487 CPUID_INTC_EBX_7_0_AVX512DQ) 3488 add_x86_feature(featureset, 3489 X86FSET_AVX512DQ); 3490 if (cpi->cpi_std[7].cp_ebx & 3491 CPUID_INTC_EBX_7_0_AVX512IFMA) 3492 add_x86_feature(featureset, 3493 X86FSET_AVX512FMA); 3494 if (cpi->cpi_std[7].cp_ebx & 3495 CPUID_INTC_EBX_7_0_AVX512PF) 3496 add_x86_feature(featureset, 3497 X86FSET_AVX512PF); 3498 if (cpi->cpi_std[7].cp_ebx & 3499 CPUID_INTC_EBX_7_0_AVX512ER) 3500 add_x86_feature(featureset, 3501 X86FSET_AVX512ER); 3502 if (cpi->cpi_std[7].cp_ebx & 3503 CPUID_INTC_EBX_7_0_AVX512CD) 3504 add_x86_feature(featureset, 3505 X86FSET_AVX512CD); 3506 if (cpi->cpi_std[7].cp_ebx & 3507 CPUID_INTC_EBX_7_0_AVX512BW) 3508 add_x86_feature(featureset, 3509 X86FSET_AVX512BW); 3510 if (cpi->cpi_std[7].cp_ebx & 3511 CPUID_INTC_EBX_7_0_AVX512VL) 3512 add_x86_feature(featureset, 3513 X86FSET_AVX512VL); 3514 3515 if (cpi->cpi_std[7].cp_ecx & 3516 CPUID_INTC_ECX_7_0_AVX512VBMI) 3517 add_x86_feature(featureset, 3518 X86FSET_AVX512VBMI); 3519 if (cpi->cpi_std[7].cp_ecx & 3520 CPUID_INTC_ECX_7_0_AVX512VNNI) 3521 add_x86_feature(featureset, 3522 X86FSET_AVX512VNNI); 3523 if (cpi->cpi_std[7].cp_ecx & 3524 CPUID_INTC_ECX_7_0_AVX512VPOPCDQ) 3525 add_x86_feature(featureset, 3526 X86FSET_AVX512VPOPCDQ); 3527 3528 if (cpi->cpi_std[7].cp_edx & 3529 CPUID_INTC_EDX_7_0_AVX5124NNIW) 3530 add_x86_feature(featureset, 3531 X86FSET_AVX512NNIW); 3532 if (cpi->cpi_std[7].cp_edx & 3533 CPUID_INTC_EDX_7_0_AVX5124FMAPS) 3534 add_x86_feature(featureset, 3535 X86FSET_AVX512FMAPS); 3536 } 3537 } 3538 } 3539 3540 if (cpi->cpi_vendor == X86_VENDOR_Intel) { 3541 if (cp->cp_ecx & CPUID_INTC_ECX_PCID) { 3542 add_x86_feature(featureset, X86FSET_PCID); 3543 } 3544 } 3545 3546 if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) { 3547 add_x86_feature(featureset, X86FSET_X2APIC); 3548 } 3549 if (cp->cp_edx & CPUID_INTC_EDX_DE) { 3550 add_x86_feature(featureset, X86FSET_DE); 3551 } 3552 #if !defined(__xpv) 3553 if (cp->cp_ecx & CPUID_INTC_ECX_MON) { 3554 3555 /* 3556 * We require the CLFLUSH instruction for erratum workaround 3557 * to use MONITOR/MWAIT. 3558 */ 3559 if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) { 3560 cpi->cpi_mwait.support |= MWAIT_SUPPORT; 3561 add_x86_feature(featureset, X86FSET_MWAIT); 3562 } else { 3563 extern int idle_cpu_assert_cflush_monitor; 3564 3565 /* 3566 * All processors we are aware of which have 3567 * MONITOR/MWAIT also have CLFLUSH. 3568 */ 3569 if (idle_cpu_assert_cflush_monitor) { 3570 ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) && 3571 (cp->cp_edx & CPUID_INTC_EDX_CLFSH)); 3572 } 3573 } 3574 } 3575 #endif /* __xpv */ 3576 3577 if (cp->cp_ecx & CPUID_INTC_ECX_VMX) { 3578 add_x86_feature(featureset, X86FSET_VMX); 3579 } 3580 3581 if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND) 3582 add_x86_feature(featureset, X86FSET_RDRAND); 3583 3584 /* 3585 * Only need it first time, rest of the cpus would follow suit. 3586 * we only capture this for the bootcpu. 3587 */ 3588 if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) { 3589 add_x86_feature(featureset, X86FSET_CLFSH); 3590 x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8); 3591 } 3592 if (is_x86_feature(featureset, X86FSET_PAE)) 3593 cpi->cpi_pabits = 36; 3594 3595 if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) { 3596 struct cpuid_regs r, *ecp; 3597 3598 ecp = &r; 3599 ecp->cp_eax = 0xD; 3600 ecp->cp_ecx = 1; 3601 ecp->cp_edx = ecp->cp_ebx = 0; 3602 (void) __cpuid_insn(ecp); 3603 3604 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT) 3605 add_x86_feature(featureset, X86FSET_XSAVEOPT); 3606 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC) 3607 add_x86_feature(featureset, X86FSET_XSAVEC); 3608 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES) 3609 add_x86_feature(featureset, X86FSET_XSAVES); 3610 } 3611 3612 /* 3613 * Work on the "extended" feature information, doing 3614 * some basic initialization for cpuid_pass2() 3615 */ 3616 xcpuid = 0; 3617 switch (cpi->cpi_vendor) { 3618 case X86_VENDOR_Intel: 3619 /* 3620 * On KVM we know we will have proper support for extended 3621 * cpuid. 3622 */ 3623 if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf || 3624 (get_hwenv() == HW_KVM && cpi->cpi_family == 6 && 3625 (cpi->cpi_model == 6 || cpi->cpi_model == 2))) 3626 xcpuid++; 3627 break; 3628 case X86_VENDOR_AMD: 3629 if (cpi->cpi_family > 5 || 3630 (cpi->cpi_family == 5 && cpi->cpi_model >= 1)) 3631 xcpuid++; 3632 break; 3633 case X86_VENDOR_Cyrix: 3634 /* 3635 * Only these Cyrix CPUs are -known- to support 3636 * extended cpuid operations. 3637 */ 3638 if (x86_type == X86_TYPE_VIA_CYRIX_III || 3639 x86_type == X86_TYPE_CYRIX_GXm) 3640 xcpuid++; 3641 break; 3642 case X86_VENDOR_Centaur: 3643 case X86_VENDOR_TM: 3644 default: 3645 xcpuid++; 3646 break; 3647 } 3648 3649 if (xcpuid) { 3650 cp = &cpi->cpi_extd[0]; 3651 cp->cp_eax = CPUID_LEAF_EXT_0; 3652 cpi->cpi_xmaxeax = __cpuid_insn(cp); 3653 } 3654 3655 if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) { 3656 3657 if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX) 3658 cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX; 3659 3660 switch (cpi->cpi_vendor) { 3661 case X86_VENDOR_Intel: 3662 case X86_VENDOR_AMD: 3663 if (cpi->cpi_xmaxeax < 0x80000001) 3664 break; 3665 cp = &cpi->cpi_extd[1]; 3666 cp->cp_eax = 0x80000001; 3667 (void) __cpuid_insn(cp); 3668 3669 if (cpi->cpi_vendor == X86_VENDOR_AMD && 3670 cpi->cpi_family == 5 && 3671 cpi->cpi_model == 6 && 3672 cpi->cpi_step == 6) { 3673 /* 3674 * K6 model 6 uses bit 10 to indicate SYSC 3675 * Later models use bit 11. Fix it here. 3676 */ 3677 if (cp->cp_edx & 0x400) { 3678 cp->cp_edx &= ~0x400; 3679 cp->cp_edx |= CPUID_AMD_EDX_SYSC; 3680 } 3681 } 3682 3683 platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp); 3684 3685 /* 3686 * Compute the additions to the kernel's feature word. 3687 */ 3688 if (cp->cp_edx & CPUID_AMD_EDX_NX) { 3689 add_x86_feature(featureset, X86FSET_NX); 3690 } 3691 3692 /* 3693 * Regardless whether or not we boot 64-bit, 3694 * we should have a way to identify whether 3695 * the CPU is capable of running 64-bit. 3696 */ 3697 if (cp->cp_edx & CPUID_AMD_EDX_LM) { 3698 add_x86_feature(featureset, X86FSET_64); 3699 } 3700 3701 /* 1 GB large page - enable only for 64 bit kernel */ 3702 if (cp->cp_edx & CPUID_AMD_EDX_1GPG) { 3703 add_x86_feature(featureset, X86FSET_1GPG); 3704 } 3705 3706 if ((cpi->cpi_vendor == X86_VENDOR_AMD) && 3707 (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) && 3708 (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) { 3709 add_x86_feature(featureset, X86FSET_SSE4A); 3710 } 3711 3712 /* 3713 * It's really tricky to support syscall/sysret in 3714 * the i386 kernel; we rely on sysenter/sysexit 3715 * instead. In the amd64 kernel, things are -way- 3716 * better. 3717 */ 3718 if (cp->cp_edx & CPUID_AMD_EDX_SYSC) { 3719 add_x86_feature(featureset, X86FSET_ASYSC); 3720 } 3721 3722 /* 3723 * While we're thinking about system calls, note 3724 * that AMD processors don't support sysenter 3725 * in long mode at all, so don't try to program them. 3726 */ 3727 if (x86_vendor == X86_VENDOR_AMD) { 3728 remove_x86_feature(featureset, X86FSET_SEP); 3729 } 3730 3731 if (cp->cp_edx & CPUID_AMD_EDX_TSCP) { 3732 add_x86_feature(featureset, X86FSET_TSCP); 3733 } 3734 3735 if (cp->cp_ecx & CPUID_AMD_ECX_SVM) { 3736 add_x86_feature(featureset, X86FSET_SVM); 3737 } 3738 3739 if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) { 3740 add_x86_feature(featureset, X86FSET_TOPOEXT); 3741 } 3742 3743 if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) { 3744 add_x86_feature(featureset, X86FSET_AMD_PCEC); 3745 } 3746 3747 if (cp->cp_ecx & CPUID_AMD_ECX_XOP) { 3748 add_x86_feature(featureset, X86FSET_XOP); 3749 } 3750 3751 if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) { 3752 add_x86_feature(featureset, X86FSET_FMA4); 3753 } 3754 3755 if (cp->cp_ecx & CPUID_AMD_ECX_TBM) { 3756 add_x86_feature(featureset, X86FSET_TBM); 3757 } 3758 3759 if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) { 3760 add_x86_feature(featureset, X86FSET_MONITORX); 3761 } 3762 break; 3763 default: 3764 break; 3765 } 3766 3767 /* 3768 * Get CPUID data about processor cores and hyperthreads. 3769 */ 3770 switch (cpi->cpi_vendor) { 3771 case X86_VENDOR_Intel: 3772 if (cpi->cpi_maxeax >= 4) { 3773 cp = &cpi->cpi_std[4]; 3774 cp->cp_eax = 4; 3775 cp->cp_ecx = 0; 3776 (void) __cpuid_insn(cp); 3777 platform_cpuid_mangle(cpi->cpi_vendor, 4, cp); 3778 } 3779 /*FALLTHROUGH*/ 3780 case X86_VENDOR_AMD: 3781 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) 3782 break; 3783 cp = &cpi->cpi_extd[8]; 3784 cp->cp_eax = CPUID_LEAF_EXT_8; 3785 (void) __cpuid_insn(cp); 3786 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, 3787 cp); 3788 3789 /* 3790 * AMD uses ebx for some extended functions. 3791 */ 3792 if (cpi->cpi_vendor == X86_VENDOR_AMD) { 3793 /* 3794 * While we're here, check for the AMD "Error 3795 * Pointer Zero/Restore" feature. This can be 3796 * used to setup the FP save handlers 3797 * appropriately. 3798 */ 3799 if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) { 3800 cpi->cpi_fp_amd_save = 0; 3801 } else { 3802 cpi->cpi_fp_amd_save = 1; 3803 } 3804 3805 if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) { 3806 add_x86_feature(featureset, 3807 X86FSET_CLZERO); 3808 } 3809 } 3810 3811 /* 3812 * Virtual and physical address limits from 3813 * cpuid override previously guessed values. 3814 */ 3815 cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0); 3816 cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8); 3817 break; 3818 default: 3819 break; 3820 } 3821 3822 /* 3823 * Get CPUID data about TSC Invariance in Deep C-State. 3824 */ 3825 switch (cpi->cpi_vendor) { 3826 case X86_VENDOR_Intel: 3827 case X86_VENDOR_AMD: 3828 if (cpi->cpi_maxeax >= 7) { 3829 cp = &cpi->cpi_extd[7]; 3830 cp->cp_eax = 0x80000007; 3831 cp->cp_ecx = 0; 3832 (void) __cpuid_insn(cp); 3833 } 3834 break; 3835 default: 3836 break; 3837 } 3838 } 3839 3840 cpuid_pass1_topology(cpu, featureset); 3841 cpuid_pass1_thermal(cpu, featureset); 3842 3843 /* 3844 * Synthesize chip "revision" and socket type 3845 */ 3846 cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family, 3847 cpi->cpi_model, cpi->cpi_step); 3848 cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor, 3849 cpi->cpi_family, cpi->cpi_model, cpi->cpi_step); 3850 cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family, 3851 cpi->cpi_model, cpi->cpi_step); 3852 3853 if (cpi->cpi_vendor == X86_VENDOR_AMD) { 3854 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 && 3855 cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) { 3856 /* Special handling for AMD FP not necessary. */ 3857 cpi->cpi_fp_amd_save = 0; 3858 } else { 3859 cpi->cpi_fp_amd_save = 1; 3860 } 3861 } 3862 3863 /* 3864 * Check the processor leaves that are used for security features. 3865 */ 3866 cpuid_scan_security(cpu, featureset); 3867 3868 pass1_done: 3869 cpi->cpi_pass = 1; 3870 } 3871 3872 /* 3873 * Make copies of the cpuid table entries we depend on, in 3874 * part for ease of parsing now, in part so that we have only 3875 * one place to correct any of it, in part for ease of 3876 * later export to userland, and in part so we can look at 3877 * this stuff in a crash dump. 3878 */ 3879 3880 /*ARGSUSED*/ 3881 void 3882 cpuid_pass2(cpu_t *cpu) 3883 { 3884 uint_t n, nmax; 3885 int i; 3886 struct cpuid_regs *cp; 3887 uint8_t *dp; 3888 uint32_t *iptr; 3889 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 3890 3891 ASSERT(cpi->cpi_pass == 1); 3892 3893 if (cpi->cpi_maxeax < 1) 3894 goto pass2_done; 3895 3896 if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD) 3897 nmax = NMAX_CPI_STD; 3898 /* 3899 * (We already handled n == 0 and n == 1 in pass 1) 3900 */ 3901 for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) { 3902 /* 3903 * leaves 6 and 7 were handled in pass 1 3904 */ 3905 if (n == 6 || n == 7) 3906 continue; 3907 3908 cp->cp_eax = n; 3909 3910 /* 3911 * CPUID function 4 expects %ecx to be initialized 3912 * with an index which indicates which cache to return 3913 * information about. The OS is expected to call function 4 3914 * with %ecx set to 0, 1, 2, ... until it returns with 3915 * EAX[4:0] set to 0, which indicates there are no more 3916 * caches. 3917 * 3918 * Here, populate cpi_std[4] with the information returned by 3919 * function 4 when %ecx == 0, and do the rest in cpuid_pass3() 3920 * when dynamic memory allocation becomes available. 3921 * 3922 * Note: we need to explicitly initialize %ecx here, since 3923 * function 4 may have been previously invoked. 3924 */ 3925 if (n == 4) 3926 cp->cp_ecx = 0; 3927 3928 (void) __cpuid_insn(cp); 3929 platform_cpuid_mangle(cpi->cpi_vendor, n, cp); 3930 switch (n) { 3931 case 2: 3932 /* 3933 * "the lower 8 bits of the %eax register 3934 * contain a value that identifies the number 3935 * of times the cpuid [instruction] has to be 3936 * executed to obtain a complete image of the 3937 * processor's caching systems." 3938 * 3939 * How *do* they make this stuff up? 3940 */ 3941 cpi->cpi_ncache = sizeof (*cp) * 3942 BITX(cp->cp_eax, 7, 0); 3943 if (cpi->cpi_ncache == 0) 3944 break; 3945 cpi->cpi_ncache--; /* skip count byte */ 3946 3947 /* 3948 * Well, for now, rather than attempt to implement 3949 * this slightly dubious algorithm, we just look 3950 * at the first 15 .. 3951 */ 3952 if (cpi->cpi_ncache > (sizeof (*cp) - 1)) 3953 cpi->cpi_ncache = sizeof (*cp) - 1; 3954 3955 dp = cpi->cpi_cacheinfo; 3956 if (BITX(cp->cp_eax, 31, 31) == 0) { 3957 uint8_t *p = (void *)&cp->cp_eax; 3958 for (i = 1; i < 4; i++) 3959 if (p[i] != 0) 3960 *dp++ = p[i]; 3961 } 3962 if (BITX(cp->cp_ebx, 31, 31) == 0) { 3963 uint8_t *p = (void *)&cp->cp_ebx; 3964 for (i = 0; i < 4; i++) 3965 if (p[i] != 0) 3966 *dp++ = p[i]; 3967 } 3968 if (BITX(cp->cp_ecx, 31, 31) == 0) { 3969 uint8_t *p = (void *)&cp->cp_ecx; 3970 for (i = 0; i < 4; i++) 3971 if (p[i] != 0) 3972 *dp++ = p[i]; 3973 } 3974 if (BITX(cp->cp_edx, 31, 31) == 0) { 3975 uint8_t *p = (void *)&cp->cp_edx; 3976 for (i = 0; i < 4; i++) 3977 if (p[i] != 0) 3978 *dp++ = p[i]; 3979 } 3980 break; 3981 3982 case 3: /* Processor serial number, if PSN supported */ 3983 break; 3984 3985 case 4: /* Deterministic cache parameters */ 3986 break; 3987 3988 case 5: /* Monitor/Mwait parameters */ 3989 { 3990 size_t mwait_size; 3991 3992 /* 3993 * check cpi_mwait.support which was set in cpuid_pass1 3994 */ 3995 if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT)) 3996 break; 3997 3998 /* 3999 * Protect ourself from insane mwait line size. 4000 * Workaround for incomplete hardware emulator(s). 4001 */ 4002 mwait_size = (size_t)MWAIT_SIZE_MAX(cpi); 4003 if (mwait_size < sizeof (uint32_t) || 4004 !ISP2(mwait_size)) { 4005 #if DEBUG 4006 cmn_err(CE_NOTE, "Cannot handle cpu %d mwait " 4007 "size %ld", cpu->cpu_id, (long)mwait_size); 4008 #endif 4009 break; 4010 } 4011 4012 cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi); 4013 cpi->cpi_mwait.mon_max = mwait_size; 4014 if (MWAIT_EXTENSION(cpi)) { 4015 cpi->cpi_mwait.support |= MWAIT_EXTENSIONS; 4016 if (MWAIT_INT_ENABLE(cpi)) 4017 cpi->cpi_mwait.support |= 4018 MWAIT_ECX_INT_ENABLE; 4019 } 4020 break; 4021 } 4022 default: 4023 break; 4024 } 4025 } 4026 4027 /* 4028 * XSAVE enumeration 4029 */ 4030 if (cpi->cpi_maxeax >= 0xD) { 4031 struct cpuid_regs regs; 4032 boolean_t cpuid_d_valid = B_TRUE; 4033 4034 cp = ®s; 4035 cp->cp_eax = 0xD; 4036 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0; 4037 4038 (void) __cpuid_insn(cp); 4039 4040 /* 4041 * Sanity checks for debug 4042 */ 4043 if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 || 4044 (cp->cp_eax & XFEATURE_SSE) == 0) { 4045 cpuid_d_valid = B_FALSE; 4046 } 4047 4048 cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax; 4049 cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx; 4050 cpi->cpi_xsave.xsav_max_size = cp->cp_ecx; 4051 4052 /* 4053 * If the hw supports AVX, get the size and offset in the save 4054 * area for the ymm state. 4055 */ 4056 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) { 4057 cp->cp_eax = 0xD; 4058 cp->cp_ecx = 2; 4059 cp->cp_edx = cp->cp_ebx = 0; 4060 4061 (void) __cpuid_insn(cp); 4062 4063 if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET || 4064 cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) { 4065 cpuid_d_valid = B_FALSE; 4066 } 4067 4068 cpi->cpi_xsave.ymm_size = cp->cp_eax; 4069 cpi->cpi_xsave.ymm_offset = cp->cp_ebx; 4070 } 4071 4072 /* 4073 * If the hw supports MPX, get the size and offset in the 4074 * save area for BNDREGS and BNDCSR. 4075 */ 4076 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) { 4077 cp->cp_eax = 0xD; 4078 cp->cp_ecx = 3; 4079 cp->cp_edx = cp->cp_ebx = 0; 4080 4081 (void) __cpuid_insn(cp); 4082 4083 cpi->cpi_xsave.bndregs_size = cp->cp_eax; 4084 cpi->cpi_xsave.bndregs_offset = cp->cp_ebx; 4085 4086 cp->cp_eax = 0xD; 4087 cp->cp_ecx = 4; 4088 cp->cp_edx = cp->cp_ebx = 0; 4089 4090 (void) __cpuid_insn(cp); 4091 4092 cpi->cpi_xsave.bndcsr_size = cp->cp_eax; 4093 cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx; 4094 } 4095 4096 /* 4097 * If the hw supports AVX512, get the size and offset in the 4098 * save area for the opmask registers and zmm state. 4099 */ 4100 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) { 4101 cp->cp_eax = 0xD; 4102 cp->cp_ecx = 5; 4103 cp->cp_edx = cp->cp_ebx = 0; 4104 4105 (void) __cpuid_insn(cp); 4106 4107 cpi->cpi_xsave.opmask_size = cp->cp_eax; 4108 cpi->cpi_xsave.opmask_offset = cp->cp_ebx; 4109 4110 cp->cp_eax = 0xD; 4111 cp->cp_ecx = 6; 4112 cp->cp_edx = cp->cp_ebx = 0; 4113 4114 (void) __cpuid_insn(cp); 4115 4116 cpi->cpi_xsave.zmmlo_size = cp->cp_eax; 4117 cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx; 4118 4119 cp->cp_eax = 0xD; 4120 cp->cp_ecx = 7; 4121 cp->cp_edx = cp->cp_ebx = 0; 4122 4123 (void) __cpuid_insn(cp); 4124 4125 cpi->cpi_xsave.zmmhi_size = cp->cp_eax; 4126 cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx; 4127 } 4128 4129 if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) { 4130 xsave_state_size = 0; 4131 } else if (cpuid_d_valid) { 4132 xsave_state_size = cpi->cpi_xsave.xsav_max_size; 4133 } else { 4134 /* Broken CPUID 0xD, probably in HVM */ 4135 cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid " 4136 "value: hw_low = %d, hw_high = %d, xsave_size = %d" 4137 ", ymm_size = %d, ymm_offset = %d\n", 4138 cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low, 4139 cpi->cpi_xsave.xsav_hw_features_high, 4140 (int)cpi->cpi_xsave.xsav_max_size, 4141 (int)cpi->cpi_xsave.ymm_size, 4142 (int)cpi->cpi_xsave.ymm_offset); 4143 4144 if (xsave_state_size != 0) { 4145 /* 4146 * This must be a non-boot CPU. We cannot 4147 * continue, because boot cpu has already 4148 * enabled XSAVE. 4149 */ 4150 ASSERT(cpu->cpu_id != 0); 4151 cmn_err(CE_PANIC, "cpu%d: we have already " 4152 "enabled XSAVE on boot cpu, cannot " 4153 "continue.", cpu->cpu_id); 4154 } else { 4155 /* 4156 * If we reached here on the boot CPU, it's also 4157 * almost certain that we'll reach here on the 4158 * non-boot CPUs. When we're here on a boot CPU 4159 * we should disable the feature, on a non-boot 4160 * CPU we need to confirm that we have. 4161 */ 4162 if (cpu->cpu_id == 0) { 4163 remove_x86_feature(x86_featureset, 4164 X86FSET_XSAVE); 4165 remove_x86_feature(x86_featureset, 4166 X86FSET_AVX); 4167 remove_x86_feature(x86_featureset, 4168 X86FSET_F16C); 4169 remove_x86_feature(x86_featureset, 4170 X86FSET_BMI1); 4171 remove_x86_feature(x86_featureset, 4172 X86FSET_BMI2); 4173 remove_x86_feature(x86_featureset, 4174 X86FSET_FMA); 4175 remove_x86_feature(x86_featureset, 4176 X86FSET_AVX2); 4177 remove_x86_feature(x86_featureset, 4178 X86FSET_MPX); 4179 remove_x86_feature(x86_featureset, 4180 X86FSET_AVX512F); 4181 remove_x86_feature(x86_featureset, 4182 X86FSET_AVX512DQ); 4183 remove_x86_feature(x86_featureset, 4184 X86FSET_AVX512PF); 4185 remove_x86_feature(x86_featureset, 4186 X86FSET_AVX512ER); 4187 remove_x86_feature(x86_featureset, 4188 X86FSET_AVX512CD); 4189 remove_x86_feature(x86_featureset, 4190 X86FSET_AVX512BW); 4191 remove_x86_feature(x86_featureset, 4192 X86FSET_AVX512VL); 4193 remove_x86_feature(x86_featureset, 4194 X86FSET_AVX512FMA); 4195 remove_x86_feature(x86_featureset, 4196 X86FSET_AVX512VBMI); 4197 remove_x86_feature(x86_featureset, 4198 X86FSET_AVX512VNNI); 4199 remove_x86_feature(x86_featureset, 4200 X86FSET_AVX512VPOPCDQ); 4201 remove_x86_feature(x86_featureset, 4202 X86FSET_AVX512NNIW); 4203 remove_x86_feature(x86_featureset, 4204 X86FSET_AVX512FMAPS); 4205 4206 CPI_FEATURES_ECX(cpi) &= 4207 ~CPUID_INTC_ECX_XSAVE; 4208 CPI_FEATURES_ECX(cpi) &= 4209 ~CPUID_INTC_ECX_AVX; 4210 CPI_FEATURES_ECX(cpi) &= 4211 ~CPUID_INTC_ECX_F16C; 4212 CPI_FEATURES_ECX(cpi) &= 4213 ~CPUID_INTC_ECX_FMA; 4214 CPI_FEATURES_7_0_EBX(cpi) &= 4215 ~CPUID_INTC_EBX_7_0_BMI1; 4216 CPI_FEATURES_7_0_EBX(cpi) &= 4217 ~CPUID_INTC_EBX_7_0_BMI2; 4218 CPI_FEATURES_7_0_EBX(cpi) &= 4219 ~CPUID_INTC_EBX_7_0_AVX2; 4220 CPI_FEATURES_7_0_EBX(cpi) &= 4221 ~CPUID_INTC_EBX_7_0_MPX; 4222 CPI_FEATURES_7_0_EBX(cpi) &= 4223 ~CPUID_INTC_EBX_7_0_ALL_AVX512; 4224 4225 CPI_FEATURES_7_0_ECX(cpi) &= 4226 ~CPUID_INTC_ECX_7_0_ALL_AVX512; 4227 4228 CPI_FEATURES_7_0_EDX(cpi) &= 4229 ~CPUID_INTC_EDX_7_0_ALL_AVX512; 4230 4231 xsave_force_disable = B_TRUE; 4232 } else { 4233 VERIFY(is_x86_feature(x86_featureset, 4234 X86FSET_XSAVE) == B_FALSE); 4235 } 4236 } 4237 } 4238 } 4239 4240 4241 if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) 4242 goto pass2_done; 4243 4244 if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD) 4245 nmax = NMAX_CPI_EXTD; 4246 /* 4247 * Copy the extended properties, fixing them as we go. 4248 * (We already handled n == 0 and n == 1 in pass 1) 4249 */ 4250 iptr = (void *)cpi->cpi_brandstr; 4251 for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) { 4252 cp->cp_eax = CPUID_LEAF_EXT_0 + n; 4253 (void) __cpuid_insn(cp); 4254 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n, 4255 cp); 4256 switch (n) { 4257 case 2: 4258 case 3: 4259 case 4: 4260 /* 4261 * Extract the brand string 4262 */ 4263 *iptr++ = cp->cp_eax; 4264 *iptr++ = cp->cp_ebx; 4265 *iptr++ = cp->cp_ecx; 4266 *iptr++ = cp->cp_edx; 4267 break; 4268 case 5: 4269 switch (cpi->cpi_vendor) { 4270 case X86_VENDOR_AMD: 4271 /* 4272 * The Athlon and Duron were the first 4273 * parts to report the sizes of the 4274 * TLB for large pages. Before then, 4275 * we don't trust the data. 4276 */ 4277 if (cpi->cpi_family < 6 || 4278 (cpi->cpi_family == 6 && 4279 cpi->cpi_model < 1)) 4280 cp->cp_eax = 0; 4281 break; 4282 default: 4283 break; 4284 } 4285 break; 4286 case 6: 4287 switch (cpi->cpi_vendor) { 4288 case X86_VENDOR_AMD: 4289 /* 4290 * The Athlon and Duron were the first 4291 * AMD parts with L2 TLB's. 4292 * Before then, don't trust the data. 4293 */ 4294 if (cpi->cpi_family < 6 || 4295 cpi->cpi_family == 6 && 4296 cpi->cpi_model < 1) 4297 cp->cp_eax = cp->cp_ebx = 0; 4298 /* 4299 * AMD Duron rev A0 reports L2 4300 * cache size incorrectly as 1K 4301 * when it is really 64K 4302 */ 4303 if (cpi->cpi_family == 6 && 4304 cpi->cpi_model == 3 && 4305 cpi->cpi_step == 0) { 4306 cp->cp_ecx &= 0xffff; 4307 cp->cp_ecx |= 0x400000; 4308 } 4309 break; 4310 case X86_VENDOR_Cyrix: /* VIA C3 */ 4311 /* 4312 * VIA C3 processors are a bit messed 4313 * up w.r.t. encoding cache sizes in %ecx 4314 */ 4315 if (cpi->cpi_family != 6) 4316 break; 4317 /* 4318 * model 7 and 8 were incorrectly encoded 4319 * 4320 * xxx is model 8 really broken? 4321 */ 4322 if (cpi->cpi_model == 7 || 4323 cpi->cpi_model == 8) 4324 cp->cp_ecx = 4325 BITX(cp->cp_ecx, 31, 24) << 16 | 4326 BITX(cp->cp_ecx, 23, 16) << 12 | 4327 BITX(cp->cp_ecx, 15, 8) << 8 | 4328 BITX(cp->cp_ecx, 7, 0); 4329 /* 4330 * model 9 stepping 1 has wrong associativity 4331 */ 4332 if (cpi->cpi_model == 9 && cpi->cpi_step == 1) 4333 cp->cp_ecx |= 8 << 12; 4334 break; 4335 case X86_VENDOR_Intel: 4336 /* 4337 * Extended L2 Cache features function. 4338 * First appeared on Prescott. 4339 */ 4340 default: 4341 break; 4342 } 4343 break; 4344 default: 4345 break; 4346 } 4347 } 4348 4349 pass2_done: 4350 cpi->cpi_pass = 2; 4351 } 4352 4353 static const char * 4354 intel_cpubrand(const struct cpuid_info *cpi) 4355 { 4356 int i; 4357 4358 if (!is_x86_feature(x86_featureset, X86FSET_CPUID) || 4359 cpi->cpi_maxeax < 1 || cpi->cpi_family < 5) 4360 return ("i486"); 4361 4362 switch (cpi->cpi_family) { 4363 case 5: 4364 return ("Intel Pentium(r)"); 4365 case 6: 4366 switch (cpi->cpi_model) { 4367 uint_t celeron, xeon; 4368 const struct cpuid_regs *cp; 4369 case 0: 4370 case 1: 4371 case 2: 4372 return ("Intel Pentium(r) Pro"); 4373 case 3: 4374 case 4: 4375 return ("Intel Pentium(r) II"); 4376 case 6: 4377 return ("Intel Celeron(r)"); 4378 case 5: 4379 case 7: 4380 celeron = xeon = 0; 4381 cp = &cpi->cpi_std[2]; /* cache info */ 4382 4383 for (i = 1; i < 4; i++) { 4384 uint_t tmp; 4385 4386 tmp = (cp->cp_eax >> (8 * i)) & 0xff; 4387 if (tmp == 0x40) 4388 celeron++; 4389 if (tmp >= 0x44 && tmp <= 0x45) 4390 xeon++; 4391 } 4392 4393 for (i = 0; i < 2; i++) { 4394 uint_t tmp; 4395 4396 tmp = (cp->cp_ebx >> (8 * i)) & 0xff; 4397 if (tmp == 0x40) 4398 celeron++; 4399 else if (tmp >= 0x44 && tmp <= 0x45) 4400 xeon++; 4401 } 4402 4403 for (i = 0; i < 4; i++) { 4404 uint_t tmp; 4405 4406 tmp = (cp->cp_ecx >> (8 * i)) & 0xff; 4407 if (tmp == 0x40) 4408 celeron++; 4409 else if (tmp >= 0x44 && tmp <= 0x45) 4410 xeon++; 4411 } 4412 4413 for (i = 0; i < 4; i++) { 4414 uint_t tmp; 4415 4416 tmp = (cp->cp_edx >> (8 * i)) & 0xff; 4417 if (tmp == 0x40) 4418 celeron++; 4419 else if (tmp >= 0x44 && tmp <= 0x45) 4420 xeon++; 4421 } 4422 4423 if (celeron) 4424 return ("Intel Celeron(r)"); 4425 if (xeon) 4426 return (cpi->cpi_model == 5 ? 4427 "Intel Pentium(r) II Xeon(tm)" : 4428 "Intel Pentium(r) III Xeon(tm)"); 4429 return (cpi->cpi_model == 5 ? 4430 "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" : 4431 "Intel Pentium(r) III or Pentium(r) III Xeon(tm)"); 4432 default: 4433 break; 4434 } 4435 default: 4436 break; 4437 } 4438 4439 /* BrandID is present if the field is nonzero */ 4440 if (cpi->cpi_brandid != 0) { 4441 static const struct { 4442 uint_t bt_bid; 4443 const char *bt_str; 4444 } brand_tbl[] = { 4445 { 0x1, "Intel(r) Celeron(r)" }, 4446 { 0x2, "Intel(r) Pentium(r) III" }, 4447 { 0x3, "Intel(r) Pentium(r) III Xeon(tm)" }, 4448 { 0x4, "Intel(r) Pentium(r) III" }, 4449 { 0x6, "Mobile Intel(r) Pentium(r) III" }, 4450 { 0x7, "Mobile Intel(r) Celeron(r)" }, 4451 { 0x8, "Intel(r) Pentium(r) 4" }, 4452 { 0x9, "Intel(r) Pentium(r) 4" }, 4453 { 0xa, "Intel(r) Celeron(r)" }, 4454 { 0xb, "Intel(r) Xeon(tm)" }, 4455 { 0xc, "Intel(r) Xeon(tm) MP" }, 4456 { 0xe, "Mobile Intel(r) Pentium(r) 4" }, 4457 { 0xf, "Mobile Intel(r) Celeron(r)" }, 4458 { 0x11, "Mobile Genuine Intel(r)" }, 4459 { 0x12, "Intel(r) Celeron(r) M" }, 4460 { 0x13, "Mobile Intel(r) Celeron(r)" }, 4461 { 0x14, "Intel(r) Celeron(r)" }, 4462 { 0x15, "Mobile Genuine Intel(r)" }, 4463 { 0x16, "Intel(r) Pentium(r) M" }, 4464 { 0x17, "Mobile Intel(r) Celeron(r)" } 4465 }; 4466 uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]); 4467 uint_t sgn; 4468 4469 sgn = (cpi->cpi_family << 8) | 4470 (cpi->cpi_model << 4) | cpi->cpi_step; 4471 4472 for (i = 0; i < btblmax; i++) 4473 if (brand_tbl[i].bt_bid == cpi->cpi_brandid) 4474 break; 4475 if (i < btblmax) { 4476 if (sgn == 0x6b1 && cpi->cpi_brandid == 3) 4477 return ("Intel(r) Celeron(r)"); 4478 if (sgn < 0xf13 && cpi->cpi_brandid == 0xb) 4479 return ("Intel(r) Xeon(tm) MP"); 4480 if (sgn < 0xf13 && cpi->cpi_brandid == 0xe) 4481 return ("Intel(r) Xeon(tm)"); 4482 return (brand_tbl[i].bt_str); 4483 } 4484 } 4485 4486 return (NULL); 4487 } 4488 4489 static const char * 4490 amd_cpubrand(const struct cpuid_info *cpi) 4491 { 4492 if (!is_x86_feature(x86_featureset, X86FSET_CPUID) || 4493 cpi->cpi_maxeax < 1 || cpi->cpi_family < 5) 4494 return ("i486 compatible"); 4495 4496 switch (cpi->cpi_family) { 4497 case 5: 4498 switch (cpi->cpi_model) { 4499 case 0: 4500 case 1: 4501 case 2: 4502 case 3: 4503 case 4: 4504 case 5: 4505 return ("AMD-K5(r)"); 4506 case 6: 4507 case 7: 4508 return ("AMD-K6(r)"); 4509 case 8: 4510 return ("AMD-K6(r)-2"); 4511 case 9: 4512 return ("AMD-K6(r)-III"); 4513 default: 4514 return ("AMD (family 5)"); 4515 } 4516 case 6: 4517 switch (cpi->cpi_model) { 4518 case 1: 4519 return ("AMD-K7(tm)"); 4520 case 0: 4521 case 2: 4522 case 4: 4523 return ("AMD Athlon(tm)"); 4524 case 3: 4525 case 7: 4526 return ("AMD Duron(tm)"); 4527 case 6: 4528 case 8: 4529 case 10: 4530 /* 4531 * Use the L2 cache size to distinguish 4532 */ 4533 return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ? 4534 "AMD Athlon(tm)" : "AMD Duron(tm)"); 4535 default: 4536 return ("AMD (family 6)"); 4537 } 4538 default: 4539 break; 4540 } 4541 4542 if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 && 4543 cpi->cpi_brandid != 0) { 4544 switch (BITX(cpi->cpi_brandid, 7, 5)) { 4545 case 3: 4546 return ("AMD Opteron(tm) UP 1xx"); 4547 case 4: 4548 return ("AMD Opteron(tm) DP 2xx"); 4549 case 5: 4550 return ("AMD Opteron(tm) MP 8xx"); 4551 default: 4552 return ("AMD Opteron(tm)"); 4553 } 4554 } 4555 4556 return (NULL); 4557 } 4558 4559 static const char * 4560 cyrix_cpubrand(struct cpuid_info *cpi, uint_t type) 4561 { 4562 if (!is_x86_feature(x86_featureset, X86FSET_CPUID) || 4563 cpi->cpi_maxeax < 1 || cpi->cpi_family < 5 || 4564 type == X86_TYPE_CYRIX_486) 4565 return ("i486 compatible"); 4566 4567 switch (type) { 4568 case X86_TYPE_CYRIX_6x86: 4569 return ("Cyrix 6x86"); 4570 case X86_TYPE_CYRIX_6x86L: 4571 return ("Cyrix 6x86L"); 4572 case X86_TYPE_CYRIX_6x86MX: 4573 return ("Cyrix 6x86MX"); 4574 case X86_TYPE_CYRIX_GXm: 4575 return ("Cyrix GXm"); 4576 case X86_TYPE_CYRIX_MediaGX: 4577 return ("Cyrix MediaGX"); 4578 case X86_TYPE_CYRIX_MII: 4579 return ("Cyrix M2"); 4580 case X86_TYPE_VIA_CYRIX_III: 4581 return ("VIA Cyrix M3"); 4582 default: 4583 /* 4584 * Have another wild guess .. 4585 */ 4586 if (cpi->cpi_family == 4 && cpi->cpi_model == 9) 4587 return ("Cyrix 5x86"); 4588 else if (cpi->cpi_family == 5) { 4589 switch (cpi->cpi_model) { 4590 case 2: 4591 return ("Cyrix 6x86"); /* Cyrix M1 */ 4592 case 4: 4593 return ("Cyrix MediaGX"); 4594 default: 4595 break; 4596 } 4597 } else if (cpi->cpi_family == 6) { 4598 switch (cpi->cpi_model) { 4599 case 0: 4600 return ("Cyrix 6x86MX"); /* Cyrix M2? */ 4601 case 5: 4602 case 6: 4603 case 7: 4604 case 8: 4605 case 9: 4606 return ("VIA C3"); 4607 default: 4608 break; 4609 } 4610 } 4611 break; 4612 } 4613 return (NULL); 4614 } 4615 4616 /* 4617 * This only gets called in the case that the CPU extended 4618 * feature brand string (0x80000002, 0x80000003, 0x80000004) 4619 * aren't available, or contain null bytes for some reason. 4620 */ 4621 static void 4622 fabricate_brandstr(struct cpuid_info *cpi) 4623 { 4624 const char *brand = NULL; 4625 4626 switch (cpi->cpi_vendor) { 4627 case X86_VENDOR_Intel: 4628 brand = intel_cpubrand(cpi); 4629 break; 4630 case X86_VENDOR_AMD: 4631 brand = amd_cpubrand(cpi); 4632 break; 4633 case X86_VENDOR_Cyrix: 4634 brand = cyrix_cpubrand(cpi, x86_type); 4635 break; 4636 case X86_VENDOR_NexGen: 4637 if (cpi->cpi_family == 5 && cpi->cpi_model == 0) 4638 brand = "NexGen Nx586"; 4639 break; 4640 case X86_VENDOR_Centaur: 4641 if (cpi->cpi_family == 5) 4642 switch (cpi->cpi_model) { 4643 case 4: 4644 brand = "Centaur C6"; 4645 break; 4646 case 8: 4647 brand = "Centaur C2"; 4648 break; 4649 case 9: 4650 brand = "Centaur C3"; 4651 break; 4652 default: 4653 break; 4654 } 4655 break; 4656 case X86_VENDOR_Rise: 4657 if (cpi->cpi_family == 5 && 4658 (cpi->cpi_model == 0 || cpi->cpi_model == 2)) 4659 brand = "Rise mP6"; 4660 break; 4661 case X86_VENDOR_SiS: 4662 if (cpi->cpi_family == 5 && cpi->cpi_model == 0) 4663 brand = "SiS 55x"; 4664 break; 4665 case X86_VENDOR_TM: 4666 if (cpi->cpi_family == 5 && cpi->cpi_model == 4) 4667 brand = "Transmeta Crusoe TM3x00 or TM5x00"; 4668 break; 4669 case X86_VENDOR_NSC: 4670 case X86_VENDOR_UMC: 4671 default: 4672 break; 4673 } 4674 if (brand) { 4675 (void) strcpy((char *)cpi->cpi_brandstr, brand); 4676 return; 4677 } 4678 4679 /* 4680 * If all else fails ... 4681 */ 4682 (void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr), 4683 "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family, 4684 cpi->cpi_model, cpi->cpi_step); 4685 } 4686 4687 /* 4688 * This routine is called just after kernel memory allocation 4689 * becomes available on cpu0, and as part of mp_startup() on 4690 * the other cpus. 4691 * 4692 * Fixup the brand string, and collect any information from cpuid 4693 * that requires dynamically allocated storage to represent. 4694 */ 4695 /*ARGSUSED*/ 4696 void 4697 cpuid_pass3(cpu_t *cpu) 4698 { 4699 int i, max, shft, level, size; 4700 struct cpuid_regs regs; 4701 struct cpuid_regs *cp; 4702 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 4703 4704 ASSERT(cpi->cpi_pass == 2); 4705 4706 /* 4707 * Deterministic cache parameters 4708 * 4709 * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The 4710 * values that are present are currently defined to be the same. This 4711 * means we can use the same logic to parse it as long as we use the 4712 * appropriate leaf to get the data. If you're updating this, make sure 4713 * you're careful about which vendor supports which aspect. 4714 * 4715 * Take this opportunity to detect the number of threads sharing the 4716 * last level cache, and construct a corresponding cache id. The 4717 * respective cpuid_info members are initialized to the default case of 4718 * "no last level cache sharing". 4719 */ 4720 cpi->cpi_ncpu_shr_last_cache = 1; 4721 cpi->cpi_last_lvl_cacheid = cpu->cpu_id; 4722 4723 if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) || 4724 (cpi->cpi_vendor == X86_VENDOR_AMD && 4725 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d && 4726 is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) { 4727 uint32_t leaf; 4728 4729 if (cpi->cpi_vendor == X86_VENDOR_Intel) { 4730 leaf = 4; 4731 } else { 4732 leaf = CPUID_LEAF_EXT_1d; 4733 } 4734 4735 /* 4736 * Find the # of elements (size) returned by the leaf and along 4737 * the way detect last level cache sharing details. 4738 */ 4739 bzero(®s, sizeof (regs)); 4740 cp = ®s; 4741 for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) { 4742 cp->cp_eax = leaf; 4743 cp->cp_ecx = i; 4744 4745 (void) __cpuid_insn(cp); 4746 4747 if (CPI_CACHE_TYPE(cp) == 0) 4748 break; 4749 level = CPI_CACHE_LVL(cp); 4750 if (level > max) { 4751 max = level; 4752 cpi->cpi_ncpu_shr_last_cache = 4753 CPI_NTHR_SHR_CACHE(cp) + 1; 4754 } 4755 } 4756 cpi->cpi_cache_leaf_size = size = i; 4757 4758 /* 4759 * Allocate the cpi_cache_leaves array. The first element 4760 * references the regs for the corresponding leaf with %ecx set 4761 * to 0. This was gathered in cpuid_pass2(). 4762 */ 4763 if (size > 0) { 4764 cpi->cpi_cache_leaves = 4765 kmem_alloc(size * sizeof (cp), KM_SLEEP); 4766 if (cpi->cpi_vendor == X86_VENDOR_Intel) { 4767 cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4]; 4768 } else { 4769 cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d]; 4770 } 4771 4772 /* 4773 * Allocate storage to hold the additional regs 4774 * for the leaf, %ecx == 1 .. cpi_cache_leaf_size. 4775 * 4776 * The regs for the leaf, %ecx == 0 has already 4777 * been allocated as indicated above. 4778 */ 4779 for (i = 1; i < size; i++) { 4780 cp = cpi->cpi_cache_leaves[i] = 4781 kmem_zalloc(sizeof (regs), KM_SLEEP); 4782 cp->cp_eax = leaf; 4783 cp->cp_ecx = i; 4784 4785 (void) __cpuid_insn(cp); 4786 } 4787 } 4788 /* 4789 * Determine the number of bits needed to represent 4790 * the number of CPUs sharing the last level cache. 4791 * 4792 * Shift off that number of bits from the APIC id to 4793 * derive the cache id. 4794 */ 4795 shft = 0; 4796 for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1) 4797 shft++; 4798 cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft; 4799 } 4800 4801 /* 4802 * Now fixup the brand string 4803 */ 4804 if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) { 4805 fabricate_brandstr(cpi); 4806 } else { 4807 4808 /* 4809 * If we successfully extracted a brand string from the cpuid 4810 * instruction, clean it up by removing leading spaces and 4811 * similar junk. 4812 */ 4813 if (cpi->cpi_brandstr[0]) { 4814 size_t maxlen = sizeof (cpi->cpi_brandstr); 4815 char *src, *dst; 4816 4817 dst = src = (char *)cpi->cpi_brandstr; 4818 src[maxlen - 1] = '\0'; 4819 /* 4820 * strip leading spaces 4821 */ 4822 while (*src == ' ') 4823 src++; 4824 /* 4825 * Remove any 'Genuine' or "Authentic" prefixes 4826 */ 4827 if (strncmp(src, "Genuine ", 8) == 0) 4828 src += 8; 4829 if (strncmp(src, "Authentic ", 10) == 0) 4830 src += 10; 4831 4832 /* 4833 * Now do an in-place copy. 4834 * Map (R) to (r) and (TM) to (tm). 4835 * The era of teletypes is long gone, and there's 4836 * -really- no need to shout. 4837 */ 4838 while (*src != '\0') { 4839 if (src[0] == '(') { 4840 if (strncmp(src + 1, "R)", 2) == 0) { 4841 (void) strncpy(dst, "(r)", 3); 4842 src += 3; 4843 dst += 3; 4844 continue; 4845 } 4846 if (strncmp(src + 1, "TM)", 3) == 0) { 4847 (void) strncpy(dst, "(tm)", 4); 4848 src += 4; 4849 dst += 4; 4850 continue; 4851 } 4852 } 4853 *dst++ = *src++; 4854 } 4855 *dst = '\0'; 4856 4857 /* 4858 * Finally, remove any trailing spaces 4859 */ 4860 while (--dst > cpi->cpi_brandstr) 4861 if (*dst == ' ') 4862 *dst = '\0'; 4863 else 4864 break; 4865 } else 4866 fabricate_brandstr(cpi); 4867 } 4868 cpi->cpi_pass = 3; 4869 } 4870 4871 /* 4872 * This routine is called out of bind_hwcap() much later in the life 4873 * of the kernel (post_startup()). The job of this routine is to resolve 4874 * the hardware feature support and kernel support for those features into 4875 * what we're actually going to tell applications via the aux vector. 4876 */ 4877 void 4878 cpuid_pass4(cpu_t *cpu, uint_t *hwcap_out) 4879 { 4880 struct cpuid_info *cpi; 4881 uint_t hwcap_flags = 0, hwcap_flags_2 = 0; 4882 4883 if (cpu == NULL) 4884 cpu = CPU; 4885 cpi = cpu->cpu_m.mcpu_cpi; 4886 4887 ASSERT(cpi->cpi_pass == 3); 4888 4889 if (cpi->cpi_maxeax >= 1) { 4890 uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES]; 4891 uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES]; 4892 uint32_t *ebx = &cpi->cpi_support[STD_EBX_FEATURES]; 4893 4894 *edx = CPI_FEATURES_EDX(cpi); 4895 *ecx = CPI_FEATURES_ECX(cpi); 4896 *ebx = CPI_FEATURES_7_0_EBX(cpi); 4897 4898 /* 4899 * [these require explicit kernel support] 4900 */ 4901 if (!is_x86_feature(x86_featureset, X86FSET_SEP)) 4902 *edx &= ~CPUID_INTC_EDX_SEP; 4903 4904 if (!is_x86_feature(x86_featureset, X86FSET_SSE)) 4905 *edx &= ~(CPUID_INTC_EDX_FXSR|CPUID_INTC_EDX_SSE); 4906 if (!is_x86_feature(x86_featureset, X86FSET_SSE2)) 4907 *edx &= ~CPUID_INTC_EDX_SSE2; 4908 4909 if (!is_x86_feature(x86_featureset, X86FSET_HTT)) 4910 *edx &= ~CPUID_INTC_EDX_HTT; 4911 4912 if (!is_x86_feature(x86_featureset, X86FSET_SSE3)) 4913 *ecx &= ~CPUID_INTC_ECX_SSE3; 4914 4915 if (!is_x86_feature(x86_featureset, X86FSET_SSSE3)) 4916 *ecx &= ~CPUID_INTC_ECX_SSSE3; 4917 if (!is_x86_feature(x86_featureset, X86FSET_SSE4_1)) 4918 *ecx &= ~CPUID_INTC_ECX_SSE4_1; 4919 if (!is_x86_feature(x86_featureset, X86FSET_SSE4_2)) 4920 *ecx &= ~CPUID_INTC_ECX_SSE4_2; 4921 if (!is_x86_feature(x86_featureset, X86FSET_AES)) 4922 *ecx &= ~CPUID_INTC_ECX_AES; 4923 if (!is_x86_feature(x86_featureset, X86FSET_PCLMULQDQ)) 4924 *ecx &= ~CPUID_INTC_ECX_PCLMULQDQ; 4925 if (!is_x86_feature(x86_featureset, X86FSET_XSAVE)) 4926 *ecx &= ~(CPUID_INTC_ECX_XSAVE | 4927 CPUID_INTC_ECX_OSXSAVE); 4928 if (!is_x86_feature(x86_featureset, X86FSET_AVX)) 4929 *ecx &= ~CPUID_INTC_ECX_AVX; 4930 if (!is_x86_feature(x86_featureset, X86FSET_F16C)) 4931 *ecx &= ~CPUID_INTC_ECX_F16C; 4932 if (!is_x86_feature(x86_featureset, X86FSET_FMA)) 4933 *ecx &= ~CPUID_INTC_ECX_FMA; 4934 if (!is_x86_feature(x86_featureset, X86FSET_BMI1)) 4935 *ebx &= ~CPUID_INTC_EBX_7_0_BMI1; 4936 if (!is_x86_feature(x86_featureset, X86FSET_BMI2)) 4937 *ebx &= ~CPUID_INTC_EBX_7_0_BMI2; 4938 if (!is_x86_feature(x86_featureset, X86FSET_AVX2)) 4939 *ebx &= ~CPUID_INTC_EBX_7_0_AVX2; 4940 if (!is_x86_feature(x86_featureset, X86FSET_RDSEED)) 4941 *ebx &= ~CPUID_INTC_EBX_7_0_RDSEED; 4942 if (!is_x86_feature(x86_featureset, X86FSET_ADX)) 4943 *ebx &= ~CPUID_INTC_EBX_7_0_ADX; 4944 4945 /* 4946 * [no explicit support required beyond x87 fp context] 4947 */ 4948 if (!fpu_exists) 4949 *edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX); 4950 4951 /* 4952 * Now map the supported feature vector to things that we 4953 * think userland will care about. 4954 */ 4955 if (*edx & CPUID_INTC_EDX_SEP) 4956 hwcap_flags |= AV_386_SEP; 4957 if (*edx & CPUID_INTC_EDX_SSE) 4958 hwcap_flags |= AV_386_FXSR | AV_386_SSE; 4959 if (*edx & CPUID_INTC_EDX_SSE2) 4960 hwcap_flags |= AV_386_SSE2; 4961 if (*ecx & CPUID_INTC_ECX_SSE3) 4962 hwcap_flags |= AV_386_SSE3; 4963 if (*ecx & CPUID_INTC_ECX_SSSE3) 4964 hwcap_flags |= AV_386_SSSE3; 4965 if (*ecx & CPUID_INTC_ECX_SSE4_1) 4966 hwcap_flags |= AV_386_SSE4_1; 4967 if (*ecx & CPUID_INTC_ECX_SSE4_2) 4968 hwcap_flags |= AV_386_SSE4_2; 4969 if (*ecx & CPUID_INTC_ECX_MOVBE) 4970 hwcap_flags |= AV_386_MOVBE; 4971 if (*ecx & CPUID_INTC_ECX_AES) 4972 hwcap_flags |= AV_386_AES; 4973 if (*ecx & CPUID_INTC_ECX_PCLMULQDQ) 4974 hwcap_flags |= AV_386_PCLMULQDQ; 4975 if ((*ecx & CPUID_INTC_ECX_XSAVE) && 4976 (*ecx & CPUID_INTC_ECX_OSXSAVE)) { 4977 hwcap_flags |= AV_386_XSAVE; 4978 4979 if (*ecx & CPUID_INTC_ECX_AVX) { 4980 uint32_t *ecx_7 = &CPI_FEATURES_7_0_ECX(cpi); 4981 uint32_t *edx_7 = &CPI_FEATURES_7_0_EDX(cpi); 4982 4983 hwcap_flags |= AV_386_AVX; 4984 if (*ecx & CPUID_INTC_ECX_F16C) 4985 hwcap_flags_2 |= AV_386_2_F16C; 4986 if (*ecx & CPUID_INTC_ECX_FMA) 4987 hwcap_flags_2 |= AV_386_2_FMA; 4988 4989 if (*ebx & CPUID_INTC_EBX_7_0_BMI1) 4990 hwcap_flags_2 |= AV_386_2_BMI1; 4991 if (*ebx & CPUID_INTC_EBX_7_0_BMI2) 4992 hwcap_flags_2 |= AV_386_2_BMI2; 4993 if (*ebx & CPUID_INTC_EBX_7_0_AVX2) 4994 hwcap_flags_2 |= AV_386_2_AVX2; 4995 if (*ebx & CPUID_INTC_EBX_7_0_AVX512F) 4996 hwcap_flags_2 |= AV_386_2_AVX512F; 4997 if (*ebx & CPUID_INTC_EBX_7_0_AVX512DQ) 4998 hwcap_flags_2 |= AV_386_2_AVX512DQ; 4999 if (*ebx & CPUID_INTC_EBX_7_0_AVX512IFMA) 5000 hwcap_flags_2 |= AV_386_2_AVX512IFMA; 5001 if (*ebx & CPUID_INTC_EBX_7_0_AVX512PF) 5002 hwcap_flags_2 |= AV_386_2_AVX512PF; 5003 if (*ebx & CPUID_INTC_EBX_7_0_AVX512ER) 5004 hwcap_flags_2 |= AV_386_2_AVX512ER; 5005 if (*ebx & CPUID_INTC_EBX_7_0_AVX512CD) 5006 hwcap_flags_2 |= AV_386_2_AVX512CD; 5007 if (*ebx & CPUID_INTC_EBX_7_0_AVX512BW) 5008 hwcap_flags_2 |= AV_386_2_AVX512BW; 5009 if (*ebx & CPUID_INTC_EBX_7_0_AVX512VL) 5010 hwcap_flags_2 |= AV_386_2_AVX512VL; 5011 5012 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VBMI) 5013 hwcap_flags_2 |= AV_386_2_AVX512VBMI; 5014 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VNNI) 5015 hwcap_flags_2 |= AV_386_2_AVX512_VNNI; 5016 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ) 5017 hwcap_flags_2 |= AV_386_2_AVX512VPOPCDQ; 5018 5019 if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124NNIW) 5020 hwcap_flags_2 |= AV_386_2_AVX512_4NNIW; 5021 if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124FMAPS) 5022 hwcap_flags_2 |= AV_386_2_AVX512_4FMAPS; 5023 } 5024 } 5025 if (*ecx & CPUID_INTC_ECX_VMX) 5026 hwcap_flags |= AV_386_VMX; 5027 if (*ecx & CPUID_INTC_ECX_POPCNT) 5028 hwcap_flags |= AV_386_POPCNT; 5029 if (*edx & CPUID_INTC_EDX_FPU) 5030 hwcap_flags |= AV_386_FPU; 5031 if (*edx & CPUID_INTC_EDX_MMX) 5032 hwcap_flags |= AV_386_MMX; 5033 5034 if (*edx & CPUID_INTC_EDX_TSC) 5035 hwcap_flags |= AV_386_TSC; 5036 if (*edx & CPUID_INTC_EDX_CX8) 5037 hwcap_flags |= AV_386_CX8; 5038 if (*edx & CPUID_INTC_EDX_CMOV) 5039 hwcap_flags |= AV_386_CMOV; 5040 if (*ecx & CPUID_INTC_ECX_CX16) 5041 hwcap_flags |= AV_386_CX16; 5042 5043 if (*ecx & CPUID_INTC_ECX_RDRAND) 5044 hwcap_flags_2 |= AV_386_2_RDRAND; 5045 if (*ebx & CPUID_INTC_EBX_7_0_ADX) 5046 hwcap_flags_2 |= AV_386_2_ADX; 5047 if (*ebx & CPUID_INTC_EBX_7_0_RDSEED) 5048 hwcap_flags_2 |= AV_386_2_RDSEED; 5049 if (*ebx & CPUID_INTC_EBX_7_0_SHA) 5050 hwcap_flags_2 |= AV_386_2_SHA; 5051 if (*ebx & CPUID_INTC_EBX_7_0_FSGSBASE) 5052 hwcap_flags_2 |= AV_386_2_FSGSBASE; 5053 if (*ebx & CPUID_INTC_EBX_7_0_CLWB) 5054 hwcap_flags_2 |= AV_386_2_CLWB; 5055 if (*ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT) 5056 hwcap_flags_2 |= AV_386_2_CLFLUSHOPT; 5057 5058 } 5059 /* 5060 * Check a few miscilaneous features. 5061 */ 5062 if (is_x86_feature(x86_featureset, X86FSET_CLZERO)) 5063 hwcap_flags_2 |= AV_386_2_CLZERO; 5064 5065 if (cpi->cpi_xmaxeax < 0x80000001) 5066 goto pass4_done; 5067 5068 switch (cpi->cpi_vendor) { 5069 struct cpuid_regs cp; 5070 uint32_t *edx, *ecx; 5071 5072 case X86_VENDOR_Intel: 5073 /* 5074 * Seems like Intel duplicated what we necessary 5075 * here to make the initial crop of 64-bit OS's work. 5076 * Hopefully, those are the only "extended" bits 5077 * they'll add. 5078 */ 5079 /*FALLTHROUGH*/ 5080 5081 case X86_VENDOR_AMD: 5082 edx = &cpi->cpi_support[AMD_EDX_FEATURES]; 5083 ecx = &cpi->cpi_support[AMD_ECX_FEATURES]; 5084 5085 *edx = CPI_FEATURES_XTD_EDX(cpi); 5086 *ecx = CPI_FEATURES_XTD_ECX(cpi); 5087 5088 /* 5089 * [these features require explicit kernel support] 5090 */ 5091 switch (cpi->cpi_vendor) { 5092 case X86_VENDOR_Intel: 5093 if (!is_x86_feature(x86_featureset, X86FSET_TSCP)) 5094 *edx &= ~CPUID_AMD_EDX_TSCP; 5095 break; 5096 5097 case X86_VENDOR_AMD: 5098 if (!is_x86_feature(x86_featureset, X86FSET_TSCP)) 5099 *edx &= ~CPUID_AMD_EDX_TSCP; 5100 if (!is_x86_feature(x86_featureset, X86FSET_SSE4A)) 5101 *ecx &= ~CPUID_AMD_ECX_SSE4A; 5102 break; 5103 5104 default: 5105 break; 5106 } 5107 5108 /* 5109 * [no explicit support required beyond 5110 * x87 fp context and exception handlers] 5111 */ 5112 if (!fpu_exists) 5113 *edx &= ~(CPUID_AMD_EDX_MMXamd | 5114 CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx); 5115 5116 if (!is_x86_feature(x86_featureset, X86FSET_NX)) 5117 *edx &= ~CPUID_AMD_EDX_NX; 5118 #if !defined(__amd64) 5119 *edx &= ~CPUID_AMD_EDX_LM; 5120 #endif 5121 /* 5122 * Now map the supported feature vector to 5123 * things that we think userland will care about. 5124 */ 5125 #if defined(__amd64) 5126 if (*edx & CPUID_AMD_EDX_SYSC) 5127 hwcap_flags |= AV_386_AMD_SYSC; 5128 #endif 5129 if (*edx & CPUID_AMD_EDX_MMXamd) 5130 hwcap_flags |= AV_386_AMD_MMX; 5131 if (*edx & CPUID_AMD_EDX_3DNow) 5132 hwcap_flags |= AV_386_AMD_3DNow; 5133 if (*edx & CPUID_AMD_EDX_3DNowx) 5134 hwcap_flags |= AV_386_AMD_3DNowx; 5135 if (*ecx & CPUID_AMD_ECX_SVM) 5136 hwcap_flags |= AV_386_AMD_SVM; 5137 5138 switch (cpi->cpi_vendor) { 5139 case X86_VENDOR_AMD: 5140 if (*edx & CPUID_AMD_EDX_TSCP) 5141 hwcap_flags |= AV_386_TSCP; 5142 if (*ecx & CPUID_AMD_ECX_AHF64) 5143 hwcap_flags |= AV_386_AHF; 5144 if (*ecx & CPUID_AMD_ECX_SSE4A) 5145 hwcap_flags |= AV_386_AMD_SSE4A; 5146 if (*ecx & CPUID_AMD_ECX_LZCNT) 5147 hwcap_flags |= AV_386_AMD_LZCNT; 5148 if (*ecx & CPUID_AMD_ECX_MONITORX) 5149 hwcap_flags_2 |= AV_386_2_MONITORX; 5150 break; 5151 5152 case X86_VENDOR_Intel: 5153 if (*edx & CPUID_AMD_EDX_TSCP) 5154 hwcap_flags |= AV_386_TSCP; 5155 if (*ecx & CPUID_AMD_ECX_LZCNT) 5156 hwcap_flags |= AV_386_AMD_LZCNT; 5157 /* 5158 * Aarrgh. 5159 * Intel uses a different bit in the same word. 5160 */ 5161 if (*ecx & CPUID_INTC_ECX_AHF64) 5162 hwcap_flags |= AV_386_AHF; 5163 break; 5164 5165 default: 5166 break; 5167 } 5168 break; 5169 5170 case X86_VENDOR_TM: 5171 cp.cp_eax = 0x80860001; 5172 (void) __cpuid_insn(&cp); 5173 cpi->cpi_support[TM_EDX_FEATURES] = cp.cp_edx; 5174 break; 5175 5176 default: 5177 break; 5178 } 5179 5180 pass4_done: 5181 cpi->cpi_pass = 4; 5182 if (hwcap_out != NULL) { 5183 hwcap_out[0] = hwcap_flags; 5184 hwcap_out[1] = hwcap_flags_2; 5185 } 5186 } 5187 5188 5189 /* 5190 * Simulate the cpuid instruction using the data we previously 5191 * captured about this CPU. We try our best to return the truth 5192 * about the hardware, independently of kernel support. 5193 */ 5194 uint32_t 5195 cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp) 5196 { 5197 struct cpuid_info *cpi; 5198 struct cpuid_regs *xcp; 5199 5200 if (cpu == NULL) 5201 cpu = CPU; 5202 cpi = cpu->cpu_m.mcpu_cpi; 5203 5204 ASSERT(cpuid_checkpass(cpu, 3)); 5205 5206 /* 5207 * CPUID data is cached in two separate places: cpi_std for standard 5208 * CPUID leaves , and cpi_extd for extended CPUID leaves. 5209 */ 5210 if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) { 5211 xcp = &cpi->cpi_std[cp->cp_eax]; 5212 } else if (cp->cp_eax >= CPUID_LEAF_EXT_0 && 5213 cp->cp_eax <= cpi->cpi_xmaxeax && 5214 cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) { 5215 xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0]; 5216 } else { 5217 /* 5218 * The caller is asking for data from an input parameter which 5219 * the kernel has not cached. In this case we go fetch from 5220 * the hardware and return the data directly to the user. 5221 */ 5222 return (__cpuid_insn(cp)); 5223 } 5224 5225 cp->cp_eax = xcp->cp_eax; 5226 cp->cp_ebx = xcp->cp_ebx; 5227 cp->cp_ecx = xcp->cp_ecx; 5228 cp->cp_edx = xcp->cp_edx; 5229 return (cp->cp_eax); 5230 } 5231 5232 int 5233 cpuid_checkpass(cpu_t *cpu, int pass) 5234 { 5235 return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL && 5236 cpu->cpu_m.mcpu_cpi->cpi_pass >= pass); 5237 } 5238 5239 int 5240 cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n) 5241 { 5242 ASSERT(cpuid_checkpass(cpu, 3)); 5243 5244 return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr)); 5245 } 5246 5247 int 5248 cpuid_is_cmt(cpu_t *cpu) 5249 { 5250 if (cpu == NULL) 5251 cpu = CPU; 5252 5253 ASSERT(cpuid_checkpass(cpu, 1)); 5254 5255 return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0); 5256 } 5257 5258 /* 5259 * AMD and Intel both implement the 64-bit variant of the syscall 5260 * instruction (syscallq), so if there's -any- support for syscall, 5261 * cpuid currently says "yes, we support this". 5262 * 5263 * However, Intel decided to -not- implement the 32-bit variant of the 5264 * syscall instruction, so we provide a predicate to allow our caller 5265 * to test that subtlety here. 5266 * 5267 * XXPV Currently, 32-bit syscall instructions don't work via the hypervisor, 5268 * even in the case where the hardware would in fact support it. 5269 */ 5270 /*ARGSUSED*/ 5271 int 5272 cpuid_syscall32_insn(cpu_t *cpu) 5273 { 5274 ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), 1)); 5275 5276 #if !defined(__xpv) 5277 if (cpu == NULL) 5278 cpu = CPU; 5279 5280 /*CSTYLED*/ 5281 { 5282 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 5283 5284 if (cpi->cpi_vendor == X86_VENDOR_AMD && 5285 cpi->cpi_xmaxeax >= 0x80000001 && 5286 (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC)) 5287 return (1); 5288 } 5289 #endif 5290 return (0); 5291 } 5292 5293 int 5294 cpuid_getidstr(cpu_t *cpu, char *s, size_t n) 5295 { 5296 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 5297 5298 static const char fmt[] = 5299 "x86 (%s %X family %d model %d step %d clock %d MHz)"; 5300 static const char fmt_ht[] = 5301 "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)"; 5302 5303 ASSERT(cpuid_checkpass(cpu, 1)); 5304 5305 if (cpuid_is_cmt(cpu)) 5306 return (snprintf(s, n, fmt_ht, cpi->cpi_chipid, 5307 cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax, 5308 cpi->cpi_family, cpi->cpi_model, 5309 cpi->cpi_step, cpu->cpu_type_info.pi_clock)); 5310 return (snprintf(s, n, fmt, 5311 cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax, 5312 cpi->cpi_family, cpi->cpi_model, 5313 cpi->cpi_step, cpu->cpu_type_info.pi_clock)); 5314 } 5315 5316 const char * 5317 cpuid_getvendorstr(cpu_t *cpu) 5318 { 5319 ASSERT(cpuid_checkpass(cpu, 1)); 5320 return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr); 5321 } 5322 5323 uint_t 5324 cpuid_getvendor(cpu_t *cpu) 5325 { 5326 ASSERT(cpuid_checkpass(cpu, 1)); 5327 return (cpu->cpu_m.mcpu_cpi->cpi_vendor); 5328 } 5329 5330 uint_t 5331 cpuid_getfamily(cpu_t *cpu) 5332 { 5333 ASSERT(cpuid_checkpass(cpu, 1)); 5334 return (cpu->cpu_m.mcpu_cpi->cpi_family); 5335 } 5336 5337 uint_t 5338 cpuid_getmodel(cpu_t *cpu) 5339 { 5340 ASSERT(cpuid_checkpass(cpu, 1)); 5341 return (cpu->cpu_m.mcpu_cpi->cpi_model); 5342 } 5343 5344 uint_t 5345 cpuid_get_ncpu_per_chip(cpu_t *cpu) 5346 { 5347 ASSERT(cpuid_checkpass(cpu, 1)); 5348 return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip); 5349 } 5350 5351 uint_t 5352 cpuid_get_ncore_per_chip(cpu_t *cpu) 5353 { 5354 ASSERT(cpuid_checkpass(cpu, 1)); 5355 return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip); 5356 } 5357 5358 uint_t 5359 cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu) 5360 { 5361 ASSERT(cpuid_checkpass(cpu, 2)); 5362 return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache); 5363 } 5364 5365 id_t 5366 cpuid_get_last_lvl_cacheid(cpu_t *cpu) 5367 { 5368 ASSERT(cpuid_checkpass(cpu, 2)); 5369 return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid); 5370 } 5371 5372 uint_t 5373 cpuid_getstep(cpu_t *cpu) 5374 { 5375 ASSERT(cpuid_checkpass(cpu, 1)); 5376 return (cpu->cpu_m.mcpu_cpi->cpi_step); 5377 } 5378 5379 uint_t 5380 cpuid_getsig(struct cpu *cpu) 5381 { 5382 ASSERT(cpuid_checkpass(cpu, 1)); 5383 return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax); 5384 } 5385 5386 uint32_t 5387 cpuid_getchiprev(struct cpu *cpu) 5388 { 5389 ASSERT(cpuid_checkpass(cpu, 1)); 5390 return (cpu->cpu_m.mcpu_cpi->cpi_chiprev); 5391 } 5392 5393 const char * 5394 cpuid_getchiprevstr(struct cpu *cpu) 5395 { 5396 ASSERT(cpuid_checkpass(cpu, 1)); 5397 return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr); 5398 } 5399 5400 uint32_t 5401 cpuid_getsockettype(struct cpu *cpu) 5402 { 5403 ASSERT(cpuid_checkpass(cpu, 1)); 5404 return (cpu->cpu_m.mcpu_cpi->cpi_socket); 5405 } 5406 5407 const char * 5408 cpuid_getsocketstr(cpu_t *cpu) 5409 { 5410 static const char *socketstr = NULL; 5411 struct cpuid_info *cpi; 5412 5413 ASSERT(cpuid_checkpass(cpu, 1)); 5414 cpi = cpu->cpu_m.mcpu_cpi; 5415 5416 /* Assume that socket types are the same across the system */ 5417 if (socketstr == NULL) 5418 socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family, 5419 cpi->cpi_model, cpi->cpi_step); 5420 5421 5422 return (socketstr); 5423 } 5424 5425 int 5426 cpuid_get_chipid(cpu_t *cpu) 5427 { 5428 ASSERT(cpuid_checkpass(cpu, 1)); 5429 5430 if (cpuid_is_cmt(cpu)) 5431 return (cpu->cpu_m.mcpu_cpi->cpi_chipid); 5432 return (cpu->cpu_id); 5433 } 5434 5435 id_t 5436 cpuid_get_coreid(cpu_t *cpu) 5437 { 5438 ASSERT(cpuid_checkpass(cpu, 1)); 5439 return (cpu->cpu_m.mcpu_cpi->cpi_coreid); 5440 } 5441 5442 int 5443 cpuid_get_pkgcoreid(cpu_t *cpu) 5444 { 5445 ASSERT(cpuid_checkpass(cpu, 1)); 5446 return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid); 5447 } 5448 5449 int 5450 cpuid_get_clogid(cpu_t *cpu) 5451 { 5452 ASSERT(cpuid_checkpass(cpu, 1)); 5453 return (cpu->cpu_m.mcpu_cpi->cpi_clogid); 5454 } 5455 5456 int 5457 cpuid_get_cacheid(cpu_t *cpu) 5458 { 5459 ASSERT(cpuid_checkpass(cpu, 1)); 5460 return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid); 5461 } 5462 5463 uint_t 5464 cpuid_get_procnodeid(cpu_t *cpu) 5465 { 5466 ASSERT(cpuid_checkpass(cpu, 1)); 5467 return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid); 5468 } 5469 5470 uint_t 5471 cpuid_get_procnodes_per_pkg(cpu_t *cpu) 5472 { 5473 ASSERT(cpuid_checkpass(cpu, 1)); 5474 return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg); 5475 } 5476 5477 uint_t 5478 cpuid_get_compunitid(cpu_t *cpu) 5479 { 5480 ASSERT(cpuid_checkpass(cpu, 1)); 5481 return (cpu->cpu_m.mcpu_cpi->cpi_compunitid); 5482 } 5483 5484 uint_t 5485 cpuid_get_cores_per_compunit(cpu_t *cpu) 5486 { 5487 ASSERT(cpuid_checkpass(cpu, 1)); 5488 return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit); 5489 } 5490 5491 /*ARGSUSED*/ 5492 int 5493 cpuid_have_cr8access(cpu_t *cpu) 5494 { 5495 #if defined(__amd64) 5496 return (1); 5497 #else 5498 struct cpuid_info *cpi; 5499 5500 ASSERT(cpu != NULL); 5501 cpi = cpu->cpu_m.mcpu_cpi; 5502 if (cpi->cpi_vendor == X86_VENDOR_AMD && cpi->cpi_maxeax >= 1 && 5503 (CPI_FEATURES_XTD_ECX(cpi) & CPUID_AMD_ECX_CR8D) != 0) 5504 return (1); 5505 return (0); 5506 #endif 5507 } 5508 5509 uint32_t 5510 cpuid_get_apicid(cpu_t *cpu) 5511 { 5512 ASSERT(cpuid_checkpass(cpu, 1)); 5513 if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) { 5514 return (UINT32_MAX); 5515 } else { 5516 return (cpu->cpu_m.mcpu_cpi->cpi_apicid); 5517 } 5518 } 5519 5520 void 5521 cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits) 5522 { 5523 struct cpuid_info *cpi; 5524 5525 if (cpu == NULL) 5526 cpu = CPU; 5527 cpi = cpu->cpu_m.mcpu_cpi; 5528 5529 ASSERT(cpuid_checkpass(cpu, 1)); 5530 5531 if (pabits) 5532 *pabits = cpi->cpi_pabits; 5533 if (vabits) 5534 *vabits = cpi->cpi_vabits; 5535 } 5536 5537 size_t 5538 cpuid_get_xsave_size() 5539 { 5540 return (MAX(cpuid_info0.cpi_xsave.xsav_max_size, 5541 sizeof (struct xsave_state))); 5542 } 5543 5544 /* 5545 * Return true if the CPUs on this system require 'pointer clearing' for the 5546 * floating point error pointer exception handling. In the past, this has been 5547 * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to 5548 * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO 5549 * feature bit and is reflected in the cpi_fp_amd_save member. 5550 */ 5551 boolean_t 5552 cpuid_need_fp_excp_handling() 5553 { 5554 return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD && 5555 cpuid_info0.cpi_fp_amd_save != 0); 5556 } 5557 5558 /* 5559 * Returns the number of data TLB entries for a corresponding 5560 * pagesize. If it can't be computed, or isn't known, the 5561 * routine returns zero. If you ask about an architecturally 5562 * impossible pagesize, the routine will panic (so that the 5563 * hat implementor knows that things are inconsistent.) 5564 */ 5565 uint_t 5566 cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize) 5567 { 5568 struct cpuid_info *cpi; 5569 uint_t dtlb_nent = 0; 5570 5571 if (cpu == NULL) 5572 cpu = CPU; 5573 cpi = cpu->cpu_m.mcpu_cpi; 5574 5575 ASSERT(cpuid_checkpass(cpu, 1)); 5576 5577 /* 5578 * Check the L2 TLB info 5579 */ 5580 if (cpi->cpi_xmaxeax >= 0x80000006) { 5581 struct cpuid_regs *cp = &cpi->cpi_extd[6]; 5582 5583 switch (pagesize) { 5584 5585 case 4 * 1024: 5586 /* 5587 * All zero in the top 16 bits of the register 5588 * indicates a unified TLB. Size is in low 16 bits. 5589 */ 5590 if ((cp->cp_ebx & 0xffff0000) == 0) 5591 dtlb_nent = cp->cp_ebx & 0x0000ffff; 5592 else 5593 dtlb_nent = BITX(cp->cp_ebx, 27, 16); 5594 break; 5595 5596 case 2 * 1024 * 1024: 5597 if ((cp->cp_eax & 0xffff0000) == 0) 5598 dtlb_nent = cp->cp_eax & 0x0000ffff; 5599 else 5600 dtlb_nent = BITX(cp->cp_eax, 27, 16); 5601 break; 5602 5603 default: 5604 panic("unknown L2 pagesize"); 5605 /*NOTREACHED*/ 5606 } 5607 } 5608 5609 if (dtlb_nent != 0) 5610 return (dtlb_nent); 5611 5612 /* 5613 * No L2 TLB support for this size, try L1. 5614 */ 5615 if (cpi->cpi_xmaxeax >= 0x80000005) { 5616 struct cpuid_regs *cp = &cpi->cpi_extd[5]; 5617 5618 switch (pagesize) { 5619 case 4 * 1024: 5620 dtlb_nent = BITX(cp->cp_ebx, 23, 16); 5621 break; 5622 case 2 * 1024 * 1024: 5623 dtlb_nent = BITX(cp->cp_eax, 23, 16); 5624 break; 5625 default: 5626 panic("unknown L1 d-TLB pagesize"); 5627 /*NOTREACHED*/ 5628 } 5629 } 5630 5631 return (dtlb_nent); 5632 } 5633 5634 /* 5635 * Return 0 if the erratum is not present or not applicable, positive 5636 * if it is, and negative if the status of the erratum is unknown. 5637 * 5638 * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm) 5639 * Processors" #25759, Rev 3.57, August 2005 5640 */ 5641 int 5642 cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum) 5643 { 5644 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 5645 uint_t eax; 5646 5647 /* 5648 * Bail out if this CPU isn't an AMD CPU, or if it's 5649 * a legacy (32-bit) AMD CPU. 5650 */ 5651 if (cpi->cpi_vendor != X86_VENDOR_AMD || 5652 cpi->cpi_family == 4 || cpi->cpi_family == 5 || 5653 cpi->cpi_family == 6) { 5654 return (0); 5655 } 5656 5657 eax = cpi->cpi_std[1].cp_eax; 5658 5659 #define SH_B0(eax) (eax == 0xf40 || eax == 0xf50) 5660 #define SH_B3(eax) (eax == 0xf51) 5661 #define B(eax) (SH_B0(eax) || SH_B3(eax)) 5662 5663 #define SH_C0(eax) (eax == 0xf48 || eax == 0xf58) 5664 5665 #define SH_CG(eax) (eax == 0xf4a || eax == 0xf5a || eax == 0xf7a) 5666 #define DH_CG(eax) (eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0) 5667 #define CH_CG(eax) (eax == 0xf82 || eax == 0xfb2) 5668 #define CG(eax) (SH_CG(eax) || DH_CG(eax) || CH_CG(eax)) 5669 5670 #define SH_D0(eax) (eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70) 5671 #define DH_D0(eax) (eax == 0x10fc0 || eax == 0x10ff0) 5672 #define CH_D0(eax) (eax == 0x10f80 || eax == 0x10fb0) 5673 #define D0(eax) (SH_D0(eax) || DH_D0(eax) || CH_D0(eax)) 5674 5675 #define SH_E0(eax) (eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70) 5676 #define JH_E1(eax) (eax == 0x20f10) /* JH8_E0 had 0x20f30 */ 5677 #define DH_E3(eax) (eax == 0x20fc0 || eax == 0x20ff0) 5678 #define SH_E4(eax) (eax == 0x20f51 || eax == 0x20f71) 5679 #define BH_E4(eax) (eax == 0x20fb1) 5680 #define SH_E5(eax) (eax == 0x20f42) 5681 #define DH_E6(eax) (eax == 0x20ff2 || eax == 0x20fc2) 5682 #define JH_E6(eax) (eax == 0x20f12 || eax == 0x20f32) 5683 #define EX(eax) (SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \ 5684 SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \ 5685 DH_E6(eax) || JH_E6(eax)) 5686 5687 #define DR_AX(eax) (eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02) 5688 #define DR_B0(eax) (eax == 0x100f20) 5689 #define DR_B1(eax) (eax == 0x100f21) 5690 #define DR_BA(eax) (eax == 0x100f2a) 5691 #define DR_B2(eax) (eax == 0x100f22) 5692 #define DR_B3(eax) (eax == 0x100f23) 5693 #define RB_C0(eax) (eax == 0x100f40) 5694 5695 switch (erratum) { 5696 case 1: 5697 return (cpi->cpi_family < 0x10); 5698 case 51: /* what does the asterisk mean? */ 5699 return (B(eax) || SH_C0(eax) || CG(eax)); 5700 case 52: 5701 return (B(eax)); 5702 case 57: 5703 return (cpi->cpi_family <= 0x11); 5704 case 58: 5705 return (B(eax)); 5706 case 60: 5707 return (cpi->cpi_family <= 0x11); 5708 case 61: 5709 case 62: 5710 case 63: 5711 case 64: 5712 case 65: 5713 case 66: 5714 case 68: 5715 case 69: 5716 case 70: 5717 case 71: 5718 return (B(eax)); 5719 case 72: 5720 return (SH_B0(eax)); 5721 case 74: 5722 return (B(eax)); 5723 case 75: 5724 return (cpi->cpi_family < 0x10); 5725 case 76: 5726 return (B(eax)); 5727 case 77: 5728 return (cpi->cpi_family <= 0x11); 5729 case 78: 5730 return (B(eax) || SH_C0(eax)); 5731 case 79: 5732 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax)); 5733 case 80: 5734 case 81: 5735 case 82: 5736 return (B(eax)); 5737 case 83: 5738 return (B(eax) || SH_C0(eax) || CG(eax)); 5739 case 85: 5740 return (cpi->cpi_family < 0x10); 5741 case 86: 5742 return (SH_C0(eax) || CG(eax)); 5743 case 88: 5744 #if !defined(__amd64) 5745 return (0); 5746 #else 5747 return (B(eax) || SH_C0(eax)); 5748 #endif 5749 case 89: 5750 return (cpi->cpi_family < 0x10); 5751 case 90: 5752 return (B(eax) || SH_C0(eax) || CG(eax)); 5753 case 91: 5754 case 92: 5755 return (B(eax) || SH_C0(eax)); 5756 case 93: 5757 return (SH_C0(eax)); 5758 case 94: 5759 return (B(eax) || SH_C0(eax) || CG(eax)); 5760 case 95: 5761 #if !defined(__amd64) 5762 return (0); 5763 #else 5764 return (B(eax) || SH_C0(eax)); 5765 #endif 5766 case 96: 5767 return (B(eax) || SH_C0(eax) || CG(eax)); 5768 case 97: 5769 case 98: 5770 return (SH_C0(eax) || CG(eax)); 5771 case 99: 5772 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax)); 5773 case 100: 5774 return (B(eax) || SH_C0(eax)); 5775 case 101: 5776 case 103: 5777 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax)); 5778 case 104: 5779 return (SH_C0(eax) || CG(eax) || D0(eax)); 5780 case 105: 5781 case 106: 5782 case 107: 5783 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax)); 5784 case 108: 5785 return (DH_CG(eax)); 5786 case 109: 5787 return (SH_C0(eax) || CG(eax) || D0(eax)); 5788 case 110: 5789 return (D0(eax) || EX(eax)); 5790 case 111: 5791 return (CG(eax)); 5792 case 112: 5793 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax)); 5794 case 113: 5795 return (eax == 0x20fc0); 5796 case 114: 5797 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax)); 5798 case 115: 5799 return (SH_E0(eax) || JH_E1(eax)); 5800 case 116: 5801 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax)); 5802 case 117: 5803 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax)); 5804 case 118: 5805 return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) || 5806 JH_E6(eax)); 5807 case 121: 5808 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax)); 5809 case 122: 5810 return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11); 5811 case 123: 5812 return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax)); 5813 case 131: 5814 return (cpi->cpi_family < 0x10); 5815 case 6336786: 5816 5817 /* 5818 * Test for AdvPowerMgmtInfo.TscPStateInvariant 5819 * if this is a K8 family or newer processor. We're testing for 5820 * this 'erratum' to determine whether or not we have a constant 5821 * TSC. 5822 * 5823 * Our current fix for this is to disable the C1-Clock ramping. 5824 * However, this doesn't work on newer processor families nor 5825 * does it work when virtualized as those devices don't exist. 5826 */ 5827 if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) { 5828 return (0); 5829 } 5830 5831 if (CPI_FAMILY(cpi) == 0xf) { 5832 struct cpuid_regs regs; 5833 regs.cp_eax = 0x80000007; 5834 (void) __cpuid_insn(®s); 5835 return (!(regs.cp_edx & 0x100)); 5836 } 5837 return (0); 5838 case 6323525: 5839 /* 5840 * This erratum (K8 #147) is not present on family 10 and newer. 5841 */ 5842 if (cpi->cpi_family >= 0x10) { 5843 return (0); 5844 } 5845 return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) | 5846 (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40); 5847 5848 case 6671130: 5849 /* 5850 * check for processors (pre-Shanghai) that do not provide 5851 * optimal management of 1gb ptes in its tlb. 5852 */ 5853 return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4); 5854 5855 case 298: 5856 return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) || 5857 DR_B2(eax) || RB_C0(eax)); 5858 5859 case 721: 5860 #if defined(__amd64) 5861 return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12); 5862 #else 5863 return (0); 5864 #endif 5865 5866 default: 5867 return (-1); 5868 5869 } 5870 } 5871 5872 /* 5873 * Determine if specified erratum is present via OSVW (OS Visible Workaround). 5874 * Return 1 if erratum is present, 0 if not present and -1 if indeterminate. 5875 */ 5876 int 5877 osvw_opteron_erratum(cpu_t *cpu, uint_t erratum) 5878 { 5879 struct cpuid_info *cpi; 5880 uint_t osvwid; 5881 static int osvwfeature = -1; 5882 uint64_t osvwlength; 5883 5884 5885 cpi = cpu->cpu_m.mcpu_cpi; 5886 5887 /* confirm OSVW supported */ 5888 if (osvwfeature == -1) { 5889 osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW; 5890 } else { 5891 /* assert that osvw feature setting is consistent on all cpus */ 5892 ASSERT(osvwfeature == 5893 (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW)); 5894 } 5895 if (!osvwfeature) 5896 return (-1); 5897 5898 osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK; 5899 5900 switch (erratum) { 5901 case 298: /* osvwid is 0 */ 5902 osvwid = 0; 5903 if (osvwlength <= (uint64_t)osvwid) { 5904 /* osvwid 0 is unknown */ 5905 return (-1); 5906 } 5907 5908 /* 5909 * Check the OSVW STATUS MSR to determine the state 5910 * of the erratum where: 5911 * 0 - fixed by HW 5912 * 1 - BIOS has applied the workaround when BIOS 5913 * workaround is available. (Or for other errata, 5914 * OS workaround is required.) 5915 * For a value of 1, caller will confirm that the 5916 * erratum 298 workaround has indeed been applied by BIOS. 5917 * 5918 * A 1 may be set in cpus that have a HW fix 5919 * in a mixed cpu system. Regarding erratum 298: 5920 * In a multiprocessor platform, the workaround above 5921 * should be applied to all processors regardless of 5922 * silicon revision when an affected processor is 5923 * present. 5924 */ 5925 5926 return (rdmsr(MSR_AMD_OSVW_STATUS + 5927 (osvwid / OSVW_ID_CNT_PER_MSR)) & 5928 (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR))); 5929 5930 default: 5931 return (-1); 5932 } 5933 } 5934 5935 static const char assoc_str[] = "associativity"; 5936 static const char line_str[] = "line-size"; 5937 static const char size_str[] = "size"; 5938 5939 static void 5940 add_cache_prop(dev_info_t *devi, const char *label, const char *type, 5941 uint32_t val) 5942 { 5943 char buf[128]; 5944 5945 /* 5946 * ndi_prop_update_int() is used because it is desirable for 5947 * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set. 5948 */ 5949 if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf)) 5950 (void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val); 5951 } 5952 5953 /* 5954 * Intel-style cache/tlb description 5955 * 5956 * Standard cpuid level 2 gives a randomly ordered 5957 * selection of tags that index into a table that describes 5958 * cache and tlb properties. 5959 */ 5960 5961 static const char l1_icache_str[] = "l1-icache"; 5962 static const char l1_dcache_str[] = "l1-dcache"; 5963 static const char l2_cache_str[] = "l2-cache"; 5964 static const char l3_cache_str[] = "l3-cache"; 5965 static const char itlb4k_str[] = "itlb-4K"; 5966 static const char dtlb4k_str[] = "dtlb-4K"; 5967 static const char itlb2M_str[] = "itlb-2M"; 5968 static const char itlb4M_str[] = "itlb-4M"; 5969 static const char dtlb4M_str[] = "dtlb-4M"; 5970 static const char dtlb24_str[] = "dtlb0-2M-4M"; 5971 static const char itlb424_str[] = "itlb-4K-2M-4M"; 5972 static const char itlb24_str[] = "itlb-2M-4M"; 5973 static const char dtlb44_str[] = "dtlb-4K-4M"; 5974 static const char sl1_dcache_str[] = "sectored-l1-dcache"; 5975 static const char sl2_cache_str[] = "sectored-l2-cache"; 5976 static const char itrace_str[] = "itrace-cache"; 5977 static const char sl3_cache_str[] = "sectored-l3-cache"; 5978 static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k"; 5979 5980 static const struct cachetab { 5981 uint8_t ct_code; 5982 uint8_t ct_assoc; 5983 uint16_t ct_line_size; 5984 size_t ct_size; 5985 const char *ct_label; 5986 } intel_ctab[] = { 5987 /* 5988 * maintain descending order! 5989 * 5990 * Codes ignored - Reason 5991 * ---------------------- 5992 * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache 5993 * f0H/f1H - Currently we do not interpret prefetch size by design 5994 */ 5995 { 0xe4, 16, 64, 8*1024*1024, l3_cache_str}, 5996 { 0xe3, 16, 64, 4*1024*1024, l3_cache_str}, 5997 { 0xe2, 16, 64, 2*1024*1024, l3_cache_str}, 5998 { 0xde, 12, 64, 6*1024*1024, l3_cache_str}, 5999 { 0xdd, 12, 64, 3*1024*1024, l3_cache_str}, 6000 { 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str}, 6001 { 0xd8, 8, 64, 4*1024*1024, l3_cache_str}, 6002 { 0xd7, 8, 64, 2*1024*1024, l3_cache_str}, 6003 { 0xd6, 8, 64, 1*1024*1024, l3_cache_str}, 6004 { 0xd2, 4, 64, 2*1024*1024, l3_cache_str}, 6005 { 0xd1, 4, 64, 1*1024*1024, l3_cache_str}, 6006 { 0xd0, 4, 64, 512*1024, l3_cache_str}, 6007 { 0xca, 4, 0, 512, sh_l2_tlb4k_str}, 6008 { 0xc0, 4, 0, 8, dtlb44_str }, 6009 { 0xba, 4, 0, 64, dtlb4k_str }, 6010 { 0xb4, 4, 0, 256, dtlb4k_str }, 6011 { 0xb3, 4, 0, 128, dtlb4k_str }, 6012 { 0xb2, 4, 0, 64, itlb4k_str }, 6013 { 0xb0, 4, 0, 128, itlb4k_str }, 6014 { 0x87, 8, 64, 1024*1024, l2_cache_str}, 6015 { 0x86, 4, 64, 512*1024, l2_cache_str}, 6016 { 0x85, 8, 32, 2*1024*1024, l2_cache_str}, 6017 { 0x84, 8, 32, 1024*1024, l2_cache_str}, 6018 { 0x83, 8, 32, 512*1024, l2_cache_str}, 6019 { 0x82, 8, 32, 256*1024, l2_cache_str}, 6020 { 0x80, 8, 64, 512*1024, l2_cache_str}, 6021 { 0x7f, 2, 64, 512*1024, l2_cache_str}, 6022 { 0x7d, 8, 64, 2*1024*1024, sl2_cache_str}, 6023 { 0x7c, 8, 64, 1024*1024, sl2_cache_str}, 6024 { 0x7b, 8, 64, 512*1024, sl2_cache_str}, 6025 { 0x7a, 8, 64, 256*1024, sl2_cache_str}, 6026 { 0x79, 8, 64, 128*1024, sl2_cache_str}, 6027 { 0x78, 8, 64, 1024*1024, l2_cache_str}, 6028 { 0x73, 8, 0, 64*1024, itrace_str}, 6029 { 0x72, 8, 0, 32*1024, itrace_str}, 6030 { 0x71, 8, 0, 16*1024, itrace_str}, 6031 { 0x70, 8, 0, 12*1024, itrace_str}, 6032 { 0x68, 4, 64, 32*1024, sl1_dcache_str}, 6033 { 0x67, 4, 64, 16*1024, sl1_dcache_str}, 6034 { 0x66, 4, 64, 8*1024, sl1_dcache_str}, 6035 { 0x60, 8, 64, 16*1024, sl1_dcache_str}, 6036 { 0x5d, 0, 0, 256, dtlb44_str}, 6037 { 0x5c, 0, 0, 128, dtlb44_str}, 6038 { 0x5b, 0, 0, 64, dtlb44_str}, 6039 { 0x5a, 4, 0, 32, dtlb24_str}, 6040 { 0x59, 0, 0, 16, dtlb4k_str}, 6041 { 0x57, 4, 0, 16, dtlb4k_str}, 6042 { 0x56, 4, 0, 16, dtlb4M_str}, 6043 { 0x55, 0, 0, 7, itlb24_str}, 6044 { 0x52, 0, 0, 256, itlb424_str}, 6045 { 0x51, 0, 0, 128, itlb424_str}, 6046 { 0x50, 0, 0, 64, itlb424_str}, 6047 { 0x4f, 0, 0, 32, itlb4k_str}, 6048 { 0x4e, 24, 64, 6*1024*1024, l2_cache_str}, 6049 { 0x4d, 16, 64, 16*1024*1024, l3_cache_str}, 6050 { 0x4c, 12, 64, 12*1024*1024, l3_cache_str}, 6051 { 0x4b, 16, 64, 8*1024*1024, l3_cache_str}, 6052 { 0x4a, 12, 64, 6*1024*1024, l3_cache_str}, 6053 { 0x49, 16, 64, 4*1024*1024, l3_cache_str}, 6054 { 0x48, 12, 64, 3*1024*1024, l2_cache_str}, 6055 { 0x47, 8, 64, 8*1024*1024, l3_cache_str}, 6056 { 0x46, 4, 64, 4*1024*1024, l3_cache_str}, 6057 { 0x45, 4, 32, 2*1024*1024, l2_cache_str}, 6058 { 0x44, 4, 32, 1024*1024, l2_cache_str}, 6059 { 0x43, 4, 32, 512*1024, l2_cache_str}, 6060 { 0x42, 4, 32, 256*1024, l2_cache_str}, 6061 { 0x41, 4, 32, 128*1024, l2_cache_str}, 6062 { 0x3e, 4, 64, 512*1024, sl2_cache_str}, 6063 { 0x3d, 6, 64, 384*1024, sl2_cache_str}, 6064 { 0x3c, 4, 64, 256*1024, sl2_cache_str}, 6065 { 0x3b, 2, 64, 128*1024, sl2_cache_str}, 6066 { 0x3a, 6, 64, 192*1024, sl2_cache_str}, 6067 { 0x39, 4, 64, 128*1024, sl2_cache_str}, 6068 { 0x30, 8, 64, 32*1024, l1_icache_str}, 6069 { 0x2c, 8, 64, 32*1024, l1_dcache_str}, 6070 { 0x29, 8, 64, 4096*1024, sl3_cache_str}, 6071 { 0x25, 8, 64, 2048*1024, sl3_cache_str}, 6072 { 0x23, 8, 64, 1024*1024, sl3_cache_str}, 6073 { 0x22, 4, 64, 512*1024, sl3_cache_str}, 6074 { 0x0e, 6, 64, 24*1024, l1_dcache_str}, 6075 { 0x0d, 4, 32, 16*1024, l1_dcache_str}, 6076 { 0x0c, 4, 32, 16*1024, l1_dcache_str}, 6077 { 0x0b, 4, 0, 4, itlb4M_str}, 6078 { 0x0a, 2, 32, 8*1024, l1_dcache_str}, 6079 { 0x08, 4, 32, 16*1024, l1_icache_str}, 6080 { 0x06, 4, 32, 8*1024, l1_icache_str}, 6081 { 0x05, 4, 0, 32, dtlb4M_str}, 6082 { 0x04, 4, 0, 8, dtlb4M_str}, 6083 { 0x03, 4, 0, 64, dtlb4k_str}, 6084 { 0x02, 4, 0, 2, itlb4M_str}, 6085 { 0x01, 4, 0, 32, itlb4k_str}, 6086 { 0 } 6087 }; 6088 6089 static const struct cachetab cyrix_ctab[] = { 6090 { 0x70, 4, 0, 32, "tlb-4K" }, 6091 { 0x80, 4, 16, 16*1024, "l1-cache" }, 6092 { 0 } 6093 }; 6094 6095 /* 6096 * Search a cache table for a matching entry 6097 */ 6098 static const struct cachetab * 6099 find_cacheent(const struct cachetab *ct, uint_t code) 6100 { 6101 if (code != 0) { 6102 for (; ct->ct_code != 0; ct++) 6103 if (ct->ct_code <= code) 6104 break; 6105 if (ct->ct_code == code) 6106 return (ct); 6107 } 6108 return (NULL); 6109 } 6110 6111 /* 6112 * Populate cachetab entry with L2 or L3 cache-information using 6113 * cpuid function 4. This function is called from intel_walk_cacheinfo() 6114 * when descriptor 0x49 is encountered. It returns 0 if no such cache 6115 * information is found. 6116 */ 6117 static int 6118 intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi) 6119 { 6120 uint32_t level, i; 6121 int ret = 0; 6122 6123 for (i = 0; i < cpi->cpi_cache_leaf_size; i++) { 6124 level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]); 6125 6126 if (level == 2 || level == 3) { 6127 ct->ct_assoc = 6128 CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1; 6129 ct->ct_line_size = 6130 CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1; 6131 ct->ct_size = ct->ct_assoc * 6132 (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) * 6133 ct->ct_line_size * 6134 (cpi->cpi_cache_leaves[i]->cp_ecx + 1); 6135 6136 if (level == 2) { 6137 ct->ct_label = l2_cache_str; 6138 } else if (level == 3) { 6139 ct->ct_label = l3_cache_str; 6140 } 6141 ret = 1; 6142 } 6143 } 6144 6145 return (ret); 6146 } 6147 6148 /* 6149 * Walk the cacheinfo descriptor, applying 'func' to every valid element 6150 * The walk is terminated if the walker returns non-zero. 6151 */ 6152 static void 6153 intel_walk_cacheinfo(struct cpuid_info *cpi, 6154 void *arg, int (*func)(void *, const struct cachetab *)) 6155 { 6156 const struct cachetab *ct; 6157 struct cachetab des_49_ct, des_b1_ct; 6158 uint8_t *dp; 6159 int i; 6160 6161 if ((dp = cpi->cpi_cacheinfo) == NULL) 6162 return; 6163 for (i = 0; i < cpi->cpi_ncache; i++, dp++) { 6164 /* 6165 * For overloaded descriptor 0x49 we use cpuid function 4 6166 * if supported by the current processor, to create 6167 * cache information. 6168 * For overloaded descriptor 0xb1 we use X86_PAE flag 6169 * to disambiguate the cache information. 6170 */ 6171 if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 && 6172 intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) { 6173 ct = &des_49_ct; 6174 } else if (*dp == 0xb1) { 6175 des_b1_ct.ct_code = 0xb1; 6176 des_b1_ct.ct_assoc = 4; 6177 des_b1_ct.ct_line_size = 0; 6178 if (is_x86_feature(x86_featureset, X86FSET_PAE)) { 6179 des_b1_ct.ct_size = 8; 6180 des_b1_ct.ct_label = itlb2M_str; 6181 } else { 6182 des_b1_ct.ct_size = 4; 6183 des_b1_ct.ct_label = itlb4M_str; 6184 } 6185 ct = &des_b1_ct; 6186 } else { 6187 if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) { 6188 continue; 6189 } 6190 } 6191 6192 if (func(arg, ct) != 0) { 6193 break; 6194 } 6195 } 6196 } 6197 6198 /* 6199 * (Like the Intel one, except for Cyrix CPUs) 6200 */ 6201 static void 6202 cyrix_walk_cacheinfo(struct cpuid_info *cpi, 6203 void *arg, int (*func)(void *, const struct cachetab *)) 6204 { 6205 const struct cachetab *ct; 6206 uint8_t *dp; 6207 int i; 6208 6209 if ((dp = cpi->cpi_cacheinfo) == NULL) 6210 return; 6211 for (i = 0; i < cpi->cpi_ncache; i++, dp++) { 6212 /* 6213 * Search Cyrix-specific descriptor table first .. 6214 */ 6215 if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) { 6216 if (func(arg, ct) != 0) 6217 break; 6218 continue; 6219 } 6220 /* 6221 * .. else fall back to the Intel one 6222 */ 6223 if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) { 6224 if (func(arg, ct) != 0) 6225 break; 6226 continue; 6227 } 6228 } 6229 } 6230 6231 /* 6232 * A cacheinfo walker that adds associativity, line-size, and size properties 6233 * to the devinfo node it is passed as an argument. 6234 */ 6235 static int 6236 add_cacheent_props(void *arg, const struct cachetab *ct) 6237 { 6238 dev_info_t *devi = arg; 6239 6240 add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc); 6241 if (ct->ct_line_size != 0) 6242 add_cache_prop(devi, ct->ct_label, line_str, 6243 ct->ct_line_size); 6244 add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size); 6245 return (0); 6246 } 6247 6248 6249 static const char fully_assoc[] = "fully-associative?"; 6250 6251 /* 6252 * AMD style cache/tlb description 6253 * 6254 * Extended functions 5 and 6 directly describe properties of 6255 * tlbs and various cache levels. 6256 */ 6257 static void 6258 add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc) 6259 { 6260 switch (assoc) { 6261 case 0: /* reserved; ignore */ 6262 break; 6263 default: 6264 add_cache_prop(devi, label, assoc_str, assoc); 6265 break; 6266 case 0xff: 6267 add_cache_prop(devi, label, fully_assoc, 1); 6268 break; 6269 } 6270 } 6271 6272 static void 6273 add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size) 6274 { 6275 if (size == 0) 6276 return; 6277 add_cache_prop(devi, label, size_str, size); 6278 add_amd_assoc(devi, label, assoc); 6279 } 6280 6281 static void 6282 add_amd_cache(dev_info_t *devi, const char *label, 6283 uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size) 6284 { 6285 if (size == 0 || line_size == 0) 6286 return; 6287 add_amd_assoc(devi, label, assoc); 6288 /* 6289 * Most AMD parts have a sectored cache. Multiple cache lines are 6290 * associated with each tag. A sector consists of all cache lines 6291 * associated with a tag. For example, the AMD K6-III has a sector 6292 * size of 2 cache lines per tag. 6293 */ 6294 if (lines_per_tag != 0) 6295 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag); 6296 add_cache_prop(devi, label, line_str, line_size); 6297 add_cache_prop(devi, label, size_str, size * 1024); 6298 } 6299 6300 static void 6301 add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc) 6302 { 6303 switch (assoc) { 6304 case 0: /* off */ 6305 break; 6306 case 1: 6307 case 2: 6308 case 4: 6309 add_cache_prop(devi, label, assoc_str, assoc); 6310 break; 6311 case 6: 6312 add_cache_prop(devi, label, assoc_str, 8); 6313 break; 6314 case 8: 6315 add_cache_prop(devi, label, assoc_str, 16); 6316 break; 6317 case 0xf: 6318 add_cache_prop(devi, label, fully_assoc, 1); 6319 break; 6320 default: /* reserved; ignore */ 6321 break; 6322 } 6323 } 6324 6325 static void 6326 add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size) 6327 { 6328 if (size == 0 || assoc == 0) 6329 return; 6330 add_amd_l2_assoc(devi, label, assoc); 6331 add_cache_prop(devi, label, size_str, size); 6332 } 6333 6334 static void 6335 add_amd_l2_cache(dev_info_t *devi, const char *label, 6336 uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size) 6337 { 6338 if (size == 0 || assoc == 0 || line_size == 0) 6339 return; 6340 add_amd_l2_assoc(devi, label, assoc); 6341 if (lines_per_tag != 0) 6342 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag); 6343 add_cache_prop(devi, label, line_str, line_size); 6344 add_cache_prop(devi, label, size_str, size * 1024); 6345 } 6346 6347 static void 6348 amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi) 6349 { 6350 struct cpuid_regs *cp; 6351 6352 if (cpi->cpi_xmaxeax < 0x80000005) 6353 return; 6354 cp = &cpi->cpi_extd[5]; 6355 6356 /* 6357 * 4M/2M L1 TLB configuration 6358 * 6359 * We report the size for 2M pages because AMD uses two 6360 * TLB entries for one 4M page. 6361 */ 6362 add_amd_tlb(devi, "dtlb-2M", 6363 BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16)); 6364 add_amd_tlb(devi, "itlb-2M", 6365 BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0)); 6366 6367 /* 6368 * 4K L1 TLB configuration 6369 */ 6370 6371 switch (cpi->cpi_vendor) { 6372 uint_t nentries; 6373 case X86_VENDOR_TM: 6374 if (cpi->cpi_family >= 5) { 6375 /* 6376 * Crusoe processors have 256 TLB entries, but 6377 * cpuid data format constrains them to only 6378 * reporting 255 of them. 6379 */ 6380 if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255) 6381 nentries = 256; 6382 /* 6383 * Crusoe processors also have a unified TLB 6384 */ 6385 add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24), 6386 nentries); 6387 break; 6388 } 6389 /*FALLTHROUGH*/ 6390 default: 6391 add_amd_tlb(devi, itlb4k_str, 6392 BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16)); 6393 add_amd_tlb(devi, dtlb4k_str, 6394 BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0)); 6395 break; 6396 } 6397 6398 /* 6399 * data L1 cache configuration 6400 */ 6401 6402 add_amd_cache(devi, l1_dcache_str, 6403 BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16), 6404 BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0)); 6405 6406 /* 6407 * code L1 cache configuration 6408 */ 6409 6410 add_amd_cache(devi, l1_icache_str, 6411 BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16), 6412 BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0)); 6413 6414 if (cpi->cpi_xmaxeax < 0x80000006) 6415 return; 6416 cp = &cpi->cpi_extd[6]; 6417 6418 /* Check for a unified L2 TLB for large pages */ 6419 6420 if (BITX(cp->cp_eax, 31, 16) == 0) 6421 add_amd_l2_tlb(devi, "l2-tlb-2M", 6422 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0)); 6423 else { 6424 add_amd_l2_tlb(devi, "l2-dtlb-2M", 6425 BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16)); 6426 add_amd_l2_tlb(devi, "l2-itlb-2M", 6427 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0)); 6428 } 6429 6430 /* Check for a unified L2 TLB for 4K pages */ 6431 6432 if (BITX(cp->cp_ebx, 31, 16) == 0) { 6433 add_amd_l2_tlb(devi, "l2-tlb-4K", 6434 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0)); 6435 } else { 6436 add_amd_l2_tlb(devi, "l2-dtlb-4K", 6437 BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16)); 6438 add_amd_l2_tlb(devi, "l2-itlb-4K", 6439 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0)); 6440 } 6441 6442 add_amd_l2_cache(devi, l2_cache_str, 6443 BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12), 6444 BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0)); 6445 } 6446 6447 /* 6448 * There are two basic ways that the x86 world describes it cache 6449 * and tlb architecture - Intel's way and AMD's way. 6450 * 6451 * Return which flavor of cache architecture we should use 6452 */ 6453 static int 6454 x86_which_cacheinfo(struct cpuid_info *cpi) 6455 { 6456 switch (cpi->cpi_vendor) { 6457 case X86_VENDOR_Intel: 6458 if (cpi->cpi_maxeax >= 2) 6459 return (X86_VENDOR_Intel); 6460 break; 6461 case X86_VENDOR_AMD: 6462 /* 6463 * The K5 model 1 was the first part from AMD that reported 6464 * cache sizes via extended cpuid functions. 6465 */ 6466 if (cpi->cpi_family > 5 || 6467 (cpi->cpi_family == 5 && cpi->cpi_model >= 1)) 6468 return (X86_VENDOR_AMD); 6469 break; 6470 case X86_VENDOR_TM: 6471 if (cpi->cpi_family >= 5) 6472 return (X86_VENDOR_AMD); 6473 /*FALLTHROUGH*/ 6474 default: 6475 /* 6476 * If they have extended CPU data for 0x80000005 6477 * then we assume they have AMD-format cache 6478 * information. 6479 * 6480 * If not, and the vendor happens to be Cyrix, 6481 * then try our-Cyrix specific handler. 6482 * 6483 * If we're not Cyrix, then assume we're using Intel's 6484 * table-driven format instead. 6485 */ 6486 if (cpi->cpi_xmaxeax >= 0x80000005) 6487 return (X86_VENDOR_AMD); 6488 else if (cpi->cpi_vendor == X86_VENDOR_Cyrix) 6489 return (X86_VENDOR_Cyrix); 6490 else if (cpi->cpi_maxeax >= 2) 6491 return (X86_VENDOR_Intel); 6492 break; 6493 } 6494 return (-1); 6495 } 6496 6497 void 6498 cpuid_set_cpu_properties(void *dip, processorid_t cpu_id, 6499 struct cpuid_info *cpi) 6500 { 6501 dev_info_t *cpu_devi; 6502 int create; 6503 6504 cpu_devi = (dev_info_t *)dip; 6505 6506 /* device_type */ 6507 (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi, 6508 "device_type", "cpu"); 6509 6510 /* reg */ 6511 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6512 "reg", cpu_id); 6513 6514 /* cpu-mhz, and clock-frequency */ 6515 if (cpu_freq > 0) { 6516 long long mul; 6517 6518 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6519 "cpu-mhz", cpu_freq); 6520 if ((mul = cpu_freq * 1000000LL) <= INT_MAX) 6521 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6522 "clock-frequency", (int)mul); 6523 } 6524 6525 if (!is_x86_feature(x86_featureset, X86FSET_CPUID)) { 6526 return; 6527 } 6528 6529 /* vendor-id */ 6530 (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi, 6531 "vendor-id", cpi->cpi_vendorstr); 6532 6533 if (cpi->cpi_maxeax == 0) { 6534 return; 6535 } 6536 6537 /* 6538 * family, model, and step 6539 */ 6540 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6541 "family", CPI_FAMILY(cpi)); 6542 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6543 "cpu-model", CPI_MODEL(cpi)); 6544 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6545 "stepping-id", CPI_STEP(cpi)); 6546 6547 /* type */ 6548 switch (cpi->cpi_vendor) { 6549 case X86_VENDOR_Intel: 6550 create = 1; 6551 break; 6552 default: 6553 create = 0; 6554 break; 6555 } 6556 if (create) 6557 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6558 "type", CPI_TYPE(cpi)); 6559 6560 /* ext-family */ 6561 switch (cpi->cpi_vendor) { 6562 case X86_VENDOR_Intel: 6563 case X86_VENDOR_AMD: 6564 create = cpi->cpi_family >= 0xf; 6565 break; 6566 default: 6567 create = 0; 6568 break; 6569 } 6570 if (create) 6571 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6572 "ext-family", CPI_FAMILY_XTD(cpi)); 6573 6574 /* ext-model */ 6575 switch (cpi->cpi_vendor) { 6576 case X86_VENDOR_Intel: 6577 create = IS_EXTENDED_MODEL_INTEL(cpi); 6578 break; 6579 case X86_VENDOR_AMD: 6580 create = CPI_FAMILY(cpi) == 0xf; 6581 break; 6582 default: 6583 create = 0; 6584 break; 6585 } 6586 if (create) 6587 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6588 "ext-model", CPI_MODEL_XTD(cpi)); 6589 6590 /* generation */ 6591 switch (cpi->cpi_vendor) { 6592 case X86_VENDOR_AMD: 6593 /* 6594 * AMD K5 model 1 was the first part to support this 6595 */ 6596 create = cpi->cpi_xmaxeax >= 0x80000001; 6597 break; 6598 default: 6599 create = 0; 6600 break; 6601 } 6602 if (create) 6603 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6604 "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8)); 6605 6606 /* brand-id */ 6607 switch (cpi->cpi_vendor) { 6608 case X86_VENDOR_Intel: 6609 /* 6610 * brand id first appeared on Pentium III Xeon model 8, 6611 * and Celeron model 8 processors and Opteron 6612 */ 6613 create = cpi->cpi_family > 6 || 6614 (cpi->cpi_family == 6 && cpi->cpi_model >= 8); 6615 break; 6616 case X86_VENDOR_AMD: 6617 create = cpi->cpi_family >= 0xf; 6618 break; 6619 default: 6620 create = 0; 6621 break; 6622 } 6623 if (create && cpi->cpi_brandid != 0) { 6624 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6625 "brand-id", cpi->cpi_brandid); 6626 } 6627 6628 /* chunks, and apic-id */ 6629 switch (cpi->cpi_vendor) { 6630 /* 6631 * first available on Pentium IV and Opteron (K8) 6632 */ 6633 case X86_VENDOR_Intel: 6634 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf; 6635 break; 6636 case X86_VENDOR_AMD: 6637 create = cpi->cpi_family >= 0xf; 6638 break; 6639 default: 6640 create = 0; 6641 break; 6642 } 6643 if (create) { 6644 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6645 "chunks", CPI_CHUNKS(cpi)); 6646 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6647 "apic-id", cpi->cpi_apicid); 6648 if (cpi->cpi_chipid >= 0) { 6649 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6650 "chip#", cpi->cpi_chipid); 6651 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6652 "clog#", cpi->cpi_clogid); 6653 } 6654 } 6655 6656 /* cpuid-features */ 6657 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6658 "cpuid-features", CPI_FEATURES_EDX(cpi)); 6659 6660 6661 /* cpuid-features-ecx */ 6662 switch (cpi->cpi_vendor) { 6663 case X86_VENDOR_Intel: 6664 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf; 6665 break; 6666 case X86_VENDOR_AMD: 6667 create = cpi->cpi_family >= 0xf; 6668 break; 6669 default: 6670 create = 0; 6671 break; 6672 } 6673 if (create) 6674 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6675 "cpuid-features-ecx", CPI_FEATURES_ECX(cpi)); 6676 6677 /* ext-cpuid-features */ 6678 switch (cpi->cpi_vendor) { 6679 case X86_VENDOR_Intel: 6680 case X86_VENDOR_AMD: 6681 case X86_VENDOR_Cyrix: 6682 case X86_VENDOR_TM: 6683 case X86_VENDOR_Centaur: 6684 create = cpi->cpi_xmaxeax >= 0x80000001; 6685 break; 6686 default: 6687 create = 0; 6688 break; 6689 } 6690 if (create) { 6691 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6692 "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi)); 6693 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6694 "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi)); 6695 } 6696 6697 /* 6698 * Brand String first appeared in Intel Pentium IV, AMD K5 6699 * model 1, and Cyrix GXm. On earlier models we try and 6700 * simulate something similar .. so this string should always 6701 * same -something- about the processor, however lame. 6702 */ 6703 (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi, 6704 "brand-string", cpi->cpi_brandstr); 6705 6706 /* 6707 * Finally, cache and tlb information 6708 */ 6709 switch (x86_which_cacheinfo(cpi)) { 6710 case X86_VENDOR_Intel: 6711 intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props); 6712 break; 6713 case X86_VENDOR_Cyrix: 6714 cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props); 6715 break; 6716 case X86_VENDOR_AMD: 6717 amd_cache_info(cpi, cpu_devi); 6718 break; 6719 default: 6720 break; 6721 } 6722 } 6723 6724 struct l2info { 6725 int *l2i_csz; 6726 int *l2i_lsz; 6727 int *l2i_assoc; 6728 int l2i_ret; 6729 }; 6730 6731 /* 6732 * A cacheinfo walker that fetches the size, line-size and associativity 6733 * of the L2 cache 6734 */ 6735 static int 6736 intel_l2cinfo(void *arg, const struct cachetab *ct) 6737 { 6738 struct l2info *l2i = arg; 6739 int *ip; 6740 6741 if (ct->ct_label != l2_cache_str && 6742 ct->ct_label != sl2_cache_str) 6743 return (0); /* not an L2 -- keep walking */ 6744 6745 if ((ip = l2i->l2i_csz) != NULL) 6746 *ip = ct->ct_size; 6747 if ((ip = l2i->l2i_lsz) != NULL) 6748 *ip = ct->ct_line_size; 6749 if ((ip = l2i->l2i_assoc) != NULL) 6750 *ip = ct->ct_assoc; 6751 l2i->l2i_ret = ct->ct_size; 6752 return (1); /* was an L2 -- terminate walk */ 6753 } 6754 6755 /* 6756 * AMD L2/L3 Cache and TLB Associativity Field Definition: 6757 * 6758 * Unlike the associativity for the L1 cache and tlb where the 8 bit 6759 * value is the associativity, the associativity for the L2 cache and 6760 * tlb is encoded in the following table. The 4 bit L2 value serves as 6761 * an index into the amd_afd[] array to determine the associativity. 6762 * -1 is undefined. 0 is fully associative. 6763 */ 6764 6765 static int amd_afd[] = 6766 {-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0}; 6767 6768 static void 6769 amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i) 6770 { 6771 struct cpuid_regs *cp; 6772 uint_t size, assoc; 6773 int i; 6774 int *ip; 6775 6776 if (cpi->cpi_xmaxeax < 0x80000006) 6777 return; 6778 cp = &cpi->cpi_extd[6]; 6779 6780 if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 && 6781 (size = BITX(cp->cp_ecx, 31, 16)) != 0) { 6782 uint_t cachesz = size * 1024; 6783 assoc = amd_afd[i]; 6784 6785 ASSERT(assoc != -1); 6786 6787 if ((ip = l2i->l2i_csz) != NULL) 6788 *ip = cachesz; 6789 if ((ip = l2i->l2i_lsz) != NULL) 6790 *ip = BITX(cp->cp_ecx, 7, 0); 6791 if ((ip = l2i->l2i_assoc) != NULL) 6792 *ip = assoc; 6793 l2i->l2i_ret = cachesz; 6794 } 6795 } 6796 6797 int 6798 getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc) 6799 { 6800 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 6801 struct l2info __l2info, *l2i = &__l2info; 6802 6803 l2i->l2i_csz = csz; 6804 l2i->l2i_lsz = lsz; 6805 l2i->l2i_assoc = assoc; 6806 l2i->l2i_ret = -1; 6807 6808 switch (x86_which_cacheinfo(cpi)) { 6809 case X86_VENDOR_Intel: 6810 intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo); 6811 break; 6812 case X86_VENDOR_Cyrix: 6813 cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo); 6814 break; 6815 case X86_VENDOR_AMD: 6816 amd_l2cacheinfo(cpi, l2i); 6817 break; 6818 default: 6819 break; 6820 } 6821 return (l2i->l2i_ret); 6822 } 6823 6824 #if !defined(__xpv) 6825 6826 uint32_t * 6827 cpuid_mwait_alloc(cpu_t *cpu) 6828 { 6829 uint32_t *ret; 6830 size_t mwait_size; 6831 6832 ASSERT(cpuid_checkpass(CPU, 2)); 6833 6834 mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max; 6835 if (mwait_size == 0) 6836 return (NULL); 6837 6838 /* 6839 * kmem_alloc() returns cache line size aligned data for mwait_size 6840 * allocations. mwait_size is currently cache line sized. Neither 6841 * of these implementation details are guarantied to be true in the 6842 * future. 6843 * 6844 * First try allocating mwait_size as kmem_alloc() currently returns 6845 * correctly aligned memory. If kmem_alloc() does not return 6846 * mwait_size aligned memory, then use mwait_size ROUNDUP. 6847 * 6848 * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we 6849 * decide to free this memory. 6850 */ 6851 ret = kmem_zalloc(mwait_size, KM_SLEEP); 6852 if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) { 6853 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret; 6854 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size; 6855 *ret = MWAIT_RUNNING; 6856 return (ret); 6857 } else { 6858 kmem_free(ret, mwait_size); 6859 ret = kmem_zalloc(mwait_size * 2, KM_SLEEP); 6860 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret; 6861 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2; 6862 ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size); 6863 *ret = MWAIT_RUNNING; 6864 return (ret); 6865 } 6866 } 6867 6868 void 6869 cpuid_mwait_free(cpu_t *cpu) 6870 { 6871 if (cpu->cpu_m.mcpu_cpi == NULL) { 6872 return; 6873 } 6874 6875 if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL && 6876 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) { 6877 kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual, 6878 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual); 6879 } 6880 6881 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL; 6882 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0; 6883 } 6884 6885 void 6886 patch_tsc_read(int flag) 6887 { 6888 size_t cnt; 6889 6890 switch (flag) { 6891 case TSC_NONE: 6892 cnt = &_no_rdtsc_end - &_no_rdtsc_start; 6893 (void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt); 6894 break; 6895 case TSC_RDTSC_MFENCE: 6896 cnt = &_tsc_mfence_end - &_tsc_mfence_start; 6897 (void) memcpy((void *)tsc_read, 6898 (void *)&_tsc_mfence_start, cnt); 6899 break; 6900 case TSC_RDTSC_LFENCE: 6901 cnt = &_tsc_lfence_end - &_tsc_lfence_start; 6902 (void) memcpy((void *)tsc_read, 6903 (void *)&_tsc_lfence_start, cnt); 6904 break; 6905 case TSC_TSCP: 6906 cnt = &_tscp_end - &_tscp_start; 6907 (void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt); 6908 break; 6909 default: 6910 /* Bail for unexpected TSC types. (TSC_NONE covers 0) */ 6911 cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag); 6912 break; 6913 } 6914 tsc_type = flag; 6915 } 6916 6917 int 6918 cpuid_deep_cstates_supported(void) 6919 { 6920 struct cpuid_info *cpi; 6921 struct cpuid_regs regs; 6922 6923 ASSERT(cpuid_checkpass(CPU, 1)); 6924 6925 cpi = CPU->cpu_m.mcpu_cpi; 6926 6927 if (!is_x86_feature(x86_featureset, X86FSET_CPUID)) 6928 return (0); 6929 6930 switch (cpi->cpi_vendor) { 6931 case X86_VENDOR_Intel: 6932 if (cpi->cpi_xmaxeax < 0x80000007) 6933 return (0); 6934 6935 /* 6936 * TSC run at a constant rate in all ACPI C-states? 6937 */ 6938 regs.cp_eax = 0x80000007; 6939 (void) __cpuid_insn(®s); 6940 return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE); 6941 6942 default: 6943 return (0); 6944 } 6945 } 6946 6947 #endif /* !__xpv */ 6948 6949 void 6950 post_startup_cpu_fixups(void) 6951 { 6952 #ifndef __xpv 6953 /* 6954 * Some AMD processors support C1E state. Entering this state will 6955 * cause the local APIC timer to stop, which we can't deal with at 6956 * this time. 6957 */ 6958 if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) { 6959 on_trap_data_t otd; 6960 uint64_t reg; 6961 6962 if (!on_trap(&otd, OT_DATA_ACCESS)) { 6963 reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT); 6964 /* Disable C1E state if it is enabled by BIOS */ 6965 if ((reg >> AMD_ACTONCMPHALT_SHIFT) & 6966 AMD_ACTONCMPHALT_MASK) { 6967 reg &= ~(AMD_ACTONCMPHALT_MASK << 6968 AMD_ACTONCMPHALT_SHIFT); 6969 wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg); 6970 } 6971 } 6972 no_trap(); 6973 } 6974 #endif /* !__xpv */ 6975 } 6976 6977 void 6978 enable_pcid(void) 6979 { 6980 if (x86_use_pcid == -1) 6981 x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID); 6982 6983 if (x86_use_invpcid == -1) { 6984 x86_use_invpcid = is_x86_feature(x86_featureset, 6985 X86FSET_INVPCID); 6986 } 6987 6988 if (!x86_use_pcid) 6989 return; 6990 6991 /* 6992 * Intel say that on setting PCIDE, it immediately starts using the PCID 6993 * bits; better make sure there's nothing there. 6994 */ 6995 ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE); 6996 6997 setcr4(getcr4() | CR4_PCIDE); 6998 } 6999 7000 /* 7001 * Setup necessary registers to enable XSAVE feature on this processor. 7002 * This function needs to be called early enough, so that no xsave/xrstor 7003 * ops will execute on the processor before the MSRs are properly set up. 7004 * 7005 * Current implementation has the following assumption: 7006 * - cpuid_pass1() is done, so that X86 features are known. 7007 * - fpu_probe() is done, so that fp_save_mech is chosen. 7008 */ 7009 void 7010 xsave_setup_msr(cpu_t *cpu) 7011 { 7012 ASSERT(fp_save_mech == FP_XSAVE); 7013 ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE)); 7014 7015 /* Enable OSXSAVE in CR4. */ 7016 setcr4(getcr4() | CR4_OSXSAVE); 7017 /* 7018 * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report 7019 * correct value. 7020 */ 7021 cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE; 7022 setup_xfem(); 7023 } 7024 7025 /* 7026 * Starting with the Westmere processor the local 7027 * APIC timer will continue running in all C-states, 7028 * including the deepest C-states. 7029 */ 7030 int 7031 cpuid_arat_supported(void) 7032 { 7033 struct cpuid_info *cpi; 7034 struct cpuid_regs regs; 7035 7036 ASSERT(cpuid_checkpass(CPU, 1)); 7037 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID)); 7038 7039 cpi = CPU->cpu_m.mcpu_cpi; 7040 7041 switch (cpi->cpi_vendor) { 7042 case X86_VENDOR_Intel: 7043 /* 7044 * Always-running Local APIC Timer is 7045 * indicated by CPUID.6.EAX[2]. 7046 */ 7047 if (cpi->cpi_maxeax >= 6) { 7048 regs.cp_eax = 6; 7049 (void) cpuid_insn(NULL, ®s); 7050 return (regs.cp_eax & CPUID_INTC_EAX_ARAT); 7051 } else { 7052 return (0); 7053 } 7054 default: 7055 return (0); 7056 } 7057 } 7058 7059 /* 7060 * Check support for Intel ENERGY_PERF_BIAS feature 7061 */ 7062 int 7063 cpuid_iepb_supported(struct cpu *cp) 7064 { 7065 struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi; 7066 struct cpuid_regs regs; 7067 7068 ASSERT(cpuid_checkpass(cp, 1)); 7069 7070 if (!(is_x86_feature(x86_featureset, X86FSET_CPUID)) || 7071 !(is_x86_feature(x86_featureset, X86FSET_MSR))) { 7072 return (0); 7073 } 7074 7075 /* 7076 * Intel ENERGY_PERF_BIAS MSR is indicated by 7077 * capability bit CPUID.6.ECX.3 7078 */ 7079 if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6)) 7080 return (0); 7081 7082 regs.cp_eax = 0x6; 7083 (void) cpuid_insn(NULL, ®s); 7084 return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS); 7085 } 7086 7087 /* 7088 * Check support for TSC deadline timer 7089 * 7090 * TSC deadline timer provides a superior software programming 7091 * model over local APIC timer that eliminates "time drifts". 7092 * Instead of specifying a relative time, software specifies an 7093 * absolute time as the target at which the processor should 7094 * generate a timer event. 7095 */ 7096 int 7097 cpuid_deadline_tsc_supported(void) 7098 { 7099 struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi; 7100 struct cpuid_regs regs; 7101 7102 ASSERT(cpuid_checkpass(CPU, 1)); 7103 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID)); 7104 7105 switch (cpi->cpi_vendor) { 7106 case X86_VENDOR_Intel: 7107 if (cpi->cpi_maxeax >= 1) { 7108 regs.cp_eax = 1; 7109 (void) cpuid_insn(NULL, ®s); 7110 return (regs.cp_ecx & CPUID_DEADLINE_TSC); 7111 } else { 7112 return (0); 7113 } 7114 default: 7115 return (0); 7116 } 7117 } 7118 7119 #if defined(__amd64) && !defined(__xpv) 7120 /* 7121 * Patch in versions of bcopy for high performance Intel Nhm processors 7122 * and later... 7123 */ 7124 void 7125 patch_memops(uint_t vendor) 7126 { 7127 size_t cnt, i; 7128 caddr_t to, from; 7129 7130 if ((vendor == X86_VENDOR_Intel) && 7131 is_x86_feature(x86_featureset, X86FSET_SSE4_2)) { 7132 cnt = &bcopy_patch_end - &bcopy_patch_start; 7133 to = &bcopy_ck_size; 7134 from = &bcopy_patch_start; 7135 for (i = 0; i < cnt; i++) { 7136 *to++ = *from++; 7137 } 7138 } 7139 } 7140 #endif /* __amd64 && !__xpv */ 7141 7142 /* 7143 * We're being asked to tell the system how many bits are required to represent 7144 * the various thread and strand IDs. While it's tempting to derive this based 7145 * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite 7146 * correct. Instead, this needs to be based on the number of bits that the APIC 7147 * allows for these different configurations. We only update these to a larger 7148 * value if we find one. 7149 */ 7150 void 7151 cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits) 7152 { 7153 struct cpuid_info *cpi; 7154 7155 VERIFY(cpuid_checkpass(CPU, 1)); 7156 cpi = cpu->cpu_m.mcpu_cpi; 7157 7158 if (cpi->cpi_ncore_bits > *core_nbits) { 7159 *core_nbits = cpi->cpi_ncore_bits; 7160 } 7161 7162 if (cpi->cpi_nthread_bits > *strand_nbits) { 7163 *strand_nbits = cpi->cpi_nthread_bits; 7164 } 7165 } 7166 7167 void 7168 cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset) 7169 { 7170 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 7171 struct cpuid_regs cp; 7172 7173 /* 7174 * Reread the CPUID portions that we need for various security 7175 * information. 7176 */ 7177 if (cpi->cpi_vendor == X86_VENDOR_Intel) { 7178 /* 7179 * Check if we now have leaf 7 available to us. 7180 */ 7181 if (cpi->cpi_maxeax < 7) { 7182 bzero(&cp, sizeof (cp)); 7183 cp.cp_eax = 0; 7184 cpi->cpi_maxeax = __cpuid_insn(&cp); 7185 if (cpi->cpi_maxeax < 7) 7186 return; 7187 } 7188 7189 bzero(&cp, sizeof (cp)); 7190 cp.cp_eax = 7; 7191 cp.cp_ecx = 0; 7192 (void) __cpuid_insn(&cp); 7193 cpi->cpi_std[7] = cp; 7194 } else if (cpi->cpi_vendor == X86_VENDOR_AMD) { 7195 /* No xcpuid support */ 7196 if (cpi->cpi_family < 5 || 7197 (cpi->cpi_family == 5 && cpi->cpi_model < 1)) 7198 return; 7199 7200 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) { 7201 bzero(&cp, sizeof (cp)); 7202 cp.cp_eax = CPUID_LEAF_EXT_0; 7203 cpi->cpi_xmaxeax = __cpuid_insn(&cp); 7204 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) { 7205 return; 7206 } 7207 } 7208 7209 bzero(&cp, sizeof (cp)); 7210 cp.cp_eax = CPUID_LEAF_EXT_8; 7211 (void) __cpuid_insn(&cp); 7212 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp); 7213 cpi->cpi_extd[8] = cp; 7214 } else { 7215 /* 7216 * Nothing to do here. Return an empty set which has already 7217 * been zeroed for us. 7218 */ 7219 return; 7220 } 7221 cpuid_scan_security(cpu, fset); 7222 } 7223 7224 /* ARGSUSED */ 7225 static int 7226 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2) 7227 { 7228 uchar_t *fset; 7229 boolean_t first_pass = (boolean_t)arg1; 7230 7231 fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id); 7232 if (first_pass && CPU->cpu_id != 0) 7233 return (0); 7234 if (!first_pass && CPU->cpu_id == 0) 7235 return (0); 7236 cpuid_pass_ucode(CPU, fset); 7237 7238 return (0); 7239 } 7240 7241 /* 7242 * After a microcode update where the version has changed, then we need to 7243 * rescan CPUID. To do this we check every CPU to make sure that they have the 7244 * same microcode. Then we perform a cross call to all such CPUs. It's the 7245 * caller's job to make sure that no one else can end up doing an update while 7246 * this is going on. 7247 * 7248 * We assume that the system is microcode capable if we're called. 7249 */ 7250 void 7251 cpuid_post_ucodeadm(void) 7252 { 7253 uint32_t rev; 7254 int i; 7255 struct cpu *cpu; 7256 cpuset_t cpuset; 7257 void *argdata; 7258 uchar_t *f0; 7259 7260 argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP); 7261 7262 mutex_enter(&cpu_lock); 7263 cpu = cpu_get(0); 7264 rev = cpu->cpu_m.mcpu_ucode_info->cui_rev; 7265 CPUSET_ONLY(cpuset, 0); 7266 for (i = 1; i < max_ncpus; i++) { 7267 if ((cpu = cpu_get(i)) == NULL) 7268 continue; 7269 7270 if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) { 7271 panic("post microcode update CPU %d has differing " 7272 "microcode revision (%u) from CPU 0 (%u)", 7273 i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev); 7274 } 7275 CPUSET_ADD(cpuset, i); 7276 } 7277 7278 /* 7279 * We do the cross calls in two passes. The first pass is only for the 7280 * boot CPU. The second pass is for all of the other CPUs. This allows 7281 * the boot CPU to go through and change behavior related to patching or 7282 * whether or not Enhanced IBRS needs to be enabled and then allow all 7283 * other CPUs to follow suit. 7284 */ 7285 kpreempt_disable(); 7286 xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset), 7287 cpuid_post_ucodeadm_xc); 7288 xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset), 7289 cpuid_post_ucodeadm_xc); 7290 kpreempt_enable(); 7291 7292 /* 7293 * OK, now look at each CPU and see if their feature sets are equal. 7294 */ 7295 f0 = argdata; 7296 for (i = 1; i < max_ncpus; i++) { 7297 uchar_t *fset; 7298 if (!CPU_IN_SET(cpuset, i)) 7299 continue; 7300 7301 fset = (uchar_t *)((uintptr_t)argdata + 7302 sizeof (x86_featureset) * i); 7303 7304 if (!compare_x86_featureset(f0, fset)) { 7305 panic("Post microcode update CPU %d has " 7306 "differing security feature (%p) set from CPU 0 " 7307 "(%p), not appending to feature set", i, 7308 (void *)fset, (void *)f0); 7309 } 7310 } 7311 7312 mutex_exit(&cpu_lock); 7313 7314 for (i = 0; i < NUM_X86_FEATURES; i++) { 7315 cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n", 7316 x86_feature_names[i]); 7317 if (is_x86_feature(f0, i)) { 7318 add_x86_feature(x86_featureset, i); 7319 } 7320 } 7321 kmem_free(argdata, sizeof (x86_featureset) * NCPU); 7322 }