Print this page
update
11967 need TAA mitigation
Portions contributed by: Robert Mustacchi <rm@fingolfin.org>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/i86pc/os/cpuid.c
+++ new/usr/src/uts/i86pc/os/cpuid.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
↓ open down ↓ |
15 lines elided |
↑ open up ↑ |
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24 24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25 25 * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
26 + * Copyright 2020 Joyent, Inc.
26 27 */
27 28 /*
28 29 * Copyright (c) 2010, Intel Corporation.
29 30 * All rights reserved.
30 31 */
31 32 /*
32 33 * Portions Copyright 2009 Advanced Micro Devices, Inc.
33 34 */
34 -/*
35 - * Copyright 2019 Joyent, Inc.
36 - */
37 35
38 36 /*
39 37 * CPU Identification logic
40 38 *
41 39 * The purpose of this file and its companion, cpuid_subr.c, is to help deal
42 40 * with the identification of CPUs, their features, and their topologies. More
43 41 * specifically, this file helps drive the following:
44 42 *
45 43 * 1. Enumeration of features of the processor which are used by the kernel to
46 44 * determine what features to enable or disable. These may be instruction set
47 45 * enhancements or features that we use.
48 46 *
49 47 * 2. Enumeration of instruction set architecture (ISA) additions that userland
50 48 * will be told about through the auxiliary vector.
51 49 *
52 50 * 3. Understanding the physical topology of the CPU such as the number of
53 51 * caches, how many cores it has, whether or not it supports symmetric
54 52 * multi-processing (SMT), etc.
55 53 *
56 54 * ------------------------
57 55 * CPUID History and Basics
58 56 * ------------------------
59 57 *
60 58 * The cpuid instruction was added by Intel roughly around the time that the
61 59 * original Pentium was introduced. The purpose of cpuid was to tell in a
62 60 * programmatic fashion information about the CPU that previously was guessed
63 61 * at. For example, an important part of cpuid is that we can know what
64 62 * extensions to the ISA exist. If you use an invalid opcode you would get a
65 63 * #UD, so this method allows a program (whether a user program or the kernel)
66 64 * to determine what exists without crashing or getting a SIGILL. Of course,
67 65 * this was also during the era of the clones and the AMD Am5x86. The vendor
68 66 * name shows up first in cpuid for a reason.
69 67 *
70 68 * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
71 69 * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
72 70 * its own meaning. The different leaves are broken down into different regions:
73 71 *
74 72 * [ 0, 7fffffff ] This region is called the 'basic'
75 73 * region. This region is generally defined
76 74 * by Intel, though some of the original
77 75 * portions have different meanings based
78 76 * on the manufacturer. These days, Intel
79 77 * adds most new features to this region.
80 78 * AMD adds non-Intel compatible
81 79 * information in the third, extended
82 80 * region. Intel uses this for everything
83 81 * including ISA extensions, CPU
84 82 * features, cache information, topology,
85 83 * and more.
86 84 *
87 85 * There is a hole carved out of this
88 86 * region which is reserved for
89 87 * hypervisors.
90 88 *
91 89 * [ 40000000, 4fffffff ] This region, which is found in the
92 90 * middle of the previous region, is
93 91 * explicitly promised to never be used by
94 92 * CPUs. Instead, it is used by hypervisors
95 93 * to communicate information about
96 94 * themselves to the operating system. The
97 95 * values and details are unique for each
98 96 * hypervisor.
99 97 *
100 98 * [ 80000000, ffffffff ] This region is called the 'extended'
101 99 * region. Some of the low leaves mirror
102 100 * parts of the basic leaves. This region
103 101 * has generally been used by AMD for
104 102 * various extensions. For example, AMD-
105 103 * specific information about caches,
106 104 * features, and topology are found in this
107 105 * region.
108 106 *
109 107 * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
110 108 * and %edx, and then issue the cpuid instruction. At the first leaf in each of
111 109 * the ranges, one of the primary things returned is the maximum valid leaf in
112 110 * that range. This allows for discovery of what range of CPUID is valid.
113 111 *
114 112 * The CPUs have potentially surprising behavior when using an invalid leaf or
115 113 * unimplemented leaf. If the requested leaf is within the valid basic or
116 114 * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
117 115 * set to zero. However, if you specify a leaf that is outside of a valid range,
118 116 * then instead it will be filled with the last valid _basic_ leaf. For example,
119 117 * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
120 118 * an invalid extended leaf will return the information for leaf 3.
121 119 *
122 120 * Some leaves are broken down into sub-leaves. This means that the value
123 121 * depends on both the leaf asked for in %eax and a secondary register. For
124 122 * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
125 123 * additional information. Or when getting topology information in leaf 0xb, the
126 124 * initial value in %ecx changes which level of the topology that you are
127 125 * getting information about.
128 126 *
129 127 * cpuid values are always kept to 32 bits regardless of whether or not the
130 128 * program is in 64-bit mode. When executing in 64-bit mode, the upper
131 129 * 32 bits of the register are always set to zero so that way the values are the
132 130 * same regardless of execution mode.
133 131 *
134 132 * ----------------------
135 133 * Identifying Processors
136 134 * ----------------------
137 135 *
138 136 * We can identify a processor in two steps. The first step looks at cpuid leaf
139 137 * 0. Leaf 0 contains the processor's vendor information. This is done by
140 138 * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
141 139 * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
142 140 *
143 141 * From there, a processor is identified by a combination of three different
144 142 * values:
145 143 *
146 144 * 1. Family
147 145 * 2. Model
148 146 * 3. Stepping
149 147 *
150 148 * Each vendor uses the family and model to uniquely identify a processor. The
151 149 * way that family and model are changed depends on the vendor. For example,
152 150 * Intel has been using family 0x6 for almost all of their processor since the
153 151 * Pentium Pro/Pentium II era, often called the P6. The model is used to
154 152 * identify the exact processor. Different models are often used for the client
155 153 * (consumer) and server parts. Even though each processor often has major
156 154 * architectural differences, they still are considered the same family by
157 155 * Intel.
158 156 *
159 157 * On the other hand, each major AMD architecture generally has its own family.
160 158 * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
161 159 * the model number is used to help identify specific processors.
162 160 *
163 161 * The stepping is used to refer to a revision of a specific microprocessor. The
164 162 * term comes from equipment used to produce masks that are used to create
165 163 * integrated circuits.
166 164 *
167 165 * The information is present in leaf 1, %eax. In technical documentation you
168 166 * will see the terms extended model and extended family. The original family,
169 167 * model, and stepping fields were each 4 bits wide. If the values in either
170 168 * are 0xf, then one is to consult the extended model and extended family, which
171 169 * take previously reserved bits and allow for a larger number of models and add
172 170 * 0xf to them.
173 171 *
174 172 * When we process this information, we store the full family, model, and
175 173 * stepping in the struct cpuid_info members cpi_family, cpi_model, and
176 174 * cpi_step, respectively. Whenever you are performing comparisons with the
177 175 * family, model, and stepping, you should use these members and not the raw
178 176 * values from cpuid. If you must use the raw values from cpuid directly, you
179 177 * must make sure that you add the extended model and family to the base model
180 178 * and family.
181 179 *
182 180 * In general, we do not use information about the family, model, and stepping
183 181 * to determine whether or not a feature is present; that is generally driven by
184 182 * specific leaves. However, when something we care about on the processor is
185 183 * not considered 'architectural' meaning that it is specific to a set of
186 184 * processors and not promised in the architecture model to be consistent from
187 185 * generation to generation, then we will fall back on this information. The
188 186 * most common cases where this comes up is when we have to workaround errata in
189 187 * the processor, are dealing with processor-specific features such as CPU
190 188 * performance counters, or we want to provide additional information for things
191 189 * such as fault management.
192 190 *
193 191 * While processors also do have a brand string, which is the name that people
194 192 * are familiar with when buying the processor, they are not meant for
195 193 * programmatic consumption. That is what the family, model, and stepping are
196 194 * for.
197 195 *
198 196 * ------------
199 197 * CPUID Passes
200 198 * ------------
201 199 *
202 200 * As part of performing feature detection, we break this into several different
203 201 * passes. The passes are as follows:
204 202 *
205 203 * Pass 0 This is a primordial pass done in locore.s to deal with
206 204 * Cyrix CPUs that don't support cpuid. The reality is that
207 205 * we likely don't run on them any more, but there is still
208 206 * logic for handling them.
209 207 *
210 208 * Pass 1 This is the primary pass and is responsible for doing a
211 209 * large number of different things:
212 210 *
213 211 * 1. Determine which vendor manufactured the CPU and
214 212 * determining the family, model, and stepping information.
215 213 *
216 214 * 2. Gathering a large number of feature flags to
217 215 * determine which features the CPU support and which
218 216 * indicate things that we need to do other work in the OS
219 217 * to enable. Features detected this way are added to the
220 218 * x86_featureset which can be queried to
221 219 * determine what we should do. This includes processing
222 220 * all of the basic and extended CPU features that we care
223 221 * about.
224 222 *
225 223 * 3. Determining the CPU's topology. This includes
226 224 * information about how many cores and threads are present
227 225 * in the package. It also is responsible for figuring out
228 226 * which logical CPUs are potentially part of the same core
229 227 * and what other resources they might share. For more
230 228 * information see the 'Topology' section.
231 229 *
232 230 * 4. Determining the set of CPU security-specific features
233 231 * that we need to worry about and determine the
234 232 * appropriate set of workarounds.
235 233 *
236 234 * Pass 1 on the boot CPU occurs before KMDB is started.
237 235 *
238 236 * Pass 2 The second pass is done after startup(). Here, we check
239 237 * other miscellaneous features. Most of this is gathering
240 238 * additional basic and extended features that we'll use in
241 239 * later passes or for debugging support.
242 240 *
243 241 * Pass 3 The third pass occurs after the kernel memory allocator
244 242 * has been fully initialized. This gathers information
245 243 * where we might need dynamic memory available for our
246 244 * uses. This includes several varying width leaves that
247 245 * have cache information and the processor's brand string.
248 246 *
249 247 * Pass 4 The fourth and final normal pass is performed after the
250 248 * kernel has brought most everything online. This is
251 249 * invoked from post_startup(). In this pass, we go through
252 250 * the set of features that we have enabled and turn that
253 251 * into the hardware auxiliary vector features that
254 252 * userland receives. This is used by userland, primarily
255 253 * by the run-time link-editor (RTLD), though userland
256 254 * software could also refer to it directly.
257 255 *
258 256 * Microcode After a microcode update, we do a selective rescan of
259 257 * the cpuid leaves to determine what features have
260 258 * changed. Microcode updates can provide more details
261 259 * about security related features to deal with issues like
262 260 * Spectre and L1TF. On occasion, vendors have violated
263 261 * their contract and removed bits. However, we don't try
264 262 * to detect that because that puts us in a situation that
265 263 * we really can't deal with. As such, the only thing we
266 264 * rescan are security related features today. See
267 265 * cpuid_pass_ucode().
268 266 *
269 267 * All of the passes (except pass 0) are run on all CPUs. However, for the most
270 268 * part we only care about what the boot CPU says about this information and use
271 269 * the other CPUs as a rough guide to sanity check that we have the same feature
272 270 * set.
273 271 *
274 272 * We do not support running multiple logical CPUs with disjoint, let alone
275 273 * different, feature sets.
276 274 *
277 275 * ------------------
278 276 * Processor Topology
279 277 * ------------------
280 278 *
281 279 * One of the important things that we need to do is to understand the topology
282 280 * of the underlying processor. When we say topology in this case, we're trying
283 281 * to understand the relationship between the logical CPUs that the operating
284 282 * system sees and the underlying physical layout. Different logical CPUs may
285 283 * share different resources which can have important consequences for the
286 284 * performance of the system. For example, they may share caches, execution
287 285 * units, and more.
288 286 *
289 287 * The topology of the processor changes from generation to generation and
290 288 * vendor to vendor. Along with that, different vendors use different
291 289 * terminology, and the operating system itself uses occasionally overlapping
292 290 * terminology. It's important to understand what this topology looks like so
293 291 * one can understand the different things that we try to calculate and
294 292 * determine.
295 293 *
296 294 * To get started, let's talk about a little bit of terminology that we've used
297 295 * so far, is used throughout this file, and is fairly generic across multiple
298 296 * vendors:
299 297 *
300 298 * CPU
301 299 * A central processing unit (CPU) refers to a logical and/or virtual
302 300 * entity that the operating system can execute instructions on. The
303 301 * underlying resources for this CPU may be shared between multiple
304 302 * entities; however, to the operating system it is a discrete unit.
305 303 *
306 304 * PROCESSOR and PACKAGE
307 305 *
308 306 * Generally, when we use the term 'processor' on its own, we are referring
309 307 * to the physical entity that one buys and plugs into a board. However,
310 308 * because processor has been overloaded and one might see it used to mean
311 309 * multiple different levels, we will instead use the term 'package' for
312 310 * the rest of this file. The term package comes from the electrical
313 311 * engineering side and refers to the physical entity that encloses the
314 312 * electronics inside. Strictly speaking the package can contain more than
315 313 * just the CPU, for example, on many processors it may also have what's
316 314 * called an 'integrated graphical processing unit (GPU)'. Because the
317 315 * package can encapsulate multiple units, it is the largest physical unit
318 316 * that we refer to.
319 317 *
320 318 * SOCKET
321 319 *
322 320 * A socket refers to unit on a system board (generally the motherboard)
323 321 * that can receive a package. A single package, or processor, is plugged
324 322 * into a single socket. A system may have multiple sockets. Often times,
325 323 * the term socket is used interchangeably with package and refers to the
326 324 * electrical component that has plugged in, and not the receptacle itself.
327 325 *
328 326 * CORE
329 327 *
330 328 * A core refers to the physical instantiation of a CPU, generally, with a
331 329 * full set of hardware resources available to it. A package may contain
332 330 * multiple cores inside of it or it may just have a single one. A
333 331 * processor with more than one core is often referred to as 'multi-core'.
334 332 * In illumos, we will use the feature X86FSET_CMP to refer to a system
335 333 * that has 'multi-core' processors.
336 334 *
337 335 * A core may expose a single logical CPU to the operating system, or it
338 336 * may expose multiple CPUs, which we call threads, defined below.
339 337 *
340 338 * Some resources may still be shared by cores in the same package. For
341 339 * example, many processors will share the level 3 cache between cores.
342 340 * Some AMD generations share hardware resources between cores. For more
343 341 * information on that see the section 'AMD Topology'.
344 342 *
345 343 * THREAD and STRAND
346 344 *
347 345 * In this file, generally a thread refers to a hardware resources and not
348 346 * the operating system's logical abstraction. A thread is always exposed
349 347 * as an independent logical CPU to the operating system. A thread belongs
350 348 * to a specific core. A core may have more than one thread. When that is
351 349 * the case, the threads that are part of the same core are often referred
352 350 * to as 'siblings'.
353 351 *
354 352 * When multiple threads exist, this is generally referred to as
355 353 * simultaneous multi-threading (SMT). When Intel introduced this in their
356 354 * processors they called it hyper-threading (HT). When multiple threads
357 355 * are active in a core, they split the resources of the core. For example,
358 356 * two threads may share the same set of hardware execution units.
359 357 *
360 358 * The operating system often uses the term 'strand' to refer to a thread.
361 359 * This helps disambiguate it from the software concept.
362 360 *
363 361 * CHIP
364 362 *
365 363 * Unfortunately, the term 'chip' is dramatically overloaded. At its most
366 364 * base meaning, it is used to refer to a single integrated circuit, which
367 365 * may or may not be the only thing in the package. In illumos, when you
368 366 * see the term 'chip' it is almost always referring to the same thing as
369 367 * the 'package'. However, many vendors may use chip to refer to one of
370 368 * many integrated circuits that have been placed in the package. As an
371 369 * example, see the subsequent definition.
372 370 *
373 371 * To try and keep things consistent, we will only use chip when referring
374 372 * to the entire integrated circuit package, with the exception of the
375 373 * definition of multi-chip module (because it is in the name) and use the
376 374 * term 'die' when we want the more general, potential sub-component
377 375 * definition.
378 376 *
379 377 * DIE
380 378 *
381 379 * A die refers to an integrated circuit. Inside of the package there may
382 380 * be a single die or multiple dies. This is sometimes called a 'chip' in
383 381 * vendor's parlance, but in this file, we use the term die to refer to a
384 382 * subcomponent.
385 383 *
386 384 * MULTI-CHIP MODULE
387 385 *
388 386 * A multi-chip module (MCM) refers to putting multiple distinct chips that
389 387 * are connected together in the same package. When a multi-chip design is
390 388 * used, generally each chip is manufactured independently and then joined
391 389 * together in the package. For example, on AMD's Zen microarchitecture
392 390 * (family 0x17), the package contains several dies (the second meaning of
393 391 * chip from above) that are connected together.
394 392 *
395 393 * CACHE
396 394 *
397 395 * A cache is a part of the processor that maintains copies of recently
398 396 * accessed memory. Caches are split into levels and then into types.
399 397 * Commonly there are one to three levels, called level one, two, and
400 398 * three. The lower the level, the smaller it is, the closer it is to the
401 399 * execution units of the CPU, and the faster it is to access. The layout
402 400 * and design of the cache come in many different flavors, consult other
403 401 * resources for a discussion of those.
404 402 *
405 403 * Caches are generally split into two types, the instruction and data
406 404 * cache. The caches contain what their names suggest, the instruction
407 405 * cache has executable program text, while the data cache has all other
408 406 * memory that the processor accesses. As of this writing, data is kept
409 407 * coherent between all of the caches on x86, so if one modifies program
410 408 * text before it is executed, that will be in the data cache, and the
411 409 * instruction cache will be synchronized with that change when the
412 410 * processor actually executes those instructions. This coherency also
413 411 * covers the fact that data could show up in multiple caches.
414 412 *
415 413 * Generally, the lowest level caches are specific to a core. However, the
416 414 * last layer cache is shared between some number of cores. The number of
417 415 * CPUs sharing this last level cache is important. This has implications
418 416 * for the choices that the scheduler makes, as accessing memory that might
419 417 * be in a remote cache after thread migration can be quite expensive.
420 418 *
421 419 * Sometimes, the word cache is abbreviated with a '$', because in US
422 420 * English the word cache is pronounced the same as cash. So L1D$ refers to
423 421 * the L1 data cache, and L2$ would be the L2 cache. This will not be used
424 422 * in the rest of this theory statement for clarity.
425 423 *
426 424 * MEMORY CONTROLLER
427 425 *
428 426 * The memory controller is a component that provides access to DRAM. Each
429 427 * memory controller can access a set number of DRAM channels. Each channel
430 428 * can have a number of DIMMs (sticks of memory) associated with it. A
431 429 * given package may have more than one memory controller. The association
432 430 * of the memory controller to a group of cores is important as it is
433 431 * cheaper to access memory on the controller that you are associated with.
434 432 *
435 433 * NUMA
436 434 *
437 435 * NUMA or non-uniform memory access, describes a way that systems are
438 436 * built. On x86, any processor core can address all of the memory in the
439 437 * system. However, When using multiple sockets or possibly within a
440 438 * multi-chip module, some of that memory is physically closer and some of
441 439 * it is further. Memory that is further away is more expensive to access.
442 440 * Consider the following image of multiple sockets with memory:
443 441 *
444 442 * +--------+ +--------+
445 443 * | DIMM A | +----------+ +----------+ | DIMM D |
446 444 * +--------+-+ | | | | +-+------+-+
447 445 * | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
448 446 * +--------+-+ | | | | +-+------+-+
449 447 * | DIMM C | +----------+ +----------+ | DIMM F |
450 448 * +--------+ +--------+
451 449 *
452 450 * In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
453 451 * closer to DIMMs D-F. This means that it is cheaper for socket 0 to
454 452 * access DIMMs A-C and more expensive to access D-F as it has to go
455 453 * through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
456 454 * D-F are cheaper than A-C. While the socket form is the most common, when
457 455 * using multi-chip modules, this can also sometimes occur. For another
458 456 * example of this that's more involved, see the AMD topology section.
459 457 *
460 458 *
461 459 * Intel Topology
462 460 * --------------
463 461 *
464 462 * Most Intel processors since Nehalem, (as of this writing the current gen
465 463 * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
466 464 * the package is a single monolithic die. MCMs currently aren't used. Most
467 465 * parts have three levels of caches, with the L3 cache being shared between
468 466 * all of the cores on the package. The L1/L2 cache is generally specific to
469 467 * an individual core. The following image shows at a simplified level what
470 468 * this looks like. The memory controller is commonly part of something called
471 469 * the 'Uncore', that used to be separate physical chips that were not a part of
472 470 * the package, but are now part of the same chip.
473 471 *
474 472 * +-----------------------------------------------------------------------+
475 473 * | Package |
476 474 * | +-------------------+ +-------------------+ +-------------------+ |
477 475 * | | Core | | Core | | Core | |
478 476 * | | +--------+ +---+ | | +--------+ +---+ | | +--------+ +---+ | |
479 477 * | | | Thread | | L | | | | Thread | | L | | | | Thread | | L | | |
480 478 * | | +--------+ | 1 | | | +--------+ | 1 | | | +--------+ | 1 | | |
481 479 * | | +--------+ | | | | +--------+ | | | | +--------+ | | | |
482 480 * | | | Thread | | | | | | Thread | | | | | | Thread | | | | |
483 481 * | | +--------+ +---+ | | +--------+ +---+ | | +--------+ +---+ | |
484 482 * | | +--------------+ | | +--------------+ | | +--------------+ | |
485 483 * | | | L2 Cache | | | | L2 Cache | | | | L2 Cache | | |
486 484 * | | +--------------+ | | +--------------+ | | +--------------+ | |
487 485 * | +-------------------+ +-------------------+ +-------------------+ |
488 486 * | +-------------------------------------------------------------------+ |
489 487 * | | Shared L3 Cache | |
490 488 * | +-------------------------------------------------------------------+ |
491 489 * | +-------------------------------------------------------------------+ |
492 490 * | | Memory Controller | |
493 491 * | +-------------------------------------------------------------------+ |
494 492 * +-----------------------------------------------------------------------+
495 493 *
496 494 * A side effect of this current architecture is that what we care about from a
497 495 * scheduling and topology perspective, is simplified. In general we care about
498 496 * understanding which logical CPUs are part of the same core and socket.
499 497 *
500 498 * To determine the relationship between threads and cores, Intel initially used
501 499 * the identifier in the advanced programmable interrupt controller (APIC). They
502 500 * also added cpuid leaf 4 to give additional information about the number of
503 501 * threads and CPUs in the processor. With the addition of x2apic (which
504 502 * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
505 503 * additional cpuid topology leaf 0xB was added.
506 504 *
507 505 * AMD Topology
508 506 * ------------
509 507 *
510 508 * When discussing AMD topology, we want to break this into three distinct
511 509 * generations of topology. There's the basic topology that has been used in
512 510 * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
513 511 * with family 0x15 (Bulldozer), and there's the topology that was introduced
514 512 * with family 0x17 (Zen). AMD also has some additional terminology that's worth
515 513 * talking about.
516 514 *
517 515 * Until the introduction of family 0x17 (Zen), AMD did not implement something
518 516 * that they considered SMT. Whether or not the AMD processors have SMT
519 517 * influences many things including scheduling and reliability, availability,
520 518 * and serviceability (RAS) features.
521 519 *
522 520 * NODE
523 521 *
524 522 * AMD uses the term node to refer to a die that contains a number of cores
525 523 * and I/O resources. Depending on the processor family and model, more
526 524 * than one node can be present in the package. When there is more than one
527 525 * node this indicates a multi-chip module. Usually each node has its own
528 526 * access to memory and I/O devices. This is important and generally
529 527 * different from the corresponding Intel Nehalem-Skylake+ processors. As a
530 528 * result, we track this relationship in the operating system.
531 529 *
532 530 * In processors with an L3 cache, the L3 cache is generally shared across
533 531 * the entire node, though the way this is carved up varies from generation
534 532 * to generation.
535 533 *
536 534 * BULLDOZER
537 535 *
538 536 * Starting with the Bulldozer family (0x15) and continuing until the
539 537 * introduction of the Zen microarchitecture, AMD introduced the idea of a
540 538 * compute unit. In a compute unit, two traditional cores share a number of
541 539 * hardware resources. Critically, they share the FPU, L1 instruction
542 540 * cache, and the L2 cache. Several compute units were then combined inside
543 541 * of a single node. Because the integer execution units, L1 data cache,
544 542 * and some other resources were not shared between the cores, AMD never
545 543 * considered this to be SMT.
546 544 *
547 545 * ZEN
548 546 *
549 547 * The Zen family (0x17) uses a multi-chip module (MCM) design, the module
550 548 * is called Zeppelin. These modules are similar to the idea of nodes used
551 549 * previously. Each of these nodes has two DRAM channels which all of the
552 550 * cores in the node can access uniformly. These nodes are linked together
553 551 * in the package, creating a NUMA environment.
554 552 *
555 553 * The Zeppelin die itself contains two different 'core complexes'. Each
556 554 * core complex consists of four cores which each have two threads, for a
557 555 * total of 8 logical CPUs per complex. Unlike other generations,
558 556 * where all the logical CPUs in a given node share the L3 cache, here each
559 557 * core complex has its own shared L3 cache.
560 558 *
561 559 * A further thing that we need to consider is that in some configurations,
562 560 * particularly with the Threadripper line of processors, not every die
563 561 * actually has its memory controllers wired up to actual memory channels.
564 562 * This means that some cores have memory attached to them and others
565 563 * don't.
566 564 *
567 565 * To put Zen in perspective, consider the following images:
568 566 *
569 567 * +--------------------------------------------------------+
570 568 * | Core Complex |
571 569 * | +-------------------+ +-------------------+ +---+ |
572 570 * | | Core +----+ | | Core +----+ | | | |
573 571 * | | +--------+ | L2 | | | +--------+ | L2 | | | | |
574 572 * | | | Thread | +----+ | | | Thread | +----+ | | | |
575 573 * | | +--------+-+ +--+ | | +--------+-+ +--+ | | L | |
576 574 * | | | Thread | |L1| | | | Thread | |L1| | | 3 | |
577 575 * | | +--------+ +--+ | | +--------+ +--+ | | | |
578 576 * | +-------------------+ +-------------------+ | C | |
579 577 * | +-------------------+ +-------------------+ | a | |
580 578 * | | Core +----+ | | Core +----+ | | c | |
581 579 * | | +--------+ | L2 | | | +--------+ | L2 | | | h | |
582 580 * | | | Thread | +----+ | | | Thread | +----+ | | e | |
583 581 * | | +--------+-+ +--+ | | +--------+-+ +--+ | | | |
584 582 * | | | Thread | |L1| | | | Thread | |L1| | | | |
585 583 * | | +--------+ +--+ | | +--------+ +--+ | | | |
586 584 * | +-------------------+ +-------------------+ +---+ |
587 585 * | |
588 586 * +--------------------------------------------------------+
589 587 *
590 588 * This first image represents a single Zen core complex that consists of four
591 589 * cores.
592 590 *
593 591 *
594 592 * +--------------------------------------------------------+
595 593 * | Zeppelin Die |
596 594 * | +--------------------------------------------------+ |
597 595 * | | I/O Units (PCIe, SATA, USB, etc.) | |
598 596 * | +--------------------------------------------------+ |
599 597 * | HH |
600 598 * | +-----------+ HH +-----------+ |
601 599 * | | | HH | | |
602 600 * | | Core |==========| Core | |
603 601 * | | Complex |==========| Complex | |
604 602 * | | | HH | | |
605 603 * | +-----------+ HH +-----------+ |
606 604 * | HH |
607 605 * | +--------------------------------------------------+ |
608 606 * | | Memory Controller | |
609 607 * | +--------------------------------------------------+ |
610 608 * | |
611 609 * +--------------------------------------------------------+
612 610 *
613 611 * This image represents a single Zeppelin Die. Note how both cores are
614 612 * connected to the same memory controller and I/O units. While each core
615 613 * complex has its own L3 cache as seen in the first image, they both have
616 614 * uniform access to memory.
617 615 *
618 616 *
619 617 * PP PP
620 618 * PP PP
621 619 * +----------PP---------------------PP---------+
622 620 * | PP PP |
623 621 * | +-----------+ +-----------+ |
624 622 * | | | | | |
625 623 * MMMMMMMMM| Zeppelin |==========| Zeppelin |MMMMMMMMM
626 624 * MMMMMMMMM| Die |==========| Die |MMMMMMMMM
627 625 * | | | | | |
628 626 * | +-----------+ooo ...+-----------+ |
629 627 * | HH ooo ... HH |
630 628 * | HH oo.. HH |
631 629 * | HH ..oo HH |
632 630 * | HH ... ooo HH |
633 631 * | +-----------+... ooo+-----------+ |
634 632 * | | | | | |
635 633 * MMMMMMMMM| Zeppelin |==========| Zeppelin |MMMMMMMMM
636 634 * MMMMMMMMM| Die |==========| Die |MMMMMMMMM
637 635 * | | | | | |
638 636 * | +-----------+ +-----------+ |
639 637 * | PP PP |
640 638 * +----------PP---------------------PP---------+
641 639 * PP PP
642 640 * PP PP
643 641 *
644 642 * This image represents a single Zen package. In this example, it has four
645 643 * Zeppelin dies, though some configurations only have a single one. In this
646 644 * example, each die is directly connected to the next. Also, each die is
647 645 * represented as being connected to memory by the 'M' character and connected
648 646 * to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
649 647 * die is made up of two core complexes, we have multiple different NUMA
650 648 * domains that we care about for these systems.
651 649 *
652 650 * CPUID LEAVES
653 651 *
654 652 * There are a few different CPUID leaves that we can use to try and understand
655 653 * the actual state of the world. As part of the introduction of family 0xf, AMD
656 654 * added CPUID leaf 0x80000008. This leaf tells us the number of logical
657 655 * processors that are in the system. Because families before Zen didn't have
658 656 * SMT, this was always the number of cores that were in the system. However, it
659 657 * should always be thought of as the number of logical threads to be consistent
660 658 * between generations. In addition we also get the size of the APIC ID that is
661 659 * used to represent the number of logical processors. This is important for
662 660 * deriving topology information.
663 661 *
664 662 * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
665 663 * bit between Bulldozer and later families, but it is quite useful in
666 664 * determining the topology information. Because this information has changed
667 665 * across family generations, it's worth calling out what these mean
668 666 * explicitly. The registers have the following meanings:
669 667 *
670 668 * %eax The APIC ID. The entire register is defined to have a 32-bit
671 669 * APIC ID, even though on systems without x2apic support, it will
672 670 * be limited to 8 bits.
673 671 *
674 672 * %ebx On Bulldozer-era systems this contains information about the
675 673 * number of cores that are in a compute unit (cores that share
676 674 * resources). It also contains a per-package compute unit ID that
677 675 * identifies which compute unit the logical CPU is a part of.
678 676 *
679 677 * On Zen-era systems this instead contains the number of threads
680 678 * per core and the ID of the core that the logical CPU is a part
681 679 * of. Note, this ID is unique only to the package, it is not
682 680 * globally unique across the entire system.
683 681 *
684 682 * %ecx This contains the number of nodes that exist in the package. It
685 683 * also contains an ID that identifies which node the logical CPU
686 684 * is a part of.
687 685 *
688 686 * Finally, we also use cpuid leaf 0x8000001D to determine information about the
689 687 * cache layout to determine which logical CPUs are sharing which caches.
690 688 *
691 689 * illumos Topology
692 690 * ----------------
693 691 *
694 692 * Based on the above we synthesize the information into several different
695 693 * variables that we store in the 'struct cpuid_info'. We'll go into the details
696 694 * of what each member is supposed to represent and their uniqueness. In
697 695 * general, there are two levels of uniqueness that we care about. We care about
698 696 * an ID that is globally unique. That means that it will be unique across all
699 697 * entities in the system. For example, the default logical CPU ID is globally
700 698 * unique. On the other hand, there is some information that we only care about
701 699 * being unique within the context of a single package / socket. Here are the
702 700 * variables that we keep track of and their meaning.
703 701 *
704 702 * Several of the values that are asking for an identifier, with the exception
705 703 * of cpi_apicid, are allowed to be synthetic.
706 704 *
707 705 *
708 706 * cpi_apicid
709 707 *
710 708 * This is the value of the CPU's APIC id. This should be the full 32-bit
711 709 * ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
712 710 * APIC ID. This value is globally unique between all logical CPUs across
713 711 * all packages. This is usually required by the APIC.
714 712 *
715 713 * cpi_chipid
716 714 *
717 715 * This value indicates the ID of the package that the logical CPU is a
718 716 * part of. This value is allowed to be synthetic. It is usually derived by
719 717 * taking the CPU's APIC ID and determining how many bits are used to
720 718 * represent CPU cores in the package. All logical CPUs that are part of
721 719 * the same package must have the same value.
722 720 *
723 721 * cpi_coreid
724 722 *
725 723 * This represents the ID of a CPU core. Two logical CPUs should only have
726 724 * the same cpi_coreid value if they are part of the same core. These
727 725 * values may be synthetic. On systems that support SMT, this value is
728 726 * usually derived from the APIC ID, otherwise it is often synthetic and
729 727 * just set to the value of the cpu_id in the cpu_t.
730 728 *
731 729 * cpi_pkgcoreid
732 730 *
733 731 * This is similar to the cpi_coreid in that logical CPUs that are part of
734 732 * the same core should have the same ID. The main difference is that these
735 733 * values are only required to be unique to a given socket.
736 734 *
737 735 * cpi_clogid
738 736 *
739 737 * This represents the logical ID of a logical CPU. This value should be
740 738 * unique within a given socket for each logical CPU. This is allowed to be
741 739 * synthetic, though it is usually based off of the CPU's apic ID. The
742 740 * broader system expects that logical CPUs that have are part of the same
743 741 * core have contiguous numbers. For example, if there were two threads per
744 742 * core, then the core IDs divided by two should be the same and the first
745 743 * modulus two should be zero and the second one. For example, IDs 4 and 5
746 744 * indicate two logical CPUs that are part of the same core. But IDs 5 and
747 745 * 6 represent two logical CPUs that are part of different cores.
748 746 *
749 747 * While it is common for the cpi_coreid and the cpi_clogid to be derived
750 748 * from the same source, strictly speaking, they don't have to be and the
751 749 * two values should be considered logically independent. One should not
752 750 * try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
753 751 * some kind of relationship. While this is tempting, we've seen cases on
754 752 * AMD family 0xf where the system's cpu id is not related to its APIC ID.
755 753 *
756 754 * cpi_ncpu_per_chip
757 755 *
758 756 * This value indicates the total number of logical CPUs that exist in the
759 757 * physical package. Critically, this is not the number of logical CPUs
760 758 * that exist for just the single core.
761 759 *
762 760 * This value should be the same for all logical CPUs in the same package.
763 761 *
764 762 * cpi_ncore_per_chip
765 763 *
766 764 * This value indicates the total number of physical CPU cores that exist
767 765 * in the package. The system compares this value with cpi_ncpu_per_chip to
768 766 * determine if simultaneous multi-threading (SMT) is enabled. When
769 767 * cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
770 768 * the X86FSET_HTT feature is not set. If this value is greater than one,
771 769 * than we consider the processor to have the feature X86FSET_CMP, to
772 770 * indicate that there is support for more than one core.
773 771 *
774 772 * This value should be the same for all logical CPUs in the same package.
775 773 *
776 774 * cpi_procnodes_per_pkg
777 775 *
778 776 * This value indicates the number of 'nodes' that exist in the package.
779 777 * When processors are actually a multi-chip module, this represents the
780 778 * number of such modules that exist in the package. Currently, on Intel
781 779 * based systems this member is always set to 1.
782 780 *
783 781 * This value should be the same for all logical CPUs in the same package.
784 782 *
785 783 * cpi_procnodeid
786 784 *
787 785 * This value indicates the ID of the node that the logical CPU is a part
788 786 * of. All logical CPUs that are in the same node must have the same value
789 787 * here. This value must be unique across all of the packages in the
790 788 * system. On Intel based systems, this is currently set to the value in
791 789 * cpi_chipid because there is only one node.
792 790 *
793 791 * cpi_cores_per_compunit
794 792 *
795 793 * This value indicates the number of cores that are part of a compute
796 794 * unit. See the AMD topology section for this. This member only has real
797 795 * meaning currently for AMD Bulldozer family processors. For all other
798 796 * processors, this should currently be set to 1.
799 797 *
800 798 * cpi_compunitid
801 799 *
802 800 * This indicates the compute unit that the logical CPU belongs to. For
803 801 * processors without AMD Bulldozer-style compute units this should be set
804 802 * to the value of cpi_coreid.
805 803 *
806 804 * cpi_ncpu_shr_last_cache
807 805 *
808 806 * This indicates the number of logical CPUs that are sharing the same last
809 807 * level cache. This value should be the same for all CPUs that are sharing
810 808 * that cache. The last cache refers to the cache that is closest to memory
811 809 * and furthest away from the CPU.
812 810 *
813 811 * cpi_last_lvl_cacheid
814 812 *
815 813 * This indicates the ID of the last cache that the logical CPU uses. This
816 814 * cache is often shared between multiple logical CPUs and is the cache
817 815 * that is closest to memory and furthest away from the CPU. This value
818 816 * should be the same for a group of logical CPUs only if they actually
819 817 * share the same last level cache. IDs should not overlap between
820 818 * packages.
821 819 *
822 820 * cpi_ncore_bits
823 821 *
824 822 * This indicates the number of bits that are required to represent all of
825 823 * the cores in the system. As cores are derived based on their APIC IDs,
826 824 * we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
827 825 * this value to be larger than the actual number of IDs that are present
828 826 * in the system. This is used to size tables by the CMI framework. It is
829 827 * only filled in for Intel and AMD CPUs.
830 828 *
831 829 * cpi_nthread_bits
832 830 *
833 831 * This indicates the number of bits required to represent all of the IDs
834 832 * that cover the logical CPUs that exist on a given core. It's OK for this
835 833 * value to be larger than the actual number of IDs that are present in the
836 834 * system. This is used to size tables by the CMI framework. It is
837 835 * only filled in for Intel and AMD CPUs.
838 836 *
839 837 * -----------
840 838 * Hypervisors
841 839 * -----------
842 840 *
843 841 * If trying to manage the differences between vendors wasn't bad enough, it can
844 842 * get worse thanks to our friend hardware virtualization. Hypervisors are given
845 843 * the ability to interpose on all cpuid instructions and change them to suit
846 844 * their purposes. In general, this is necessary as the hypervisor wants to be
847 845 * able to present a more uniform set of features or not necessarily give the
848 846 * guest operating system kernel knowledge of all features so it can be
849 847 * more easily migrated between systems.
850 848 *
851 849 * When it comes to trying to determine topology information, this can be a
852 850 * double edged sword. When a hypervisor doesn't actually implement a cpuid
853 851 * leaf, it'll often return all zeros. Because of that, you'll often see various
854 852 * checks scattered about fields being non-zero before we assume we can use
855 853 * them.
856 854 *
857 855 * When it comes to topology information, the hypervisor is often incentivized
858 856 * to lie to you about topology. This is because it doesn't always actually
859 857 * guarantee that topology at all. The topology path we take in the system
860 858 * depends on how the CPU advertises itself. If it advertises itself as an Intel
861 859 * or AMD CPU, then we basically do our normal path. However, when they don't
862 860 * use an actual vendor, then that usually turns into multiple one-core CPUs
863 861 * that we enumerate that are often on different sockets. The actual behavior
864 862 * depends greatly on what the hypervisor actually exposes to us.
865 863 *
866 864 * --------------------
867 865 * Exposing Information
868 866 * --------------------
869 867 *
870 868 * We expose CPUID information in three different forms in the system.
871 869 *
872 870 * The first is through the x86_featureset variable. This is used in conjunction
873 871 * with the is_x86_feature() function. This is queried by x86-specific functions
874 872 * to determine which features are or aren't present in the system and to make
875 873 * decisions based upon them. For example, users of this include everything from
876 874 * parts of the system dedicated to reliability, availability, and
877 875 * serviceability (RAS), to making decisions about how to handle security
878 876 * mitigations, to various x86-specific drivers. General purpose or
879 877 * architecture independent drivers should never be calling this function.
880 878 *
881 879 * The second means is through the auxiliary vector. The auxiliary vector is a
882 880 * series of tagged data that the kernel passes down to a user program when it
883 881 * begins executing. This information is used to indicate to programs what
884 882 * instruction set extensions are present. For example, information about the
885 883 * CPU supporting the machine check architecture (MCA) wouldn't be passed down
886 884 * since user programs cannot make use of it. However, things like the AVX
887 885 * instruction sets are. Programs use this information to make run-time
888 886 * decisions about what features they should use. As an example, the run-time
889 887 * link-editor (rtld) can relocate different functions depending on the hardware
890 888 * support available.
891 889 *
892 890 * The final form is through a series of accessor functions that all have the
893 891 * form cpuid_get*. This is used by a number of different subsystems in the
894 892 * kernel to determine more detailed information about what we're running on,
895 893 * topology information, etc. Some of these subsystems include processor groups
896 894 * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
897 895 * microcode, and performance monitoring. These functions all ASSERT that the
898 896 * CPU they're being called on has reached a certain cpuid pass. If the passes
899 897 * are rearranged, then this needs to be adjusted.
900 898 *
901 899 * -----------------------------------------------
902 900 * Speculative Execution CPU Side Channel Security
903 901 * -----------------------------------------------
904 902 *
905 903 * With the advent of the Spectre and Meltdown attacks which exploit speculative
906 904 * execution in the CPU to create side channels there have been a number of
907 905 * different attacks and corresponding issues that the operating system needs to
908 906 * mitigate against. The following list is some of the common, but not
909 907 * exhaustive, set of issues that we know about and have done some or need to do
910 908 * more work in the system to mitigate against:
911 909 *
912 910 * - Spectre v1
913 911 * - swapgs (Spectre v1 variant)
914 912 * - Spectre v2
915 913 * - Meltdown (Spectre v3)
916 914 * - Rogue Register Read (Spectre v3a)
917 915 * - Speculative Store Bypass (Spectre v4)
918 916 * - ret2spec, SpectreRSB
919 917 * - L1 Terminal Fault (L1TF)
920 918 * - Microarchitectural Data Sampling (MDS)
921 919 *
922 920 * Each of these requires different sets of mitigations and has different attack
923 921 * surfaces. For the most part, this discussion is about protecting the kernel
924 922 * from non-kernel executing environments such as user processes and hardware
925 923 * virtual machines. Unfortunately, there are a number of user vs. user
926 924 * scenarios that exist with these. The rest of this section will describe the
927 925 * overall approach that the system has taken to address these as well as their
928 926 * shortcomings. Unfortunately, not all of the above have been handled today.
929 927 *
930 928 * SPECTRE v2, ret2spec, SpectreRSB
931 929 *
932 930 * The second variant of the spectre attack focuses on performing branch target
933 931 * injection. This generally impacts indirect call instructions in the system.
934 932 * There are three different ways to mitigate this issue that are commonly
935 933 * described today:
936 934 *
937 935 * 1. Using Indirect Branch Restricted Speculation (IBRS).
938 936 * 2. Using Retpolines and RSB Stuffing
939 937 * 3. Using Enhanced Indirect Branch Restricted Speculation (EIBRS)
940 938 *
941 939 * IBRS uses a feature added to microcode to restrict speculation, among other
942 940 * things. This form of mitigation has not been used as it has been generally
943 941 * seen as too expensive and requires reactivation upon various transitions in
944 942 * the system.
945 943 *
946 944 * As a less impactful alternative to IBRS, retpolines were developed by
947 945 * Google. These basically require one to replace indirect calls with a specific
948 946 * trampoline that will cause speculation to fail and break the attack.
949 947 * Retpolines require compiler support. We always build with retpolines in the
950 948 * external thunk mode. This means that a traditional indirect call is replaced
951 949 * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
952 950 * of this is that all indirect function calls are performed through a register.
953 951 *
954 952 * We have to use a common external location of the thunk and not inline it into
955 953 * the callsite so that way we can have a single place to patch these functions.
956 954 * As it turns out, we actually have three different forms of retpolines that
957 955 * exist in the system:
958 956 *
959 957 * 1. A full retpoline
960 958 * 2. An AMD-specific optimized retpoline
961 959 * 3. A no-op version
962 960 *
963 961 * The first one is used in the general case. The second one is used if we can
964 962 * determine that we're on an AMD system and we can successfully toggle the
965 963 * lfence serializing MSR that exists on the platform. Basically with this
966 964 * present, an lfence is sufficient and we don't need to do anywhere near as
967 965 * complicated a dance to successfully use retpolines.
968 966 *
969 967 * The third form described above is the most curious. It turns out that the way
970 968 * that retpolines are implemented is that they rely on how speculation is
971 969 * performed on a 'ret' instruction. Intel has continued to optimize this
972 970 * process (which is partly why we need to have return stack buffer stuffing,
973 971 * but more on that in a bit) and in processors starting with Cascade Lake
974 972 * on the server side, it's dangerous to rely on retpolines. Instead, a new
975 973 * mechanism has been introduced called Enhanced IBRS (EIBRS).
976 974 *
977 975 * Unlike IBRS, EIBRS is designed to be enabled once at boot and left on each
978 976 * physical core. However, if this is the case, we don't want to use retpolines
979 977 * any more. Therefore if EIBRS is present, we end up turning each retpoline
980 978 * function (called a thunk) into a jmp instruction. This means that we're still
981 979 * paying the cost of an extra jump to the external thunk, but it gives us
982 980 * flexibility and the ability to have a single kernel image that works across a
983 981 * wide variety of systems and hardware features.
984 982 *
985 983 * Unfortunately, this alone is insufficient. First, Skylake systems have
986 984 * additional speculation for the Return Stack Buffer (RSB) which is used to
987 985 * return from call instructions which retpolines take advantage of. However,
988 986 * this problem is not just limited to Skylake and is actually more pernicious.
989 987 * The SpectreRSB paper introduces several more problems that can arise with
990 988 * dealing with this. The RSB can be poisoned just like the indirect branch
991 989 * predictor. This means that one needs to clear the RSB when transitioning
992 990 * between two different privilege domains. Some examples include:
993 991 *
994 992 * - Switching between two different user processes
995 993 * - Going between user land and the kernel
996 994 * - Returning to the kernel from a hardware virtual machine
997 995 *
998 996 * Mitigating this involves combining a couple of different things. The first is
999 997 * SMEP (supervisor mode execution protection) which was introduced in Ivy
1000 998 * Bridge. When an RSB entry refers to a user address and we're executing in the
1001 999 * kernel, speculation through it will be stopped when SMEP is enabled. This
1002 1000 * protects against a number of the different cases that we would normally be
1003 1001 * worried about such as when we enter the kernel from user land.
1004 1002 *
1005 1003 * To prevent against additional manipulation of the RSB from other contexts
1006 1004 * such as a non-root VMX context attacking the kernel we first look to enhanced
1007 1005 * IBRS. When EIBRS is present and enabled, then there is nothing else that we
1008 1006 * need to do to protect the kernel at this time.
1009 1007 *
1010 1008 * On CPUs without EIBRS we need to manually overwrite the contents of the
1011 1009 * return stack buffer. We do this through the x86_rsb_stuff() function.
1012 1010 * Currently this is employed on context switch. The x86_rsb_stuff() function is
1013 1011 * disabled when enhanced IBRS is present because Intel claims on such systems
1014 1012 * it will be ineffective. Stuffing the RSB in context switch helps prevent user
1015 1013 * to user attacks via the RSB.
1016 1014 *
1017 1015 * If SMEP is not present, then we would have to stuff the RSB every time we
1018 1016 * transitioned from user mode to the kernel, which isn't very practical right
1019 1017 * now.
1020 1018 *
1021 1019 * To fully protect user to user and vmx to vmx attacks from these classes of
1022 1020 * issues, we would also need to allow them to opt into performing an Indirect
1023 1021 * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1024 1022 *
1025 1023 * By default, the system will enable RSB stuffing and the required variant of
1026 1024 * retpolines and store that information in the x86_spectrev2_mitigation value.
1027 1025 * This will be evaluated after a microcode update as well, though it is
1028 1026 * expected that microcode updates will not take away features. This may mean
1029 1027 * that a late loaded microcode may not end up in the optimal configuration
1030 1028 * (though this should be rare).
1031 1029 *
1032 1030 * Currently we do not build kmdb with retpolines or perform any additional side
1033 1031 * channel security mitigations for it. One complication with kmdb is that it
1034 1032 * requires its own retpoline thunks and it would need to adjust itself based on
1035 1033 * what the kernel does. The threat model of kmdb is more limited and therefore
1036 1034 * it may make more sense to investigate using prediction barriers as the whole
1037 1035 * system is only executing a single instruction at a time while in kmdb.
1038 1036 *
1039 1037 * SPECTRE v1, v4
1040 1038 *
1041 1039 * The v1 and v4 variants of spectre are not currently mitigated in the
1042 1040 * system and require other classes of changes to occur in the code.
1043 1041 *
1044 1042 * SPECTRE v1 (SWAPGS VARIANT)
1045 1043 *
1046 1044 * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but
1047 1045 * can generally affect any branch-dependent code. The swapgs issue is one
1048 1046 * variant of this. If we are coming in from userspace, we can have code like
1049 1047 * this:
1050 1048 *
1051 1049 * cmpw $KCS_SEL, REGOFF_CS(%rsp)
1052 1050 * je 1f
1053 1051 * movq $0, REGOFF_SAVFP(%rsp)
1054 1052 * swapgs
1055 1053 * 1:
1056 1054 * movq %gs:CPU_THREAD, %rax
1057 1055 *
1058 1056 * If an attacker can cause a mis-speculation of the branch here, we could skip
1059 1057 * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based
1060 1058 * load. If subsequent code can act as the usual Spectre cache gadget, this
1061 1059 * would potentially allow KPTI bypass. To fix this, we need an lfence prior to
1062 1060 * any use of the %gs override.
1063 1061 *
1064 1062 * The other case is also an issue: if we're coming into a trap from kernel
1065 1063 * space, we could mis-speculate and swapgs the user %gsbase back in prior to
1066 1064 * using it. AMD systems are not vulnerable to this version, as a swapgs is
1067 1065 * serializing with respect to subsequent uses. But as AMD /does/ need the other
1068 1066 * case, and the fix is the same in both cases (an lfence at the branch target
1069 1067 * 1: in this example), we'll just do it unconditionally.
1070 1068 *
1071 1069 * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it
1072 1070 * harder for user-space to actually set a useful %gsbase value: although it's
1073 1071 * not clear, it might still be feasible via lwp_setprivate(), though, so we
1074 1072 * mitigate anyway.
1075 1073 *
1076 1074 * MELTDOWN
1077 1075 *
1078 1076 * Meltdown, or spectre v3, allowed a user process to read any data in their
1079 1077 * address space regardless of whether or not the page tables in question
1080 1078 * allowed the user to have the ability to read them. The solution to meltdown
1081 1079 * is kernel page table isolation. In this world, there are two page tables that
1082 1080 * are used for a process, one in user land and one in the kernel. To implement
1083 1081 * this we use per-CPU page tables and switch between the user and kernel
1084 1082 * variants when entering and exiting the kernel. For more information about
1085 1083 * this process and how the trampolines work, please see the big theory
1086 1084 * statements and additional comments in:
1087 1085 *
1088 1086 * - uts/i86pc/ml/kpti_trampolines.s
1089 1087 * - uts/i86pc/vm/hat_i86.c
1090 1088 *
1091 1089 * While Meltdown only impacted Intel systems and there are also Intel systems
1092 1090 * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1093 1091 * kernel page table isolation enabled. While this may at first seem weird, an
1094 1092 * important thing to remember is that you can't speculatively read an address
1095 1093 * if it's never in your page table at all. Having user processes without kernel
1096 1094 * pages present provides us with an important layer of defense in the kernel
1097 1095 * against any other side channel attacks that exist and have yet to be
1098 1096 * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1099 1097 * default, no matter the x86 system.
1100 1098 *
1101 1099 * L1 TERMINAL FAULT
1102 1100 *
1103 1101 * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1104 1102 * execution uses page table entries. Effectively, it is two different problems.
1105 1103 * The first is that it ignores the not present bit in the page table entries
1106 1104 * when performing speculative execution. This means that something can
1107 1105 * speculatively read the listed physical address if it's present in the L1
1108 1106 * cache under certain conditions (see Intel's documentation for the full set of
1109 1107 * conditions). Secondly, this can be used to bypass hardware virtualization
1110 1108 * extended page tables (EPT) that are part of Intel's hardware virtual machine
1111 1109 * instructions.
1112 1110 *
1113 1111 * For the non-hardware virtualized case, this is relatively easy to deal with.
1114 1112 * We must make sure that all unmapped pages have an address of zero. This means
1115 1113 * that they could read the first 4k of physical memory; however, we never use
1116 1114 * that first page in the operating system and always skip putting it in our
1117 1115 * memory map, even if firmware tells us we can use it in our memory map. While
1118 1116 * other systems try to put extra metadata in the address and reserved bits,
1119 1117 * which led to this being problematic in those cases, we do not.
1120 1118 *
1121 1119 * For hardware virtual machines things are more complicated. Because they can
1122 1120 * construct their own page tables, it isn't hard for them to perform this
1123 1121 * attack against any physical address. The one wrinkle is that this physical
1124 1122 * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1125 1123 * to flush the L1 data cache. We wrap this up in the function
1126 1124 * spec_uarch_flush(). This function is also used in the mitigation of
1127 1125 * microarchitectural data sampling (MDS) discussed later on. Kernel based
1128 1126 * hypervisors such as KVM or bhyve are responsible for performing this before
1129 1127 * entering the guest.
1130 1128 *
1131 1129 * Because this attack takes place in the L1 cache, there's another wrinkle
1132 1130 * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1133 1131 * designs. This means that when a thread enters a hardware virtualized context
1134 1132 * and flushes the L1 data cache, the other thread on the processor may then go
1135 1133 * ahead and put new data in it that can be potentially attacked. While one
1136 1134 * solution is to disable SMT on the system, another option that is available is
1137 1135 * to use a feature for hardware virtualization called 'SMT exclusion'. This
1138 1136 * goes through and makes sure that if a HVM is being scheduled on one thread,
1139 1137 * then the thing on the other thread is from the same hardware virtual machine.
1140 1138 * If an interrupt comes in or the guest exits to the broader system, then the
1141 1139 * other SMT thread will be kicked out.
1142 1140 *
1143 1141 * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1144 1142 * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1145 1143 * perform L1TF related mitigations.
1146 1144 *
1147 1145 * MICROARCHITECTURAL DATA SAMPLING
1148 1146 *
1149 1147 * Microarchitectural data sampling (MDS) is a combination of four discrete
1150 1148 * vulnerabilities that are similar issues affecting various parts of the CPU's
1151 1149 * microarchitectural implementation around load, store, and fill buffers.
1152 1150 * Specifically it is made up of the following subcomponents:
1153 1151 *
1154 1152 * 1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1155 1153 * 2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1156 1154 * 3. Microarchitectural Load Port Data Sampling (MLPDS)
1157 1155 * 4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1158 1156 *
1159 1157 * To begin addressing these, Intel has introduced another feature in microcode
1160 1158 * called MD_CLEAR. This changes the verw instruction to operate in a different
1161 1159 * way. This allows us to execute the verw instruction in a particular way to
1162 1160 * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1163 1161 * updated when this microcode is present to flush this state.
1164 1162 *
1165 1163 * Primarily we need to flush this state whenever we transition from the kernel
1166 1164 * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1167 1165 * little bit different. Here the structures are statically sized when a logical
1168 1166 * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1169 1167 * flush the microarchitectural state before the CPU goes idles by calling hlt,
1170 1168 * mwait, or another ACPI method. To perform these flushes, we call
1171 1169 * x86_md_clear() at all of these transition points.
1172 1170 *
1173 1171 * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1174 1172 * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1175 1173 * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1176 1174 * a no-op.
1177 1175 *
1178 1176 * Unfortunately, with this issue hyperthreading rears its ugly head. In
↓ open down ↓ |
1132 lines elided |
↑ open up ↑ |
1179 1177 * particular, everything we've discussed above is only valid for a single
1180 1178 * thread executing on a core. In the case where you have hyper-threading
1181 1179 * present, this attack can be performed between threads. The theoretical fix
1182 1180 * for this is to ensure that both threads are always in the same security
1183 1181 * domain. This means that they are executing in the same ring and mutually
1184 1182 * trust each other. Practically speaking, this would mean that a system call
1185 1183 * would have to issue an inter-processor interrupt (IPI) to the other thread.
1186 1184 * Rather than implement this, we recommend that one disables hyper-threading
1187 1185 * through the use of psradm -aS.
1188 1186 *
1187 + * TSX ASYNCHRONOUS ABORT
1188 + *
1189 + * TSX Asynchronous Abort (TAA) is another side-channel vulnerability that
1190 + * behaves like MDS, but leverages Intel's transactional instructions as another
1191 + * vector. Effectively, when a transaction hits one of these cases (unmapped
1192 + * page, various cache snoop activity, etc.) then the same data can be exposed
1193 + * as in the case of MDS. This means that you can attack your twin.
1194 + *
1195 + * Intel has described that there are two different ways that we can mitigate
1196 + * this problem on affected processors:
1197 + *
1198 + * 1) We can use the same techniques used to deal with MDS. Flushing the
1199 + * microarchitectural buffers and disabling hyperthreading will mitigate
1200 + * this in the same way.
1201 + *
1202 + * 2) Using microcode to disable TSX.
1203 + *
1204 + * Now, most processors that are subject to MDS (as in they don't have MDS_NO in
1205 + * the IA32_ARCH_CAPABILITIES MSR) will not receive microcode to disable TSX.
1206 + * That's OK as we're already doing all such mitigations. On the other hand,
1207 + * processors with MDS_NO are all supposed to receive microcode updates that
1208 + * enumerate support for disabling TSX. In general, we'd rather use this method
1209 + * when available as it doesn't require disabling hyperthreading to be
1210 + * effective. Currently we basically are relying on microcode for processors
1211 + * that enumerate MDS_NO.
1212 + *
1213 + * The microcode features are enumerated as part of the IA32_ARCH_CAPABILITIES.
1214 + * When bit 7 (IA32_ARCH_CAP_TSX_CTRL) is present, then we are given two
1215 + * different powers. The first allows us to cause all transactions to
1216 + * immediately abort. The second gives us a means of disabling TSX completely,
1217 + * which includes removing it from cpuid. If we have support for this in
1218 + * microcode during the first cpuid pass, then we'll disable TSX completely such
1219 + * that user land never has a chance to observe the bit. However, if we are late
1220 + * loading the microcode, then we must use the functionality to cause
1221 + * transactions to automatically abort. This is necessary for user land's sake.
1222 + * Once a program sees a cpuid bit, it must not be taken away.
1223 + *
1224 + * We track whether or not we should do this based on what cpuid pass we're in.
1225 + * Whenever we hit cpuid_scan_security() on the boot CPU and we're still on pass
1226 + * 1 of the cpuid logic, then we can completely turn off TSX. Notably this
1227 + * should happen twice. Once in the normal cpuid_pass1() code and then a second
1228 + * time after we do the initial microcode update. As a result we need to be
1229 + * careful in cpuid_apply_tsx() to only use the MSR if we've loaded a suitable
1230 + * microcode on the current CPU (which happens prior to cpuid_pass_ucode()).
1231 + *
1232 + * If TAA has been fixed, then it will be enumerated in IA32_ARCH_CAPABILITIES
1233 + * as TAA_NO. In such a case, we will still disable TSX: it's proven to be an
1234 + * unfortunate feature in a number of ways, and taking the opportunity to
1235 + * finally be able to turn it off is likely to be of benefit in the future.
1236 + *
1189 1237 * SUMMARY
1190 1238 *
1191 1239 * The following table attempts to summarize the mitigations for various issues
1192 1240 * and what's done in various places:
1193 1241 *
1194 1242 * - Spectre v1: Not currently mitigated
1195 1243 * - swapgs: lfences after swapgs paths
1196 1244 * - Spectre v2: Retpolines/RSB Stuffing or EIBRS if HW support
1197 1245 * - Meltdown: Kernel Page Table Isolation
1198 1246 * - Spectre v3a: Updated CPU microcode
1199 1247 * - Spectre v4: Not currently mitigated
1200 1248 * - SpectreRSB: SMEP and RSB Stuffing
1201 1249 * - L1TF: spec_uarch_flush, SMT exclusion, requires microcode
1202 - * - MDS: x86_md_clear, requires microcode, disabling hyper threading
1250 + * - MDS: x86_md_clear, requires microcode, disabling SMT
1251 + * - TAA: x86_md_clear and disabling SMT OR microcode and disabling TSX
1203 1252 *
1204 1253 * The following table indicates the x86 feature set bits that indicate that a
1205 1254 * given problem has been solved or a notable feature is present:
1206 1255 *
1207 1256 * - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1208 1257 * - MDS_NO: All forms of MDS
1258 + * - TAA_NO: TAA
1209 1259 */
1210 1260
1211 1261 #include <sys/types.h>
1212 1262 #include <sys/archsystm.h>
1213 1263 #include <sys/x86_archext.h>
1214 1264 #include <sys/kmem.h>
1215 1265 #include <sys/systm.h>
1216 1266 #include <sys/cmn_err.h>
1217 1267 #include <sys/sunddi.h>
1218 1268 #include <sys/sunndi.h>
1219 1269 #include <sys/cpuvar.h>
1220 1270 #include <sys/processor.h>
1221 1271 #include <sys/sysmacros.h>
1222 1272 #include <sys/pg.h>
1223 1273 #include <sys/fp.h>
1224 1274 #include <sys/controlregs.h>
1225 1275 #include <sys/bitmap.h>
1226 1276 #include <sys/auxv_386.h>
1227 1277 #include <sys/memnode.h>
1228 1278 #include <sys/pci_cfgspace.h>
1229 1279 #include <sys/comm_page.h>
1230 1280 #include <sys/mach_mmu.h>
1231 1281 #include <sys/ucode.h>
1232 1282 #include <sys/tsc.h>
1233 1283 #include <sys/kobj.h>
1234 1284 #include <sys/asm_misc.h>
1235 1285
1236 1286 #ifdef __xpv
1237 1287 #include <sys/hypervisor.h>
1238 1288 #else
1239 1289 #include <sys/ontrap.h>
1240 1290 #endif
1241 1291
1242 1292 uint_t x86_vendor = X86_VENDOR_IntelClone;
1243 1293 uint_t x86_type = X86_TYPE_OTHER;
1244 1294 uint_t x86_clflush_size = 0;
1245 1295
1246 1296 #if defined(__xpv)
1247 1297 int x86_use_pcid = 0;
1248 1298 int x86_use_invpcid = 0;
1249 1299 #else
1250 1300 int x86_use_pcid = -1;
1251 1301 int x86_use_invpcid = -1;
1252 1302 #endif
1253 1303
1254 1304 typedef enum {
↓ open down ↓ |
36 lines elided |
↑ open up ↑ |
1255 1305 X86_SPECTREV2_RETPOLINE,
1256 1306 X86_SPECTREV2_RETPOLINE_AMD,
1257 1307 X86_SPECTREV2_ENHANCED_IBRS,
1258 1308 X86_SPECTREV2_DISABLED
1259 1309 } x86_spectrev2_mitigation_t;
1260 1310
1261 1311 uint_t x86_disable_spectrev2 = 0;
1262 1312 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1263 1313 X86_SPECTREV2_RETPOLINE;
1264 1314
1315 +/*
1316 + * The mitigation status for TAA:
1317 + * X86_TAA_NOTHING -- no mitigation available for TAA side-channels
1318 + * X86_TAA_DISABLED -- mitigation disabled via x86_disable_taa
1319 + * X86_TAA_MD_CLEAR -- MDS mitigation also suffices for TAA
1320 + * X86_TAA_TSX_FORCE_ABORT -- transactions are forced to abort
1321 + * X86_TAA_TSX_DISABLE -- force abort transactions and hide from CPUID
1322 + * X86_TAA_HW_MITIGATED -- TSX potentially active but H/W not TAA-vulnerable
1323 + */
1324 +typedef enum {
1325 + X86_TAA_NOTHING,
1326 + X86_TAA_DISABLED,
1327 + X86_TAA_MD_CLEAR,
1328 + X86_TAA_TSX_FORCE_ABORT,
1329 + X86_TAA_TSX_DISABLE,
1330 + X86_TAA_HW_MITIGATED
1331 +} x86_taa_mitigation_t;
1332 +
1333 +uint_t x86_disable_taa = 0;
1334 +static x86_taa_mitigation_t x86_taa_mitigation = X86_TAA_NOTHING;
1335 +
1265 1336 uint_t pentiumpro_bug4046376;
1266 1337
1267 1338 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1268 1339
1269 1340 static char *x86_feature_names[NUM_X86_FEATURES] = {
1270 1341 "lgpg",
1271 1342 "tsc",
1272 1343 "msr",
1273 1344 "mtrr",
1274 1345 "pge",
1275 1346 "de",
1276 1347 "cmov",
1277 1348 "mmx",
1278 1349 "mca",
1279 1350 "pae",
1280 1351 "cv8",
1281 1352 "pat",
1282 1353 "sep",
1283 1354 "sse",
1284 1355 "sse2",
1285 1356 "htt",
1286 1357 "asysc",
1287 1358 "nx",
1288 1359 "sse3",
1289 1360 "cx16",
1290 1361 "cmp",
1291 1362 "tscp",
1292 1363 "mwait",
1293 1364 "sse4a",
1294 1365 "cpuid",
1295 1366 "ssse3",
1296 1367 "sse4_1",
1297 1368 "sse4_2",
1298 1369 "1gpg",
1299 1370 "clfsh",
1300 1371 "64",
1301 1372 "aes",
1302 1373 "pclmulqdq",
1303 1374 "xsave",
1304 1375 "avx",
1305 1376 "vmx",
1306 1377 "svm",
1307 1378 "topoext",
1308 1379 "f16c",
1309 1380 "rdrand",
1310 1381 "x2apic",
1311 1382 "avx2",
1312 1383 "bmi1",
1313 1384 "bmi2",
1314 1385 "fma",
1315 1386 "smep",
1316 1387 "smap",
1317 1388 "adx",
1318 1389 "rdseed",
1319 1390 "mpx",
1320 1391 "avx512f",
1321 1392 "avx512dq",
1322 1393 "avx512pf",
1323 1394 "avx512er",
1324 1395 "avx512cd",
1325 1396 "avx512bw",
1326 1397 "avx512vl",
1327 1398 "avx512fma",
1328 1399 "avx512vbmi",
1329 1400 "avx512_vpopcntdq",
1330 1401 "avx512_4vnniw",
1331 1402 "avx512_4fmaps",
1332 1403 "xsaveopt",
1333 1404 "xsavec",
1334 1405 "xsaves",
1335 1406 "sha",
1336 1407 "umip",
1337 1408 "pku",
1338 1409 "ospke",
1339 1410 "pcid",
1340 1411 "invpcid",
1341 1412 "ibrs",
1342 1413 "ibpb",
1343 1414 "stibp",
1344 1415 "ssbd",
1345 1416 "ssbd_virt",
1346 1417 "rdcl_no",
1347 1418 "ibrs_all",
1348 1419 "rsba",
1349 1420 "ssb_no",
1350 1421 "stibp_all",
1351 1422 "flush_cmd",
1352 1423 "l1d_vmentry_no",
1353 1424 "fsgsbase",
1354 1425 "clflushopt",
1355 1426 "clwb",
↓ open down ↓ |
81 lines elided |
↑ open up ↑ |
1356 1427 "monitorx",
1357 1428 "clzero",
1358 1429 "xop",
1359 1430 "fma4",
1360 1431 "tbm",
1361 1432 "avx512_vnni",
1362 1433 "amd_pcec",
1363 1434 "mb_clear",
1364 1435 "mds_no",
1365 1436 "core_thermal",
1366 - "pkg_thermal"
1437 + "pkg_thermal",
1438 + "tsx_ctrl",
1439 + "taa_no"
1367 1440 };
1368 1441
1369 1442 boolean_t
1370 1443 is_x86_feature(void *featureset, uint_t feature)
1371 1444 {
1372 1445 ASSERT(feature < NUM_X86_FEATURES);
1373 1446 return (BT_TEST((ulong_t *)featureset, feature));
1374 1447 }
1375 1448
1376 1449 void
1377 1450 add_x86_feature(void *featureset, uint_t feature)
1378 1451 {
1379 1452 ASSERT(feature < NUM_X86_FEATURES);
1380 1453 BT_SET((ulong_t *)featureset, feature);
1381 1454 }
1382 1455
1383 1456 void
1384 1457 remove_x86_feature(void *featureset, uint_t feature)
1385 1458 {
1386 1459 ASSERT(feature < NUM_X86_FEATURES);
1387 1460 BT_CLEAR((ulong_t *)featureset, feature);
1388 1461 }
1389 1462
1390 1463 boolean_t
1391 1464 compare_x86_featureset(void *setA, void *setB)
1392 1465 {
1393 1466 /*
1394 1467 * We assume that the unused bits of the bitmap are always zero.
1395 1468 */
1396 1469 if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1397 1470 return (B_TRUE);
1398 1471 } else {
1399 1472 return (B_FALSE);
1400 1473 }
1401 1474 }
1402 1475
1403 1476 void
1404 1477 print_x86_featureset(void *featureset)
1405 1478 {
1406 1479 uint_t i;
1407 1480
1408 1481 for (i = 0; i < NUM_X86_FEATURES; i++) {
1409 1482 if (is_x86_feature(featureset, i)) {
1410 1483 cmn_err(CE_CONT, "?x86_feature: %s\n",
1411 1484 x86_feature_names[i]);
1412 1485 }
1413 1486 }
1414 1487 }
1415 1488
1416 1489 /* Note: This is the maximum size for the CPU, not the size of the structure. */
1417 1490 static size_t xsave_state_size = 0;
1418 1491 uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1419 1492 boolean_t xsave_force_disable = B_FALSE;
1420 1493 extern int disable_smap;
1421 1494
1422 1495 /*
1423 1496 * This is set to platform type we are running on.
1424 1497 */
1425 1498 static int platform_type = -1;
1426 1499
1427 1500 #if !defined(__xpv)
1428 1501 /*
1429 1502 * Variable to patch if hypervisor platform detection needs to be
1430 1503 * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1431 1504 */
1432 1505 int enable_platform_detection = 1;
1433 1506 #endif
1434 1507
1435 1508 /*
1436 1509 * monitor/mwait info.
1437 1510 *
1438 1511 * size_actual and buf_actual are the real address and size allocated to get
1439 1512 * proper mwait_buf alignement. buf_actual and size_actual should be passed
1440 1513 * to kmem_free(). Currently kmem_alloc() and mwait happen to both use
1441 1514 * processor cache-line alignment, but this is not guarantied in the furture.
1442 1515 */
1443 1516 struct mwait_info {
1444 1517 size_t mon_min; /* min size to avoid missed wakeups */
1445 1518 size_t mon_max; /* size to avoid false wakeups */
1446 1519 size_t size_actual; /* size actually allocated */
1447 1520 void *buf_actual; /* memory actually allocated */
1448 1521 uint32_t support; /* processor support of monitor/mwait */
1449 1522 };
1450 1523
1451 1524 /*
1452 1525 * xsave/xrestor info.
1453 1526 *
1454 1527 * This structure contains HW feature bits and the size of the xsave save area.
1455 1528 * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1456 1529 * (xsave_state) to describe the xsave layout. However, at runtime the
1457 1530 * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1458 1531 * xsave_state structure simply represents the legacy layout of the beginning
1459 1532 * of the xsave area.
1460 1533 */
1461 1534 struct xsave_info {
1462 1535 uint32_t xsav_hw_features_low; /* Supported HW features */
1463 1536 uint32_t xsav_hw_features_high; /* Supported HW features */
1464 1537 size_t xsav_max_size; /* max size save area for HW features */
1465 1538 size_t ymm_size; /* AVX: size of ymm save area */
1466 1539 size_t ymm_offset; /* AVX: offset for ymm save area */
1467 1540 size_t bndregs_size; /* MPX: size of bndregs save area */
1468 1541 size_t bndregs_offset; /* MPX: offset for bndregs save area */
1469 1542 size_t bndcsr_size; /* MPX: size of bndcsr save area */
1470 1543 size_t bndcsr_offset; /* MPX: offset for bndcsr save area */
1471 1544 size_t opmask_size; /* AVX512: size of opmask save */
1472 1545 size_t opmask_offset; /* AVX512: offset for opmask save */
1473 1546 size_t zmmlo_size; /* AVX512: size of zmm 256 save */
1474 1547 size_t zmmlo_offset; /* AVX512: offset for zmm 256 save */
1475 1548 size_t zmmhi_size; /* AVX512: size of zmm hi reg save */
1476 1549 size_t zmmhi_offset; /* AVX512: offset for zmm hi reg save */
1477 1550 };
1478 1551
1479 1552
1480 1553 /*
1481 1554 * These constants determine how many of the elements of the
1482 1555 * cpuid we cache in the cpuid_info data structure; the
1483 1556 * remaining elements are accessible via the cpuid instruction.
1484 1557 */
1485 1558
1486 1559 #define NMAX_CPI_STD 8 /* eax = 0 .. 7 */
1487 1560 #define NMAX_CPI_EXTD 0x1f /* eax = 0x80000000 .. 0x8000001e */
1488 1561
1489 1562 /*
1490 1563 * See the big theory statement for a more detailed explanation of what some of
1491 1564 * these members mean.
1492 1565 */
1493 1566 struct cpuid_info {
1494 1567 uint_t cpi_pass; /* last pass completed */
1495 1568 /*
1496 1569 * standard function information
1497 1570 */
1498 1571 uint_t cpi_maxeax; /* fn 0: %eax */
1499 1572 char cpi_vendorstr[13]; /* fn 0: %ebx:%ecx:%edx */
1500 1573 uint_t cpi_vendor; /* enum of cpi_vendorstr */
1501 1574
1502 1575 uint_t cpi_family; /* fn 1: extended family */
1503 1576 uint_t cpi_model; /* fn 1: extended model */
1504 1577 uint_t cpi_step; /* fn 1: stepping */
1505 1578 chipid_t cpi_chipid; /* fn 1: %ebx: Intel: chip # */
1506 1579 /* AMD: package/socket # */
1507 1580 uint_t cpi_brandid; /* fn 1: %ebx: brand ID */
1508 1581 int cpi_clogid; /* fn 1: %ebx: thread # */
1509 1582 uint_t cpi_ncpu_per_chip; /* fn 1: %ebx: logical cpu count */
1510 1583 uint8_t cpi_cacheinfo[16]; /* fn 2: intel-style cache desc */
1511 1584 uint_t cpi_ncache; /* fn 2: number of elements */
1512 1585 uint_t cpi_ncpu_shr_last_cache; /* fn 4: %eax: ncpus sharing cache */
1513 1586 id_t cpi_last_lvl_cacheid; /* fn 4: %eax: derived cache id */
1514 1587 uint_t cpi_cache_leaf_size; /* Number of cache elements */
1515 1588 /* Intel fn: 4, AMD fn: 8000001d */
1516 1589 struct cpuid_regs **cpi_cache_leaves; /* Acual leaves from above */
1517 1590 struct cpuid_regs cpi_std[NMAX_CPI_STD]; /* 0 .. 7 */
1518 1591 /*
1519 1592 * extended function information
1520 1593 */
1521 1594 uint_t cpi_xmaxeax; /* fn 0x80000000: %eax */
1522 1595 char cpi_brandstr[49]; /* fn 0x8000000[234] */
1523 1596 uint8_t cpi_pabits; /* fn 0x80000006: %eax */
1524 1597 uint8_t cpi_vabits; /* fn 0x80000006: %eax */
1525 1598 uint8_t cpi_fp_amd_save; /* AMD: FP error pointer save rqd. */
1526 1599 struct cpuid_regs cpi_extd[NMAX_CPI_EXTD]; /* 0x800000XX */
1527 1600
1528 1601 id_t cpi_coreid; /* same coreid => strands share core */
1529 1602 int cpi_pkgcoreid; /* core number within single package */
1530 1603 uint_t cpi_ncore_per_chip; /* AMD: fn 0x80000008: %ecx[7-0] */
1531 1604 /* Intel: fn 4: %eax[31-26] */
1532 1605
1533 1606 /*
1534 1607 * These values represent the number of bits that are required to store
1535 1608 * information about the number of cores and threads.
1536 1609 */
1537 1610 uint_t cpi_ncore_bits;
1538 1611 uint_t cpi_nthread_bits;
1539 1612 /*
1540 1613 * supported feature information
1541 1614 */
1542 1615 uint32_t cpi_support[6];
1543 1616 #define STD_EDX_FEATURES 0
1544 1617 #define AMD_EDX_FEATURES 1
1545 1618 #define TM_EDX_FEATURES 2
1546 1619 #define STD_ECX_FEATURES 3
1547 1620 #define AMD_ECX_FEATURES 4
1548 1621 #define STD_EBX_FEATURES 5
1549 1622 /*
1550 1623 * Synthesized information, where known.
1551 1624 */
1552 1625 uint32_t cpi_chiprev; /* See X86_CHIPREV_* in x86_archext.h */
1553 1626 const char *cpi_chiprevstr; /* May be NULL if chiprev unknown */
1554 1627 uint32_t cpi_socket; /* Chip package/socket type */
1555 1628
1556 1629 struct mwait_info cpi_mwait; /* fn 5: monitor/mwait info */
1557 1630 uint32_t cpi_apicid;
1558 1631 uint_t cpi_procnodeid; /* AMD: nodeID on HT, Intel: chipid */
1559 1632 uint_t cpi_procnodes_per_pkg; /* AMD: # of nodes in the package */
1560 1633 /* Intel: 1 */
1561 1634 uint_t cpi_compunitid; /* AMD: ComputeUnit ID, Intel: coreid */
1562 1635 uint_t cpi_cores_per_compunit; /* AMD: # of cores in the ComputeUnit */
1563 1636
1564 1637 struct xsave_info cpi_xsave; /* fn D: xsave/xrestor info */
1565 1638 };
1566 1639
1567 1640
1568 1641 static struct cpuid_info cpuid_info0;
1569 1642
1570 1643 /*
1571 1644 * These bit fields are defined by the Intel Application Note AP-485
1572 1645 * "Intel Processor Identification and the CPUID Instruction"
1573 1646 */
1574 1647 #define CPI_FAMILY_XTD(cpi) BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1575 1648 #define CPI_MODEL_XTD(cpi) BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1576 1649 #define CPI_TYPE(cpi) BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1577 1650 #define CPI_FAMILY(cpi) BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1578 1651 #define CPI_STEP(cpi) BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1579 1652 #define CPI_MODEL(cpi) BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1580 1653
1581 1654 #define CPI_FEATURES_EDX(cpi) ((cpi)->cpi_std[1].cp_edx)
1582 1655 #define CPI_FEATURES_ECX(cpi) ((cpi)->cpi_std[1].cp_ecx)
1583 1656 #define CPI_FEATURES_XTD_EDX(cpi) ((cpi)->cpi_extd[1].cp_edx)
1584 1657 #define CPI_FEATURES_XTD_ECX(cpi) ((cpi)->cpi_extd[1].cp_ecx)
1585 1658 #define CPI_FEATURES_7_0_EBX(cpi) ((cpi)->cpi_std[7].cp_ebx)
1586 1659 #define CPI_FEATURES_7_0_ECX(cpi) ((cpi)->cpi_std[7].cp_ecx)
1587 1660 #define CPI_FEATURES_7_0_EDX(cpi) ((cpi)->cpi_std[7].cp_edx)
1588 1661
1589 1662 #define CPI_BRANDID(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1590 1663 #define CPI_CHUNKS(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1591 1664 #define CPI_CPU_COUNT(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1592 1665 #define CPI_APIC_ID(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1593 1666
1594 1667 #define CPI_MAXEAX_MAX 0x100 /* sanity control */
1595 1668 #define CPI_XMAXEAX_MAX 0x80000100
1596 1669 #define CPI_FN4_ECX_MAX 0x20 /* sanity: max fn 4 levels */
1597 1670 #define CPI_FNB_ECX_MAX 0x20 /* sanity: max fn B levels */
1598 1671
1599 1672 /*
1600 1673 * Function 4 (Deterministic Cache Parameters) macros
1601 1674 * Defined by Intel Application Note AP-485
1602 1675 */
1603 1676 #define CPI_NUM_CORES(regs) BITX((regs)->cp_eax, 31, 26)
1604 1677 #define CPI_NTHR_SHR_CACHE(regs) BITX((regs)->cp_eax, 25, 14)
1605 1678 #define CPI_FULL_ASSOC_CACHE(regs) BITX((regs)->cp_eax, 9, 9)
1606 1679 #define CPI_SELF_INIT_CACHE(regs) BITX((regs)->cp_eax, 8, 8)
1607 1680 #define CPI_CACHE_LVL(regs) BITX((regs)->cp_eax, 7, 5)
1608 1681 #define CPI_CACHE_TYPE(regs) BITX((regs)->cp_eax, 4, 0)
1609 1682 #define CPI_CPU_LEVEL_TYPE(regs) BITX((regs)->cp_ecx, 15, 8)
1610 1683
1611 1684 #define CPI_CACHE_WAYS(regs) BITX((regs)->cp_ebx, 31, 22)
1612 1685 #define CPI_CACHE_PARTS(regs) BITX((regs)->cp_ebx, 21, 12)
1613 1686 #define CPI_CACHE_COH_LN_SZ(regs) BITX((regs)->cp_ebx, 11, 0)
1614 1687
1615 1688 #define CPI_CACHE_SETS(regs) BITX((regs)->cp_ecx, 31, 0)
1616 1689
1617 1690 #define CPI_PREFCH_STRIDE(regs) BITX((regs)->cp_edx, 9, 0)
1618 1691
1619 1692
1620 1693 /*
1621 1694 * A couple of shorthand macros to identify "later" P6-family chips
1622 1695 * like the Pentium M and Core. First, the "older" P6-based stuff
1623 1696 * (loosely defined as "pre-Pentium-4"):
1624 1697 * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1625 1698 */
1626 1699 #define IS_LEGACY_P6(cpi) ( \
1627 1700 cpi->cpi_family == 6 && \
1628 1701 (cpi->cpi_model == 1 || \
1629 1702 cpi->cpi_model == 3 || \
1630 1703 cpi->cpi_model == 5 || \
1631 1704 cpi->cpi_model == 6 || \
1632 1705 cpi->cpi_model == 7 || \
1633 1706 cpi->cpi_model == 8 || \
1634 1707 cpi->cpi_model == 0xA || \
1635 1708 cpi->cpi_model == 0xB) \
1636 1709 )
1637 1710
1638 1711 /* A "new F6" is everything with family 6 that's not the above */
1639 1712 #define IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
1640 1713
1641 1714 /* Extended family/model support */
1642 1715 #define IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
1643 1716 cpi->cpi_family >= 0xf)
1644 1717
1645 1718 /*
1646 1719 * Info for monitor/mwait idle loop.
1647 1720 *
1648 1721 * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
1649 1722 * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
1650 1723 * 2006.
1651 1724 * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
1652 1725 * Documentation Updates" #33633, Rev 2.05, December 2006.
1653 1726 */
1654 1727 #define MWAIT_SUPPORT (0x00000001) /* mwait supported */
1655 1728 #define MWAIT_EXTENSIONS (0x00000002) /* extenstion supported */
1656 1729 #define MWAIT_ECX_INT_ENABLE (0x00000004) /* ecx 1 extension supported */
1657 1730 #define MWAIT_SUPPORTED(cpi) ((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
1658 1731 #define MWAIT_INT_ENABLE(cpi) ((cpi)->cpi_std[5].cp_ecx & 0x2)
1659 1732 #define MWAIT_EXTENSION(cpi) ((cpi)->cpi_std[5].cp_ecx & 0x1)
1660 1733 #define MWAIT_SIZE_MIN(cpi) BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
1661 1734 #define MWAIT_SIZE_MAX(cpi) BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
1662 1735 /*
1663 1736 * Number of sub-cstates for a given c-state.
1664 1737 */
1665 1738 #define MWAIT_NUM_SUBC_STATES(cpi, c_state) \
1666 1739 BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
1667 1740
1668 1741 /*
1669 1742 * XSAVE leaf 0xD enumeration
1670 1743 */
1671 1744 #define CPUID_LEAFD_2_YMM_OFFSET 576
1672 1745 #define CPUID_LEAFD_2_YMM_SIZE 256
1673 1746
1674 1747 /*
1675 1748 * Common extended leaf names to cut down on typos.
1676 1749 */
1677 1750 #define CPUID_LEAF_EXT_0 0x80000000
1678 1751 #define CPUID_LEAF_EXT_8 0x80000008
1679 1752 #define CPUID_LEAF_EXT_1d 0x8000001d
1680 1753 #define CPUID_LEAF_EXT_1e 0x8000001e
1681 1754
1682 1755 /*
1683 1756 * Functions we consune from cpuid_subr.c; don't publish these in a header
1684 1757 * file to try and keep people using the expected cpuid_* interfaces.
1685 1758 */
1686 1759 extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
1687 1760 extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
1688 1761 extern uint32_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
1689 1762 extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
1690 1763 extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
1691 1764
1692 1765 /*
1693 1766 * Apply up various platform-dependent restrictions where the
1694 1767 * underlying platform restrictions mean the CPU can be marked
1695 1768 * as less capable than its cpuid instruction would imply.
1696 1769 */
1697 1770 #if defined(__xpv)
1698 1771 static void
1699 1772 platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
1700 1773 {
1701 1774 switch (eax) {
1702 1775 case 1: {
1703 1776 uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
1704 1777 0 : CPUID_INTC_EDX_MCA;
1705 1778 cp->cp_edx &=
1706 1779 ~(mcamask |
1707 1780 CPUID_INTC_EDX_PSE |
1708 1781 CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1709 1782 CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
1710 1783 CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
1711 1784 CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1712 1785 CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
1713 1786 break;
1714 1787 }
1715 1788
1716 1789 case 0x80000001:
1717 1790 cp->cp_edx &=
1718 1791 ~(CPUID_AMD_EDX_PSE |
1719 1792 CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1720 1793 CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
1721 1794 CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
1722 1795 CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1723 1796 CPUID_AMD_EDX_TSCP);
1724 1797 cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
1725 1798 break;
1726 1799 default:
1727 1800 break;
1728 1801 }
1729 1802
1730 1803 switch (vendor) {
1731 1804 case X86_VENDOR_Intel:
1732 1805 switch (eax) {
1733 1806 case 4:
1734 1807 /*
1735 1808 * Zero out the (ncores-per-chip - 1) field
1736 1809 */
1737 1810 cp->cp_eax &= 0x03fffffff;
1738 1811 break;
1739 1812 default:
1740 1813 break;
1741 1814 }
1742 1815 break;
1743 1816 case X86_VENDOR_AMD:
1744 1817 switch (eax) {
1745 1818
1746 1819 case 0x80000001:
1747 1820 cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
1748 1821 break;
1749 1822
1750 1823 case CPUID_LEAF_EXT_8:
1751 1824 /*
1752 1825 * Zero out the (ncores-per-chip - 1) field
1753 1826 */
1754 1827 cp->cp_ecx &= 0xffffff00;
1755 1828 break;
1756 1829 default:
1757 1830 break;
1758 1831 }
1759 1832 break;
1760 1833 default:
1761 1834 break;
1762 1835 }
1763 1836 }
1764 1837 #else
1765 1838 #define platform_cpuid_mangle(vendor, eax, cp) /* nothing */
1766 1839 #endif
1767 1840
1768 1841 /*
1769 1842 * Some undocumented ways of patching the results of the cpuid
1770 1843 * instruction to permit running Solaris 10 on future cpus that
1771 1844 * we don't currently support. Could be set to non-zero values
1772 1845 * via settings in eeprom.
1773 1846 */
1774 1847
1775 1848 uint32_t cpuid_feature_ecx_include;
1776 1849 uint32_t cpuid_feature_ecx_exclude;
1777 1850 uint32_t cpuid_feature_edx_include;
1778 1851 uint32_t cpuid_feature_edx_exclude;
1779 1852
1780 1853 /*
1781 1854 * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
1782 1855 */
1783 1856 void
1784 1857 cpuid_alloc_space(cpu_t *cpu)
1785 1858 {
1786 1859 /*
1787 1860 * By convention, cpu0 is the boot cpu, which is set up
1788 1861 * before memory allocation is available. All other cpus get
1789 1862 * their cpuid_info struct allocated here.
1790 1863 */
1791 1864 ASSERT(cpu->cpu_id != 0);
1792 1865 ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
1793 1866 cpu->cpu_m.mcpu_cpi =
1794 1867 kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
1795 1868 }
1796 1869
1797 1870 void
1798 1871 cpuid_free_space(cpu_t *cpu)
1799 1872 {
1800 1873 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1801 1874 int i;
1802 1875
1803 1876 ASSERT(cpi != NULL);
1804 1877 ASSERT(cpi != &cpuid_info0);
1805 1878
1806 1879 /*
1807 1880 * Free up any cache leaf related dynamic storage. The first entry was
1808 1881 * cached from the standard cpuid storage, so we should not free it.
1809 1882 */
1810 1883 for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
1811 1884 kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
1812 1885 if (cpi->cpi_cache_leaf_size > 0)
1813 1886 kmem_free(cpi->cpi_cache_leaves,
1814 1887 cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
1815 1888
1816 1889 kmem_free(cpi, sizeof (*cpi));
1817 1890 cpu->cpu_m.mcpu_cpi = NULL;
1818 1891 }
1819 1892
1820 1893 #if !defined(__xpv)
1821 1894 /*
1822 1895 * Determine the type of the underlying platform. This is used to customize
1823 1896 * initialization of various subsystems (e.g. TSC). determine_platform() must
1824 1897 * only ever be called once to prevent two processors from seeing different
1825 1898 * values of platform_type. Must be called before cpuid_pass1(), the earliest
1826 1899 * consumer to execute (uses _cpuid_chiprev --> synth_amd_info --> get_hwenv).
1827 1900 */
1828 1901 void
1829 1902 determine_platform(void)
1830 1903 {
1831 1904 struct cpuid_regs cp;
1832 1905 uint32_t base;
1833 1906 uint32_t regs[4];
1834 1907 char *hvstr = (char *)regs;
1835 1908
1836 1909 ASSERT(platform_type == -1);
1837 1910
1838 1911 platform_type = HW_NATIVE;
1839 1912
1840 1913 if (!enable_platform_detection)
1841 1914 return;
1842 1915
1843 1916 /*
1844 1917 * If Hypervisor CPUID bit is set, try to determine hypervisor
1845 1918 * vendor signature, and set platform type accordingly.
1846 1919 *
1847 1920 * References:
1848 1921 * http://lkml.org/lkml/2008/10/1/246
1849 1922 * http://kb.vmware.com/kb/1009458
1850 1923 */
1851 1924 cp.cp_eax = 0x1;
1852 1925 (void) __cpuid_insn(&cp);
1853 1926 if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
1854 1927 cp.cp_eax = 0x40000000;
1855 1928 (void) __cpuid_insn(&cp);
1856 1929 regs[0] = cp.cp_ebx;
1857 1930 regs[1] = cp.cp_ecx;
1858 1931 regs[2] = cp.cp_edx;
1859 1932 regs[3] = 0;
1860 1933 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
1861 1934 platform_type = HW_XEN_HVM;
1862 1935 return;
1863 1936 }
1864 1937 if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
1865 1938 platform_type = HW_VMWARE;
1866 1939 return;
1867 1940 }
1868 1941 if (strcmp(hvstr, HVSIG_KVM) == 0) {
1869 1942 platform_type = HW_KVM;
1870 1943 return;
1871 1944 }
1872 1945 if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
1873 1946 platform_type = HW_BHYVE;
1874 1947 return;
1875 1948 }
1876 1949 if (strcmp(hvstr, HVSIG_MICROSOFT) == 0)
1877 1950 platform_type = HW_MICROSOFT;
1878 1951 } else {
1879 1952 /*
1880 1953 * Check older VMware hardware versions. VMware hypervisor is
1881 1954 * detected by performing an IN operation to VMware hypervisor
1882 1955 * port and checking that value returned in %ebx is VMware
1883 1956 * hypervisor magic value.
1884 1957 *
1885 1958 * References: http://kb.vmware.com/kb/1009458
1886 1959 */
1887 1960 vmware_port(VMWARE_HVCMD_GETVERSION, regs);
1888 1961 if (regs[1] == VMWARE_HVMAGIC) {
1889 1962 platform_type = HW_VMWARE;
1890 1963 return;
1891 1964 }
1892 1965 }
1893 1966
1894 1967 /*
1895 1968 * Check Xen hypervisor. In a fully virtualized domain,
1896 1969 * Xen's pseudo-cpuid function returns a string representing the
1897 1970 * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
1898 1971 * supported cpuid function. We need at least a (base + 2) leaf value
1899 1972 * to do what we want to do. Try different base values, since the
1900 1973 * hypervisor might use a different one depending on whether Hyper-V
1901 1974 * emulation is switched on by default or not.
1902 1975 */
1903 1976 for (base = 0x40000000; base < 0x40010000; base += 0x100) {
1904 1977 cp.cp_eax = base;
1905 1978 (void) __cpuid_insn(&cp);
1906 1979 regs[0] = cp.cp_ebx;
1907 1980 regs[1] = cp.cp_ecx;
1908 1981 regs[2] = cp.cp_edx;
1909 1982 regs[3] = 0;
1910 1983 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
1911 1984 cp.cp_eax >= (base + 2)) {
1912 1985 platform_type &= ~HW_NATIVE;
1913 1986 platform_type |= HW_XEN_HVM;
1914 1987 return;
1915 1988 }
1916 1989 }
1917 1990 }
1918 1991
1919 1992 int
1920 1993 get_hwenv(void)
1921 1994 {
1922 1995 ASSERT(platform_type != -1);
1923 1996 return (platform_type);
1924 1997 }
1925 1998
1926 1999 int
1927 2000 is_controldom(void)
1928 2001 {
1929 2002 return (0);
1930 2003 }
1931 2004
1932 2005 #else
1933 2006
1934 2007 int
1935 2008 get_hwenv(void)
1936 2009 {
1937 2010 return (HW_XEN_PV);
1938 2011 }
1939 2012
1940 2013 int
1941 2014 is_controldom(void)
1942 2015 {
1943 2016 return (DOMAIN_IS_INITDOMAIN(xen_info));
1944 2017 }
1945 2018
1946 2019 #endif /* __xpv */
1947 2020
1948 2021 /*
1949 2022 * Make sure that we have gathered all of the CPUID leaves that we might need to
1950 2023 * determine topology. We assume that the standard leaf 1 has already been done
1951 2024 * and that xmaxeax has already been calculated.
1952 2025 */
1953 2026 static void
1954 2027 cpuid_gather_amd_topology_leaves(cpu_t *cpu)
1955 2028 {
1956 2029 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1957 2030
1958 2031 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
1959 2032 struct cpuid_regs *cp;
1960 2033
1961 2034 cp = &cpi->cpi_extd[8];
1962 2035 cp->cp_eax = CPUID_LEAF_EXT_8;
1963 2036 (void) __cpuid_insn(cp);
1964 2037 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
1965 2038 }
1966 2039
1967 2040 if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
1968 2041 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
1969 2042 struct cpuid_regs *cp;
1970 2043
1971 2044 cp = &cpi->cpi_extd[0x1e];
1972 2045 cp->cp_eax = CPUID_LEAF_EXT_1e;
1973 2046 (void) __cpuid_insn(cp);
1974 2047 }
1975 2048 }
1976 2049
1977 2050 /*
1978 2051 * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
1979 2052 * it to everything else. If not, and we're on an AMD system where 8000001e is
1980 2053 * valid, then we use that. Othewrise, we fall back to the default value for the
1981 2054 * APIC ID in leaf 1.
1982 2055 */
1983 2056 static uint32_t
1984 2057 cpuid_gather_apicid(struct cpuid_info *cpi)
1985 2058 {
1986 2059 /*
1987 2060 * Leaf B changes based on the arguments to it. Beacuse we don't cache
1988 2061 * it, we need to gather it again.
1989 2062 */
1990 2063 if (cpi->cpi_maxeax >= 0xB) {
1991 2064 struct cpuid_regs regs;
1992 2065 struct cpuid_regs *cp;
1993 2066
1994 2067 cp = ®s;
1995 2068 cp->cp_eax = 0xB;
1996 2069 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
1997 2070 (void) __cpuid_insn(cp);
1998 2071
1999 2072 if (cp->cp_ebx != 0) {
2000 2073 return (cp->cp_edx);
2001 2074 }
2002 2075 }
2003 2076
2004 2077 if (cpi->cpi_vendor == X86_VENDOR_AMD &&
2005 2078 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2006 2079 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2007 2080 return (cpi->cpi_extd[0x1e].cp_eax);
2008 2081 }
2009 2082
2010 2083 return (CPI_APIC_ID(cpi));
2011 2084 }
2012 2085
2013 2086 /*
2014 2087 * For AMD processors, attempt to calculate the number of chips and cores that
2015 2088 * exist. The way that we do this varies based on the generation, because the
2016 2089 * generations themselves have changed dramatically.
2017 2090 *
2018 2091 * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
2019 2092 * However, with the advent of family 17h (Zen) it actually tells us the number
2020 2093 * of threads, so we need to look at leaf 0x8000001e if available to determine
2021 2094 * its value. Otherwise, for all prior families, the number of enabled cores is
2022 2095 * the same as threads.
2023 2096 *
2024 2097 * If we do not have leaf 0x80000008, then we assume that this processor does
2025 2098 * not have anything. AMD's older CPUID specification says there's no reason to
2026 2099 * fall back to leaf 1.
2027 2100 *
2028 2101 * In some virtualization cases we will not have leaf 8000001e or it will be
2029 2102 * zero. When that happens we assume the number of threads is one.
2030 2103 */
2031 2104 static void
2032 2105 cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2033 2106 {
2034 2107 uint_t nthreads, nthread_per_core;
2035 2108
2036 2109 nthreads = nthread_per_core = 1;
2037 2110
2038 2111 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2039 2112 nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2040 2113 } else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2041 2114 nthreads = CPI_CPU_COUNT(cpi);
2042 2115 }
2043 2116
2044 2117 /*
2045 2118 * For us to have threads, and know about it, we have to be at least at
2046 2119 * family 17h and have the cpuid bit that says we have extended
2047 2120 * topology.
2048 2121 */
2049 2122 if (cpi->cpi_family >= 0x17 &&
2050 2123 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2051 2124 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2052 2125 nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2053 2126 }
2054 2127
2055 2128 *ncpus = nthreads;
2056 2129 *ncores = nthreads / nthread_per_core;
2057 2130 }
2058 2131
2059 2132 /*
2060 2133 * Seed the initial values for the cores and threads for an Intel based
2061 2134 * processor. These values will be overwritten if we detect that the processor
2062 2135 * supports CPUID leaf 0xb.
2063 2136 */
2064 2137 static void
2065 2138 cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2066 2139 {
2067 2140 /*
2068 2141 * Only seed the number of physical cores from the first level leaf 4
2069 2142 * information. The number of threads there indicate how many share the
2070 2143 * L1 cache, which may or may not have anything to do with the number of
2071 2144 * logical CPUs per core.
2072 2145 */
2073 2146 if (cpi->cpi_maxeax >= 4) {
2074 2147 *ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2075 2148 } else {
2076 2149 *ncores = 1;
2077 2150 }
2078 2151
2079 2152 if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2080 2153 *ncpus = CPI_CPU_COUNT(cpi);
2081 2154 } else {
2082 2155 *ncpus = *ncores;
2083 2156 }
2084 2157 }
2085 2158
2086 2159 static boolean_t
2087 2160 cpuid_leafB_getids(cpu_t *cpu)
2088 2161 {
2089 2162 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2090 2163 struct cpuid_regs regs;
2091 2164 struct cpuid_regs *cp;
2092 2165
2093 2166 if (cpi->cpi_maxeax < 0xB)
2094 2167 return (B_FALSE);
2095 2168
2096 2169 cp = ®s;
2097 2170 cp->cp_eax = 0xB;
2098 2171 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2099 2172
2100 2173 (void) __cpuid_insn(cp);
2101 2174
2102 2175 /*
2103 2176 * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2104 2177 * indicates that the extended topology enumeration leaf is
2105 2178 * available.
2106 2179 */
2107 2180 if (cp->cp_ebx != 0) {
2108 2181 uint32_t x2apic_id = 0;
2109 2182 uint_t coreid_shift = 0;
2110 2183 uint_t ncpu_per_core = 1;
2111 2184 uint_t chipid_shift = 0;
2112 2185 uint_t ncpu_per_chip = 1;
2113 2186 uint_t i;
2114 2187 uint_t level;
2115 2188
2116 2189 for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2117 2190 cp->cp_eax = 0xB;
2118 2191 cp->cp_ecx = i;
2119 2192
2120 2193 (void) __cpuid_insn(cp);
2121 2194 level = CPI_CPU_LEVEL_TYPE(cp);
2122 2195
2123 2196 if (level == 1) {
2124 2197 x2apic_id = cp->cp_edx;
2125 2198 coreid_shift = BITX(cp->cp_eax, 4, 0);
2126 2199 ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2127 2200 } else if (level == 2) {
2128 2201 x2apic_id = cp->cp_edx;
2129 2202 chipid_shift = BITX(cp->cp_eax, 4, 0);
2130 2203 ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2131 2204 }
2132 2205 }
2133 2206
2134 2207 /*
2135 2208 * cpi_apicid is taken care of in cpuid_gather_apicid.
2136 2209 */
2137 2210 cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2138 2211 cpi->cpi_ncore_per_chip = ncpu_per_chip /
2139 2212 ncpu_per_core;
2140 2213 cpi->cpi_chipid = x2apic_id >> chipid_shift;
2141 2214 cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2142 2215 cpi->cpi_coreid = x2apic_id >> coreid_shift;
2143 2216 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2144 2217 cpi->cpi_procnodeid = cpi->cpi_chipid;
2145 2218 cpi->cpi_compunitid = cpi->cpi_coreid;
2146 2219
2147 2220 if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2148 2221 cpi->cpi_nthread_bits = coreid_shift;
2149 2222 cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2150 2223 }
2151 2224
2152 2225 return (B_TRUE);
2153 2226 } else {
2154 2227 return (B_FALSE);
2155 2228 }
2156 2229 }
2157 2230
2158 2231 static void
2159 2232 cpuid_intel_getids(cpu_t *cpu, void *feature)
2160 2233 {
2161 2234 uint_t i;
2162 2235 uint_t chipid_shift = 0;
2163 2236 uint_t coreid_shift = 0;
2164 2237 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2165 2238
2166 2239 /*
2167 2240 * There are no compute units or processor nodes currently on Intel.
2168 2241 * Always set these to one.
2169 2242 */
2170 2243 cpi->cpi_procnodes_per_pkg = 1;
2171 2244 cpi->cpi_cores_per_compunit = 1;
2172 2245
2173 2246 /*
2174 2247 * If cpuid Leaf B is present, use that to try and get this information.
2175 2248 * It will be the most accurate for Intel CPUs.
2176 2249 */
2177 2250 if (cpuid_leafB_getids(cpu))
2178 2251 return;
2179 2252
2180 2253 /*
2181 2254 * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2182 2255 * and ncore_per_chip. These represent the largest power of two values
2183 2256 * that we need to cover all of the IDs in the system. Therefore, we use
2184 2257 * those values to seed the number of bits needed to cover information
2185 2258 * in the case when leaf B is not available. These values will probably
2186 2259 * be larger than required, but that's OK.
2187 2260 */
2188 2261 cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2189 2262 cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2190 2263
2191 2264 for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2192 2265 chipid_shift++;
2193 2266
2194 2267 cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2195 2268 cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2196 2269
2197 2270 if (is_x86_feature(feature, X86FSET_CMP)) {
2198 2271 /*
2199 2272 * Multi-core (and possibly multi-threaded)
2200 2273 * processors.
2201 2274 */
2202 2275 uint_t ncpu_per_core;
2203 2276 if (cpi->cpi_ncore_per_chip == 1)
2204 2277 ncpu_per_core = cpi->cpi_ncpu_per_chip;
2205 2278 else if (cpi->cpi_ncore_per_chip > 1)
2206 2279 ncpu_per_core = cpi->cpi_ncpu_per_chip /
2207 2280 cpi->cpi_ncore_per_chip;
2208 2281 /*
2209 2282 * 8bit APIC IDs on dual core Pentiums
2210 2283 * look like this:
2211 2284 *
2212 2285 * +-----------------------+------+------+
2213 2286 * | Physical Package ID | MC | HT |
2214 2287 * +-----------------------+------+------+
2215 2288 * <------- chipid -------->
2216 2289 * <------- coreid --------------->
2217 2290 * <--- clogid -->
2218 2291 * <------>
2219 2292 * pkgcoreid
2220 2293 *
2221 2294 * Where the number of bits necessary to
2222 2295 * represent MC and HT fields together equals
2223 2296 * to the minimum number of bits necessary to
2224 2297 * store the value of cpi->cpi_ncpu_per_chip.
2225 2298 * Of those bits, the MC part uses the number
2226 2299 * of bits necessary to store the value of
2227 2300 * cpi->cpi_ncore_per_chip.
2228 2301 */
2229 2302 for (i = 1; i < ncpu_per_core; i <<= 1)
2230 2303 coreid_shift++;
2231 2304 cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2232 2305 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2233 2306 } else if (is_x86_feature(feature, X86FSET_HTT)) {
2234 2307 /*
2235 2308 * Single-core multi-threaded processors.
2236 2309 */
2237 2310 cpi->cpi_coreid = cpi->cpi_chipid;
2238 2311 cpi->cpi_pkgcoreid = 0;
2239 2312 } else {
2240 2313 /*
2241 2314 * Single-core single-thread processors.
2242 2315 */
2243 2316 cpi->cpi_coreid = cpu->cpu_id;
2244 2317 cpi->cpi_pkgcoreid = 0;
2245 2318 }
2246 2319 cpi->cpi_procnodeid = cpi->cpi_chipid;
2247 2320 cpi->cpi_compunitid = cpi->cpi_coreid;
2248 2321 }
2249 2322
2250 2323 /*
2251 2324 * Historically, AMD has had CMP chips with only a single thread per core.
2252 2325 * However, starting in family 17h (Zen), this has changed and they now have
2253 2326 * multiple threads. Our internal core id needs to be a unique value.
2254 2327 *
2255 2328 * To determine the core id of an AMD system, if we're from a family before 17h,
2256 2329 * then we just use the cpu id, as that gives us a good value that will be
2257 2330 * unique for each core. If instead, we're on family 17h or later, then we need
2258 2331 * to do something more complicated. CPUID leaf 0x8000001e can tell us
2259 2332 * how many threads are in the system. Based on that, we'll shift the APIC ID.
2260 2333 * We can't use the normal core id in that leaf as it's only unique within the
2261 2334 * socket, which is perfect for cpi_pkgcoreid, but not us.
2262 2335 */
2263 2336 static id_t
2264 2337 cpuid_amd_get_coreid(cpu_t *cpu)
2265 2338 {
2266 2339 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2267 2340
2268 2341 if (cpi->cpi_family >= 0x17 &&
2269 2342 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2270 2343 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2271 2344 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2272 2345 if (nthreads > 1) {
2273 2346 VERIFY3U(nthreads, ==, 2);
2274 2347 return (cpi->cpi_apicid >> 1);
2275 2348 }
2276 2349 }
2277 2350
2278 2351 return (cpu->cpu_id);
2279 2352 }
2280 2353
2281 2354 /*
2282 2355 * IDs on AMD is a more challenging task. This is notable because of the
2283 2356 * following two facts:
2284 2357 *
2285 2358 * 1. Before family 0x17 (Zen), there was no support for SMT and there was
2286 2359 * also no way to get an actual unique core id from the system. As such, we
2287 2360 * synthesize this case by using cpu->cpu_id. This scheme does not,
2288 2361 * however, guarantee that sibling cores of a chip will have sequential
2289 2362 * coreids starting at a multiple of the number of cores per chip - that is
2290 2363 * usually the case, but if the ACPI MADT table is presented in a different
2291 2364 * order then we need to perform a few more gymnastics for the pkgcoreid.
2292 2365 *
2293 2366 * 2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2294 2367 * called compute units. These compute units share the L1I cache, L2 cache,
2295 2368 * and the FPU. To deal with this, a new topology leaf was added in
2296 2369 * 0x8000001e. However, parts of this leaf have different meanings
2297 2370 * once we get to family 0x17.
2298 2371 */
2299 2372
2300 2373 static void
2301 2374 cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2302 2375 {
2303 2376 int i, first_half, coreidsz;
2304 2377 uint32_t nb_caps_reg;
2305 2378 uint_t node2_1;
2306 2379 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2307 2380 struct cpuid_regs *cp;
2308 2381
2309 2382 /*
2310 2383 * Calculate the core id (this comes from hardware in family 0x17 if it
2311 2384 * hasn't been stripped by virtualization). We always set the compute
2312 2385 * unit id to the same value. Also, initialize the default number of
2313 2386 * cores per compute unit and nodes per package. This will be
2314 2387 * overwritten when we know information about a particular family.
2315 2388 */
2316 2389 cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2317 2390 cpi->cpi_compunitid = cpi->cpi_coreid;
2318 2391 cpi->cpi_cores_per_compunit = 1;
2319 2392 cpi->cpi_procnodes_per_pkg = 1;
2320 2393
2321 2394 /*
2322 2395 * To construct the logical ID, we need to determine how many APIC IDs
2323 2396 * are dedicated to the cores and threads. This is provided for us in
2324 2397 * 0x80000008. However, if it's not present (say due to virtualization),
2325 2398 * then we assume it's one. This should be present on all 64-bit AMD
2326 2399 * processors. It was added in family 0xf (Hammer).
2327 2400 */
2328 2401 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2329 2402 coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2330 2403
2331 2404 /*
2332 2405 * In AMD parlance chip is really a node while illumos
2333 2406 * uses chip as equivalent to socket/package.
2334 2407 */
2335 2408 if (coreidsz == 0) {
2336 2409 /* Use legacy method */
2337 2410 for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2338 2411 coreidsz++;
2339 2412 if (coreidsz == 0)
2340 2413 coreidsz = 1;
2341 2414 }
2342 2415 } else {
2343 2416 /* Assume single-core part */
2344 2417 coreidsz = 1;
2345 2418 }
2346 2419 cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2347 2420
2348 2421 /*
2349 2422 * The package core ID varies depending on the family. While it may be
2350 2423 * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2351 2424 * this value is the core id in the given node. For non-virtualized
2352 2425 * family 17h, we need to take the logical core id and shift off the
2353 2426 * threads like we do when getting the core id. Otherwise, we can use
2354 2427 * the clogid as is. When family 17h is virtualized, the clogid should
2355 2428 * be sufficient as if we don't have valid data in the leaf, then we
2356 2429 * won't think we have SMT, in which case the cpi_clogid should be
2357 2430 * sufficient.
2358 2431 */
2359 2432 if (cpi->cpi_family >= 0x17 &&
2360 2433 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2361 2434 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2362 2435 cpi->cpi_extd[0x1e].cp_ebx != 0) {
2363 2436 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2364 2437 if (nthreads > 1) {
2365 2438 VERIFY3U(nthreads, ==, 2);
2366 2439 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2367 2440 } else {
2368 2441 cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2369 2442 }
2370 2443 } else {
2371 2444 cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2372 2445 }
2373 2446
2374 2447 /*
2375 2448 * Obtain the node ID and compute unit IDs. If we're on family 0x15
2376 2449 * (bulldozer) or newer, then we can derive all of this from leaf
2377 2450 * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2378 2451 */
2379 2452 if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2380 2453 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2381 2454 cp = &cpi->cpi_extd[0x1e];
2382 2455
2383 2456 cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2384 2457 cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2385 2458
2386 2459 /*
2387 2460 * For Bulldozer-era CPUs, recalculate the compute unit
2388 2461 * information.
2389 2462 */
2390 2463 if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2391 2464 cpi->cpi_cores_per_compunit =
2392 2465 BITX(cp->cp_ebx, 15, 8) + 1;
2393 2466 cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2394 2467 (cpi->cpi_ncore_per_chip /
2395 2468 cpi->cpi_cores_per_compunit) *
2396 2469 (cpi->cpi_procnodeid /
2397 2470 cpi->cpi_procnodes_per_pkg);
2398 2471 }
2399 2472 } else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2400 2473 cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2401 2474 } else if (cpi->cpi_family == 0x10) {
2402 2475 /*
2403 2476 * See if we are a multi-node processor.
2404 2477 * All processors in the system have the same number of nodes
2405 2478 */
2406 2479 nb_caps_reg = pci_getl_func(0, 24, 3, 0xe8);
2407 2480 if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2408 2481 /* Single-node */
2409 2482 cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2410 2483 coreidsz);
2411 2484 } else {
2412 2485
2413 2486 /*
2414 2487 * Multi-node revision D (2 nodes per package
2415 2488 * are supported)
2416 2489 */
2417 2490 cpi->cpi_procnodes_per_pkg = 2;
2418 2491
2419 2492 first_half = (cpi->cpi_pkgcoreid <=
2420 2493 (cpi->cpi_ncore_per_chip/2 - 1));
2421 2494
2422 2495 if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2423 2496 /* We are BSP */
2424 2497 cpi->cpi_procnodeid = (first_half ? 0 : 1);
2425 2498 } else {
2426 2499
2427 2500 /* We are AP */
2428 2501 /* NodeId[2:1] bits to use for reading F3xe8 */
2429 2502 node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2430 2503
2431 2504 nb_caps_reg =
2432 2505 pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2433 2506
2434 2507 /*
2435 2508 * Check IntNodeNum bit (31:30, but bit 31 is
2436 2509 * always 0 on dual-node processors)
2437 2510 */
2438 2511 if (BITX(nb_caps_reg, 30, 30) == 0)
2439 2512 cpi->cpi_procnodeid = node2_1 +
2440 2513 !first_half;
2441 2514 else
2442 2515 cpi->cpi_procnodeid = node2_1 +
2443 2516 first_half;
2444 2517 }
2445 2518 }
2446 2519 } else {
2447 2520 cpi->cpi_procnodeid = 0;
2448 2521 }
2449 2522
2450 2523 cpi->cpi_chipid =
2451 2524 cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2452 2525
2453 2526 cpi->cpi_ncore_bits = coreidsz;
2454 2527 cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2455 2528 cpi->cpi_ncore_per_chip);
2456 2529 }
2457 2530
2458 2531 static void
2459 2532 spec_uarch_flush_noop(void)
2460 2533 {
2461 2534 }
2462 2535
2463 2536 /*
2464 2537 * When microcode is present that mitigates MDS, this wrmsr will also flush the
2465 2538 * MDS-related micro-architectural state that would normally happen by calling
2466 2539 * x86_md_clear().
2467 2540 */
2468 2541 static void
2469 2542 spec_uarch_flush_msr(void)
2470 2543 {
2471 2544 wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2472 2545 }
2473 2546
2474 2547 /*
2475 2548 * This function points to a function that will flush certain
2476 2549 * micro-architectural state on the processor. This flush is used to mitigate
2477 2550 * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This
2478 2551 * function can point to one of three functions:
2479 2552 *
2480 2553 * - A noop which is done because we either are vulnerable, but do not have
2481 2554 * microcode available to help deal with a fix, or because we aren't
2482 2555 * vulnerable.
2483 2556 *
2484 2557 * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2485 2558 * mitigate MDS is present, also perform the equivalent of the MDS flush;
2486 2559 * however, it only flushes the MDS related micro-architectural state on the
2487 2560 * current hyperthread, it does not do anything for the twin.
2488 2561 *
2489 2562 * - x86_md_clear which will flush the MDS related state. This is done when we
2490 2563 * have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2491 2564 * (RDCL_NO is set).
2492 2565 */
2493 2566 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2494 2567
2495 2568 static void
2496 2569 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2497 2570 {
2498 2571 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2499 2572
2500 2573 /*
2501 2574 * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2502 2575 * has been fixed in hardware, it doesn't cover everything related to
2503 2576 * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2504 2577 * need to mitigate this.
2505 2578 */
2506 2579 if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2507 2580 is_x86_feature(featureset, X86FSET_MDS_NO)) {
2508 2581 return;
2509 2582 }
2510 2583
2511 2584 if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2512 2585 const uint8_t nop = NOP_INSTR;
2513 2586 uint8_t *md = (uint8_t *)x86_md_clear;
2514 2587
2515 2588 *md = nop;
2516 2589 }
2517 2590
2518 2591 membar_producer();
2519 2592 }
2520 2593
2521 2594 static void
2522 2595 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2523 2596 {
2524 2597 boolean_t need_l1d, need_mds;
2525 2598 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2526 2599
2527 2600 /*
2528 2601 * If we're not on Intel or we've mitigated both RDCL and MDS in
2529 2602 * hardware, then there's nothing left for us to do for enabling the
2530 2603 * flush. We can also go ahead and say that SMT exclusion is
2531 2604 * unnecessary.
2532 2605 */
2533 2606 if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2534 2607 (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2535 2608 is_x86_feature(featureset, X86FSET_MDS_NO))) {
2536 2609 extern int smt_exclusion;
2537 2610 smt_exclusion = 0;
2538 2611 spec_uarch_flush = spec_uarch_flush_noop;
2539 2612 membar_producer();
2540 2613 return;
2541 2614 }
2542 2615
2543 2616 /*
2544 2617 * The locations where we need to perform an L1D flush are required both
2545 2618 * for mitigating L1TF and MDS. When verw support is present in
2546 2619 * microcode, then the L1D flush will take care of doing that as well.
2547 2620 * However, if we have a system where RDCL_NO is present, but we don't
2548 2621 * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2549 2622 * L1D flush.
2550 2623 */
2551 2624 if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2552 2625 is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2553 2626 !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2554 2627 need_l1d = B_TRUE;
2555 2628 } else {
2556 2629 need_l1d = B_FALSE;
2557 2630 }
2558 2631
2559 2632 if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2560 2633 is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2561 2634 need_mds = B_TRUE;
2562 2635 } else {
2563 2636 need_mds = B_FALSE;
2564 2637 }
2565 2638
2566 2639 if (need_l1d) {
2567 2640 spec_uarch_flush = spec_uarch_flush_msr;
2568 2641 } else if (need_mds) {
2569 2642 spec_uarch_flush = x86_md_clear;
2570 2643 } else {
2571 2644 /*
2572 2645 * We have no hardware mitigations available to us.
2573 2646 */
2574 2647 spec_uarch_flush = spec_uarch_flush_noop;
2575 2648 }
2576 2649 membar_producer();
2577 2650 }
2578 2651
2579 2652 /*
2580 2653 * We default to enabling RSB mitigations.
2581 2654 */
2582 2655 static void
2583 2656 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)
2584 2657 {
2585 2658 const uint8_t ret = RET_INSTR;
2586 2659 uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
2587 2660
2588 2661 switch (mit) {
2589 2662 case X86_SPECTREV2_ENHANCED_IBRS:
2590 2663 case X86_SPECTREV2_DISABLED:
2591 2664 *stuff = ret;
2592 2665 break;
2593 2666 default:
2594 2667 break;
2595 2668 }
2596 2669 }
2597 2670
2598 2671 static void
2599 2672 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
2600 2673 {
2601 2674 const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
2602 2675 "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
2603 2676 "_r14", "_r15" };
2604 2677 const uint_t nthunks = ARRAY_SIZE(thunks);
2605 2678 const char *type;
2606 2679 uint_t i;
2607 2680
2608 2681 if (mit == x86_spectrev2_mitigation)
2609 2682 return;
2610 2683
2611 2684 switch (mit) {
2612 2685 case X86_SPECTREV2_RETPOLINE:
2613 2686 type = "gen";
2614 2687 break;
2615 2688 case X86_SPECTREV2_RETPOLINE_AMD:
2616 2689 type = "amd";
2617 2690 break;
2618 2691 case X86_SPECTREV2_ENHANCED_IBRS:
2619 2692 case X86_SPECTREV2_DISABLED:
2620 2693 type = "jmp";
2621 2694 break;
2622 2695 default:
2623 2696 panic("asked to updated retpoline state with unknown state!");
2624 2697 }
2625 2698
2626 2699 for (i = 0; i < nthunks; i++) {
2627 2700 uintptr_t source, dest;
2628 2701 int ssize, dsize;
2629 2702 char sourcebuf[64], destbuf[64];
2630 2703 size_t len;
2631 2704
2632 2705 (void) snprintf(destbuf, sizeof (destbuf),
2633 2706 "__x86_indirect_thunk%s", thunks[i]);
2634 2707 (void) snprintf(sourcebuf, sizeof (sourcebuf),
2635 2708 "__x86_indirect_thunk_%s%s", type, thunks[i]);
2636 2709
2637 2710 source = kobj_getelfsym(sourcebuf, NULL, &ssize);
2638 2711 dest = kobj_getelfsym(destbuf, NULL, &dsize);
2639 2712 VERIFY3U(source, !=, 0);
2640 2713 VERIFY3U(dest, !=, 0);
2641 2714 VERIFY3S(dsize, >=, ssize);
2642 2715 bcopy((void *)source, (void *)dest, ssize);
2643 2716 }
2644 2717 }
2645 2718
2646 2719 static void
2647 2720 cpuid_enable_enhanced_ibrs(void)
2648 2721 {
2649 2722 uint64_t val;
2650 2723
2651 2724 val = rdmsr(MSR_IA32_SPEC_CTRL);
2652 2725 val |= IA32_SPEC_CTRL_IBRS;
2653 2726 wrmsr(MSR_IA32_SPEC_CTRL, val);
2654 2727 }
2655 2728
2656 2729 #ifndef __xpv
2657 2730 /*
2658 2731 * Determine whether or not we can use the AMD optimized retpoline
2659 2732 * functionality. We use this when we know we're on an AMD system and we can
2660 2733 * successfully verify that lfence is dispatch serializing.
2661 2734 */
2662 2735 static boolean_t
2663 2736 cpuid_use_amd_retpoline(struct cpuid_info *cpi)
2664 2737 {
2665 2738 uint64_t val;
2666 2739 on_trap_data_t otd;
2667 2740
2668 2741 if (cpi->cpi_vendor != X86_VENDOR_AMD)
2669 2742 return (B_FALSE);
2670 2743
2671 2744 /*
2672 2745 * We need to determine whether or not lfence is serializing. It always
2673 2746 * is on families 0xf and 0x11. On others, it's controlled by
2674 2747 * MSR_AMD_DECODE_CONFIG (MSRC001_1029). If some hypervisor gives us a
2675 2748 * crazy old family, don't try and do anything.
2676 2749 */
2677 2750 if (cpi->cpi_family < 0xf)
2678 2751 return (B_FALSE);
2679 2752 if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11)
2680 2753 return (B_TRUE);
2681 2754
2682 2755 /*
2683 2756 * While it may be tempting to use get_hwenv(), there are no promises
2684 2757 * that a hypervisor will actually declare themselves to be so in a
2685 2758 * friendly way. As such, try to read and set the MSR. If we can then
2686 2759 * read back the value we set (it wasn't just set to zero), then we go
2687 2760 * for it.
2688 2761 */
2689 2762 if (!on_trap(&otd, OT_DATA_ACCESS)) {
2690 2763 val = rdmsr(MSR_AMD_DECODE_CONFIG);
2691 2764 val |= AMD_DECODE_CONFIG_LFENCE_DISPATCH;
2692 2765 wrmsr(MSR_AMD_DECODE_CONFIG, val);
2693 2766 val = rdmsr(MSR_AMD_DECODE_CONFIG);
2694 2767 } else {
↓ open down ↓ |
1318 lines elided |
↑ open up ↑ |
2695 2768 val = 0;
2696 2769 }
2697 2770 no_trap();
2698 2771
2699 2772 if ((val & AMD_DECODE_CONFIG_LFENCE_DISPATCH) != 0)
2700 2773 return (B_TRUE);
2701 2774 return (B_FALSE);
2702 2775 }
2703 2776 #endif /* !__xpv */
2704 2777
2778 +/*
2779 + * Determine how we should mitigate TAA or if we need to. Regardless of TAA, if
2780 + * we can disable TSX, we do so.
2781 + *
2782 + * This determination is done only on the boot CPU, potentially after loading
2783 + * updated microcode.
2784 + */
2705 2785 static void
2786 +cpuid_update_tsx(cpu_t *cpu, uchar_t *featureset)
2787 +{
2788 + struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2789 +
2790 + VERIFY(cpu->cpu_id == 0);
2791 +
2792 + if (cpi->cpi_vendor != X86_VENDOR_Intel) {
2793 + x86_taa_mitigation = X86_TAA_HW_MITIGATED;
2794 + return;
2795 + }
2796 +
2797 + if (x86_disable_taa) {
2798 + x86_taa_mitigation = X86_TAA_DISABLED;
2799 + return;
2800 + }
2801 +
2802 + /*
2803 + * If we do not have the ability to disable TSX, then our only
2804 + * mitigation options are in hardware (TAA_NO), or by using our existing
2805 + * MDS mitigation as described above. The latter relies upon us having
2806 + * configured MDS mitigations correctly! This includes disabling SMT if
2807 + * we want to cross-CPU-thread protection.
2808 + */
2809 + if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) {
2810 + /*
2811 + * It's not clear whether any parts will enumerate TAA_NO
2812 + * *without* TSX_CTRL, but let's mark it as such if we see this.
2813 + */
2814 + if (is_x86_feature(featureset, X86FSET_TAA_NO)) {
2815 + x86_taa_mitigation = X86_TAA_HW_MITIGATED;
2816 + return;
2817 + }
2818 +
2819 + if (is_x86_feature(featureset, X86FSET_MD_CLEAR) &&
2820 + !is_x86_feature(featureset, X86FSET_MDS_NO)) {
2821 + x86_taa_mitigation = X86_TAA_MD_CLEAR;
2822 + } else {
2823 + x86_taa_mitigation = X86_TAA_NOTHING;
2824 + }
2825 + return;
2826 + }
2827 +
2828 + /*
2829 + * We have TSX_CTRL, but we can only fully disable TSX if we're early
2830 + * enough in boot.
2831 + *
2832 + * Otherwise, we'll fall back to causing transactions to abort as our
2833 + * mitigation. TSX-using code will always take the fallback path.
2834 + */
2835 + if (cpi->cpi_pass < 4) {
2836 + x86_taa_mitigation = X86_TAA_TSX_DISABLE;
2837 + } else {
2838 + x86_taa_mitigation = X86_TAA_TSX_FORCE_ABORT;
2839 + }
2840 +}
2841 +
2842 +/*
2843 + * As mentioned, we should only touch the MSR when we've got a suitable
2844 + * microcode loaded on this CPU.
2845 + */
2846 +static void
2847 +cpuid_apply_tsx(x86_taa_mitigation_t taa, uchar_t *featureset)
2848 +{
2849 + uint64_t val;
2850 +
2851 + switch (taa) {
2852 + case X86_TAA_TSX_DISABLE:
2853 + if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
2854 + return;
2855 + val = rdmsr(MSR_IA32_TSX_CTRL);
2856 + val |= IA32_TSX_CTRL_CPUID_CLEAR | IA32_TSX_CTRL_RTM_DISABLE;
2857 + wrmsr(MSR_IA32_TSX_CTRL, val);
2858 + break;
2859 + case X86_TAA_TSX_FORCE_ABORT:
2860 + if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
2861 + return;
2862 + val = rdmsr(MSR_IA32_TSX_CTRL);
2863 + val |= IA32_TSX_CTRL_RTM_DISABLE;
2864 + wrmsr(MSR_IA32_TSX_CTRL, val);
2865 + break;
2866 + case X86_TAA_HW_MITIGATED:
2867 + case X86_TAA_MD_CLEAR:
2868 + case X86_TAA_DISABLED:
2869 + case X86_TAA_NOTHING:
2870 + break;
2871 + }
2872 +}
2873 +
2874 +static void
2706 2875 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
2707 2876 {
2708 2877 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2709 2878 x86_spectrev2_mitigation_t v2mit;
2710 2879
2711 2880 if (cpi->cpi_vendor == X86_VENDOR_AMD &&
2712 2881 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2713 2882 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
2714 2883 add_x86_feature(featureset, X86FSET_IBPB);
2715 2884 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
2716 2885 add_x86_feature(featureset, X86FSET_IBRS);
2717 2886 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
2718 2887 add_x86_feature(featureset, X86FSET_STIBP);
2719 2888 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
2720 2889 add_x86_feature(featureset, X86FSET_STIBP_ALL);
2721 2890 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
2722 2891 add_x86_feature(featureset, X86FSET_SSBD);
2723 2892 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
2724 2893 add_x86_feature(featureset, X86FSET_SSBD_VIRT);
2725 2894 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
2726 2895 add_x86_feature(featureset, X86FSET_SSB_NO);
2727 2896 /*
2728 2897 * Don't enable enhanced IBRS unless we're told that we should
2729 2898 * prefer it and it has the same semantics as Intel. This is
2730 2899 * split into two bits rather than a single one.
2731 2900 */
2732 2901 if ((cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
2733 2902 (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)) {
2734 2903 add_x86_feature(featureset, X86FSET_IBRS_ALL);
2735 2904 }
2736 2905
2737 2906 } else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
2738 2907 cpi->cpi_maxeax >= 7) {
2739 2908 struct cpuid_regs *ecp;
2740 2909 ecp = &cpi->cpi_std[7];
2741 2910
2742 2911 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
2743 2912 add_x86_feature(featureset, X86FSET_MD_CLEAR);
2744 2913 }
2745 2914
2746 2915 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
2747 2916 add_x86_feature(featureset, X86FSET_IBRS);
2748 2917 add_x86_feature(featureset, X86FSET_IBPB);
2749 2918 }
2750 2919
2751 2920 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
2752 2921 add_x86_feature(featureset, X86FSET_STIBP);
2753 2922 }
2754 2923
2755 2924 /*
2756 2925 * Don't read the arch caps MSR on xpv where we lack the
2757 2926 * on_trap().
2758 2927 */
2759 2928 #ifndef __xpv
2760 2929 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
2761 2930 on_trap_data_t otd;
2762 2931
2763 2932 /*
2764 2933 * Be paranoid and assume we'll get a #GP.
2765 2934 */
2766 2935 if (!on_trap(&otd, OT_DATA_ACCESS)) {
2767 2936 uint64_t reg;
2768 2937
2769 2938 reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
2770 2939 if (reg & IA32_ARCH_CAP_RDCL_NO) {
2771 2940 add_x86_feature(featureset,
2772 2941 X86FSET_RDCL_NO);
2773 2942 }
2774 2943 if (reg & IA32_ARCH_CAP_IBRS_ALL) {
2775 2944 add_x86_feature(featureset,
2776 2945 X86FSET_IBRS_ALL);
2777 2946 }
2778 2947 if (reg & IA32_ARCH_CAP_RSBA) {
2779 2948 add_x86_feature(featureset,
2780 2949 X86FSET_RSBA);
2781 2950 }
2782 2951 if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
2783 2952 add_x86_feature(featureset,
↓ open down ↓ |
68 lines elided |
↑ open up ↑ |
2784 2953 X86FSET_L1D_VM_NO);
2785 2954 }
2786 2955 if (reg & IA32_ARCH_CAP_SSB_NO) {
2787 2956 add_x86_feature(featureset,
2788 2957 X86FSET_SSB_NO);
2789 2958 }
2790 2959 if (reg & IA32_ARCH_CAP_MDS_NO) {
2791 2960 add_x86_feature(featureset,
2792 2961 X86FSET_MDS_NO);
2793 2962 }
2963 + if (reg & IA32_ARCH_CAP_TSX_CTRL) {
2964 + add_x86_feature(featureset,
2965 + X86FSET_TSX_CTRL);
2966 + }
2967 + if (reg & IA32_ARCH_CAP_TAA_NO) {
2968 + add_x86_feature(featureset,
2969 + X86FSET_TAA_NO);
2970 + }
2794 2971 }
2795 2972 no_trap();
2796 2973 }
2797 2974 #endif /* !__xpv */
2798 2975
2799 2976 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
2800 2977 add_x86_feature(featureset, X86FSET_SSBD);
2801 2978
2802 2979 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
2803 2980 add_x86_feature(featureset, X86FSET_FLUSH_CMD);
2804 2981 }
2805 2982
2983 + /*
2984 + * Take care of certain mitigations on the non-boot CPU. The boot CPU
2985 + * will have already run this function and determined what we need to
2986 + * do. This gives us a hook for per-HW thread mitigations such as
2987 + * enhanced IBRS, or disabling TSX.
2988 + */
2806 2989 if (cpu->cpu_id != 0) {
2807 2990 if (x86_spectrev2_mitigation == X86_SPECTREV2_ENHANCED_IBRS) {
2808 2991 cpuid_enable_enhanced_ibrs();
2809 2992 }
2993 +
2994 + cpuid_apply_tsx(x86_taa_mitigation, featureset);
2810 2995 return;
2811 2996 }
2812 2997
2813 2998 /*
2814 2999 * Go through and initialize various security mechanisms that we should
2815 - * only do on a single CPU. This includes Spectre V2, L1TF, and MDS.
3000 + * only do on a single CPU. This includes Spectre V2, L1TF, MDS, and
3001 + * TAA.
2816 3002 */
2817 3003
2818 3004 /*
2819 3005 * By default we've come in with retpolines enabled. Check whether we
2820 3006 * should disable them or enable enhanced IBRS. RSB stuffing is enabled
2821 3007 * by default, but disabled if we are using enhanced IBRS.
2822 3008 */
2823 3009 if (x86_disable_spectrev2 != 0) {
2824 3010 v2mit = X86_SPECTREV2_DISABLED;
2825 3011 } else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
2826 3012 cpuid_enable_enhanced_ibrs();
2827 3013 v2mit = X86_SPECTREV2_ENHANCED_IBRS;
2828 3014 #ifndef __xpv
2829 3015 } else if (cpuid_use_amd_retpoline(cpi)) {
2830 3016 v2mit = X86_SPECTREV2_RETPOLINE_AMD;
2831 3017 #endif /* !__xpv */
2832 3018 } else {
2833 3019 v2mit = X86_SPECTREV2_RETPOLINE;
2834 3020 }
2835 3021
2836 3022 cpuid_patch_retpolines(v2mit);
2837 3023 cpuid_patch_rsb(v2mit);
2838 3024 x86_spectrev2_mitigation = v2mit;
2839 3025 membar_producer();
2840 3026
2841 3027 /*
2842 3028 * We need to determine what changes are required for mitigating L1TF
2843 3029 * and MDS. If the CPU suffers from either of them, then SMT exclusion
2844 3030 * is required.
2845 3031 *
2846 3032 * If any of these are present, then we need to flush u-arch state at
2847 3033 * various points. For MDS, we need to do so whenever we change to a
2848 3034 * lesser privilege level or we are halting the CPU. For L1TF we need to
2849 3035 * flush the L1D cache at VM entry. When we have microcode that handles
2850 3036 * MDS, the L1D flush also clears the other u-arch state that the
2851 3037 * md_clear does.
2852 3038 */
2853 3039
2854 3040 /*
↓ open down ↓ |
29 lines elided |
↑ open up ↑ |
2855 3041 * Update whether or not we need to be taking explicit action against
2856 3042 * MDS.
2857 3043 */
2858 3044 cpuid_update_md_clear(cpu, featureset);
2859 3045
2860 3046 /*
2861 3047 * Determine whether SMT exclusion is required and whether or not we
2862 3048 * need to perform an l1d flush.
2863 3049 */
2864 3050 cpuid_update_l1d_flush(cpu, featureset);
3051 +
3052 + /*
3053 + * Determine what our mitigation strategy should be for TAA and then
3054 + * also apply TAA mitigations.
3055 + */
3056 + cpuid_update_tsx(cpu, featureset);
3057 + cpuid_apply_tsx(x86_taa_mitigation, featureset);
2865 3058 }
2866 3059
2867 3060 /*
2868 3061 * Setup XFeature_Enabled_Mask register. Required by xsave feature.
2869 3062 */
2870 3063 void
2871 3064 setup_xfem(void)
2872 3065 {
2873 3066 uint64_t flags = XFEATURE_LEGACY_FP;
2874 3067
2875 3068 ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
2876 3069
2877 3070 if (is_x86_feature(x86_featureset, X86FSET_SSE))
2878 3071 flags |= XFEATURE_SSE;
2879 3072
2880 3073 if (is_x86_feature(x86_featureset, X86FSET_AVX))
2881 3074 flags |= XFEATURE_AVX;
2882 3075
2883 3076 if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
2884 3077 flags |= XFEATURE_AVX512;
2885 3078
2886 3079 set_xcr(XFEATURE_ENABLED_MASK, flags);
2887 3080
2888 3081 xsave_bv_all = flags;
2889 3082 }
2890 3083
2891 3084 static void
2892 3085 cpuid_pass1_topology(cpu_t *cpu, uchar_t *featureset)
2893 3086 {
2894 3087 struct cpuid_info *cpi;
2895 3088
2896 3089 cpi = cpu->cpu_m.mcpu_cpi;
2897 3090
2898 3091 if (cpi->cpi_vendor == X86_VENDOR_AMD) {
2899 3092 cpuid_gather_amd_topology_leaves(cpu);
2900 3093 }
2901 3094
2902 3095 cpi->cpi_apicid = cpuid_gather_apicid(cpi);
2903 3096
2904 3097 /*
2905 3098 * Before we can calculate the IDs that we should assign to this
2906 3099 * processor, we need to understand how many cores and threads it has.
2907 3100 */
2908 3101 switch (cpi->cpi_vendor) {
2909 3102 case X86_VENDOR_Intel:
2910 3103 cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
2911 3104 &cpi->cpi_ncore_per_chip);
2912 3105 break;
2913 3106 case X86_VENDOR_AMD:
2914 3107 cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
2915 3108 &cpi->cpi_ncore_per_chip);
2916 3109 break;
2917 3110 default:
2918 3111 /*
2919 3112 * If we have some other x86 compatible chip, it's not clear how
2920 3113 * they would behave. The most common case is virtualization
2921 3114 * today, though there are also 64-bit VIA chips. Assume that
2922 3115 * all we can get is the basic Leaf 1 HTT information.
2923 3116 */
2924 3117 if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2925 3118 cpi->cpi_ncore_per_chip = 1;
2926 3119 cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
2927 3120 }
2928 3121 break;
2929 3122 }
2930 3123
2931 3124 /*
2932 3125 * Based on the calculated number of threads and cores, potentially
2933 3126 * assign the HTT and CMT features.
2934 3127 */
2935 3128 if (cpi->cpi_ncore_per_chip > 1) {
2936 3129 add_x86_feature(featureset, X86FSET_CMP);
2937 3130 }
2938 3131
2939 3132 if (cpi->cpi_ncpu_per_chip > 1 &&
2940 3133 cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
2941 3134 add_x86_feature(featureset, X86FSET_HTT);
2942 3135 }
2943 3136
2944 3137 /*
2945 3138 * Now that has been set up, we need to go through and calculate all of
2946 3139 * the rest of the parameters that exist. If we think the CPU doesn't
2947 3140 * have either SMT (HTT) or CMP, then we basically go through and fake
2948 3141 * up information in some way. The most likely case for this is
2949 3142 * virtualization where we have a lot of partial topology information.
2950 3143 */
2951 3144 if (!is_x86_feature(featureset, X86FSET_HTT) &&
2952 3145 !is_x86_feature(featureset, X86FSET_CMP)) {
2953 3146 /*
2954 3147 * This is a single core, single-threaded processor.
2955 3148 */
2956 3149 cpi->cpi_procnodes_per_pkg = 1;
2957 3150 cpi->cpi_cores_per_compunit = 1;
2958 3151 cpi->cpi_compunitid = 0;
2959 3152 cpi->cpi_chipid = -1;
2960 3153 cpi->cpi_clogid = 0;
2961 3154 cpi->cpi_coreid = cpu->cpu_id;
2962 3155 cpi->cpi_pkgcoreid = 0;
2963 3156 if (cpi->cpi_vendor == X86_VENDOR_AMD) {
2964 3157 cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
2965 3158 } else {
2966 3159 cpi->cpi_procnodeid = cpi->cpi_chipid;
2967 3160 }
2968 3161 } else {
2969 3162 switch (cpi->cpi_vendor) {
2970 3163 case X86_VENDOR_Intel:
2971 3164 cpuid_intel_getids(cpu, featureset);
2972 3165 break;
2973 3166 case X86_VENDOR_AMD:
2974 3167 cpuid_amd_getids(cpu, featureset);
2975 3168 break;
2976 3169 default:
2977 3170 /*
2978 3171 * In this case, it's hard to say what we should do.
2979 3172 * We're going to model them to the OS as single core
2980 3173 * threads. We don't have a good identifier for them, so
2981 3174 * we're just going to use the cpu id all on a single
2982 3175 * chip.
2983 3176 *
2984 3177 * This case has historically been different from the
2985 3178 * case above where we don't have HTT or CMP. While they
2986 3179 * could be combined, we've opted to keep it separate to
2987 3180 * minimize the risk of topology changes in weird cases.
2988 3181 */
2989 3182 cpi->cpi_procnodes_per_pkg = 1;
2990 3183 cpi->cpi_cores_per_compunit = 1;
2991 3184 cpi->cpi_chipid = 0;
2992 3185 cpi->cpi_coreid = cpu->cpu_id;
2993 3186 cpi->cpi_clogid = cpu->cpu_id;
2994 3187 cpi->cpi_pkgcoreid = cpu->cpu_id;
2995 3188 cpi->cpi_procnodeid = cpi->cpi_chipid;
2996 3189 cpi->cpi_compunitid = cpi->cpi_coreid;
2997 3190 break;
2998 3191 }
2999 3192 }
3000 3193 }
3001 3194
3002 3195 /*
3003 3196 * Gather relevant CPU features from leaf 6 which covers thermal information. We
3004 3197 * always gather leaf 6 if it's supported; however, we only look for features on
3005 3198 * Intel systems as AMD does not currently define any of the features we look
3006 3199 * for below.
3007 3200 */
3008 3201 static void
3009 3202 cpuid_pass1_thermal(cpu_t *cpu, uchar_t *featureset)
3010 3203 {
3011 3204 struct cpuid_regs *cp;
3012 3205 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3013 3206
3014 3207 if (cpi->cpi_maxeax < 6) {
3015 3208 return;
3016 3209 }
3017 3210
3018 3211 cp = &cpi->cpi_std[6];
3019 3212 cp->cp_eax = 6;
3020 3213 cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
3021 3214 (void) __cpuid_insn(cp);
3022 3215 platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
3023 3216
3024 3217 if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3025 3218 return;
3026 3219 }
3027 3220
3028 3221 if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
3029 3222 add_x86_feature(featureset, X86FSET_CORE_THERMAL);
3030 3223 }
3031 3224
3032 3225 if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
3033 3226 add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3034 3227 }
3035 3228 }
3036 3229
3037 3230 void
3038 3231 cpuid_pass1(cpu_t *cpu, uchar_t *featureset)
3039 3232 {
3040 3233 uint32_t mask_ecx, mask_edx;
3041 3234 struct cpuid_info *cpi;
3042 3235 struct cpuid_regs *cp;
3043 3236 int xcpuid;
3044 3237 #if !defined(__xpv)
3045 3238 extern int idle_cpu_prefer_mwait;
3046 3239 #endif
3047 3240
3048 3241 /*
3049 3242 * Space statically allocated for BSP, ensure pointer is set
3050 3243 */
3051 3244 if (cpu->cpu_id == 0) {
3052 3245 if (cpu->cpu_m.mcpu_cpi == NULL)
3053 3246 cpu->cpu_m.mcpu_cpi = &cpuid_info0;
3054 3247 }
3055 3248
3056 3249 add_x86_feature(featureset, X86FSET_CPUID);
3057 3250
3058 3251 cpi = cpu->cpu_m.mcpu_cpi;
3059 3252 ASSERT(cpi != NULL);
3060 3253 cp = &cpi->cpi_std[0];
3061 3254 cp->cp_eax = 0;
3062 3255 cpi->cpi_maxeax = __cpuid_insn(cp);
3063 3256 {
3064 3257 uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
3065 3258 *iptr++ = cp->cp_ebx;
3066 3259 *iptr++ = cp->cp_edx;
3067 3260 *iptr++ = cp->cp_ecx;
3068 3261 *(char *)&cpi->cpi_vendorstr[12] = '\0';
3069 3262 }
3070 3263
3071 3264 cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
3072 3265 x86_vendor = cpi->cpi_vendor; /* for compatibility */
3073 3266
3074 3267 /*
3075 3268 * Limit the range in case of weird hardware
3076 3269 */
3077 3270 if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
3078 3271 cpi->cpi_maxeax = CPI_MAXEAX_MAX;
3079 3272 if (cpi->cpi_maxeax < 1)
3080 3273 goto pass1_done;
3081 3274
3082 3275 cp = &cpi->cpi_std[1];
3083 3276 cp->cp_eax = 1;
3084 3277 (void) __cpuid_insn(cp);
3085 3278
3086 3279 /*
3087 3280 * Extract identifying constants for easy access.
3088 3281 */
3089 3282 cpi->cpi_model = CPI_MODEL(cpi);
3090 3283 cpi->cpi_family = CPI_FAMILY(cpi);
3091 3284
3092 3285 if (cpi->cpi_family == 0xf)
3093 3286 cpi->cpi_family += CPI_FAMILY_XTD(cpi);
3094 3287
3095 3288 /*
3096 3289 * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
3097 3290 * Intel, and presumably everyone else, uses model == 0xf, as
3098 3291 * one would expect (max value means possible overflow). Sigh.
3099 3292 */
3100 3293
3101 3294 switch (cpi->cpi_vendor) {
3102 3295 case X86_VENDOR_Intel:
3103 3296 if (IS_EXTENDED_MODEL_INTEL(cpi))
3104 3297 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3105 3298 break;
3106 3299 case X86_VENDOR_AMD:
3107 3300 if (CPI_FAMILY(cpi) == 0xf)
3108 3301 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3109 3302 break;
3110 3303 default:
3111 3304 if (cpi->cpi_model == 0xf)
3112 3305 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3113 3306 break;
3114 3307 }
3115 3308
3116 3309 cpi->cpi_step = CPI_STEP(cpi);
3117 3310 cpi->cpi_brandid = CPI_BRANDID(cpi);
3118 3311
3119 3312 /*
3120 3313 * *default* assumptions:
3121 3314 * - believe %edx feature word
3122 3315 * - ignore %ecx feature word
3123 3316 * - 32-bit virtual and physical addressing
3124 3317 */
3125 3318 mask_edx = 0xffffffff;
3126 3319 mask_ecx = 0;
3127 3320
3128 3321 cpi->cpi_pabits = cpi->cpi_vabits = 32;
3129 3322
3130 3323 switch (cpi->cpi_vendor) {
3131 3324 case X86_VENDOR_Intel:
3132 3325 if (cpi->cpi_family == 5)
3133 3326 x86_type = X86_TYPE_P5;
3134 3327 else if (IS_LEGACY_P6(cpi)) {
3135 3328 x86_type = X86_TYPE_P6;
3136 3329 pentiumpro_bug4046376 = 1;
3137 3330 /*
3138 3331 * Clear the SEP bit when it was set erroneously
3139 3332 */
3140 3333 if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
3141 3334 cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
3142 3335 } else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
3143 3336 x86_type = X86_TYPE_P4;
3144 3337 /*
3145 3338 * We don't currently depend on any of the %ecx
3146 3339 * features until Prescott, so we'll only check
3147 3340 * this from P4 onwards. We might want to revisit
3148 3341 * that idea later.
3149 3342 */
3150 3343 mask_ecx = 0xffffffff;
3151 3344 } else if (cpi->cpi_family > 0xf)
3152 3345 mask_ecx = 0xffffffff;
3153 3346 /*
3154 3347 * We don't support MONITOR/MWAIT if leaf 5 is not available
3155 3348 * to obtain the monitor linesize.
3156 3349 */
3157 3350 if (cpi->cpi_maxeax < 5)
3158 3351 mask_ecx &= ~CPUID_INTC_ECX_MON;
3159 3352 break;
3160 3353 case X86_VENDOR_IntelClone:
3161 3354 default:
3162 3355 break;
3163 3356 case X86_VENDOR_AMD:
3164 3357 #if defined(OPTERON_ERRATUM_108)
3165 3358 if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
3166 3359 cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
3167 3360 cpi->cpi_model = 0xc;
3168 3361 } else
3169 3362 #endif
3170 3363 if (cpi->cpi_family == 5) {
3171 3364 /*
3172 3365 * AMD K5 and K6
3173 3366 *
3174 3367 * These CPUs have an incomplete implementation
3175 3368 * of MCA/MCE which we mask away.
3176 3369 */
3177 3370 mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
3178 3371
3179 3372 /*
3180 3373 * Model 0 uses the wrong (APIC) bit
3181 3374 * to indicate PGE. Fix it here.
3182 3375 */
3183 3376 if (cpi->cpi_model == 0) {
3184 3377 if (cp->cp_edx & 0x200) {
3185 3378 cp->cp_edx &= ~0x200;
3186 3379 cp->cp_edx |= CPUID_INTC_EDX_PGE;
3187 3380 }
3188 3381 }
3189 3382
3190 3383 /*
3191 3384 * Early models had problems w/ MMX; disable.
3192 3385 */
3193 3386 if (cpi->cpi_model < 6)
3194 3387 mask_edx &= ~CPUID_INTC_EDX_MMX;
3195 3388 }
3196 3389
3197 3390 /*
3198 3391 * For newer families, SSE3 and CX16, at least, are valid;
3199 3392 * enable all
3200 3393 */
3201 3394 if (cpi->cpi_family >= 0xf)
3202 3395 mask_ecx = 0xffffffff;
3203 3396 /*
3204 3397 * We don't support MONITOR/MWAIT if leaf 5 is not available
3205 3398 * to obtain the monitor linesize.
3206 3399 */
3207 3400 if (cpi->cpi_maxeax < 5)
3208 3401 mask_ecx &= ~CPUID_INTC_ECX_MON;
3209 3402
3210 3403 #if !defined(__xpv)
3211 3404 /*
3212 3405 * AMD has not historically used MWAIT in the CPU's idle loop.
3213 3406 * Pre-family-10h Opterons do not have the MWAIT instruction. We
3214 3407 * know for certain that in at least family 17h, per AMD, mwait
3215 3408 * is preferred. Families in-between are less certain.
3216 3409 */
3217 3410 if (cpi->cpi_family < 0x17) {
3218 3411 idle_cpu_prefer_mwait = 0;
3219 3412 }
3220 3413 #endif
3221 3414
3222 3415 break;
3223 3416 case X86_VENDOR_TM:
3224 3417 /*
3225 3418 * workaround the NT workaround in CMS 4.1
3226 3419 */
3227 3420 if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
3228 3421 (cpi->cpi_step == 2 || cpi->cpi_step == 3))
3229 3422 cp->cp_edx |= CPUID_INTC_EDX_CX8;
3230 3423 break;
3231 3424 case X86_VENDOR_Centaur:
3232 3425 /*
3233 3426 * workaround the NT workarounds again
3234 3427 */
3235 3428 if (cpi->cpi_family == 6)
3236 3429 cp->cp_edx |= CPUID_INTC_EDX_CX8;
3237 3430 break;
3238 3431 case X86_VENDOR_Cyrix:
3239 3432 /*
3240 3433 * We rely heavily on the probing in locore
3241 3434 * to actually figure out what parts, if any,
3242 3435 * of the Cyrix cpuid instruction to believe.
3243 3436 */
3244 3437 switch (x86_type) {
3245 3438 case X86_TYPE_CYRIX_486:
3246 3439 mask_edx = 0;
3247 3440 break;
3248 3441 case X86_TYPE_CYRIX_6x86:
3249 3442 mask_edx = 0;
3250 3443 break;
3251 3444 case X86_TYPE_CYRIX_6x86L:
3252 3445 mask_edx =
3253 3446 CPUID_INTC_EDX_DE |
3254 3447 CPUID_INTC_EDX_CX8;
3255 3448 break;
3256 3449 case X86_TYPE_CYRIX_6x86MX:
3257 3450 mask_edx =
3258 3451 CPUID_INTC_EDX_DE |
3259 3452 CPUID_INTC_EDX_MSR |
3260 3453 CPUID_INTC_EDX_CX8 |
3261 3454 CPUID_INTC_EDX_PGE |
3262 3455 CPUID_INTC_EDX_CMOV |
3263 3456 CPUID_INTC_EDX_MMX;
3264 3457 break;
3265 3458 case X86_TYPE_CYRIX_GXm:
3266 3459 mask_edx =
3267 3460 CPUID_INTC_EDX_MSR |
3268 3461 CPUID_INTC_EDX_CX8 |
3269 3462 CPUID_INTC_EDX_CMOV |
3270 3463 CPUID_INTC_EDX_MMX;
3271 3464 break;
3272 3465 case X86_TYPE_CYRIX_MediaGX:
3273 3466 break;
3274 3467 case X86_TYPE_CYRIX_MII:
3275 3468 case X86_TYPE_VIA_CYRIX_III:
3276 3469 mask_edx =
3277 3470 CPUID_INTC_EDX_DE |
3278 3471 CPUID_INTC_EDX_TSC |
3279 3472 CPUID_INTC_EDX_MSR |
3280 3473 CPUID_INTC_EDX_CX8 |
3281 3474 CPUID_INTC_EDX_PGE |
3282 3475 CPUID_INTC_EDX_CMOV |
3283 3476 CPUID_INTC_EDX_MMX;
3284 3477 break;
3285 3478 default:
3286 3479 break;
3287 3480 }
3288 3481 break;
3289 3482 }
3290 3483
3291 3484 #if defined(__xpv)
3292 3485 /*
3293 3486 * Do not support MONITOR/MWAIT under a hypervisor
3294 3487 */
3295 3488 mask_ecx &= ~CPUID_INTC_ECX_MON;
3296 3489 /*
3297 3490 * Do not support XSAVE under a hypervisor for now
3298 3491 */
3299 3492 xsave_force_disable = B_TRUE;
3300 3493
3301 3494 #endif /* __xpv */
3302 3495
3303 3496 if (xsave_force_disable) {
3304 3497 mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
3305 3498 mask_ecx &= ~CPUID_INTC_ECX_AVX;
3306 3499 mask_ecx &= ~CPUID_INTC_ECX_F16C;
3307 3500 mask_ecx &= ~CPUID_INTC_ECX_FMA;
3308 3501 }
3309 3502
3310 3503 /*
3311 3504 * Now we've figured out the masks that determine
3312 3505 * which bits we choose to believe, apply the masks
3313 3506 * to the feature words, then map the kernel's view
3314 3507 * of these feature words into its feature word.
3315 3508 */
3316 3509 cp->cp_edx &= mask_edx;
3317 3510 cp->cp_ecx &= mask_ecx;
3318 3511
3319 3512 /*
3320 3513 * apply any platform restrictions (we don't call this
3321 3514 * immediately after __cpuid_insn here, because we need the
3322 3515 * workarounds applied above first)
3323 3516 */
3324 3517 platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
3325 3518
3326 3519 /*
3327 3520 * In addition to ecx and edx, Intel and AMD are storing a bunch of
3328 3521 * instruction set extensions in leaf 7's ebx, ecx, and edx.
3329 3522 */
3330 3523 if (cpi->cpi_maxeax >= 7) {
3331 3524 struct cpuid_regs *ecp;
3332 3525 ecp = &cpi->cpi_std[7];
3333 3526 ecp->cp_eax = 7;
3334 3527 ecp->cp_ecx = 0;
3335 3528 (void) __cpuid_insn(ecp);
3336 3529
3337 3530 /*
3338 3531 * If XSAVE has been disabled, just ignore all of the
3339 3532 * extended-save-area dependent flags here.
3340 3533 */
3341 3534 if (xsave_force_disable) {
3342 3535 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
3343 3536 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
3344 3537 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
3345 3538 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
3346 3539 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
3347 3540 ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
3348 3541 ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
3349 3542 }
3350 3543
3351 3544 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
3352 3545 add_x86_feature(featureset, X86FSET_SMEP);
3353 3546
3354 3547 /*
3355 3548 * We check disable_smap here in addition to in startup_smap()
3356 3549 * to ensure CPUs that aren't the boot CPU don't accidentally
3357 3550 * include it in the feature set and thus generate a mismatched
3358 3551 * x86 feature set across CPUs.
3359 3552 */
3360 3553 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
3361 3554 disable_smap == 0)
3362 3555 add_x86_feature(featureset, X86FSET_SMAP);
3363 3556
3364 3557 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
3365 3558 add_x86_feature(featureset, X86FSET_RDSEED);
3366 3559
3367 3560 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
3368 3561 add_x86_feature(featureset, X86FSET_ADX);
3369 3562
3370 3563 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
3371 3564 add_x86_feature(featureset, X86FSET_FSGSBASE);
3372 3565
3373 3566 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
3374 3567 add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
3375 3568
3376 3569 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3377 3570 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
3378 3571 add_x86_feature(featureset, X86FSET_INVPCID);
3379 3572
3380 3573 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
3381 3574 add_x86_feature(featureset, X86FSET_MPX);
3382 3575
3383 3576 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
3384 3577 add_x86_feature(featureset, X86FSET_CLWB);
3385 3578 }
3386 3579 }
3387 3580
3388 3581 /*
3389 3582 * fold in overrides from the "eeprom" mechanism
3390 3583 */
3391 3584 cp->cp_edx |= cpuid_feature_edx_include;
3392 3585 cp->cp_edx &= ~cpuid_feature_edx_exclude;
3393 3586
3394 3587 cp->cp_ecx |= cpuid_feature_ecx_include;
3395 3588 cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
3396 3589
3397 3590 if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
3398 3591 add_x86_feature(featureset, X86FSET_LARGEPAGE);
3399 3592 }
3400 3593 if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
3401 3594 add_x86_feature(featureset, X86FSET_TSC);
3402 3595 }
3403 3596 if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
3404 3597 add_x86_feature(featureset, X86FSET_MSR);
3405 3598 }
3406 3599 if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
3407 3600 add_x86_feature(featureset, X86FSET_MTRR);
3408 3601 }
3409 3602 if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
3410 3603 add_x86_feature(featureset, X86FSET_PGE);
3411 3604 }
3412 3605 if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
3413 3606 add_x86_feature(featureset, X86FSET_CMOV);
3414 3607 }
3415 3608 if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
3416 3609 add_x86_feature(featureset, X86FSET_MMX);
3417 3610 }
3418 3611 if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
3419 3612 (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
3420 3613 add_x86_feature(featureset, X86FSET_MCA);
3421 3614 }
3422 3615 if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
3423 3616 add_x86_feature(featureset, X86FSET_PAE);
3424 3617 }
3425 3618 if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
3426 3619 add_x86_feature(featureset, X86FSET_CX8);
3427 3620 }
3428 3621 if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
3429 3622 add_x86_feature(featureset, X86FSET_CX16);
3430 3623 }
3431 3624 if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
3432 3625 add_x86_feature(featureset, X86FSET_PAT);
3433 3626 }
3434 3627 if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
3435 3628 add_x86_feature(featureset, X86FSET_SEP);
3436 3629 }
3437 3630 if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
3438 3631 /*
3439 3632 * In our implementation, fxsave/fxrstor
3440 3633 * are prerequisites before we'll even
3441 3634 * try and do SSE things.
3442 3635 */
3443 3636 if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
3444 3637 add_x86_feature(featureset, X86FSET_SSE);
3445 3638 }
3446 3639 if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
3447 3640 add_x86_feature(featureset, X86FSET_SSE2);
3448 3641 }
3449 3642 if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
3450 3643 add_x86_feature(featureset, X86FSET_SSE3);
3451 3644 }
3452 3645 if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
3453 3646 add_x86_feature(featureset, X86FSET_SSSE3);
3454 3647 }
3455 3648 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
3456 3649 add_x86_feature(featureset, X86FSET_SSE4_1);
3457 3650 }
3458 3651 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
3459 3652 add_x86_feature(featureset, X86FSET_SSE4_2);
3460 3653 }
3461 3654 if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
3462 3655 add_x86_feature(featureset, X86FSET_AES);
3463 3656 }
3464 3657 if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
3465 3658 add_x86_feature(featureset, X86FSET_PCLMULQDQ);
3466 3659 }
3467 3660
3468 3661 if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
3469 3662 add_x86_feature(featureset, X86FSET_SHA);
3470 3663
3471 3664 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
3472 3665 add_x86_feature(featureset, X86FSET_UMIP);
3473 3666 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_PKU)
3474 3667 add_x86_feature(featureset, X86FSET_PKU);
3475 3668 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
3476 3669 add_x86_feature(featureset, X86FSET_OSPKE);
3477 3670
3478 3671 if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
3479 3672 add_x86_feature(featureset, X86FSET_XSAVE);
3480 3673
3481 3674 /* We only test AVX & AVX512 when there is XSAVE */
3482 3675
3483 3676 if (cp->cp_ecx & CPUID_INTC_ECX_AVX) {
3484 3677 add_x86_feature(featureset,
3485 3678 X86FSET_AVX);
3486 3679
3487 3680 /*
3488 3681 * Intel says we can't check these without also
3489 3682 * checking AVX.
3490 3683 */
3491 3684 if (cp->cp_ecx & CPUID_INTC_ECX_F16C)
3492 3685 add_x86_feature(featureset,
3493 3686 X86FSET_F16C);
3494 3687
3495 3688 if (cp->cp_ecx & CPUID_INTC_ECX_FMA)
3496 3689 add_x86_feature(featureset,
3497 3690 X86FSET_FMA);
3498 3691
3499 3692 if (cpi->cpi_std[7].cp_ebx &
3500 3693 CPUID_INTC_EBX_7_0_BMI1)
3501 3694 add_x86_feature(featureset,
3502 3695 X86FSET_BMI1);
3503 3696
3504 3697 if (cpi->cpi_std[7].cp_ebx &
3505 3698 CPUID_INTC_EBX_7_0_BMI2)
3506 3699 add_x86_feature(featureset,
3507 3700 X86FSET_BMI2);
3508 3701
3509 3702 if (cpi->cpi_std[7].cp_ebx &
3510 3703 CPUID_INTC_EBX_7_0_AVX2)
3511 3704 add_x86_feature(featureset,
3512 3705 X86FSET_AVX2);
3513 3706 }
3514 3707
3515 3708 if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3516 3709 (cpi->cpi_std[7].cp_ebx &
3517 3710 CPUID_INTC_EBX_7_0_AVX512F) != 0) {
3518 3711 add_x86_feature(featureset, X86FSET_AVX512F);
3519 3712
3520 3713 if (cpi->cpi_std[7].cp_ebx &
3521 3714 CPUID_INTC_EBX_7_0_AVX512DQ)
3522 3715 add_x86_feature(featureset,
3523 3716 X86FSET_AVX512DQ);
3524 3717 if (cpi->cpi_std[7].cp_ebx &
3525 3718 CPUID_INTC_EBX_7_0_AVX512IFMA)
3526 3719 add_x86_feature(featureset,
3527 3720 X86FSET_AVX512FMA);
3528 3721 if (cpi->cpi_std[7].cp_ebx &
3529 3722 CPUID_INTC_EBX_7_0_AVX512PF)
3530 3723 add_x86_feature(featureset,
3531 3724 X86FSET_AVX512PF);
3532 3725 if (cpi->cpi_std[7].cp_ebx &
3533 3726 CPUID_INTC_EBX_7_0_AVX512ER)
3534 3727 add_x86_feature(featureset,
3535 3728 X86FSET_AVX512ER);
3536 3729 if (cpi->cpi_std[7].cp_ebx &
3537 3730 CPUID_INTC_EBX_7_0_AVX512CD)
3538 3731 add_x86_feature(featureset,
3539 3732 X86FSET_AVX512CD);
3540 3733 if (cpi->cpi_std[7].cp_ebx &
3541 3734 CPUID_INTC_EBX_7_0_AVX512BW)
3542 3735 add_x86_feature(featureset,
3543 3736 X86FSET_AVX512BW);
3544 3737 if (cpi->cpi_std[7].cp_ebx &
3545 3738 CPUID_INTC_EBX_7_0_AVX512VL)
3546 3739 add_x86_feature(featureset,
3547 3740 X86FSET_AVX512VL);
3548 3741
3549 3742 if (cpi->cpi_std[7].cp_ecx &
3550 3743 CPUID_INTC_ECX_7_0_AVX512VBMI)
3551 3744 add_x86_feature(featureset,
3552 3745 X86FSET_AVX512VBMI);
3553 3746 if (cpi->cpi_std[7].cp_ecx &
3554 3747 CPUID_INTC_ECX_7_0_AVX512VNNI)
3555 3748 add_x86_feature(featureset,
3556 3749 X86FSET_AVX512VNNI);
3557 3750 if (cpi->cpi_std[7].cp_ecx &
3558 3751 CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3559 3752 add_x86_feature(featureset,
3560 3753 X86FSET_AVX512VPOPCDQ);
3561 3754
3562 3755 if (cpi->cpi_std[7].cp_edx &
3563 3756 CPUID_INTC_EDX_7_0_AVX5124NNIW)
3564 3757 add_x86_feature(featureset,
3565 3758 X86FSET_AVX512NNIW);
3566 3759 if (cpi->cpi_std[7].cp_edx &
3567 3760 CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3568 3761 add_x86_feature(featureset,
3569 3762 X86FSET_AVX512FMAPS);
3570 3763 }
3571 3764 }
3572 3765 }
3573 3766
3574 3767 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3575 3768 if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
3576 3769 add_x86_feature(featureset, X86FSET_PCID);
3577 3770 }
3578 3771 }
3579 3772
3580 3773 if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
3581 3774 add_x86_feature(featureset, X86FSET_X2APIC);
3582 3775 }
3583 3776 if (cp->cp_edx & CPUID_INTC_EDX_DE) {
3584 3777 add_x86_feature(featureset, X86FSET_DE);
3585 3778 }
3586 3779 #if !defined(__xpv)
3587 3780 if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
3588 3781
3589 3782 /*
3590 3783 * We require the CLFLUSH instruction for erratum workaround
3591 3784 * to use MONITOR/MWAIT.
3592 3785 */
3593 3786 if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3594 3787 cpi->cpi_mwait.support |= MWAIT_SUPPORT;
3595 3788 add_x86_feature(featureset, X86FSET_MWAIT);
3596 3789 } else {
3597 3790 extern int idle_cpu_assert_cflush_monitor;
3598 3791
3599 3792 /*
3600 3793 * All processors we are aware of which have
3601 3794 * MONITOR/MWAIT also have CLFLUSH.
3602 3795 */
3603 3796 if (idle_cpu_assert_cflush_monitor) {
3604 3797 ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
3605 3798 (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
3606 3799 }
3607 3800 }
3608 3801 }
3609 3802 #endif /* __xpv */
3610 3803
3611 3804 if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
3612 3805 add_x86_feature(featureset, X86FSET_VMX);
3613 3806 }
3614 3807
3615 3808 if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
3616 3809 add_x86_feature(featureset, X86FSET_RDRAND);
3617 3810
3618 3811 /*
3619 3812 * Only need it first time, rest of the cpus would follow suit.
3620 3813 * we only capture this for the bootcpu.
3621 3814 */
3622 3815 if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3623 3816 add_x86_feature(featureset, X86FSET_CLFSH);
3624 3817 x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
3625 3818 }
3626 3819 if (is_x86_feature(featureset, X86FSET_PAE))
3627 3820 cpi->cpi_pabits = 36;
3628 3821
3629 3822 if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
3630 3823 struct cpuid_regs r, *ecp;
3631 3824
3632 3825 ecp = &r;
3633 3826 ecp->cp_eax = 0xD;
3634 3827 ecp->cp_ecx = 1;
3635 3828 ecp->cp_edx = ecp->cp_ebx = 0;
3636 3829 (void) __cpuid_insn(ecp);
3637 3830
3638 3831 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
3639 3832 add_x86_feature(featureset, X86FSET_XSAVEOPT);
3640 3833 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
3641 3834 add_x86_feature(featureset, X86FSET_XSAVEC);
3642 3835 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
3643 3836 add_x86_feature(featureset, X86FSET_XSAVES);
3644 3837 }
3645 3838
3646 3839 /*
3647 3840 * Work on the "extended" feature information, doing
3648 3841 * some basic initialization for cpuid_pass2()
3649 3842 */
3650 3843 xcpuid = 0;
3651 3844 switch (cpi->cpi_vendor) {
3652 3845 case X86_VENDOR_Intel:
3653 3846 /*
3654 3847 * On KVM we know we will have proper support for extended
3655 3848 * cpuid.
3656 3849 */
3657 3850 if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
3658 3851 (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
3659 3852 (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
3660 3853 xcpuid++;
3661 3854 break;
3662 3855 case X86_VENDOR_AMD:
3663 3856 if (cpi->cpi_family > 5 ||
3664 3857 (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
3665 3858 xcpuid++;
3666 3859 break;
3667 3860 case X86_VENDOR_Cyrix:
3668 3861 /*
3669 3862 * Only these Cyrix CPUs are -known- to support
3670 3863 * extended cpuid operations.
3671 3864 */
3672 3865 if (x86_type == X86_TYPE_VIA_CYRIX_III ||
3673 3866 x86_type == X86_TYPE_CYRIX_GXm)
3674 3867 xcpuid++;
3675 3868 break;
3676 3869 case X86_VENDOR_Centaur:
3677 3870 case X86_VENDOR_TM:
3678 3871 default:
3679 3872 xcpuid++;
3680 3873 break;
3681 3874 }
3682 3875
3683 3876 if (xcpuid) {
3684 3877 cp = &cpi->cpi_extd[0];
3685 3878 cp->cp_eax = CPUID_LEAF_EXT_0;
3686 3879 cpi->cpi_xmaxeax = __cpuid_insn(cp);
3687 3880 }
3688 3881
3689 3882 if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
3690 3883
3691 3884 if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
3692 3885 cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
3693 3886
3694 3887 switch (cpi->cpi_vendor) {
3695 3888 case X86_VENDOR_Intel:
3696 3889 case X86_VENDOR_AMD:
3697 3890 if (cpi->cpi_xmaxeax < 0x80000001)
3698 3891 break;
3699 3892 cp = &cpi->cpi_extd[1];
3700 3893 cp->cp_eax = 0x80000001;
3701 3894 (void) __cpuid_insn(cp);
3702 3895
3703 3896 if (cpi->cpi_vendor == X86_VENDOR_AMD &&
3704 3897 cpi->cpi_family == 5 &&
3705 3898 cpi->cpi_model == 6 &&
3706 3899 cpi->cpi_step == 6) {
3707 3900 /*
3708 3901 * K6 model 6 uses bit 10 to indicate SYSC
3709 3902 * Later models use bit 11. Fix it here.
3710 3903 */
3711 3904 if (cp->cp_edx & 0x400) {
3712 3905 cp->cp_edx &= ~0x400;
3713 3906 cp->cp_edx |= CPUID_AMD_EDX_SYSC;
3714 3907 }
3715 3908 }
3716 3909
3717 3910 platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
3718 3911
3719 3912 /*
3720 3913 * Compute the additions to the kernel's feature word.
3721 3914 */
3722 3915 if (cp->cp_edx & CPUID_AMD_EDX_NX) {
3723 3916 add_x86_feature(featureset, X86FSET_NX);
3724 3917 }
3725 3918
3726 3919 /*
3727 3920 * Regardless whether or not we boot 64-bit,
3728 3921 * we should have a way to identify whether
3729 3922 * the CPU is capable of running 64-bit.
3730 3923 */
3731 3924 if (cp->cp_edx & CPUID_AMD_EDX_LM) {
3732 3925 add_x86_feature(featureset, X86FSET_64);
3733 3926 }
3734 3927
3735 3928 /* 1 GB large page - enable only for 64 bit kernel */
3736 3929 if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
3737 3930 add_x86_feature(featureset, X86FSET_1GPG);
3738 3931 }
3739 3932
3740 3933 if ((cpi->cpi_vendor == X86_VENDOR_AMD) &&
3741 3934 (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
3742 3935 (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
3743 3936 add_x86_feature(featureset, X86FSET_SSE4A);
3744 3937 }
3745 3938
3746 3939 /*
3747 3940 * It's really tricky to support syscall/sysret in
3748 3941 * the i386 kernel; we rely on sysenter/sysexit
3749 3942 * instead. In the amd64 kernel, things are -way-
3750 3943 * better.
3751 3944 */
3752 3945 if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
3753 3946 add_x86_feature(featureset, X86FSET_ASYSC);
3754 3947 }
3755 3948
3756 3949 /*
3757 3950 * While we're thinking about system calls, note
3758 3951 * that AMD processors don't support sysenter
3759 3952 * in long mode at all, so don't try to program them.
3760 3953 */
3761 3954 if (x86_vendor == X86_VENDOR_AMD) {
3762 3955 remove_x86_feature(featureset, X86FSET_SEP);
3763 3956 }
3764 3957
3765 3958 if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
3766 3959 add_x86_feature(featureset, X86FSET_TSCP);
3767 3960 }
3768 3961
3769 3962 if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
3770 3963 add_x86_feature(featureset, X86FSET_SVM);
3771 3964 }
3772 3965
3773 3966 if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
3774 3967 add_x86_feature(featureset, X86FSET_TOPOEXT);
3775 3968 }
3776 3969
3777 3970 if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
3778 3971 add_x86_feature(featureset, X86FSET_AMD_PCEC);
3779 3972 }
3780 3973
3781 3974 if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
3782 3975 add_x86_feature(featureset, X86FSET_XOP);
3783 3976 }
3784 3977
3785 3978 if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
3786 3979 add_x86_feature(featureset, X86FSET_FMA4);
3787 3980 }
3788 3981
3789 3982 if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
3790 3983 add_x86_feature(featureset, X86FSET_TBM);
3791 3984 }
3792 3985
3793 3986 if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
3794 3987 add_x86_feature(featureset, X86FSET_MONITORX);
3795 3988 }
3796 3989 break;
3797 3990 default:
3798 3991 break;
3799 3992 }
3800 3993
3801 3994 /*
3802 3995 * Get CPUID data about processor cores and hyperthreads.
3803 3996 */
3804 3997 switch (cpi->cpi_vendor) {
3805 3998 case X86_VENDOR_Intel:
3806 3999 if (cpi->cpi_maxeax >= 4) {
3807 4000 cp = &cpi->cpi_std[4];
3808 4001 cp->cp_eax = 4;
3809 4002 cp->cp_ecx = 0;
3810 4003 (void) __cpuid_insn(cp);
3811 4004 platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
3812 4005 }
3813 4006 /*FALLTHROUGH*/
3814 4007 case X86_VENDOR_AMD:
3815 4008 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
3816 4009 break;
3817 4010 cp = &cpi->cpi_extd[8];
3818 4011 cp->cp_eax = CPUID_LEAF_EXT_8;
3819 4012 (void) __cpuid_insn(cp);
3820 4013 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
3821 4014 cp);
3822 4015
3823 4016 /*
3824 4017 * AMD uses ebx for some extended functions.
3825 4018 */
3826 4019 if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3827 4020 /*
3828 4021 * While we're here, check for the AMD "Error
3829 4022 * Pointer Zero/Restore" feature. This can be
3830 4023 * used to setup the FP save handlers
3831 4024 * appropriately.
3832 4025 */
3833 4026 if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
3834 4027 cpi->cpi_fp_amd_save = 0;
3835 4028 } else {
3836 4029 cpi->cpi_fp_amd_save = 1;
3837 4030 }
3838 4031
3839 4032 if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
3840 4033 add_x86_feature(featureset,
3841 4034 X86FSET_CLZERO);
3842 4035 }
3843 4036 }
3844 4037
3845 4038 /*
3846 4039 * Virtual and physical address limits from
3847 4040 * cpuid override previously guessed values.
3848 4041 */
3849 4042 cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
3850 4043 cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
3851 4044 break;
3852 4045 default:
3853 4046 break;
3854 4047 }
3855 4048
3856 4049 /*
3857 4050 * Get CPUID data about TSC Invariance in Deep C-State.
3858 4051 */
3859 4052 switch (cpi->cpi_vendor) {
3860 4053 case X86_VENDOR_Intel:
3861 4054 case X86_VENDOR_AMD:
3862 4055 if (cpi->cpi_maxeax >= 7) {
3863 4056 cp = &cpi->cpi_extd[7];
3864 4057 cp->cp_eax = 0x80000007;
3865 4058 cp->cp_ecx = 0;
3866 4059 (void) __cpuid_insn(cp);
3867 4060 }
3868 4061 break;
3869 4062 default:
3870 4063 break;
3871 4064 }
3872 4065 }
3873 4066
3874 4067 cpuid_pass1_topology(cpu, featureset);
3875 4068 cpuid_pass1_thermal(cpu, featureset);
3876 4069
3877 4070 /*
3878 4071 * Synthesize chip "revision" and socket type
3879 4072 */
3880 4073 cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
3881 4074 cpi->cpi_model, cpi->cpi_step);
3882 4075 cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
3883 4076 cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
3884 4077 cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
3885 4078 cpi->cpi_model, cpi->cpi_step);
3886 4079
3887 4080 if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3888 4081 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
3889 4082 cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
3890 4083 /* Special handling for AMD FP not necessary. */
3891 4084 cpi->cpi_fp_amd_save = 0;
3892 4085 } else {
3893 4086 cpi->cpi_fp_amd_save = 1;
3894 4087 }
3895 4088 }
3896 4089
3897 4090 /*
3898 4091 * Check the processor leaves that are used for security features.
3899 4092 */
3900 4093 cpuid_scan_security(cpu, featureset);
3901 4094
3902 4095 pass1_done:
3903 4096 cpi->cpi_pass = 1;
3904 4097 }
3905 4098
3906 4099 /*
3907 4100 * Make copies of the cpuid table entries we depend on, in
3908 4101 * part for ease of parsing now, in part so that we have only
3909 4102 * one place to correct any of it, in part for ease of
3910 4103 * later export to userland, and in part so we can look at
3911 4104 * this stuff in a crash dump.
3912 4105 */
3913 4106
3914 4107 /*ARGSUSED*/
3915 4108 void
3916 4109 cpuid_pass2(cpu_t *cpu)
3917 4110 {
3918 4111 uint_t n, nmax;
3919 4112 int i;
3920 4113 struct cpuid_regs *cp;
3921 4114 uint8_t *dp;
3922 4115 uint32_t *iptr;
3923 4116 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3924 4117
3925 4118 ASSERT(cpi->cpi_pass == 1);
3926 4119
3927 4120 if (cpi->cpi_maxeax < 1)
3928 4121 goto pass2_done;
3929 4122
3930 4123 if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
3931 4124 nmax = NMAX_CPI_STD;
3932 4125 /*
3933 4126 * (We already handled n == 0 and n == 1 in pass 1)
3934 4127 */
3935 4128 for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
3936 4129 /*
3937 4130 * leaves 6 and 7 were handled in pass 1
3938 4131 */
3939 4132 if (n == 6 || n == 7)
3940 4133 continue;
3941 4134
3942 4135 cp->cp_eax = n;
3943 4136
3944 4137 /*
3945 4138 * CPUID function 4 expects %ecx to be initialized
3946 4139 * with an index which indicates which cache to return
3947 4140 * information about. The OS is expected to call function 4
3948 4141 * with %ecx set to 0, 1, 2, ... until it returns with
3949 4142 * EAX[4:0] set to 0, which indicates there are no more
3950 4143 * caches.
3951 4144 *
3952 4145 * Here, populate cpi_std[4] with the information returned by
3953 4146 * function 4 when %ecx == 0, and do the rest in cpuid_pass3()
3954 4147 * when dynamic memory allocation becomes available.
3955 4148 *
3956 4149 * Note: we need to explicitly initialize %ecx here, since
3957 4150 * function 4 may have been previously invoked.
3958 4151 */
3959 4152 if (n == 4)
3960 4153 cp->cp_ecx = 0;
3961 4154
3962 4155 (void) __cpuid_insn(cp);
3963 4156 platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
3964 4157 switch (n) {
3965 4158 case 2:
3966 4159 /*
3967 4160 * "the lower 8 bits of the %eax register
3968 4161 * contain a value that identifies the number
3969 4162 * of times the cpuid [instruction] has to be
3970 4163 * executed to obtain a complete image of the
3971 4164 * processor's caching systems."
3972 4165 *
3973 4166 * How *do* they make this stuff up?
3974 4167 */
3975 4168 cpi->cpi_ncache = sizeof (*cp) *
3976 4169 BITX(cp->cp_eax, 7, 0);
3977 4170 if (cpi->cpi_ncache == 0)
3978 4171 break;
3979 4172 cpi->cpi_ncache--; /* skip count byte */
3980 4173
3981 4174 /*
3982 4175 * Well, for now, rather than attempt to implement
3983 4176 * this slightly dubious algorithm, we just look
3984 4177 * at the first 15 ..
3985 4178 */
3986 4179 if (cpi->cpi_ncache > (sizeof (*cp) - 1))
3987 4180 cpi->cpi_ncache = sizeof (*cp) - 1;
3988 4181
3989 4182 dp = cpi->cpi_cacheinfo;
3990 4183 if (BITX(cp->cp_eax, 31, 31) == 0) {
3991 4184 uint8_t *p = (void *)&cp->cp_eax;
3992 4185 for (i = 1; i < 4; i++)
3993 4186 if (p[i] != 0)
3994 4187 *dp++ = p[i];
3995 4188 }
3996 4189 if (BITX(cp->cp_ebx, 31, 31) == 0) {
3997 4190 uint8_t *p = (void *)&cp->cp_ebx;
3998 4191 for (i = 0; i < 4; i++)
3999 4192 if (p[i] != 0)
4000 4193 *dp++ = p[i];
4001 4194 }
4002 4195 if (BITX(cp->cp_ecx, 31, 31) == 0) {
4003 4196 uint8_t *p = (void *)&cp->cp_ecx;
4004 4197 for (i = 0; i < 4; i++)
4005 4198 if (p[i] != 0)
4006 4199 *dp++ = p[i];
4007 4200 }
4008 4201 if (BITX(cp->cp_edx, 31, 31) == 0) {
4009 4202 uint8_t *p = (void *)&cp->cp_edx;
4010 4203 for (i = 0; i < 4; i++)
4011 4204 if (p[i] != 0)
4012 4205 *dp++ = p[i];
4013 4206 }
4014 4207 break;
4015 4208
4016 4209 case 3: /* Processor serial number, if PSN supported */
4017 4210 break;
4018 4211
4019 4212 case 4: /* Deterministic cache parameters */
4020 4213 break;
4021 4214
4022 4215 case 5: /* Monitor/Mwait parameters */
4023 4216 {
4024 4217 size_t mwait_size;
4025 4218
4026 4219 /*
4027 4220 * check cpi_mwait.support which was set in cpuid_pass1
4028 4221 */
4029 4222 if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
4030 4223 break;
4031 4224
4032 4225 /*
4033 4226 * Protect ourself from insane mwait line size.
4034 4227 * Workaround for incomplete hardware emulator(s).
4035 4228 */
4036 4229 mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
4037 4230 if (mwait_size < sizeof (uint32_t) ||
4038 4231 !ISP2(mwait_size)) {
4039 4232 #if DEBUG
4040 4233 cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
4041 4234 "size %ld", cpu->cpu_id, (long)mwait_size);
4042 4235 #endif
4043 4236 break;
4044 4237 }
4045 4238
4046 4239 cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
4047 4240 cpi->cpi_mwait.mon_max = mwait_size;
4048 4241 if (MWAIT_EXTENSION(cpi)) {
4049 4242 cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
4050 4243 if (MWAIT_INT_ENABLE(cpi))
4051 4244 cpi->cpi_mwait.support |=
4052 4245 MWAIT_ECX_INT_ENABLE;
4053 4246 }
4054 4247 break;
4055 4248 }
4056 4249 default:
4057 4250 break;
4058 4251 }
4059 4252 }
4060 4253
4061 4254 /*
4062 4255 * XSAVE enumeration
4063 4256 */
4064 4257 if (cpi->cpi_maxeax >= 0xD) {
4065 4258 struct cpuid_regs regs;
4066 4259 boolean_t cpuid_d_valid = B_TRUE;
4067 4260
4068 4261 cp = ®s;
4069 4262 cp->cp_eax = 0xD;
4070 4263 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
4071 4264
4072 4265 (void) __cpuid_insn(cp);
4073 4266
4074 4267 /*
4075 4268 * Sanity checks for debug
4076 4269 */
4077 4270 if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
4078 4271 (cp->cp_eax & XFEATURE_SSE) == 0) {
4079 4272 cpuid_d_valid = B_FALSE;
4080 4273 }
4081 4274
4082 4275 cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
4083 4276 cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
4084 4277 cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
4085 4278
4086 4279 /*
4087 4280 * If the hw supports AVX, get the size and offset in the save
4088 4281 * area for the ymm state.
4089 4282 */
4090 4283 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
4091 4284 cp->cp_eax = 0xD;
4092 4285 cp->cp_ecx = 2;
4093 4286 cp->cp_edx = cp->cp_ebx = 0;
4094 4287
4095 4288 (void) __cpuid_insn(cp);
4096 4289
4097 4290 if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
4098 4291 cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
4099 4292 cpuid_d_valid = B_FALSE;
4100 4293 }
4101 4294
4102 4295 cpi->cpi_xsave.ymm_size = cp->cp_eax;
4103 4296 cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
4104 4297 }
4105 4298
4106 4299 /*
4107 4300 * If the hw supports MPX, get the size and offset in the
4108 4301 * save area for BNDREGS and BNDCSR.
4109 4302 */
4110 4303 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
4111 4304 cp->cp_eax = 0xD;
4112 4305 cp->cp_ecx = 3;
4113 4306 cp->cp_edx = cp->cp_ebx = 0;
4114 4307
4115 4308 (void) __cpuid_insn(cp);
4116 4309
4117 4310 cpi->cpi_xsave.bndregs_size = cp->cp_eax;
4118 4311 cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
4119 4312
4120 4313 cp->cp_eax = 0xD;
4121 4314 cp->cp_ecx = 4;
4122 4315 cp->cp_edx = cp->cp_ebx = 0;
4123 4316
4124 4317 (void) __cpuid_insn(cp);
4125 4318
4126 4319 cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
4127 4320 cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
4128 4321 }
4129 4322
4130 4323 /*
4131 4324 * If the hw supports AVX512, get the size and offset in the
4132 4325 * save area for the opmask registers and zmm state.
4133 4326 */
4134 4327 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
4135 4328 cp->cp_eax = 0xD;
4136 4329 cp->cp_ecx = 5;
4137 4330 cp->cp_edx = cp->cp_ebx = 0;
4138 4331
4139 4332 (void) __cpuid_insn(cp);
4140 4333
4141 4334 cpi->cpi_xsave.opmask_size = cp->cp_eax;
4142 4335 cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
4143 4336
4144 4337 cp->cp_eax = 0xD;
4145 4338 cp->cp_ecx = 6;
4146 4339 cp->cp_edx = cp->cp_ebx = 0;
4147 4340
4148 4341 (void) __cpuid_insn(cp);
4149 4342
4150 4343 cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
4151 4344 cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
4152 4345
4153 4346 cp->cp_eax = 0xD;
4154 4347 cp->cp_ecx = 7;
4155 4348 cp->cp_edx = cp->cp_ebx = 0;
4156 4349
4157 4350 (void) __cpuid_insn(cp);
4158 4351
4159 4352 cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
4160 4353 cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
4161 4354 }
4162 4355
4163 4356 if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
4164 4357 xsave_state_size = 0;
4165 4358 } else if (cpuid_d_valid) {
4166 4359 xsave_state_size = cpi->cpi_xsave.xsav_max_size;
4167 4360 } else {
4168 4361 /* Broken CPUID 0xD, probably in HVM */
4169 4362 cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
4170 4363 "value: hw_low = %d, hw_high = %d, xsave_size = %d"
4171 4364 ", ymm_size = %d, ymm_offset = %d\n",
4172 4365 cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
4173 4366 cpi->cpi_xsave.xsav_hw_features_high,
4174 4367 (int)cpi->cpi_xsave.xsav_max_size,
4175 4368 (int)cpi->cpi_xsave.ymm_size,
4176 4369 (int)cpi->cpi_xsave.ymm_offset);
4177 4370
4178 4371 if (xsave_state_size != 0) {
4179 4372 /*
4180 4373 * This must be a non-boot CPU. We cannot
4181 4374 * continue, because boot cpu has already
4182 4375 * enabled XSAVE.
4183 4376 */
4184 4377 ASSERT(cpu->cpu_id != 0);
4185 4378 cmn_err(CE_PANIC, "cpu%d: we have already "
4186 4379 "enabled XSAVE on boot cpu, cannot "
4187 4380 "continue.", cpu->cpu_id);
4188 4381 } else {
4189 4382 /*
4190 4383 * If we reached here on the boot CPU, it's also
4191 4384 * almost certain that we'll reach here on the
4192 4385 * non-boot CPUs. When we're here on a boot CPU
4193 4386 * we should disable the feature, on a non-boot
4194 4387 * CPU we need to confirm that we have.
4195 4388 */
4196 4389 if (cpu->cpu_id == 0) {
4197 4390 remove_x86_feature(x86_featureset,
4198 4391 X86FSET_XSAVE);
4199 4392 remove_x86_feature(x86_featureset,
4200 4393 X86FSET_AVX);
4201 4394 remove_x86_feature(x86_featureset,
4202 4395 X86FSET_F16C);
4203 4396 remove_x86_feature(x86_featureset,
4204 4397 X86FSET_BMI1);
4205 4398 remove_x86_feature(x86_featureset,
4206 4399 X86FSET_BMI2);
4207 4400 remove_x86_feature(x86_featureset,
4208 4401 X86FSET_FMA);
4209 4402 remove_x86_feature(x86_featureset,
4210 4403 X86FSET_AVX2);
4211 4404 remove_x86_feature(x86_featureset,
4212 4405 X86FSET_MPX);
4213 4406 remove_x86_feature(x86_featureset,
4214 4407 X86FSET_AVX512F);
4215 4408 remove_x86_feature(x86_featureset,
4216 4409 X86FSET_AVX512DQ);
4217 4410 remove_x86_feature(x86_featureset,
4218 4411 X86FSET_AVX512PF);
4219 4412 remove_x86_feature(x86_featureset,
4220 4413 X86FSET_AVX512ER);
4221 4414 remove_x86_feature(x86_featureset,
4222 4415 X86FSET_AVX512CD);
4223 4416 remove_x86_feature(x86_featureset,
4224 4417 X86FSET_AVX512BW);
4225 4418 remove_x86_feature(x86_featureset,
4226 4419 X86FSET_AVX512VL);
4227 4420 remove_x86_feature(x86_featureset,
4228 4421 X86FSET_AVX512FMA);
4229 4422 remove_x86_feature(x86_featureset,
4230 4423 X86FSET_AVX512VBMI);
4231 4424 remove_x86_feature(x86_featureset,
4232 4425 X86FSET_AVX512VNNI);
4233 4426 remove_x86_feature(x86_featureset,
4234 4427 X86FSET_AVX512VPOPCDQ);
4235 4428 remove_x86_feature(x86_featureset,
4236 4429 X86FSET_AVX512NNIW);
4237 4430 remove_x86_feature(x86_featureset,
4238 4431 X86FSET_AVX512FMAPS);
4239 4432
4240 4433 CPI_FEATURES_ECX(cpi) &=
4241 4434 ~CPUID_INTC_ECX_XSAVE;
4242 4435 CPI_FEATURES_ECX(cpi) &=
4243 4436 ~CPUID_INTC_ECX_AVX;
4244 4437 CPI_FEATURES_ECX(cpi) &=
4245 4438 ~CPUID_INTC_ECX_F16C;
4246 4439 CPI_FEATURES_ECX(cpi) &=
4247 4440 ~CPUID_INTC_ECX_FMA;
4248 4441 CPI_FEATURES_7_0_EBX(cpi) &=
4249 4442 ~CPUID_INTC_EBX_7_0_BMI1;
4250 4443 CPI_FEATURES_7_0_EBX(cpi) &=
4251 4444 ~CPUID_INTC_EBX_7_0_BMI2;
4252 4445 CPI_FEATURES_7_0_EBX(cpi) &=
4253 4446 ~CPUID_INTC_EBX_7_0_AVX2;
4254 4447 CPI_FEATURES_7_0_EBX(cpi) &=
4255 4448 ~CPUID_INTC_EBX_7_0_MPX;
4256 4449 CPI_FEATURES_7_0_EBX(cpi) &=
4257 4450 ~CPUID_INTC_EBX_7_0_ALL_AVX512;
4258 4451
4259 4452 CPI_FEATURES_7_0_ECX(cpi) &=
4260 4453 ~CPUID_INTC_ECX_7_0_ALL_AVX512;
4261 4454
4262 4455 CPI_FEATURES_7_0_EDX(cpi) &=
4263 4456 ~CPUID_INTC_EDX_7_0_ALL_AVX512;
4264 4457
4265 4458 xsave_force_disable = B_TRUE;
4266 4459 } else {
4267 4460 VERIFY(is_x86_feature(x86_featureset,
4268 4461 X86FSET_XSAVE) == B_FALSE);
4269 4462 }
4270 4463 }
4271 4464 }
4272 4465 }
4273 4466
4274 4467
4275 4468 if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
4276 4469 goto pass2_done;
4277 4470
4278 4471 if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
4279 4472 nmax = NMAX_CPI_EXTD;
4280 4473 /*
4281 4474 * Copy the extended properties, fixing them as we go.
4282 4475 * (We already handled n == 0 and n == 1 in pass 1)
4283 4476 */
4284 4477 iptr = (void *)cpi->cpi_brandstr;
4285 4478 for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
4286 4479 cp->cp_eax = CPUID_LEAF_EXT_0 + n;
4287 4480 (void) __cpuid_insn(cp);
4288 4481 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
4289 4482 cp);
4290 4483 switch (n) {
4291 4484 case 2:
4292 4485 case 3:
4293 4486 case 4:
4294 4487 /*
4295 4488 * Extract the brand string
4296 4489 */
4297 4490 *iptr++ = cp->cp_eax;
4298 4491 *iptr++ = cp->cp_ebx;
4299 4492 *iptr++ = cp->cp_ecx;
4300 4493 *iptr++ = cp->cp_edx;
4301 4494 break;
4302 4495 case 5:
4303 4496 switch (cpi->cpi_vendor) {
4304 4497 case X86_VENDOR_AMD:
4305 4498 /*
4306 4499 * The Athlon and Duron were the first
4307 4500 * parts to report the sizes of the
4308 4501 * TLB for large pages. Before then,
4309 4502 * we don't trust the data.
4310 4503 */
4311 4504 if (cpi->cpi_family < 6 ||
4312 4505 (cpi->cpi_family == 6 &&
4313 4506 cpi->cpi_model < 1))
4314 4507 cp->cp_eax = 0;
4315 4508 break;
4316 4509 default:
4317 4510 break;
4318 4511 }
4319 4512 break;
4320 4513 case 6:
4321 4514 switch (cpi->cpi_vendor) {
4322 4515 case X86_VENDOR_AMD:
4323 4516 /*
4324 4517 * The Athlon and Duron were the first
4325 4518 * AMD parts with L2 TLB's.
4326 4519 * Before then, don't trust the data.
4327 4520 */
4328 4521 if (cpi->cpi_family < 6 ||
4329 4522 cpi->cpi_family == 6 &&
4330 4523 cpi->cpi_model < 1)
4331 4524 cp->cp_eax = cp->cp_ebx = 0;
4332 4525 /*
4333 4526 * AMD Duron rev A0 reports L2
4334 4527 * cache size incorrectly as 1K
4335 4528 * when it is really 64K
4336 4529 */
4337 4530 if (cpi->cpi_family == 6 &&
4338 4531 cpi->cpi_model == 3 &&
4339 4532 cpi->cpi_step == 0) {
4340 4533 cp->cp_ecx &= 0xffff;
4341 4534 cp->cp_ecx |= 0x400000;
4342 4535 }
4343 4536 break;
4344 4537 case X86_VENDOR_Cyrix: /* VIA C3 */
4345 4538 /*
4346 4539 * VIA C3 processors are a bit messed
4347 4540 * up w.r.t. encoding cache sizes in %ecx
4348 4541 */
4349 4542 if (cpi->cpi_family != 6)
4350 4543 break;
4351 4544 /*
4352 4545 * model 7 and 8 were incorrectly encoded
4353 4546 *
4354 4547 * xxx is model 8 really broken?
4355 4548 */
4356 4549 if (cpi->cpi_model == 7 ||
4357 4550 cpi->cpi_model == 8)
4358 4551 cp->cp_ecx =
4359 4552 BITX(cp->cp_ecx, 31, 24) << 16 |
4360 4553 BITX(cp->cp_ecx, 23, 16) << 12 |
4361 4554 BITX(cp->cp_ecx, 15, 8) << 8 |
4362 4555 BITX(cp->cp_ecx, 7, 0);
4363 4556 /*
4364 4557 * model 9 stepping 1 has wrong associativity
4365 4558 */
4366 4559 if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
4367 4560 cp->cp_ecx |= 8 << 12;
4368 4561 break;
4369 4562 case X86_VENDOR_Intel:
4370 4563 /*
4371 4564 * Extended L2 Cache features function.
4372 4565 * First appeared on Prescott.
4373 4566 */
4374 4567 default:
4375 4568 break;
4376 4569 }
4377 4570 break;
4378 4571 default:
4379 4572 break;
4380 4573 }
4381 4574 }
4382 4575
4383 4576 pass2_done:
4384 4577 cpi->cpi_pass = 2;
4385 4578 }
4386 4579
4387 4580 static const char *
4388 4581 intel_cpubrand(const struct cpuid_info *cpi)
4389 4582 {
4390 4583 int i;
4391 4584
4392 4585 if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4393 4586 cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
4394 4587 return ("i486");
4395 4588
4396 4589 switch (cpi->cpi_family) {
4397 4590 case 5:
4398 4591 return ("Intel Pentium(r)");
4399 4592 case 6:
4400 4593 switch (cpi->cpi_model) {
4401 4594 uint_t celeron, xeon;
4402 4595 const struct cpuid_regs *cp;
4403 4596 case 0:
4404 4597 case 1:
4405 4598 case 2:
4406 4599 return ("Intel Pentium(r) Pro");
4407 4600 case 3:
4408 4601 case 4:
4409 4602 return ("Intel Pentium(r) II");
4410 4603 case 6:
4411 4604 return ("Intel Celeron(r)");
4412 4605 case 5:
4413 4606 case 7:
4414 4607 celeron = xeon = 0;
4415 4608 cp = &cpi->cpi_std[2]; /* cache info */
4416 4609
4417 4610 for (i = 1; i < 4; i++) {
4418 4611 uint_t tmp;
4419 4612
4420 4613 tmp = (cp->cp_eax >> (8 * i)) & 0xff;
4421 4614 if (tmp == 0x40)
4422 4615 celeron++;
4423 4616 if (tmp >= 0x44 && tmp <= 0x45)
4424 4617 xeon++;
4425 4618 }
4426 4619
4427 4620 for (i = 0; i < 2; i++) {
4428 4621 uint_t tmp;
4429 4622
4430 4623 tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
4431 4624 if (tmp == 0x40)
4432 4625 celeron++;
4433 4626 else if (tmp >= 0x44 && tmp <= 0x45)
4434 4627 xeon++;
4435 4628 }
4436 4629
4437 4630 for (i = 0; i < 4; i++) {
4438 4631 uint_t tmp;
4439 4632
4440 4633 tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
4441 4634 if (tmp == 0x40)
4442 4635 celeron++;
4443 4636 else if (tmp >= 0x44 && tmp <= 0x45)
4444 4637 xeon++;
4445 4638 }
4446 4639
4447 4640 for (i = 0; i < 4; i++) {
4448 4641 uint_t tmp;
4449 4642
4450 4643 tmp = (cp->cp_edx >> (8 * i)) & 0xff;
4451 4644 if (tmp == 0x40)
4452 4645 celeron++;
4453 4646 else if (tmp >= 0x44 && tmp <= 0x45)
4454 4647 xeon++;
4455 4648 }
4456 4649
4457 4650 if (celeron)
4458 4651 return ("Intel Celeron(r)");
4459 4652 if (xeon)
4460 4653 return (cpi->cpi_model == 5 ?
4461 4654 "Intel Pentium(r) II Xeon(tm)" :
4462 4655 "Intel Pentium(r) III Xeon(tm)");
4463 4656 return (cpi->cpi_model == 5 ?
4464 4657 "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
4465 4658 "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
4466 4659 default:
4467 4660 break;
4468 4661 }
4469 4662 default:
4470 4663 break;
4471 4664 }
4472 4665
4473 4666 /* BrandID is present if the field is nonzero */
4474 4667 if (cpi->cpi_brandid != 0) {
4475 4668 static const struct {
4476 4669 uint_t bt_bid;
4477 4670 const char *bt_str;
4478 4671 } brand_tbl[] = {
4479 4672 { 0x1, "Intel(r) Celeron(r)" },
4480 4673 { 0x2, "Intel(r) Pentium(r) III" },
4481 4674 { 0x3, "Intel(r) Pentium(r) III Xeon(tm)" },
4482 4675 { 0x4, "Intel(r) Pentium(r) III" },
4483 4676 { 0x6, "Mobile Intel(r) Pentium(r) III" },
4484 4677 { 0x7, "Mobile Intel(r) Celeron(r)" },
4485 4678 { 0x8, "Intel(r) Pentium(r) 4" },
4486 4679 { 0x9, "Intel(r) Pentium(r) 4" },
4487 4680 { 0xa, "Intel(r) Celeron(r)" },
4488 4681 { 0xb, "Intel(r) Xeon(tm)" },
4489 4682 { 0xc, "Intel(r) Xeon(tm) MP" },
4490 4683 { 0xe, "Mobile Intel(r) Pentium(r) 4" },
4491 4684 { 0xf, "Mobile Intel(r) Celeron(r)" },
4492 4685 { 0x11, "Mobile Genuine Intel(r)" },
4493 4686 { 0x12, "Intel(r) Celeron(r) M" },
4494 4687 { 0x13, "Mobile Intel(r) Celeron(r)" },
4495 4688 { 0x14, "Intel(r) Celeron(r)" },
4496 4689 { 0x15, "Mobile Genuine Intel(r)" },
4497 4690 { 0x16, "Intel(r) Pentium(r) M" },
4498 4691 { 0x17, "Mobile Intel(r) Celeron(r)" }
4499 4692 };
4500 4693 uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
4501 4694 uint_t sgn;
4502 4695
4503 4696 sgn = (cpi->cpi_family << 8) |
4504 4697 (cpi->cpi_model << 4) | cpi->cpi_step;
4505 4698
4506 4699 for (i = 0; i < btblmax; i++)
4507 4700 if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
4508 4701 break;
4509 4702 if (i < btblmax) {
4510 4703 if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
4511 4704 return ("Intel(r) Celeron(r)");
4512 4705 if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
4513 4706 return ("Intel(r) Xeon(tm) MP");
4514 4707 if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
4515 4708 return ("Intel(r) Xeon(tm)");
4516 4709 return (brand_tbl[i].bt_str);
4517 4710 }
4518 4711 }
4519 4712
4520 4713 return (NULL);
4521 4714 }
4522 4715
4523 4716 static const char *
4524 4717 amd_cpubrand(const struct cpuid_info *cpi)
4525 4718 {
4526 4719 if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4527 4720 cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
4528 4721 return ("i486 compatible");
4529 4722
4530 4723 switch (cpi->cpi_family) {
4531 4724 case 5:
4532 4725 switch (cpi->cpi_model) {
4533 4726 case 0:
4534 4727 case 1:
4535 4728 case 2:
4536 4729 case 3:
4537 4730 case 4:
4538 4731 case 5:
4539 4732 return ("AMD-K5(r)");
4540 4733 case 6:
4541 4734 case 7:
4542 4735 return ("AMD-K6(r)");
4543 4736 case 8:
4544 4737 return ("AMD-K6(r)-2");
4545 4738 case 9:
4546 4739 return ("AMD-K6(r)-III");
4547 4740 default:
4548 4741 return ("AMD (family 5)");
4549 4742 }
4550 4743 case 6:
4551 4744 switch (cpi->cpi_model) {
4552 4745 case 1:
4553 4746 return ("AMD-K7(tm)");
4554 4747 case 0:
4555 4748 case 2:
4556 4749 case 4:
4557 4750 return ("AMD Athlon(tm)");
4558 4751 case 3:
4559 4752 case 7:
4560 4753 return ("AMD Duron(tm)");
4561 4754 case 6:
4562 4755 case 8:
4563 4756 case 10:
4564 4757 /*
4565 4758 * Use the L2 cache size to distinguish
4566 4759 */
4567 4760 return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
4568 4761 "AMD Athlon(tm)" : "AMD Duron(tm)");
4569 4762 default:
4570 4763 return ("AMD (family 6)");
4571 4764 }
4572 4765 default:
4573 4766 break;
4574 4767 }
4575 4768
4576 4769 if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
4577 4770 cpi->cpi_brandid != 0) {
4578 4771 switch (BITX(cpi->cpi_brandid, 7, 5)) {
4579 4772 case 3:
4580 4773 return ("AMD Opteron(tm) UP 1xx");
4581 4774 case 4:
4582 4775 return ("AMD Opteron(tm) DP 2xx");
4583 4776 case 5:
4584 4777 return ("AMD Opteron(tm) MP 8xx");
4585 4778 default:
4586 4779 return ("AMD Opteron(tm)");
4587 4780 }
4588 4781 }
4589 4782
4590 4783 return (NULL);
4591 4784 }
4592 4785
4593 4786 static const char *
4594 4787 cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
4595 4788 {
4596 4789 if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4597 4790 cpi->cpi_maxeax < 1 || cpi->cpi_family < 5 ||
4598 4791 type == X86_TYPE_CYRIX_486)
4599 4792 return ("i486 compatible");
4600 4793
4601 4794 switch (type) {
4602 4795 case X86_TYPE_CYRIX_6x86:
4603 4796 return ("Cyrix 6x86");
4604 4797 case X86_TYPE_CYRIX_6x86L:
4605 4798 return ("Cyrix 6x86L");
4606 4799 case X86_TYPE_CYRIX_6x86MX:
4607 4800 return ("Cyrix 6x86MX");
4608 4801 case X86_TYPE_CYRIX_GXm:
4609 4802 return ("Cyrix GXm");
4610 4803 case X86_TYPE_CYRIX_MediaGX:
4611 4804 return ("Cyrix MediaGX");
4612 4805 case X86_TYPE_CYRIX_MII:
4613 4806 return ("Cyrix M2");
4614 4807 case X86_TYPE_VIA_CYRIX_III:
4615 4808 return ("VIA Cyrix M3");
4616 4809 default:
4617 4810 /*
4618 4811 * Have another wild guess ..
4619 4812 */
4620 4813 if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
4621 4814 return ("Cyrix 5x86");
4622 4815 else if (cpi->cpi_family == 5) {
4623 4816 switch (cpi->cpi_model) {
4624 4817 case 2:
4625 4818 return ("Cyrix 6x86"); /* Cyrix M1 */
4626 4819 case 4:
4627 4820 return ("Cyrix MediaGX");
4628 4821 default:
4629 4822 break;
4630 4823 }
4631 4824 } else if (cpi->cpi_family == 6) {
4632 4825 switch (cpi->cpi_model) {
4633 4826 case 0:
4634 4827 return ("Cyrix 6x86MX"); /* Cyrix M2? */
4635 4828 case 5:
4636 4829 case 6:
4637 4830 case 7:
4638 4831 case 8:
4639 4832 case 9:
4640 4833 return ("VIA C3");
4641 4834 default:
4642 4835 break;
4643 4836 }
4644 4837 }
4645 4838 break;
4646 4839 }
4647 4840 return (NULL);
4648 4841 }
4649 4842
4650 4843 /*
4651 4844 * This only gets called in the case that the CPU extended
4652 4845 * feature brand string (0x80000002, 0x80000003, 0x80000004)
4653 4846 * aren't available, or contain null bytes for some reason.
4654 4847 */
4655 4848 static void
4656 4849 fabricate_brandstr(struct cpuid_info *cpi)
4657 4850 {
4658 4851 const char *brand = NULL;
4659 4852
4660 4853 switch (cpi->cpi_vendor) {
4661 4854 case X86_VENDOR_Intel:
4662 4855 brand = intel_cpubrand(cpi);
4663 4856 break;
4664 4857 case X86_VENDOR_AMD:
4665 4858 brand = amd_cpubrand(cpi);
4666 4859 break;
4667 4860 case X86_VENDOR_Cyrix:
4668 4861 brand = cyrix_cpubrand(cpi, x86_type);
4669 4862 break;
4670 4863 case X86_VENDOR_NexGen:
4671 4864 if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4672 4865 brand = "NexGen Nx586";
4673 4866 break;
4674 4867 case X86_VENDOR_Centaur:
4675 4868 if (cpi->cpi_family == 5)
4676 4869 switch (cpi->cpi_model) {
4677 4870 case 4:
4678 4871 brand = "Centaur C6";
4679 4872 break;
4680 4873 case 8:
4681 4874 brand = "Centaur C2";
4682 4875 break;
4683 4876 case 9:
4684 4877 brand = "Centaur C3";
4685 4878 break;
4686 4879 default:
4687 4880 break;
4688 4881 }
4689 4882 break;
4690 4883 case X86_VENDOR_Rise:
4691 4884 if (cpi->cpi_family == 5 &&
4692 4885 (cpi->cpi_model == 0 || cpi->cpi_model == 2))
4693 4886 brand = "Rise mP6";
4694 4887 break;
4695 4888 case X86_VENDOR_SiS:
4696 4889 if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4697 4890 brand = "SiS 55x";
4698 4891 break;
4699 4892 case X86_VENDOR_TM:
4700 4893 if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
4701 4894 brand = "Transmeta Crusoe TM3x00 or TM5x00";
4702 4895 break;
4703 4896 case X86_VENDOR_NSC:
4704 4897 case X86_VENDOR_UMC:
4705 4898 default:
4706 4899 break;
4707 4900 }
4708 4901 if (brand) {
4709 4902 (void) strcpy((char *)cpi->cpi_brandstr, brand);
4710 4903 return;
4711 4904 }
4712 4905
4713 4906 /*
4714 4907 * If all else fails ...
4715 4908 */
4716 4909 (void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
4717 4910 "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
4718 4911 cpi->cpi_model, cpi->cpi_step);
4719 4912 }
4720 4913
4721 4914 /*
4722 4915 * This routine is called just after kernel memory allocation
4723 4916 * becomes available on cpu0, and as part of mp_startup() on
4724 4917 * the other cpus.
4725 4918 *
4726 4919 * Fixup the brand string, and collect any information from cpuid
4727 4920 * that requires dynamically allocated storage to represent.
4728 4921 */
4729 4922 /*ARGSUSED*/
4730 4923 void
4731 4924 cpuid_pass3(cpu_t *cpu)
4732 4925 {
4733 4926 int i, max, shft, level, size;
4734 4927 struct cpuid_regs regs;
4735 4928 struct cpuid_regs *cp;
4736 4929 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4737 4930
4738 4931 ASSERT(cpi->cpi_pass == 2);
4739 4932
4740 4933 /*
4741 4934 * Deterministic cache parameters
4742 4935 *
4743 4936 * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
4744 4937 * values that are present are currently defined to be the same. This
4745 4938 * means we can use the same logic to parse it as long as we use the
4746 4939 * appropriate leaf to get the data. If you're updating this, make sure
4747 4940 * you're careful about which vendor supports which aspect.
4748 4941 *
4749 4942 * Take this opportunity to detect the number of threads sharing the
4750 4943 * last level cache, and construct a corresponding cache id. The
4751 4944 * respective cpuid_info members are initialized to the default case of
4752 4945 * "no last level cache sharing".
4753 4946 */
4754 4947 cpi->cpi_ncpu_shr_last_cache = 1;
4755 4948 cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
4756 4949
4757 4950 if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
4758 4951 (cpi->cpi_vendor == X86_VENDOR_AMD &&
4759 4952 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
4760 4953 is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
4761 4954 uint32_t leaf;
4762 4955
4763 4956 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4764 4957 leaf = 4;
4765 4958 } else {
4766 4959 leaf = CPUID_LEAF_EXT_1d;
4767 4960 }
4768 4961
4769 4962 /*
4770 4963 * Find the # of elements (size) returned by the leaf and along
4771 4964 * the way detect last level cache sharing details.
4772 4965 */
4773 4966 bzero(®s, sizeof (regs));
4774 4967 cp = ®s;
4775 4968 for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
4776 4969 cp->cp_eax = leaf;
4777 4970 cp->cp_ecx = i;
4778 4971
4779 4972 (void) __cpuid_insn(cp);
4780 4973
4781 4974 if (CPI_CACHE_TYPE(cp) == 0)
4782 4975 break;
4783 4976 level = CPI_CACHE_LVL(cp);
4784 4977 if (level > max) {
4785 4978 max = level;
4786 4979 cpi->cpi_ncpu_shr_last_cache =
4787 4980 CPI_NTHR_SHR_CACHE(cp) + 1;
4788 4981 }
4789 4982 }
4790 4983 cpi->cpi_cache_leaf_size = size = i;
4791 4984
4792 4985 /*
4793 4986 * Allocate the cpi_cache_leaves array. The first element
4794 4987 * references the regs for the corresponding leaf with %ecx set
4795 4988 * to 0. This was gathered in cpuid_pass2().
4796 4989 */
4797 4990 if (size > 0) {
4798 4991 cpi->cpi_cache_leaves =
4799 4992 kmem_alloc(size * sizeof (cp), KM_SLEEP);
4800 4993 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4801 4994 cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
4802 4995 } else {
4803 4996 cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
4804 4997 }
4805 4998
4806 4999 /*
4807 5000 * Allocate storage to hold the additional regs
4808 5001 * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
4809 5002 *
4810 5003 * The regs for the leaf, %ecx == 0 has already
4811 5004 * been allocated as indicated above.
4812 5005 */
4813 5006 for (i = 1; i < size; i++) {
4814 5007 cp = cpi->cpi_cache_leaves[i] =
4815 5008 kmem_zalloc(sizeof (regs), KM_SLEEP);
4816 5009 cp->cp_eax = leaf;
4817 5010 cp->cp_ecx = i;
4818 5011
4819 5012 (void) __cpuid_insn(cp);
4820 5013 }
4821 5014 }
4822 5015 /*
4823 5016 * Determine the number of bits needed to represent
4824 5017 * the number of CPUs sharing the last level cache.
4825 5018 *
4826 5019 * Shift off that number of bits from the APIC id to
4827 5020 * derive the cache id.
4828 5021 */
4829 5022 shft = 0;
4830 5023 for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
4831 5024 shft++;
4832 5025 cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
4833 5026 }
4834 5027
4835 5028 /*
4836 5029 * Now fixup the brand string
4837 5030 */
4838 5031 if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
4839 5032 fabricate_brandstr(cpi);
4840 5033 } else {
4841 5034
4842 5035 /*
4843 5036 * If we successfully extracted a brand string from the cpuid
4844 5037 * instruction, clean it up by removing leading spaces and
4845 5038 * similar junk.
4846 5039 */
4847 5040 if (cpi->cpi_brandstr[0]) {
4848 5041 size_t maxlen = sizeof (cpi->cpi_brandstr);
4849 5042 char *src, *dst;
4850 5043
4851 5044 dst = src = (char *)cpi->cpi_brandstr;
4852 5045 src[maxlen - 1] = '\0';
4853 5046 /*
4854 5047 * strip leading spaces
4855 5048 */
4856 5049 while (*src == ' ')
4857 5050 src++;
4858 5051 /*
4859 5052 * Remove any 'Genuine' or "Authentic" prefixes
4860 5053 */
4861 5054 if (strncmp(src, "Genuine ", 8) == 0)
4862 5055 src += 8;
4863 5056 if (strncmp(src, "Authentic ", 10) == 0)
4864 5057 src += 10;
4865 5058
4866 5059 /*
4867 5060 * Now do an in-place copy.
4868 5061 * Map (R) to (r) and (TM) to (tm).
4869 5062 * The era of teletypes is long gone, and there's
4870 5063 * -really- no need to shout.
4871 5064 */
4872 5065 while (*src != '\0') {
4873 5066 if (src[0] == '(') {
4874 5067 if (strncmp(src + 1, "R)", 2) == 0) {
4875 5068 (void) strncpy(dst, "(r)", 3);
4876 5069 src += 3;
4877 5070 dst += 3;
4878 5071 continue;
4879 5072 }
4880 5073 if (strncmp(src + 1, "TM)", 3) == 0) {
4881 5074 (void) strncpy(dst, "(tm)", 4);
4882 5075 src += 4;
4883 5076 dst += 4;
4884 5077 continue;
4885 5078 }
4886 5079 }
4887 5080 *dst++ = *src++;
4888 5081 }
4889 5082 *dst = '\0';
4890 5083
4891 5084 /*
4892 5085 * Finally, remove any trailing spaces
4893 5086 */
4894 5087 while (--dst > cpi->cpi_brandstr)
4895 5088 if (*dst == ' ')
4896 5089 *dst = '\0';
4897 5090 else
4898 5091 break;
4899 5092 } else
4900 5093 fabricate_brandstr(cpi);
4901 5094 }
4902 5095 cpi->cpi_pass = 3;
4903 5096 }
4904 5097
4905 5098 /*
4906 5099 * This routine is called out of bind_hwcap() much later in the life
4907 5100 * of the kernel (post_startup()). The job of this routine is to resolve
4908 5101 * the hardware feature support and kernel support for those features into
4909 5102 * what we're actually going to tell applications via the aux vector.
4910 5103 */
4911 5104 void
4912 5105 cpuid_pass4(cpu_t *cpu, uint_t *hwcap_out)
4913 5106 {
4914 5107 struct cpuid_info *cpi;
4915 5108 uint_t hwcap_flags = 0, hwcap_flags_2 = 0;
4916 5109
4917 5110 if (cpu == NULL)
4918 5111 cpu = CPU;
4919 5112 cpi = cpu->cpu_m.mcpu_cpi;
4920 5113
4921 5114 ASSERT(cpi->cpi_pass == 3);
4922 5115
4923 5116 if (cpi->cpi_maxeax >= 1) {
4924 5117 uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
4925 5118 uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
4926 5119 uint32_t *ebx = &cpi->cpi_support[STD_EBX_FEATURES];
4927 5120
4928 5121 *edx = CPI_FEATURES_EDX(cpi);
4929 5122 *ecx = CPI_FEATURES_ECX(cpi);
4930 5123 *ebx = CPI_FEATURES_7_0_EBX(cpi);
4931 5124
4932 5125 /*
4933 5126 * [these require explicit kernel support]
4934 5127 */
4935 5128 if (!is_x86_feature(x86_featureset, X86FSET_SEP))
4936 5129 *edx &= ~CPUID_INTC_EDX_SEP;
4937 5130
4938 5131 if (!is_x86_feature(x86_featureset, X86FSET_SSE))
4939 5132 *edx &= ~(CPUID_INTC_EDX_FXSR|CPUID_INTC_EDX_SSE);
4940 5133 if (!is_x86_feature(x86_featureset, X86FSET_SSE2))
4941 5134 *edx &= ~CPUID_INTC_EDX_SSE2;
4942 5135
4943 5136 if (!is_x86_feature(x86_featureset, X86FSET_HTT))
4944 5137 *edx &= ~CPUID_INTC_EDX_HTT;
4945 5138
4946 5139 if (!is_x86_feature(x86_featureset, X86FSET_SSE3))
4947 5140 *ecx &= ~CPUID_INTC_ECX_SSE3;
4948 5141
4949 5142 if (!is_x86_feature(x86_featureset, X86FSET_SSSE3))
4950 5143 *ecx &= ~CPUID_INTC_ECX_SSSE3;
4951 5144 if (!is_x86_feature(x86_featureset, X86FSET_SSE4_1))
4952 5145 *ecx &= ~CPUID_INTC_ECX_SSE4_1;
4953 5146 if (!is_x86_feature(x86_featureset, X86FSET_SSE4_2))
4954 5147 *ecx &= ~CPUID_INTC_ECX_SSE4_2;
4955 5148 if (!is_x86_feature(x86_featureset, X86FSET_AES))
4956 5149 *ecx &= ~CPUID_INTC_ECX_AES;
4957 5150 if (!is_x86_feature(x86_featureset, X86FSET_PCLMULQDQ))
4958 5151 *ecx &= ~CPUID_INTC_ECX_PCLMULQDQ;
4959 5152 if (!is_x86_feature(x86_featureset, X86FSET_XSAVE))
4960 5153 *ecx &= ~(CPUID_INTC_ECX_XSAVE |
4961 5154 CPUID_INTC_ECX_OSXSAVE);
4962 5155 if (!is_x86_feature(x86_featureset, X86FSET_AVX))
4963 5156 *ecx &= ~CPUID_INTC_ECX_AVX;
4964 5157 if (!is_x86_feature(x86_featureset, X86FSET_F16C))
4965 5158 *ecx &= ~CPUID_INTC_ECX_F16C;
4966 5159 if (!is_x86_feature(x86_featureset, X86FSET_FMA))
4967 5160 *ecx &= ~CPUID_INTC_ECX_FMA;
4968 5161 if (!is_x86_feature(x86_featureset, X86FSET_BMI1))
4969 5162 *ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
4970 5163 if (!is_x86_feature(x86_featureset, X86FSET_BMI2))
4971 5164 *ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
4972 5165 if (!is_x86_feature(x86_featureset, X86FSET_AVX2))
4973 5166 *ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
4974 5167 if (!is_x86_feature(x86_featureset, X86FSET_RDSEED))
4975 5168 *ebx &= ~CPUID_INTC_EBX_7_0_RDSEED;
4976 5169 if (!is_x86_feature(x86_featureset, X86FSET_ADX))
4977 5170 *ebx &= ~CPUID_INTC_EBX_7_0_ADX;
4978 5171
4979 5172 /*
4980 5173 * [no explicit support required beyond x87 fp context]
4981 5174 */
4982 5175 if (!fpu_exists)
4983 5176 *edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
4984 5177
4985 5178 /*
4986 5179 * Now map the supported feature vector to things that we
4987 5180 * think userland will care about.
4988 5181 */
4989 5182 if (*edx & CPUID_INTC_EDX_SEP)
4990 5183 hwcap_flags |= AV_386_SEP;
4991 5184 if (*edx & CPUID_INTC_EDX_SSE)
4992 5185 hwcap_flags |= AV_386_FXSR | AV_386_SSE;
4993 5186 if (*edx & CPUID_INTC_EDX_SSE2)
4994 5187 hwcap_flags |= AV_386_SSE2;
4995 5188 if (*ecx & CPUID_INTC_ECX_SSE3)
4996 5189 hwcap_flags |= AV_386_SSE3;
4997 5190 if (*ecx & CPUID_INTC_ECX_SSSE3)
4998 5191 hwcap_flags |= AV_386_SSSE3;
4999 5192 if (*ecx & CPUID_INTC_ECX_SSE4_1)
5000 5193 hwcap_flags |= AV_386_SSE4_1;
5001 5194 if (*ecx & CPUID_INTC_ECX_SSE4_2)
5002 5195 hwcap_flags |= AV_386_SSE4_2;
5003 5196 if (*ecx & CPUID_INTC_ECX_MOVBE)
5004 5197 hwcap_flags |= AV_386_MOVBE;
5005 5198 if (*ecx & CPUID_INTC_ECX_AES)
5006 5199 hwcap_flags |= AV_386_AES;
5007 5200 if (*ecx & CPUID_INTC_ECX_PCLMULQDQ)
5008 5201 hwcap_flags |= AV_386_PCLMULQDQ;
5009 5202 if ((*ecx & CPUID_INTC_ECX_XSAVE) &&
5010 5203 (*ecx & CPUID_INTC_ECX_OSXSAVE)) {
5011 5204 hwcap_flags |= AV_386_XSAVE;
5012 5205
5013 5206 if (*ecx & CPUID_INTC_ECX_AVX) {
5014 5207 uint32_t *ecx_7 = &CPI_FEATURES_7_0_ECX(cpi);
5015 5208 uint32_t *edx_7 = &CPI_FEATURES_7_0_EDX(cpi);
5016 5209
5017 5210 hwcap_flags |= AV_386_AVX;
5018 5211 if (*ecx & CPUID_INTC_ECX_F16C)
5019 5212 hwcap_flags_2 |= AV_386_2_F16C;
5020 5213 if (*ecx & CPUID_INTC_ECX_FMA)
5021 5214 hwcap_flags_2 |= AV_386_2_FMA;
5022 5215
5023 5216 if (*ebx & CPUID_INTC_EBX_7_0_BMI1)
5024 5217 hwcap_flags_2 |= AV_386_2_BMI1;
5025 5218 if (*ebx & CPUID_INTC_EBX_7_0_BMI2)
5026 5219 hwcap_flags_2 |= AV_386_2_BMI2;
5027 5220 if (*ebx & CPUID_INTC_EBX_7_0_AVX2)
5028 5221 hwcap_flags_2 |= AV_386_2_AVX2;
5029 5222 if (*ebx & CPUID_INTC_EBX_7_0_AVX512F)
5030 5223 hwcap_flags_2 |= AV_386_2_AVX512F;
5031 5224 if (*ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
5032 5225 hwcap_flags_2 |= AV_386_2_AVX512DQ;
5033 5226 if (*ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
5034 5227 hwcap_flags_2 |= AV_386_2_AVX512IFMA;
5035 5228 if (*ebx & CPUID_INTC_EBX_7_0_AVX512PF)
5036 5229 hwcap_flags_2 |= AV_386_2_AVX512PF;
5037 5230 if (*ebx & CPUID_INTC_EBX_7_0_AVX512ER)
5038 5231 hwcap_flags_2 |= AV_386_2_AVX512ER;
5039 5232 if (*ebx & CPUID_INTC_EBX_7_0_AVX512CD)
5040 5233 hwcap_flags_2 |= AV_386_2_AVX512CD;
5041 5234 if (*ebx & CPUID_INTC_EBX_7_0_AVX512BW)
5042 5235 hwcap_flags_2 |= AV_386_2_AVX512BW;
5043 5236 if (*ebx & CPUID_INTC_EBX_7_0_AVX512VL)
5044 5237 hwcap_flags_2 |= AV_386_2_AVX512VL;
5045 5238
5046 5239 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VBMI)
5047 5240 hwcap_flags_2 |= AV_386_2_AVX512VBMI;
5048 5241 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VNNI)
5049 5242 hwcap_flags_2 |= AV_386_2_AVX512_VNNI;
5050 5243 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
5051 5244 hwcap_flags_2 |= AV_386_2_AVX512VPOPCDQ;
5052 5245
5053 5246 if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124NNIW)
5054 5247 hwcap_flags_2 |= AV_386_2_AVX512_4NNIW;
5055 5248 if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
5056 5249 hwcap_flags_2 |= AV_386_2_AVX512_4FMAPS;
5057 5250 }
5058 5251 }
5059 5252 if (*ecx & CPUID_INTC_ECX_VMX)
5060 5253 hwcap_flags |= AV_386_VMX;
5061 5254 if (*ecx & CPUID_INTC_ECX_POPCNT)
5062 5255 hwcap_flags |= AV_386_POPCNT;
5063 5256 if (*edx & CPUID_INTC_EDX_FPU)
5064 5257 hwcap_flags |= AV_386_FPU;
5065 5258 if (*edx & CPUID_INTC_EDX_MMX)
5066 5259 hwcap_flags |= AV_386_MMX;
5067 5260
5068 5261 if (*edx & CPUID_INTC_EDX_TSC)
5069 5262 hwcap_flags |= AV_386_TSC;
5070 5263 if (*edx & CPUID_INTC_EDX_CX8)
5071 5264 hwcap_flags |= AV_386_CX8;
5072 5265 if (*edx & CPUID_INTC_EDX_CMOV)
5073 5266 hwcap_flags |= AV_386_CMOV;
5074 5267 if (*ecx & CPUID_INTC_ECX_CX16)
5075 5268 hwcap_flags |= AV_386_CX16;
5076 5269
5077 5270 if (*ecx & CPUID_INTC_ECX_RDRAND)
5078 5271 hwcap_flags_2 |= AV_386_2_RDRAND;
5079 5272 if (*ebx & CPUID_INTC_EBX_7_0_ADX)
5080 5273 hwcap_flags_2 |= AV_386_2_ADX;
5081 5274 if (*ebx & CPUID_INTC_EBX_7_0_RDSEED)
5082 5275 hwcap_flags_2 |= AV_386_2_RDSEED;
5083 5276 if (*ebx & CPUID_INTC_EBX_7_0_SHA)
5084 5277 hwcap_flags_2 |= AV_386_2_SHA;
5085 5278 if (*ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
5086 5279 hwcap_flags_2 |= AV_386_2_FSGSBASE;
5087 5280 if (*ebx & CPUID_INTC_EBX_7_0_CLWB)
5088 5281 hwcap_flags_2 |= AV_386_2_CLWB;
5089 5282 if (*ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
5090 5283 hwcap_flags_2 |= AV_386_2_CLFLUSHOPT;
5091 5284
5092 5285 }
5093 5286 /*
5094 5287 * Check a few miscilaneous features.
5095 5288 */
5096 5289 if (is_x86_feature(x86_featureset, X86FSET_CLZERO))
5097 5290 hwcap_flags_2 |= AV_386_2_CLZERO;
5098 5291
5099 5292 if (cpi->cpi_xmaxeax < 0x80000001)
5100 5293 goto pass4_done;
5101 5294
5102 5295 switch (cpi->cpi_vendor) {
5103 5296 struct cpuid_regs cp;
5104 5297 uint32_t *edx, *ecx;
5105 5298
5106 5299 case X86_VENDOR_Intel:
5107 5300 /*
5108 5301 * Seems like Intel duplicated what we necessary
5109 5302 * here to make the initial crop of 64-bit OS's work.
5110 5303 * Hopefully, those are the only "extended" bits
5111 5304 * they'll add.
5112 5305 */
5113 5306 /*FALLTHROUGH*/
5114 5307
5115 5308 case X86_VENDOR_AMD:
5116 5309 edx = &cpi->cpi_support[AMD_EDX_FEATURES];
5117 5310 ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
5118 5311
5119 5312 *edx = CPI_FEATURES_XTD_EDX(cpi);
5120 5313 *ecx = CPI_FEATURES_XTD_ECX(cpi);
5121 5314
5122 5315 /*
5123 5316 * [these features require explicit kernel support]
5124 5317 */
5125 5318 switch (cpi->cpi_vendor) {
5126 5319 case X86_VENDOR_Intel:
5127 5320 if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
5128 5321 *edx &= ~CPUID_AMD_EDX_TSCP;
5129 5322 break;
5130 5323
5131 5324 case X86_VENDOR_AMD:
5132 5325 if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
5133 5326 *edx &= ~CPUID_AMD_EDX_TSCP;
5134 5327 if (!is_x86_feature(x86_featureset, X86FSET_SSE4A))
5135 5328 *ecx &= ~CPUID_AMD_ECX_SSE4A;
5136 5329 break;
5137 5330
5138 5331 default:
5139 5332 break;
5140 5333 }
5141 5334
5142 5335 /*
5143 5336 * [no explicit support required beyond
5144 5337 * x87 fp context and exception handlers]
5145 5338 */
5146 5339 if (!fpu_exists)
5147 5340 *edx &= ~(CPUID_AMD_EDX_MMXamd |
5148 5341 CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
5149 5342
5150 5343 if (!is_x86_feature(x86_featureset, X86FSET_NX))
5151 5344 *edx &= ~CPUID_AMD_EDX_NX;
5152 5345 #if !defined(__amd64)
5153 5346 *edx &= ~CPUID_AMD_EDX_LM;
5154 5347 #endif
5155 5348 /*
5156 5349 * Now map the supported feature vector to
5157 5350 * things that we think userland will care about.
5158 5351 */
5159 5352 #if defined(__amd64)
5160 5353 if (*edx & CPUID_AMD_EDX_SYSC)
5161 5354 hwcap_flags |= AV_386_AMD_SYSC;
5162 5355 #endif
5163 5356 if (*edx & CPUID_AMD_EDX_MMXamd)
5164 5357 hwcap_flags |= AV_386_AMD_MMX;
5165 5358 if (*edx & CPUID_AMD_EDX_3DNow)
5166 5359 hwcap_flags |= AV_386_AMD_3DNow;
5167 5360 if (*edx & CPUID_AMD_EDX_3DNowx)
5168 5361 hwcap_flags |= AV_386_AMD_3DNowx;
5169 5362 if (*ecx & CPUID_AMD_ECX_SVM)
5170 5363 hwcap_flags |= AV_386_AMD_SVM;
5171 5364
5172 5365 switch (cpi->cpi_vendor) {
5173 5366 case X86_VENDOR_AMD:
5174 5367 if (*edx & CPUID_AMD_EDX_TSCP)
5175 5368 hwcap_flags |= AV_386_TSCP;
5176 5369 if (*ecx & CPUID_AMD_ECX_AHF64)
5177 5370 hwcap_flags |= AV_386_AHF;
5178 5371 if (*ecx & CPUID_AMD_ECX_SSE4A)
5179 5372 hwcap_flags |= AV_386_AMD_SSE4A;
5180 5373 if (*ecx & CPUID_AMD_ECX_LZCNT)
5181 5374 hwcap_flags |= AV_386_AMD_LZCNT;
5182 5375 if (*ecx & CPUID_AMD_ECX_MONITORX)
5183 5376 hwcap_flags_2 |= AV_386_2_MONITORX;
5184 5377 break;
5185 5378
5186 5379 case X86_VENDOR_Intel:
5187 5380 if (*edx & CPUID_AMD_EDX_TSCP)
5188 5381 hwcap_flags |= AV_386_TSCP;
5189 5382 if (*ecx & CPUID_AMD_ECX_LZCNT)
5190 5383 hwcap_flags |= AV_386_AMD_LZCNT;
5191 5384 /*
5192 5385 * Aarrgh.
5193 5386 * Intel uses a different bit in the same word.
5194 5387 */
5195 5388 if (*ecx & CPUID_INTC_ECX_AHF64)
5196 5389 hwcap_flags |= AV_386_AHF;
5197 5390 break;
5198 5391
5199 5392 default:
5200 5393 break;
5201 5394 }
5202 5395 break;
5203 5396
5204 5397 case X86_VENDOR_TM:
5205 5398 cp.cp_eax = 0x80860001;
5206 5399 (void) __cpuid_insn(&cp);
5207 5400 cpi->cpi_support[TM_EDX_FEATURES] = cp.cp_edx;
5208 5401 break;
5209 5402
5210 5403 default:
5211 5404 break;
5212 5405 }
5213 5406
5214 5407 pass4_done:
5215 5408 cpi->cpi_pass = 4;
5216 5409 if (hwcap_out != NULL) {
5217 5410 hwcap_out[0] = hwcap_flags;
5218 5411 hwcap_out[1] = hwcap_flags_2;
5219 5412 }
5220 5413 }
5221 5414
5222 5415
5223 5416 /*
5224 5417 * Simulate the cpuid instruction using the data we previously
5225 5418 * captured about this CPU. We try our best to return the truth
5226 5419 * about the hardware, independently of kernel support.
5227 5420 */
5228 5421 uint32_t
5229 5422 cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
5230 5423 {
5231 5424 struct cpuid_info *cpi;
5232 5425 struct cpuid_regs *xcp;
5233 5426
5234 5427 if (cpu == NULL)
5235 5428 cpu = CPU;
5236 5429 cpi = cpu->cpu_m.mcpu_cpi;
5237 5430
5238 5431 ASSERT(cpuid_checkpass(cpu, 3));
5239 5432
5240 5433 /*
5241 5434 * CPUID data is cached in two separate places: cpi_std for standard
5242 5435 * CPUID leaves , and cpi_extd for extended CPUID leaves.
5243 5436 */
5244 5437 if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
5245 5438 xcp = &cpi->cpi_std[cp->cp_eax];
5246 5439 } else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
5247 5440 cp->cp_eax <= cpi->cpi_xmaxeax &&
5248 5441 cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
5249 5442 xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
5250 5443 } else {
5251 5444 /*
5252 5445 * The caller is asking for data from an input parameter which
5253 5446 * the kernel has not cached. In this case we go fetch from
5254 5447 * the hardware and return the data directly to the user.
5255 5448 */
5256 5449 return (__cpuid_insn(cp));
5257 5450 }
5258 5451
5259 5452 cp->cp_eax = xcp->cp_eax;
5260 5453 cp->cp_ebx = xcp->cp_ebx;
5261 5454 cp->cp_ecx = xcp->cp_ecx;
5262 5455 cp->cp_edx = xcp->cp_edx;
5263 5456 return (cp->cp_eax);
5264 5457 }
5265 5458
5266 5459 int
5267 5460 cpuid_checkpass(cpu_t *cpu, int pass)
5268 5461 {
5269 5462 return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
5270 5463 cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
5271 5464 }
5272 5465
5273 5466 int
5274 5467 cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
5275 5468 {
5276 5469 ASSERT(cpuid_checkpass(cpu, 3));
5277 5470
5278 5471 return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
5279 5472 }
5280 5473
5281 5474 int
5282 5475 cpuid_is_cmt(cpu_t *cpu)
5283 5476 {
5284 5477 if (cpu == NULL)
5285 5478 cpu = CPU;
5286 5479
5287 5480 ASSERT(cpuid_checkpass(cpu, 1));
5288 5481
5289 5482 return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
5290 5483 }
5291 5484
5292 5485 /*
5293 5486 * AMD and Intel both implement the 64-bit variant of the syscall
5294 5487 * instruction (syscallq), so if there's -any- support for syscall,
5295 5488 * cpuid currently says "yes, we support this".
5296 5489 *
5297 5490 * However, Intel decided to -not- implement the 32-bit variant of the
5298 5491 * syscall instruction, so we provide a predicate to allow our caller
5299 5492 * to test that subtlety here.
5300 5493 *
5301 5494 * XXPV Currently, 32-bit syscall instructions don't work via the hypervisor,
5302 5495 * even in the case where the hardware would in fact support it.
5303 5496 */
5304 5497 /*ARGSUSED*/
5305 5498 int
5306 5499 cpuid_syscall32_insn(cpu_t *cpu)
5307 5500 {
5308 5501 ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), 1));
5309 5502
5310 5503 #if !defined(__xpv)
5311 5504 if (cpu == NULL)
5312 5505 cpu = CPU;
5313 5506
5314 5507 /*CSTYLED*/
5315 5508 {
5316 5509 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5317 5510
5318 5511 if (cpi->cpi_vendor == X86_VENDOR_AMD &&
5319 5512 cpi->cpi_xmaxeax >= 0x80000001 &&
5320 5513 (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
5321 5514 return (1);
5322 5515 }
5323 5516 #endif
5324 5517 return (0);
5325 5518 }
5326 5519
5327 5520 int
5328 5521 cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
5329 5522 {
5330 5523 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5331 5524
5332 5525 static const char fmt[] =
5333 5526 "x86 (%s %X family %d model %d step %d clock %d MHz)";
5334 5527 static const char fmt_ht[] =
5335 5528 "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
5336 5529
5337 5530 ASSERT(cpuid_checkpass(cpu, 1));
5338 5531
5339 5532 if (cpuid_is_cmt(cpu))
5340 5533 return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
5341 5534 cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5342 5535 cpi->cpi_family, cpi->cpi_model,
5343 5536 cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5344 5537 return (snprintf(s, n, fmt,
5345 5538 cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5346 5539 cpi->cpi_family, cpi->cpi_model,
5347 5540 cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5348 5541 }
5349 5542
5350 5543 const char *
5351 5544 cpuid_getvendorstr(cpu_t *cpu)
5352 5545 {
5353 5546 ASSERT(cpuid_checkpass(cpu, 1));
5354 5547 return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
5355 5548 }
5356 5549
5357 5550 uint_t
5358 5551 cpuid_getvendor(cpu_t *cpu)
5359 5552 {
5360 5553 ASSERT(cpuid_checkpass(cpu, 1));
5361 5554 return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
5362 5555 }
5363 5556
5364 5557 uint_t
5365 5558 cpuid_getfamily(cpu_t *cpu)
5366 5559 {
5367 5560 ASSERT(cpuid_checkpass(cpu, 1));
5368 5561 return (cpu->cpu_m.mcpu_cpi->cpi_family);
5369 5562 }
5370 5563
5371 5564 uint_t
5372 5565 cpuid_getmodel(cpu_t *cpu)
5373 5566 {
5374 5567 ASSERT(cpuid_checkpass(cpu, 1));
5375 5568 return (cpu->cpu_m.mcpu_cpi->cpi_model);
5376 5569 }
5377 5570
5378 5571 uint_t
5379 5572 cpuid_get_ncpu_per_chip(cpu_t *cpu)
5380 5573 {
5381 5574 ASSERT(cpuid_checkpass(cpu, 1));
5382 5575 return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
5383 5576 }
5384 5577
5385 5578 uint_t
5386 5579 cpuid_get_ncore_per_chip(cpu_t *cpu)
5387 5580 {
5388 5581 ASSERT(cpuid_checkpass(cpu, 1));
5389 5582 return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
5390 5583 }
5391 5584
5392 5585 uint_t
5393 5586 cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
5394 5587 {
5395 5588 ASSERT(cpuid_checkpass(cpu, 2));
5396 5589 return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
5397 5590 }
5398 5591
5399 5592 id_t
5400 5593 cpuid_get_last_lvl_cacheid(cpu_t *cpu)
5401 5594 {
5402 5595 ASSERT(cpuid_checkpass(cpu, 2));
5403 5596 return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5404 5597 }
5405 5598
5406 5599 uint_t
5407 5600 cpuid_getstep(cpu_t *cpu)
5408 5601 {
5409 5602 ASSERT(cpuid_checkpass(cpu, 1));
5410 5603 return (cpu->cpu_m.mcpu_cpi->cpi_step);
5411 5604 }
5412 5605
5413 5606 uint_t
5414 5607 cpuid_getsig(struct cpu *cpu)
5415 5608 {
5416 5609 ASSERT(cpuid_checkpass(cpu, 1));
5417 5610 return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
5418 5611 }
5419 5612
5420 5613 uint32_t
5421 5614 cpuid_getchiprev(struct cpu *cpu)
5422 5615 {
5423 5616 ASSERT(cpuid_checkpass(cpu, 1));
5424 5617 return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
5425 5618 }
5426 5619
5427 5620 const char *
5428 5621 cpuid_getchiprevstr(struct cpu *cpu)
5429 5622 {
5430 5623 ASSERT(cpuid_checkpass(cpu, 1));
5431 5624 return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
5432 5625 }
5433 5626
5434 5627 uint32_t
5435 5628 cpuid_getsockettype(struct cpu *cpu)
5436 5629 {
5437 5630 ASSERT(cpuid_checkpass(cpu, 1));
5438 5631 return (cpu->cpu_m.mcpu_cpi->cpi_socket);
5439 5632 }
5440 5633
5441 5634 const char *
5442 5635 cpuid_getsocketstr(cpu_t *cpu)
5443 5636 {
5444 5637 static const char *socketstr = NULL;
5445 5638 struct cpuid_info *cpi;
5446 5639
5447 5640 ASSERT(cpuid_checkpass(cpu, 1));
5448 5641 cpi = cpu->cpu_m.mcpu_cpi;
5449 5642
5450 5643 /* Assume that socket types are the same across the system */
5451 5644 if (socketstr == NULL)
5452 5645 socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
5453 5646 cpi->cpi_model, cpi->cpi_step);
5454 5647
5455 5648
5456 5649 return (socketstr);
5457 5650 }
5458 5651
5459 5652 int
5460 5653 cpuid_get_chipid(cpu_t *cpu)
5461 5654 {
5462 5655 ASSERT(cpuid_checkpass(cpu, 1));
5463 5656
5464 5657 if (cpuid_is_cmt(cpu))
5465 5658 return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
5466 5659 return (cpu->cpu_id);
5467 5660 }
5468 5661
5469 5662 id_t
5470 5663 cpuid_get_coreid(cpu_t *cpu)
5471 5664 {
5472 5665 ASSERT(cpuid_checkpass(cpu, 1));
5473 5666 return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
5474 5667 }
5475 5668
5476 5669 int
5477 5670 cpuid_get_pkgcoreid(cpu_t *cpu)
5478 5671 {
5479 5672 ASSERT(cpuid_checkpass(cpu, 1));
5480 5673 return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
5481 5674 }
5482 5675
5483 5676 int
5484 5677 cpuid_get_clogid(cpu_t *cpu)
5485 5678 {
5486 5679 ASSERT(cpuid_checkpass(cpu, 1));
5487 5680 return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
5488 5681 }
5489 5682
5490 5683 int
5491 5684 cpuid_get_cacheid(cpu_t *cpu)
5492 5685 {
5493 5686 ASSERT(cpuid_checkpass(cpu, 1));
5494 5687 return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5495 5688 }
5496 5689
5497 5690 uint_t
5498 5691 cpuid_get_procnodeid(cpu_t *cpu)
5499 5692 {
5500 5693 ASSERT(cpuid_checkpass(cpu, 1));
5501 5694 return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
5502 5695 }
5503 5696
5504 5697 uint_t
5505 5698 cpuid_get_procnodes_per_pkg(cpu_t *cpu)
5506 5699 {
5507 5700 ASSERT(cpuid_checkpass(cpu, 1));
5508 5701 return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
5509 5702 }
5510 5703
5511 5704 uint_t
5512 5705 cpuid_get_compunitid(cpu_t *cpu)
5513 5706 {
5514 5707 ASSERT(cpuid_checkpass(cpu, 1));
5515 5708 return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
5516 5709 }
5517 5710
5518 5711 uint_t
5519 5712 cpuid_get_cores_per_compunit(cpu_t *cpu)
5520 5713 {
5521 5714 ASSERT(cpuid_checkpass(cpu, 1));
5522 5715 return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
5523 5716 }
5524 5717
5525 5718 /*ARGSUSED*/
5526 5719 int
5527 5720 cpuid_have_cr8access(cpu_t *cpu)
5528 5721 {
5529 5722 #if defined(__amd64)
5530 5723 return (1);
5531 5724 #else
5532 5725 struct cpuid_info *cpi;
5533 5726
5534 5727 ASSERT(cpu != NULL);
5535 5728 cpi = cpu->cpu_m.mcpu_cpi;
5536 5729 if (cpi->cpi_vendor == X86_VENDOR_AMD && cpi->cpi_maxeax >= 1 &&
5537 5730 (CPI_FEATURES_XTD_ECX(cpi) & CPUID_AMD_ECX_CR8D) != 0)
5538 5731 return (1);
5539 5732 return (0);
5540 5733 #endif
5541 5734 }
5542 5735
5543 5736 uint32_t
5544 5737 cpuid_get_apicid(cpu_t *cpu)
5545 5738 {
5546 5739 ASSERT(cpuid_checkpass(cpu, 1));
5547 5740 if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
5548 5741 return (UINT32_MAX);
5549 5742 } else {
5550 5743 return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
5551 5744 }
5552 5745 }
5553 5746
5554 5747 void
5555 5748 cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
5556 5749 {
5557 5750 struct cpuid_info *cpi;
5558 5751
5559 5752 if (cpu == NULL)
5560 5753 cpu = CPU;
5561 5754 cpi = cpu->cpu_m.mcpu_cpi;
5562 5755
5563 5756 ASSERT(cpuid_checkpass(cpu, 1));
5564 5757
5565 5758 if (pabits)
5566 5759 *pabits = cpi->cpi_pabits;
5567 5760 if (vabits)
5568 5761 *vabits = cpi->cpi_vabits;
5569 5762 }
5570 5763
5571 5764 size_t
5572 5765 cpuid_get_xsave_size()
5573 5766 {
5574 5767 return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
5575 5768 sizeof (struct xsave_state)));
5576 5769 }
5577 5770
5578 5771 /*
5579 5772 * Return true if the CPUs on this system require 'pointer clearing' for the
5580 5773 * floating point error pointer exception handling. In the past, this has been
5581 5774 * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
5582 5775 * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
5583 5776 * feature bit and is reflected in the cpi_fp_amd_save member.
5584 5777 */
5585 5778 boolean_t
5586 5779 cpuid_need_fp_excp_handling()
5587 5780 {
5588 5781 return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
5589 5782 cpuid_info0.cpi_fp_amd_save != 0);
5590 5783 }
5591 5784
5592 5785 /*
5593 5786 * Returns the number of data TLB entries for a corresponding
5594 5787 * pagesize. If it can't be computed, or isn't known, the
5595 5788 * routine returns zero. If you ask about an architecturally
5596 5789 * impossible pagesize, the routine will panic (so that the
5597 5790 * hat implementor knows that things are inconsistent.)
5598 5791 */
5599 5792 uint_t
5600 5793 cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
5601 5794 {
5602 5795 struct cpuid_info *cpi;
5603 5796 uint_t dtlb_nent = 0;
5604 5797
5605 5798 if (cpu == NULL)
5606 5799 cpu = CPU;
5607 5800 cpi = cpu->cpu_m.mcpu_cpi;
5608 5801
5609 5802 ASSERT(cpuid_checkpass(cpu, 1));
5610 5803
5611 5804 /*
5612 5805 * Check the L2 TLB info
5613 5806 */
5614 5807 if (cpi->cpi_xmaxeax >= 0x80000006) {
5615 5808 struct cpuid_regs *cp = &cpi->cpi_extd[6];
5616 5809
5617 5810 switch (pagesize) {
5618 5811
5619 5812 case 4 * 1024:
5620 5813 /*
5621 5814 * All zero in the top 16 bits of the register
5622 5815 * indicates a unified TLB. Size is in low 16 bits.
5623 5816 */
5624 5817 if ((cp->cp_ebx & 0xffff0000) == 0)
5625 5818 dtlb_nent = cp->cp_ebx & 0x0000ffff;
5626 5819 else
5627 5820 dtlb_nent = BITX(cp->cp_ebx, 27, 16);
5628 5821 break;
5629 5822
5630 5823 case 2 * 1024 * 1024:
5631 5824 if ((cp->cp_eax & 0xffff0000) == 0)
5632 5825 dtlb_nent = cp->cp_eax & 0x0000ffff;
5633 5826 else
5634 5827 dtlb_nent = BITX(cp->cp_eax, 27, 16);
5635 5828 break;
5636 5829
5637 5830 default:
5638 5831 panic("unknown L2 pagesize");
5639 5832 /*NOTREACHED*/
5640 5833 }
5641 5834 }
5642 5835
5643 5836 if (dtlb_nent != 0)
5644 5837 return (dtlb_nent);
5645 5838
5646 5839 /*
5647 5840 * No L2 TLB support for this size, try L1.
5648 5841 */
5649 5842 if (cpi->cpi_xmaxeax >= 0x80000005) {
5650 5843 struct cpuid_regs *cp = &cpi->cpi_extd[5];
5651 5844
5652 5845 switch (pagesize) {
5653 5846 case 4 * 1024:
5654 5847 dtlb_nent = BITX(cp->cp_ebx, 23, 16);
5655 5848 break;
5656 5849 case 2 * 1024 * 1024:
5657 5850 dtlb_nent = BITX(cp->cp_eax, 23, 16);
5658 5851 break;
5659 5852 default:
5660 5853 panic("unknown L1 d-TLB pagesize");
5661 5854 /*NOTREACHED*/
5662 5855 }
5663 5856 }
5664 5857
5665 5858 return (dtlb_nent);
5666 5859 }
5667 5860
5668 5861 /*
5669 5862 * Return 0 if the erratum is not present or not applicable, positive
5670 5863 * if it is, and negative if the status of the erratum is unknown.
5671 5864 *
5672 5865 * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
5673 5866 * Processors" #25759, Rev 3.57, August 2005
5674 5867 */
5675 5868 int
5676 5869 cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
5677 5870 {
5678 5871 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5679 5872 uint_t eax;
5680 5873
5681 5874 /*
5682 5875 * Bail out if this CPU isn't an AMD CPU, or if it's
5683 5876 * a legacy (32-bit) AMD CPU.
5684 5877 */
5685 5878 if (cpi->cpi_vendor != X86_VENDOR_AMD ||
5686 5879 cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
5687 5880 cpi->cpi_family == 6) {
5688 5881 return (0);
5689 5882 }
5690 5883
5691 5884 eax = cpi->cpi_std[1].cp_eax;
5692 5885
5693 5886 #define SH_B0(eax) (eax == 0xf40 || eax == 0xf50)
5694 5887 #define SH_B3(eax) (eax == 0xf51)
5695 5888 #define B(eax) (SH_B0(eax) || SH_B3(eax))
5696 5889
5697 5890 #define SH_C0(eax) (eax == 0xf48 || eax == 0xf58)
5698 5891
5699 5892 #define SH_CG(eax) (eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
5700 5893 #define DH_CG(eax) (eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
5701 5894 #define CH_CG(eax) (eax == 0xf82 || eax == 0xfb2)
5702 5895 #define CG(eax) (SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
5703 5896
5704 5897 #define SH_D0(eax) (eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
5705 5898 #define DH_D0(eax) (eax == 0x10fc0 || eax == 0x10ff0)
5706 5899 #define CH_D0(eax) (eax == 0x10f80 || eax == 0x10fb0)
5707 5900 #define D0(eax) (SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
5708 5901
5709 5902 #define SH_E0(eax) (eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
5710 5903 #define JH_E1(eax) (eax == 0x20f10) /* JH8_E0 had 0x20f30 */
5711 5904 #define DH_E3(eax) (eax == 0x20fc0 || eax == 0x20ff0)
5712 5905 #define SH_E4(eax) (eax == 0x20f51 || eax == 0x20f71)
5713 5906 #define BH_E4(eax) (eax == 0x20fb1)
5714 5907 #define SH_E5(eax) (eax == 0x20f42)
5715 5908 #define DH_E6(eax) (eax == 0x20ff2 || eax == 0x20fc2)
5716 5909 #define JH_E6(eax) (eax == 0x20f12 || eax == 0x20f32)
5717 5910 #define EX(eax) (SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
5718 5911 SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
5719 5912 DH_E6(eax) || JH_E6(eax))
5720 5913
5721 5914 #define DR_AX(eax) (eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
5722 5915 #define DR_B0(eax) (eax == 0x100f20)
5723 5916 #define DR_B1(eax) (eax == 0x100f21)
5724 5917 #define DR_BA(eax) (eax == 0x100f2a)
5725 5918 #define DR_B2(eax) (eax == 0x100f22)
5726 5919 #define DR_B3(eax) (eax == 0x100f23)
5727 5920 #define RB_C0(eax) (eax == 0x100f40)
5728 5921
5729 5922 switch (erratum) {
5730 5923 case 1:
5731 5924 return (cpi->cpi_family < 0x10);
5732 5925 case 51: /* what does the asterisk mean? */
5733 5926 return (B(eax) || SH_C0(eax) || CG(eax));
5734 5927 case 52:
5735 5928 return (B(eax));
5736 5929 case 57:
5737 5930 return (cpi->cpi_family <= 0x11);
5738 5931 case 58:
5739 5932 return (B(eax));
5740 5933 case 60:
5741 5934 return (cpi->cpi_family <= 0x11);
5742 5935 case 61:
5743 5936 case 62:
5744 5937 case 63:
5745 5938 case 64:
5746 5939 case 65:
5747 5940 case 66:
5748 5941 case 68:
5749 5942 case 69:
5750 5943 case 70:
5751 5944 case 71:
5752 5945 return (B(eax));
5753 5946 case 72:
5754 5947 return (SH_B0(eax));
5755 5948 case 74:
5756 5949 return (B(eax));
5757 5950 case 75:
5758 5951 return (cpi->cpi_family < 0x10);
5759 5952 case 76:
5760 5953 return (B(eax));
5761 5954 case 77:
5762 5955 return (cpi->cpi_family <= 0x11);
5763 5956 case 78:
5764 5957 return (B(eax) || SH_C0(eax));
5765 5958 case 79:
5766 5959 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5767 5960 case 80:
5768 5961 case 81:
5769 5962 case 82:
5770 5963 return (B(eax));
5771 5964 case 83:
5772 5965 return (B(eax) || SH_C0(eax) || CG(eax));
5773 5966 case 85:
5774 5967 return (cpi->cpi_family < 0x10);
5775 5968 case 86:
5776 5969 return (SH_C0(eax) || CG(eax));
5777 5970 case 88:
5778 5971 #if !defined(__amd64)
5779 5972 return (0);
5780 5973 #else
5781 5974 return (B(eax) || SH_C0(eax));
5782 5975 #endif
5783 5976 case 89:
5784 5977 return (cpi->cpi_family < 0x10);
5785 5978 case 90:
5786 5979 return (B(eax) || SH_C0(eax) || CG(eax));
5787 5980 case 91:
5788 5981 case 92:
5789 5982 return (B(eax) || SH_C0(eax));
5790 5983 case 93:
5791 5984 return (SH_C0(eax));
5792 5985 case 94:
5793 5986 return (B(eax) || SH_C0(eax) || CG(eax));
5794 5987 case 95:
5795 5988 #if !defined(__amd64)
5796 5989 return (0);
5797 5990 #else
5798 5991 return (B(eax) || SH_C0(eax));
5799 5992 #endif
5800 5993 case 96:
5801 5994 return (B(eax) || SH_C0(eax) || CG(eax));
5802 5995 case 97:
5803 5996 case 98:
5804 5997 return (SH_C0(eax) || CG(eax));
5805 5998 case 99:
5806 5999 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5807 6000 case 100:
5808 6001 return (B(eax) || SH_C0(eax));
5809 6002 case 101:
5810 6003 case 103:
5811 6004 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5812 6005 case 104:
5813 6006 return (SH_C0(eax) || CG(eax) || D0(eax));
5814 6007 case 105:
5815 6008 case 106:
5816 6009 case 107:
5817 6010 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5818 6011 case 108:
5819 6012 return (DH_CG(eax));
5820 6013 case 109:
5821 6014 return (SH_C0(eax) || CG(eax) || D0(eax));
5822 6015 case 110:
5823 6016 return (D0(eax) || EX(eax));
5824 6017 case 111:
5825 6018 return (CG(eax));
5826 6019 case 112:
5827 6020 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5828 6021 case 113:
5829 6022 return (eax == 0x20fc0);
5830 6023 case 114:
5831 6024 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
5832 6025 case 115:
5833 6026 return (SH_E0(eax) || JH_E1(eax));
5834 6027 case 116:
5835 6028 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
5836 6029 case 117:
5837 6030 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5838 6031 case 118:
5839 6032 return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
5840 6033 JH_E6(eax));
5841 6034 case 121:
5842 6035 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5843 6036 case 122:
5844 6037 return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
5845 6038 case 123:
5846 6039 return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
5847 6040 case 131:
5848 6041 return (cpi->cpi_family < 0x10);
5849 6042 case 6336786:
5850 6043
5851 6044 /*
5852 6045 * Test for AdvPowerMgmtInfo.TscPStateInvariant
5853 6046 * if this is a K8 family or newer processor. We're testing for
5854 6047 * this 'erratum' to determine whether or not we have a constant
5855 6048 * TSC.
5856 6049 *
5857 6050 * Our current fix for this is to disable the C1-Clock ramping.
5858 6051 * However, this doesn't work on newer processor families nor
5859 6052 * does it work when virtualized as those devices don't exist.
5860 6053 */
5861 6054 if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
5862 6055 return (0);
5863 6056 }
5864 6057
5865 6058 if (CPI_FAMILY(cpi) == 0xf) {
5866 6059 struct cpuid_regs regs;
5867 6060 regs.cp_eax = 0x80000007;
5868 6061 (void) __cpuid_insn(®s);
5869 6062 return (!(regs.cp_edx & 0x100));
5870 6063 }
5871 6064 return (0);
5872 6065 case 6323525:
5873 6066 /*
5874 6067 * This erratum (K8 #147) is not present on family 10 and newer.
5875 6068 */
5876 6069 if (cpi->cpi_family >= 0x10) {
5877 6070 return (0);
5878 6071 }
5879 6072 return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
5880 6073 (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
5881 6074
5882 6075 case 6671130:
5883 6076 /*
5884 6077 * check for processors (pre-Shanghai) that do not provide
5885 6078 * optimal management of 1gb ptes in its tlb.
5886 6079 */
5887 6080 return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
5888 6081
5889 6082 case 298:
5890 6083 return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
5891 6084 DR_B2(eax) || RB_C0(eax));
5892 6085
5893 6086 case 721:
5894 6087 #if defined(__amd64)
5895 6088 return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
5896 6089 #else
5897 6090 return (0);
5898 6091 #endif
5899 6092
5900 6093 default:
5901 6094 return (-1);
5902 6095
5903 6096 }
5904 6097 }
5905 6098
5906 6099 /*
5907 6100 * Determine if specified erratum is present via OSVW (OS Visible Workaround).
5908 6101 * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
5909 6102 */
5910 6103 int
5911 6104 osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
5912 6105 {
5913 6106 struct cpuid_info *cpi;
5914 6107 uint_t osvwid;
5915 6108 static int osvwfeature = -1;
5916 6109 uint64_t osvwlength;
5917 6110
5918 6111
5919 6112 cpi = cpu->cpu_m.mcpu_cpi;
5920 6113
5921 6114 /* confirm OSVW supported */
5922 6115 if (osvwfeature == -1) {
5923 6116 osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
5924 6117 } else {
5925 6118 /* assert that osvw feature setting is consistent on all cpus */
5926 6119 ASSERT(osvwfeature ==
5927 6120 (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
5928 6121 }
5929 6122 if (!osvwfeature)
5930 6123 return (-1);
5931 6124
5932 6125 osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
5933 6126
5934 6127 switch (erratum) {
5935 6128 case 298: /* osvwid is 0 */
5936 6129 osvwid = 0;
5937 6130 if (osvwlength <= (uint64_t)osvwid) {
5938 6131 /* osvwid 0 is unknown */
5939 6132 return (-1);
5940 6133 }
5941 6134
5942 6135 /*
5943 6136 * Check the OSVW STATUS MSR to determine the state
5944 6137 * of the erratum where:
5945 6138 * 0 - fixed by HW
5946 6139 * 1 - BIOS has applied the workaround when BIOS
5947 6140 * workaround is available. (Or for other errata,
5948 6141 * OS workaround is required.)
5949 6142 * For a value of 1, caller will confirm that the
5950 6143 * erratum 298 workaround has indeed been applied by BIOS.
5951 6144 *
5952 6145 * A 1 may be set in cpus that have a HW fix
5953 6146 * in a mixed cpu system. Regarding erratum 298:
5954 6147 * In a multiprocessor platform, the workaround above
5955 6148 * should be applied to all processors regardless of
5956 6149 * silicon revision when an affected processor is
5957 6150 * present.
5958 6151 */
5959 6152
5960 6153 return (rdmsr(MSR_AMD_OSVW_STATUS +
5961 6154 (osvwid / OSVW_ID_CNT_PER_MSR)) &
5962 6155 (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
5963 6156
5964 6157 default:
5965 6158 return (-1);
5966 6159 }
5967 6160 }
5968 6161
5969 6162 static const char assoc_str[] = "associativity";
5970 6163 static const char line_str[] = "line-size";
5971 6164 static const char size_str[] = "size";
5972 6165
5973 6166 static void
5974 6167 add_cache_prop(dev_info_t *devi, const char *label, const char *type,
5975 6168 uint32_t val)
5976 6169 {
5977 6170 char buf[128];
5978 6171
5979 6172 /*
5980 6173 * ndi_prop_update_int() is used because it is desirable for
5981 6174 * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
5982 6175 */
5983 6176 if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
5984 6177 (void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
5985 6178 }
5986 6179
5987 6180 /*
5988 6181 * Intel-style cache/tlb description
5989 6182 *
5990 6183 * Standard cpuid level 2 gives a randomly ordered
5991 6184 * selection of tags that index into a table that describes
5992 6185 * cache and tlb properties.
5993 6186 */
5994 6187
5995 6188 static const char l1_icache_str[] = "l1-icache";
5996 6189 static const char l1_dcache_str[] = "l1-dcache";
5997 6190 static const char l2_cache_str[] = "l2-cache";
5998 6191 static const char l3_cache_str[] = "l3-cache";
5999 6192 static const char itlb4k_str[] = "itlb-4K";
6000 6193 static const char dtlb4k_str[] = "dtlb-4K";
6001 6194 static const char itlb2M_str[] = "itlb-2M";
6002 6195 static const char itlb4M_str[] = "itlb-4M";
6003 6196 static const char dtlb4M_str[] = "dtlb-4M";
6004 6197 static const char dtlb24_str[] = "dtlb0-2M-4M";
6005 6198 static const char itlb424_str[] = "itlb-4K-2M-4M";
6006 6199 static const char itlb24_str[] = "itlb-2M-4M";
6007 6200 static const char dtlb44_str[] = "dtlb-4K-4M";
6008 6201 static const char sl1_dcache_str[] = "sectored-l1-dcache";
6009 6202 static const char sl2_cache_str[] = "sectored-l2-cache";
6010 6203 static const char itrace_str[] = "itrace-cache";
6011 6204 static const char sl3_cache_str[] = "sectored-l3-cache";
6012 6205 static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
6013 6206
6014 6207 static const struct cachetab {
6015 6208 uint8_t ct_code;
6016 6209 uint8_t ct_assoc;
6017 6210 uint16_t ct_line_size;
6018 6211 size_t ct_size;
6019 6212 const char *ct_label;
6020 6213 } intel_ctab[] = {
6021 6214 /*
6022 6215 * maintain descending order!
6023 6216 *
6024 6217 * Codes ignored - Reason
6025 6218 * ----------------------
6026 6219 * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
6027 6220 * f0H/f1H - Currently we do not interpret prefetch size by design
6028 6221 */
6029 6222 { 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
6030 6223 { 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
6031 6224 { 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
6032 6225 { 0xde, 12, 64, 6*1024*1024, l3_cache_str},
6033 6226 { 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
6034 6227 { 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
6035 6228 { 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
6036 6229 { 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
6037 6230 { 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
6038 6231 { 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
6039 6232 { 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
6040 6233 { 0xd0, 4, 64, 512*1024, l3_cache_str},
6041 6234 { 0xca, 4, 0, 512, sh_l2_tlb4k_str},
6042 6235 { 0xc0, 4, 0, 8, dtlb44_str },
6043 6236 { 0xba, 4, 0, 64, dtlb4k_str },
6044 6237 { 0xb4, 4, 0, 256, dtlb4k_str },
6045 6238 { 0xb3, 4, 0, 128, dtlb4k_str },
6046 6239 { 0xb2, 4, 0, 64, itlb4k_str },
6047 6240 { 0xb0, 4, 0, 128, itlb4k_str },
6048 6241 { 0x87, 8, 64, 1024*1024, l2_cache_str},
6049 6242 { 0x86, 4, 64, 512*1024, l2_cache_str},
6050 6243 { 0x85, 8, 32, 2*1024*1024, l2_cache_str},
6051 6244 { 0x84, 8, 32, 1024*1024, l2_cache_str},
6052 6245 { 0x83, 8, 32, 512*1024, l2_cache_str},
6053 6246 { 0x82, 8, 32, 256*1024, l2_cache_str},
6054 6247 { 0x80, 8, 64, 512*1024, l2_cache_str},
6055 6248 { 0x7f, 2, 64, 512*1024, l2_cache_str},
6056 6249 { 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
6057 6250 { 0x7c, 8, 64, 1024*1024, sl2_cache_str},
6058 6251 { 0x7b, 8, 64, 512*1024, sl2_cache_str},
6059 6252 { 0x7a, 8, 64, 256*1024, sl2_cache_str},
6060 6253 { 0x79, 8, 64, 128*1024, sl2_cache_str},
6061 6254 { 0x78, 8, 64, 1024*1024, l2_cache_str},
6062 6255 { 0x73, 8, 0, 64*1024, itrace_str},
6063 6256 { 0x72, 8, 0, 32*1024, itrace_str},
6064 6257 { 0x71, 8, 0, 16*1024, itrace_str},
6065 6258 { 0x70, 8, 0, 12*1024, itrace_str},
6066 6259 { 0x68, 4, 64, 32*1024, sl1_dcache_str},
6067 6260 { 0x67, 4, 64, 16*1024, sl1_dcache_str},
6068 6261 { 0x66, 4, 64, 8*1024, sl1_dcache_str},
6069 6262 { 0x60, 8, 64, 16*1024, sl1_dcache_str},
6070 6263 { 0x5d, 0, 0, 256, dtlb44_str},
6071 6264 { 0x5c, 0, 0, 128, dtlb44_str},
6072 6265 { 0x5b, 0, 0, 64, dtlb44_str},
6073 6266 { 0x5a, 4, 0, 32, dtlb24_str},
6074 6267 { 0x59, 0, 0, 16, dtlb4k_str},
6075 6268 { 0x57, 4, 0, 16, dtlb4k_str},
6076 6269 { 0x56, 4, 0, 16, dtlb4M_str},
6077 6270 { 0x55, 0, 0, 7, itlb24_str},
6078 6271 { 0x52, 0, 0, 256, itlb424_str},
6079 6272 { 0x51, 0, 0, 128, itlb424_str},
6080 6273 { 0x50, 0, 0, 64, itlb424_str},
6081 6274 { 0x4f, 0, 0, 32, itlb4k_str},
6082 6275 { 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
6083 6276 { 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
6084 6277 { 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
6085 6278 { 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
6086 6279 { 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
6087 6280 { 0x49, 16, 64, 4*1024*1024, l3_cache_str},
6088 6281 { 0x48, 12, 64, 3*1024*1024, l2_cache_str},
6089 6282 { 0x47, 8, 64, 8*1024*1024, l3_cache_str},
6090 6283 { 0x46, 4, 64, 4*1024*1024, l3_cache_str},
6091 6284 { 0x45, 4, 32, 2*1024*1024, l2_cache_str},
6092 6285 { 0x44, 4, 32, 1024*1024, l2_cache_str},
6093 6286 { 0x43, 4, 32, 512*1024, l2_cache_str},
6094 6287 { 0x42, 4, 32, 256*1024, l2_cache_str},
6095 6288 { 0x41, 4, 32, 128*1024, l2_cache_str},
6096 6289 { 0x3e, 4, 64, 512*1024, sl2_cache_str},
6097 6290 { 0x3d, 6, 64, 384*1024, sl2_cache_str},
6098 6291 { 0x3c, 4, 64, 256*1024, sl2_cache_str},
6099 6292 { 0x3b, 2, 64, 128*1024, sl2_cache_str},
6100 6293 { 0x3a, 6, 64, 192*1024, sl2_cache_str},
6101 6294 { 0x39, 4, 64, 128*1024, sl2_cache_str},
6102 6295 { 0x30, 8, 64, 32*1024, l1_icache_str},
6103 6296 { 0x2c, 8, 64, 32*1024, l1_dcache_str},
6104 6297 { 0x29, 8, 64, 4096*1024, sl3_cache_str},
6105 6298 { 0x25, 8, 64, 2048*1024, sl3_cache_str},
6106 6299 { 0x23, 8, 64, 1024*1024, sl3_cache_str},
6107 6300 { 0x22, 4, 64, 512*1024, sl3_cache_str},
6108 6301 { 0x0e, 6, 64, 24*1024, l1_dcache_str},
6109 6302 { 0x0d, 4, 32, 16*1024, l1_dcache_str},
6110 6303 { 0x0c, 4, 32, 16*1024, l1_dcache_str},
6111 6304 { 0x0b, 4, 0, 4, itlb4M_str},
6112 6305 { 0x0a, 2, 32, 8*1024, l1_dcache_str},
6113 6306 { 0x08, 4, 32, 16*1024, l1_icache_str},
6114 6307 { 0x06, 4, 32, 8*1024, l1_icache_str},
6115 6308 { 0x05, 4, 0, 32, dtlb4M_str},
6116 6309 { 0x04, 4, 0, 8, dtlb4M_str},
6117 6310 { 0x03, 4, 0, 64, dtlb4k_str},
6118 6311 { 0x02, 4, 0, 2, itlb4M_str},
6119 6312 { 0x01, 4, 0, 32, itlb4k_str},
6120 6313 { 0 }
6121 6314 };
6122 6315
6123 6316 static const struct cachetab cyrix_ctab[] = {
6124 6317 { 0x70, 4, 0, 32, "tlb-4K" },
6125 6318 { 0x80, 4, 16, 16*1024, "l1-cache" },
6126 6319 { 0 }
6127 6320 };
6128 6321
6129 6322 /*
6130 6323 * Search a cache table for a matching entry
6131 6324 */
6132 6325 static const struct cachetab *
6133 6326 find_cacheent(const struct cachetab *ct, uint_t code)
6134 6327 {
6135 6328 if (code != 0) {
6136 6329 for (; ct->ct_code != 0; ct++)
6137 6330 if (ct->ct_code <= code)
6138 6331 break;
6139 6332 if (ct->ct_code == code)
6140 6333 return (ct);
6141 6334 }
6142 6335 return (NULL);
6143 6336 }
6144 6337
6145 6338 /*
6146 6339 * Populate cachetab entry with L2 or L3 cache-information using
6147 6340 * cpuid function 4. This function is called from intel_walk_cacheinfo()
6148 6341 * when descriptor 0x49 is encountered. It returns 0 if no such cache
6149 6342 * information is found.
6150 6343 */
6151 6344 static int
6152 6345 intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
6153 6346 {
6154 6347 uint32_t level, i;
6155 6348 int ret = 0;
6156 6349
6157 6350 for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
6158 6351 level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
6159 6352
6160 6353 if (level == 2 || level == 3) {
6161 6354 ct->ct_assoc =
6162 6355 CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
6163 6356 ct->ct_line_size =
6164 6357 CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
6165 6358 ct->ct_size = ct->ct_assoc *
6166 6359 (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
6167 6360 ct->ct_line_size *
6168 6361 (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
6169 6362
6170 6363 if (level == 2) {
6171 6364 ct->ct_label = l2_cache_str;
6172 6365 } else if (level == 3) {
6173 6366 ct->ct_label = l3_cache_str;
6174 6367 }
6175 6368 ret = 1;
6176 6369 }
6177 6370 }
6178 6371
6179 6372 return (ret);
6180 6373 }
6181 6374
6182 6375 /*
6183 6376 * Walk the cacheinfo descriptor, applying 'func' to every valid element
6184 6377 * The walk is terminated if the walker returns non-zero.
6185 6378 */
6186 6379 static void
6187 6380 intel_walk_cacheinfo(struct cpuid_info *cpi,
6188 6381 void *arg, int (*func)(void *, const struct cachetab *))
6189 6382 {
6190 6383 const struct cachetab *ct;
6191 6384 struct cachetab des_49_ct, des_b1_ct;
6192 6385 uint8_t *dp;
6193 6386 int i;
6194 6387
6195 6388 if ((dp = cpi->cpi_cacheinfo) == NULL)
6196 6389 return;
6197 6390 for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6198 6391 /*
6199 6392 * For overloaded descriptor 0x49 we use cpuid function 4
6200 6393 * if supported by the current processor, to create
6201 6394 * cache information.
6202 6395 * For overloaded descriptor 0xb1 we use X86_PAE flag
6203 6396 * to disambiguate the cache information.
6204 6397 */
6205 6398 if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
6206 6399 intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
6207 6400 ct = &des_49_ct;
6208 6401 } else if (*dp == 0xb1) {
6209 6402 des_b1_ct.ct_code = 0xb1;
6210 6403 des_b1_ct.ct_assoc = 4;
6211 6404 des_b1_ct.ct_line_size = 0;
6212 6405 if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
6213 6406 des_b1_ct.ct_size = 8;
6214 6407 des_b1_ct.ct_label = itlb2M_str;
6215 6408 } else {
6216 6409 des_b1_ct.ct_size = 4;
6217 6410 des_b1_ct.ct_label = itlb4M_str;
6218 6411 }
6219 6412 ct = &des_b1_ct;
6220 6413 } else {
6221 6414 if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
6222 6415 continue;
6223 6416 }
6224 6417 }
6225 6418
6226 6419 if (func(arg, ct) != 0) {
6227 6420 break;
6228 6421 }
6229 6422 }
6230 6423 }
6231 6424
6232 6425 /*
6233 6426 * (Like the Intel one, except for Cyrix CPUs)
6234 6427 */
6235 6428 static void
6236 6429 cyrix_walk_cacheinfo(struct cpuid_info *cpi,
6237 6430 void *arg, int (*func)(void *, const struct cachetab *))
6238 6431 {
6239 6432 const struct cachetab *ct;
6240 6433 uint8_t *dp;
6241 6434 int i;
6242 6435
6243 6436 if ((dp = cpi->cpi_cacheinfo) == NULL)
6244 6437 return;
6245 6438 for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6246 6439 /*
6247 6440 * Search Cyrix-specific descriptor table first ..
6248 6441 */
6249 6442 if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
6250 6443 if (func(arg, ct) != 0)
6251 6444 break;
6252 6445 continue;
6253 6446 }
6254 6447 /*
6255 6448 * .. else fall back to the Intel one
6256 6449 */
6257 6450 if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
6258 6451 if (func(arg, ct) != 0)
6259 6452 break;
6260 6453 continue;
6261 6454 }
6262 6455 }
6263 6456 }
6264 6457
6265 6458 /*
6266 6459 * A cacheinfo walker that adds associativity, line-size, and size properties
6267 6460 * to the devinfo node it is passed as an argument.
6268 6461 */
6269 6462 static int
6270 6463 add_cacheent_props(void *arg, const struct cachetab *ct)
6271 6464 {
6272 6465 dev_info_t *devi = arg;
6273 6466
6274 6467 add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
6275 6468 if (ct->ct_line_size != 0)
6276 6469 add_cache_prop(devi, ct->ct_label, line_str,
6277 6470 ct->ct_line_size);
6278 6471 add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
6279 6472 return (0);
6280 6473 }
6281 6474
6282 6475
6283 6476 static const char fully_assoc[] = "fully-associative?";
6284 6477
6285 6478 /*
6286 6479 * AMD style cache/tlb description
6287 6480 *
6288 6481 * Extended functions 5 and 6 directly describe properties of
6289 6482 * tlbs and various cache levels.
6290 6483 */
6291 6484 static void
6292 6485 add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6293 6486 {
6294 6487 switch (assoc) {
6295 6488 case 0: /* reserved; ignore */
6296 6489 break;
6297 6490 default:
6298 6491 add_cache_prop(devi, label, assoc_str, assoc);
6299 6492 break;
6300 6493 case 0xff:
6301 6494 add_cache_prop(devi, label, fully_assoc, 1);
6302 6495 break;
6303 6496 }
6304 6497 }
6305 6498
6306 6499 static void
6307 6500 add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6308 6501 {
6309 6502 if (size == 0)
6310 6503 return;
6311 6504 add_cache_prop(devi, label, size_str, size);
6312 6505 add_amd_assoc(devi, label, assoc);
6313 6506 }
6314 6507
6315 6508 static void
6316 6509 add_amd_cache(dev_info_t *devi, const char *label,
6317 6510 uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6318 6511 {
6319 6512 if (size == 0 || line_size == 0)
6320 6513 return;
6321 6514 add_amd_assoc(devi, label, assoc);
6322 6515 /*
6323 6516 * Most AMD parts have a sectored cache. Multiple cache lines are
6324 6517 * associated with each tag. A sector consists of all cache lines
6325 6518 * associated with a tag. For example, the AMD K6-III has a sector
6326 6519 * size of 2 cache lines per tag.
6327 6520 */
6328 6521 if (lines_per_tag != 0)
6329 6522 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6330 6523 add_cache_prop(devi, label, line_str, line_size);
6331 6524 add_cache_prop(devi, label, size_str, size * 1024);
6332 6525 }
6333 6526
6334 6527 static void
6335 6528 add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6336 6529 {
6337 6530 switch (assoc) {
6338 6531 case 0: /* off */
6339 6532 break;
6340 6533 case 1:
6341 6534 case 2:
6342 6535 case 4:
6343 6536 add_cache_prop(devi, label, assoc_str, assoc);
6344 6537 break;
6345 6538 case 6:
6346 6539 add_cache_prop(devi, label, assoc_str, 8);
6347 6540 break;
6348 6541 case 8:
6349 6542 add_cache_prop(devi, label, assoc_str, 16);
6350 6543 break;
6351 6544 case 0xf:
6352 6545 add_cache_prop(devi, label, fully_assoc, 1);
6353 6546 break;
6354 6547 default: /* reserved; ignore */
6355 6548 break;
6356 6549 }
6357 6550 }
6358 6551
6359 6552 static void
6360 6553 add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6361 6554 {
6362 6555 if (size == 0 || assoc == 0)
6363 6556 return;
6364 6557 add_amd_l2_assoc(devi, label, assoc);
6365 6558 add_cache_prop(devi, label, size_str, size);
6366 6559 }
6367 6560
6368 6561 static void
6369 6562 add_amd_l2_cache(dev_info_t *devi, const char *label,
6370 6563 uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6371 6564 {
6372 6565 if (size == 0 || assoc == 0 || line_size == 0)
6373 6566 return;
6374 6567 add_amd_l2_assoc(devi, label, assoc);
6375 6568 if (lines_per_tag != 0)
6376 6569 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6377 6570 add_cache_prop(devi, label, line_str, line_size);
6378 6571 add_cache_prop(devi, label, size_str, size * 1024);
6379 6572 }
6380 6573
6381 6574 static void
6382 6575 amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
6383 6576 {
6384 6577 struct cpuid_regs *cp;
6385 6578
6386 6579 if (cpi->cpi_xmaxeax < 0x80000005)
6387 6580 return;
6388 6581 cp = &cpi->cpi_extd[5];
6389 6582
6390 6583 /*
6391 6584 * 4M/2M L1 TLB configuration
6392 6585 *
6393 6586 * We report the size for 2M pages because AMD uses two
6394 6587 * TLB entries for one 4M page.
6395 6588 */
6396 6589 add_amd_tlb(devi, "dtlb-2M",
6397 6590 BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
6398 6591 add_amd_tlb(devi, "itlb-2M",
6399 6592 BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
6400 6593
6401 6594 /*
6402 6595 * 4K L1 TLB configuration
6403 6596 */
6404 6597
6405 6598 switch (cpi->cpi_vendor) {
6406 6599 uint_t nentries;
6407 6600 case X86_VENDOR_TM:
6408 6601 if (cpi->cpi_family >= 5) {
6409 6602 /*
6410 6603 * Crusoe processors have 256 TLB entries, but
6411 6604 * cpuid data format constrains them to only
6412 6605 * reporting 255 of them.
6413 6606 */
6414 6607 if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
6415 6608 nentries = 256;
6416 6609 /*
6417 6610 * Crusoe processors also have a unified TLB
6418 6611 */
6419 6612 add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
6420 6613 nentries);
6421 6614 break;
6422 6615 }
6423 6616 /*FALLTHROUGH*/
6424 6617 default:
6425 6618 add_amd_tlb(devi, itlb4k_str,
6426 6619 BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
6427 6620 add_amd_tlb(devi, dtlb4k_str,
6428 6621 BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
6429 6622 break;
6430 6623 }
6431 6624
6432 6625 /*
6433 6626 * data L1 cache configuration
6434 6627 */
6435 6628
6436 6629 add_amd_cache(devi, l1_dcache_str,
6437 6630 BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
6438 6631 BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
6439 6632
6440 6633 /*
6441 6634 * code L1 cache configuration
6442 6635 */
6443 6636
6444 6637 add_amd_cache(devi, l1_icache_str,
6445 6638 BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
6446 6639 BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
6447 6640
6448 6641 if (cpi->cpi_xmaxeax < 0x80000006)
6449 6642 return;
6450 6643 cp = &cpi->cpi_extd[6];
6451 6644
6452 6645 /* Check for a unified L2 TLB for large pages */
6453 6646
6454 6647 if (BITX(cp->cp_eax, 31, 16) == 0)
6455 6648 add_amd_l2_tlb(devi, "l2-tlb-2M",
6456 6649 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6457 6650 else {
6458 6651 add_amd_l2_tlb(devi, "l2-dtlb-2M",
6459 6652 BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6460 6653 add_amd_l2_tlb(devi, "l2-itlb-2M",
6461 6654 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6462 6655 }
6463 6656
6464 6657 /* Check for a unified L2 TLB for 4K pages */
6465 6658
6466 6659 if (BITX(cp->cp_ebx, 31, 16) == 0) {
6467 6660 add_amd_l2_tlb(devi, "l2-tlb-4K",
6468 6661 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6469 6662 } else {
6470 6663 add_amd_l2_tlb(devi, "l2-dtlb-4K",
6471 6664 BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6472 6665 add_amd_l2_tlb(devi, "l2-itlb-4K",
6473 6666 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6474 6667 }
6475 6668
6476 6669 add_amd_l2_cache(devi, l2_cache_str,
6477 6670 BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
6478 6671 BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
6479 6672 }
6480 6673
6481 6674 /*
6482 6675 * There are two basic ways that the x86 world describes it cache
6483 6676 * and tlb architecture - Intel's way and AMD's way.
6484 6677 *
6485 6678 * Return which flavor of cache architecture we should use
6486 6679 */
6487 6680 static int
6488 6681 x86_which_cacheinfo(struct cpuid_info *cpi)
6489 6682 {
6490 6683 switch (cpi->cpi_vendor) {
6491 6684 case X86_VENDOR_Intel:
6492 6685 if (cpi->cpi_maxeax >= 2)
6493 6686 return (X86_VENDOR_Intel);
6494 6687 break;
6495 6688 case X86_VENDOR_AMD:
6496 6689 /*
6497 6690 * The K5 model 1 was the first part from AMD that reported
6498 6691 * cache sizes via extended cpuid functions.
6499 6692 */
6500 6693 if (cpi->cpi_family > 5 ||
6501 6694 (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
6502 6695 return (X86_VENDOR_AMD);
6503 6696 break;
6504 6697 case X86_VENDOR_TM:
6505 6698 if (cpi->cpi_family >= 5)
6506 6699 return (X86_VENDOR_AMD);
6507 6700 /*FALLTHROUGH*/
6508 6701 default:
6509 6702 /*
6510 6703 * If they have extended CPU data for 0x80000005
6511 6704 * then we assume they have AMD-format cache
6512 6705 * information.
6513 6706 *
6514 6707 * If not, and the vendor happens to be Cyrix,
6515 6708 * then try our-Cyrix specific handler.
6516 6709 *
6517 6710 * If we're not Cyrix, then assume we're using Intel's
6518 6711 * table-driven format instead.
6519 6712 */
6520 6713 if (cpi->cpi_xmaxeax >= 0x80000005)
6521 6714 return (X86_VENDOR_AMD);
6522 6715 else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
6523 6716 return (X86_VENDOR_Cyrix);
6524 6717 else if (cpi->cpi_maxeax >= 2)
6525 6718 return (X86_VENDOR_Intel);
6526 6719 break;
6527 6720 }
6528 6721 return (-1);
6529 6722 }
6530 6723
6531 6724 void
6532 6725 cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
6533 6726 struct cpuid_info *cpi)
6534 6727 {
6535 6728 dev_info_t *cpu_devi;
6536 6729 int create;
6537 6730
6538 6731 cpu_devi = (dev_info_t *)dip;
6539 6732
6540 6733 /* device_type */
6541 6734 (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6542 6735 "device_type", "cpu");
6543 6736
6544 6737 /* reg */
6545 6738 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6546 6739 "reg", cpu_id);
6547 6740
6548 6741 /* cpu-mhz, and clock-frequency */
6549 6742 if (cpu_freq > 0) {
6550 6743 long long mul;
6551 6744
6552 6745 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6553 6746 "cpu-mhz", cpu_freq);
6554 6747 if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
6555 6748 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6556 6749 "clock-frequency", (int)mul);
6557 6750 }
6558 6751
6559 6752 if (!is_x86_feature(x86_featureset, X86FSET_CPUID)) {
6560 6753 return;
6561 6754 }
6562 6755
6563 6756 /* vendor-id */
6564 6757 (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6565 6758 "vendor-id", cpi->cpi_vendorstr);
6566 6759
6567 6760 if (cpi->cpi_maxeax == 0) {
6568 6761 return;
6569 6762 }
6570 6763
6571 6764 /*
6572 6765 * family, model, and step
6573 6766 */
6574 6767 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6575 6768 "family", CPI_FAMILY(cpi));
6576 6769 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6577 6770 "cpu-model", CPI_MODEL(cpi));
6578 6771 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6579 6772 "stepping-id", CPI_STEP(cpi));
6580 6773
6581 6774 /* type */
6582 6775 switch (cpi->cpi_vendor) {
6583 6776 case X86_VENDOR_Intel:
6584 6777 create = 1;
6585 6778 break;
6586 6779 default:
6587 6780 create = 0;
6588 6781 break;
6589 6782 }
6590 6783 if (create)
6591 6784 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6592 6785 "type", CPI_TYPE(cpi));
6593 6786
6594 6787 /* ext-family */
6595 6788 switch (cpi->cpi_vendor) {
6596 6789 case X86_VENDOR_Intel:
6597 6790 case X86_VENDOR_AMD:
6598 6791 create = cpi->cpi_family >= 0xf;
6599 6792 break;
6600 6793 default:
6601 6794 create = 0;
6602 6795 break;
6603 6796 }
6604 6797 if (create)
6605 6798 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6606 6799 "ext-family", CPI_FAMILY_XTD(cpi));
6607 6800
6608 6801 /* ext-model */
6609 6802 switch (cpi->cpi_vendor) {
6610 6803 case X86_VENDOR_Intel:
6611 6804 create = IS_EXTENDED_MODEL_INTEL(cpi);
6612 6805 break;
6613 6806 case X86_VENDOR_AMD:
6614 6807 create = CPI_FAMILY(cpi) == 0xf;
6615 6808 break;
6616 6809 default:
6617 6810 create = 0;
6618 6811 break;
6619 6812 }
6620 6813 if (create)
6621 6814 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6622 6815 "ext-model", CPI_MODEL_XTD(cpi));
6623 6816
6624 6817 /* generation */
6625 6818 switch (cpi->cpi_vendor) {
6626 6819 case X86_VENDOR_AMD:
6627 6820 /*
6628 6821 * AMD K5 model 1 was the first part to support this
6629 6822 */
6630 6823 create = cpi->cpi_xmaxeax >= 0x80000001;
6631 6824 break;
6632 6825 default:
6633 6826 create = 0;
6634 6827 break;
6635 6828 }
6636 6829 if (create)
6637 6830 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6638 6831 "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
6639 6832
6640 6833 /* brand-id */
6641 6834 switch (cpi->cpi_vendor) {
6642 6835 case X86_VENDOR_Intel:
6643 6836 /*
6644 6837 * brand id first appeared on Pentium III Xeon model 8,
6645 6838 * and Celeron model 8 processors and Opteron
6646 6839 */
6647 6840 create = cpi->cpi_family > 6 ||
6648 6841 (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
6649 6842 break;
6650 6843 case X86_VENDOR_AMD:
6651 6844 create = cpi->cpi_family >= 0xf;
6652 6845 break;
6653 6846 default:
6654 6847 create = 0;
6655 6848 break;
6656 6849 }
6657 6850 if (create && cpi->cpi_brandid != 0) {
6658 6851 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6659 6852 "brand-id", cpi->cpi_brandid);
6660 6853 }
6661 6854
6662 6855 /* chunks, and apic-id */
6663 6856 switch (cpi->cpi_vendor) {
6664 6857 /*
6665 6858 * first available on Pentium IV and Opteron (K8)
6666 6859 */
6667 6860 case X86_VENDOR_Intel:
6668 6861 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6669 6862 break;
6670 6863 case X86_VENDOR_AMD:
6671 6864 create = cpi->cpi_family >= 0xf;
6672 6865 break;
6673 6866 default:
6674 6867 create = 0;
6675 6868 break;
6676 6869 }
6677 6870 if (create) {
6678 6871 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6679 6872 "chunks", CPI_CHUNKS(cpi));
6680 6873 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6681 6874 "apic-id", cpi->cpi_apicid);
6682 6875 if (cpi->cpi_chipid >= 0) {
6683 6876 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6684 6877 "chip#", cpi->cpi_chipid);
6685 6878 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6686 6879 "clog#", cpi->cpi_clogid);
6687 6880 }
6688 6881 }
6689 6882
6690 6883 /* cpuid-features */
6691 6884 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6692 6885 "cpuid-features", CPI_FEATURES_EDX(cpi));
6693 6886
6694 6887
6695 6888 /* cpuid-features-ecx */
6696 6889 switch (cpi->cpi_vendor) {
6697 6890 case X86_VENDOR_Intel:
6698 6891 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6699 6892 break;
6700 6893 case X86_VENDOR_AMD:
6701 6894 create = cpi->cpi_family >= 0xf;
6702 6895 break;
6703 6896 default:
6704 6897 create = 0;
6705 6898 break;
6706 6899 }
6707 6900 if (create)
6708 6901 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6709 6902 "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
6710 6903
6711 6904 /* ext-cpuid-features */
6712 6905 switch (cpi->cpi_vendor) {
6713 6906 case X86_VENDOR_Intel:
6714 6907 case X86_VENDOR_AMD:
6715 6908 case X86_VENDOR_Cyrix:
6716 6909 case X86_VENDOR_TM:
6717 6910 case X86_VENDOR_Centaur:
6718 6911 create = cpi->cpi_xmaxeax >= 0x80000001;
6719 6912 break;
6720 6913 default:
6721 6914 create = 0;
6722 6915 break;
6723 6916 }
6724 6917 if (create) {
6725 6918 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6726 6919 "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
6727 6920 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6728 6921 "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
6729 6922 }
6730 6923
6731 6924 /*
6732 6925 * Brand String first appeared in Intel Pentium IV, AMD K5
6733 6926 * model 1, and Cyrix GXm. On earlier models we try and
6734 6927 * simulate something similar .. so this string should always
6735 6928 * same -something- about the processor, however lame.
6736 6929 */
6737 6930 (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6738 6931 "brand-string", cpi->cpi_brandstr);
6739 6932
6740 6933 /*
6741 6934 * Finally, cache and tlb information
6742 6935 */
6743 6936 switch (x86_which_cacheinfo(cpi)) {
6744 6937 case X86_VENDOR_Intel:
6745 6938 intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
6746 6939 break;
6747 6940 case X86_VENDOR_Cyrix:
6748 6941 cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
6749 6942 break;
6750 6943 case X86_VENDOR_AMD:
6751 6944 amd_cache_info(cpi, cpu_devi);
6752 6945 break;
6753 6946 default:
6754 6947 break;
6755 6948 }
6756 6949 }
6757 6950
6758 6951 struct l2info {
6759 6952 int *l2i_csz;
6760 6953 int *l2i_lsz;
6761 6954 int *l2i_assoc;
6762 6955 int l2i_ret;
6763 6956 };
6764 6957
6765 6958 /*
6766 6959 * A cacheinfo walker that fetches the size, line-size and associativity
6767 6960 * of the L2 cache
6768 6961 */
6769 6962 static int
6770 6963 intel_l2cinfo(void *arg, const struct cachetab *ct)
6771 6964 {
6772 6965 struct l2info *l2i = arg;
6773 6966 int *ip;
6774 6967
6775 6968 if (ct->ct_label != l2_cache_str &&
6776 6969 ct->ct_label != sl2_cache_str)
6777 6970 return (0); /* not an L2 -- keep walking */
6778 6971
6779 6972 if ((ip = l2i->l2i_csz) != NULL)
6780 6973 *ip = ct->ct_size;
6781 6974 if ((ip = l2i->l2i_lsz) != NULL)
6782 6975 *ip = ct->ct_line_size;
6783 6976 if ((ip = l2i->l2i_assoc) != NULL)
6784 6977 *ip = ct->ct_assoc;
6785 6978 l2i->l2i_ret = ct->ct_size;
6786 6979 return (1); /* was an L2 -- terminate walk */
6787 6980 }
6788 6981
6789 6982 /*
6790 6983 * AMD L2/L3 Cache and TLB Associativity Field Definition:
6791 6984 *
6792 6985 * Unlike the associativity for the L1 cache and tlb where the 8 bit
6793 6986 * value is the associativity, the associativity for the L2 cache and
6794 6987 * tlb is encoded in the following table. The 4 bit L2 value serves as
6795 6988 * an index into the amd_afd[] array to determine the associativity.
6796 6989 * -1 is undefined. 0 is fully associative.
6797 6990 */
6798 6991
6799 6992 static int amd_afd[] =
6800 6993 {-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
6801 6994
6802 6995 static void
6803 6996 amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
6804 6997 {
6805 6998 struct cpuid_regs *cp;
6806 6999 uint_t size, assoc;
6807 7000 int i;
6808 7001 int *ip;
6809 7002
6810 7003 if (cpi->cpi_xmaxeax < 0x80000006)
6811 7004 return;
6812 7005 cp = &cpi->cpi_extd[6];
6813 7006
6814 7007 if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
6815 7008 (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
6816 7009 uint_t cachesz = size * 1024;
6817 7010 assoc = amd_afd[i];
6818 7011
6819 7012 ASSERT(assoc != -1);
6820 7013
6821 7014 if ((ip = l2i->l2i_csz) != NULL)
6822 7015 *ip = cachesz;
6823 7016 if ((ip = l2i->l2i_lsz) != NULL)
6824 7017 *ip = BITX(cp->cp_ecx, 7, 0);
6825 7018 if ((ip = l2i->l2i_assoc) != NULL)
6826 7019 *ip = assoc;
6827 7020 l2i->l2i_ret = cachesz;
6828 7021 }
6829 7022 }
6830 7023
6831 7024 int
6832 7025 getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
6833 7026 {
6834 7027 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6835 7028 struct l2info __l2info, *l2i = &__l2info;
6836 7029
6837 7030 l2i->l2i_csz = csz;
6838 7031 l2i->l2i_lsz = lsz;
6839 7032 l2i->l2i_assoc = assoc;
6840 7033 l2i->l2i_ret = -1;
6841 7034
6842 7035 switch (x86_which_cacheinfo(cpi)) {
6843 7036 case X86_VENDOR_Intel:
6844 7037 intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
6845 7038 break;
6846 7039 case X86_VENDOR_Cyrix:
6847 7040 cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
6848 7041 break;
6849 7042 case X86_VENDOR_AMD:
6850 7043 amd_l2cacheinfo(cpi, l2i);
6851 7044 break;
6852 7045 default:
6853 7046 break;
6854 7047 }
6855 7048 return (l2i->l2i_ret);
6856 7049 }
6857 7050
6858 7051 #if !defined(__xpv)
6859 7052
6860 7053 uint32_t *
6861 7054 cpuid_mwait_alloc(cpu_t *cpu)
6862 7055 {
6863 7056 uint32_t *ret;
6864 7057 size_t mwait_size;
6865 7058
6866 7059 ASSERT(cpuid_checkpass(CPU, 2));
6867 7060
6868 7061 mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
6869 7062 if (mwait_size == 0)
6870 7063 return (NULL);
6871 7064
6872 7065 /*
6873 7066 * kmem_alloc() returns cache line size aligned data for mwait_size
6874 7067 * allocations. mwait_size is currently cache line sized. Neither
6875 7068 * of these implementation details are guarantied to be true in the
6876 7069 * future.
6877 7070 *
6878 7071 * First try allocating mwait_size as kmem_alloc() currently returns
6879 7072 * correctly aligned memory. If kmem_alloc() does not return
6880 7073 * mwait_size aligned memory, then use mwait_size ROUNDUP.
6881 7074 *
6882 7075 * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
6883 7076 * decide to free this memory.
6884 7077 */
6885 7078 ret = kmem_zalloc(mwait_size, KM_SLEEP);
6886 7079 if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
6887 7080 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
6888 7081 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
6889 7082 *ret = MWAIT_RUNNING;
6890 7083 return (ret);
6891 7084 } else {
6892 7085 kmem_free(ret, mwait_size);
6893 7086 ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
6894 7087 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
6895 7088 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
6896 7089 ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
6897 7090 *ret = MWAIT_RUNNING;
6898 7091 return (ret);
6899 7092 }
6900 7093 }
6901 7094
6902 7095 void
6903 7096 cpuid_mwait_free(cpu_t *cpu)
6904 7097 {
6905 7098 if (cpu->cpu_m.mcpu_cpi == NULL) {
6906 7099 return;
6907 7100 }
6908 7101
6909 7102 if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
6910 7103 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
6911 7104 kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
6912 7105 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
6913 7106 }
6914 7107
6915 7108 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
6916 7109 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
6917 7110 }
6918 7111
6919 7112 void
6920 7113 patch_tsc_read(int flag)
6921 7114 {
6922 7115 size_t cnt;
6923 7116
6924 7117 switch (flag) {
6925 7118 case TSC_NONE:
6926 7119 cnt = &_no_rdtsc_end - &_no_rdtsc_start;
6927 7120 (void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
6928 7121 break;
6929 7122 case TSC_RDTSC_MFENCE:
6930 7123 cnt = &_tsc_mfence_end - &_tsc_mfence_start;
6931 7124 (void) memcpy((void *)tsc_read,
6932 7125 (void *)&_tsc_mfence_start, cnt);
6933 7126 break;
6934 7127 case TSC_RDTSC_LFENCE:
6935 7128 cnt = &_tsc_lfence_end - &_tsc_lfence_start;
6936 7129 (void) memcpy((void *)tsc_read,
6937 7130 (void *)&_tsc_lfence_start, cnt);
6938 7131 break;
6939 7132 case TSC_TSCP:
6940 7133 cnt = &_tscp_end - &_tscp_start;
6941 7134 (void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
6942 7135 break;
6943 7136 default:
6944 7137 /* Bail for unexpected TSC types. (TSC_NONE covers 0) */
6945 7138 cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
6946 7139 break;
6947 7140 }
6948 7141 tsc_type = flag;
6949 7142 }
6950 7143
6951 7144 int
6952 7145 cpuid_deep_cstates_supported(void)
6953 7146 {
6954 7147 struct cpuid_info *cpi;
6955 7148 struct cpuid_regs regs;
6956 7149
6957 7150 ASSERT(cpuid_checkpass(CPU, 1));
6958 7151
6959 7152 cpi = CPU->cpu_m.mcpu_cpi;
6960 7153
6961 7154 if (!is_x86_feature(x86_featureset, X86FSET_CPUID))
6962 7155 return (0);
6963 7156
6964 7157 switch (cpi->cpi_vendor) {
6965 7158 case X86_VENDOR_Intel:
6966 7159 if (cpi->cpi_xmaxeax < 0x80000007)
6967 7160 return (0);
6968 7161
6969 7162 /*
6970 7163 * TSC run at a constant rate in all ACPI C-states?
6971 7164 */
6972 7165 regs.cp_eax = 0x80000007;
6973 7166 (void) __cpuid_insn(®s);
6974 7167 return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
6975 7168
6976 7169 default:
6977 7170 return (0);
6978 7171 }
6979 7172 }
6980 7173
6981 7174 #endif /* !__xpv */
6982 7175
6983 7176 void
6984 7177 post_startup_cpu_fixups(void)
6985 7178 {
6986 7179 #ifndef __xpv
6987 7180 /*
6988 7181 * Some AMD processors support C1E state. Entering this state will
6989 7182 * cause the local APIC timer to stop, which we can't deal with at
6990 7183 * this time.
6991 7184 */
6992 7185 if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
6993 7186 on_trap_data_t otd;
6994 7187 uint64_t reg;
6995 7188
6996 7189 if (!on_trap(&otd, OT_DATA_ACCESS)) {
6997 7190 reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
6998 7191 /* Disable C1E state if it is enabled by BIOS */
6999 7192 if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
7000 7193 AMD_ACTONCMPHALT_MASK) {
7001 7194 reg &= ~(AMD_ACTONCMPHALT_MASK <<
7002 7195 AMD_ACTONCMPHALT_SHIFT);
7003 7196 wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
7004 7197 }
7005 7198 }
7006 7199 no_trap();
7007 7200 }
7008 7201 #endif /* !__xpv */
7009 7202 }
7010 7203
7011 7204 void
7012 7205 enable_pcid(void)
7013 7206 {
7014 7207 if (x86_use_pcid == -1)
7015 7208 x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
7016 7209
7017 7210 if (x86_use_invpcid == -1) {
7018 7211 x86_use_invpcid = is_x86_feature(x86_featureset,
7019 7212 X86FSET_INVPCID);
7020 7213 }
7021 7214
7022 7215 if (!x86_use_pcid)
7023 7216 return;
7024 7217
7025 7218 /*
7026 7219 * Intel say that on setting PCIDE, it immediately starts using the PCID
7027 7220 * bits; better make sure there's nothing there.
7028 7221 */
7029 7222 ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
7030 7223
7031 7224 setcr4(getcr4() | CR4_PCIDE);
7032 7225 }
7033 7226
7034 7227 /*
7035 7228 * Setup necessary registers to enable XSAVE feature on this processor.
7036 7229 * This function needs to be called early enough, so that no xsave/xrstor
7037 7230 * ops will execute on the processor before the MSRs are properly set up.
7038 7231 *
7039 7232 * Current implementation has the following assumption:
7040 7233 * - cpuid_pass1() is done, so that X86 features are known.
7041 7234 * - fpu_probe() is done, so that fp_save_mech is chosen.
7042 7235 */
7043 7236 void
7044 7237 xsave_setup_msr(cpu_t *cpu)
7045 7238 {
7046 7239 ASSERT(fp_save_mech == FP_XSAVE);
7047 7240 ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
7048 7241
7049 7242 /* Enable OSXSAVE in CR4. */
7050 7243 setcr4(getcr4() | CR4_OSXSAVE);
7051 7244 /*
7052 7245 * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report
7053 7246 * correct value.
7054 7247 */
7055 7248 cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE;
7056 7249 setup_xfem();
7057 7250 }
7058 7251
7059 7252 /*
7060 7253 * Starting with the Westmere processor the local
7061 7254 * APIC timer will continue running in all C-states,
7062 7255 * including the deepest C-states.
7063 7256 */
7064 7257 int
7065 7258 cpuid_arat_supported(void)
7066 7259 {
7067 7260 struct cpuid_info *cpi;
7068 7261 struct cpuid_regs regs;
7069 7262
7070 7263 ASSERT(cpuid_checkpass(CPU, 1));
7071 7264 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7072 7265
7073 7266 cpi = CPU->cpu_m.mcpu_cpi;
7074 7267
7075 7268 switch (cpi->cpi_vendor) {
7076 7269 case X86_VENDOR_Intel:
7077 7270 /*
7078 7271 * Always-running Local APIC Timer is
7079 7272 * indicated by CPUID.6.EAX[2].
7080 7273 */
7081 7274 if (cpi->cpi_maxeax >= 6) {
7082 7275 regs.cp_eax = 6;
7083 7276 (void) cpuid_insn(NULL, ®s);
7084 7277 return (regs.cp_eax & CPUID_INTC_EAX_ARAT);
7085 7278 } else {
7086 7279 return (0);
7087 7280 }
7088 7281 default:
7089 7282 return (0);
7090 7283 }
7091 7284 }
7092 7285
7093 7286 /*
7094 7287 * Check support for Intel ENERGY_PERF_BIAS feature
7095 7288 */
7096 7289 int
7097 7290 cpuid_iepb_supported(struct cpu *cp)
7098 7291 {
7099 7292 struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi;
7100 7293 struct cpuid_regs regs;
7101 7294
7102 7295 ASSERT(cpuid_checkpass(cp, 1));
7103 7296
7104 7297 if (!(is_x86_feature(x86_featureset, X86FSET_CPUID)) ||
7105 7298 !(is_x86_feature(x86_featureset, X86FSET_MSR))) {
7106 7299 return (0);
7107 7300 }
7108 7301
7109 7302 /*
7110 7303 * Intel ENERGY_PERF_BIAS MSR is indicated by
7111 7304 * capability bit CPUID.6.ECX.3
7112 7305 */
7113 7306 if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6))
7114 7307 return (0);
7115 7308
7116 7309 regs.cp_eax = 0x6;
7117 7310 (void) cpuid_insn(NULL, ®s);
7118 7311 return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS);
7119 7312 }
7120 7313
7121 7314 /*
7122 7315 * Check support for TSC deadline timer
7123 7316 *
7124 7317 * TSC deadline timer provides a superior software programming
7125 7318 * model over local APIC timer that eliminates "time drifts".
7126 7319 * Instead of specifying a relative time, software specifies an
7127 7320 * absolute time as the target at which the processor should
7128 7321 * generate a timer event.
7129 7322 */
7130 7323 int
7131 7324 cpuid_deadline_tsc_supported(void)
7132 7325 {
7133 7326 struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi;
7134 7327 struct cpuid_regs regs;
7135 7328
7136 7329 ASSERT(cpuid_checkpass(CPU, 1));
7137 7330 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7138 7331
7139 7332 switch (cpi->cpi_vendor) {
7140 7333 case X86_VENDOR_Intel:
7141 7334 if (cpi->cpi_maxeax >= 1) {
7142 7335 regs.cp_eax = 1;
7143 7336 (void) cpuid_insn(NULL, ®s);
7144 7337 return (regs.cp_ecx & CPUID_DEADLINE_TSC);
7145 7338 } else {
7146 7339 return (0);
7147 7340 }
7148 7341 default:
7149 7342 return (0);
7150 7343 }
7151 7344 }
7152 7345
7153 7346 #if defined(__amd64) && !defined(__xpv)
7154 7347 /*
7155 7348 * Patch in versions of bcopy for high performance Intel Nhm processors
7156 7349 * and later...
7157 7350 */
7158 7351 void
7159 7352 patch_memops(uint_t vendor)
7160 7353 {
7161 7354 size_t cnt, i;
7162 7355 caddr_t to, from;
7163 7356
7164 7357 if ((vendor == X86_VENDOR_Intel) &&
7165 7358 is_x86_feature(x86_featureset, X86FSET_SSE4_2)) {
7166 7359 cnt = &bcopy_patch_end - &bcopy_patch_start;
7167 7360 to = &bcopy_ck_size;
7168 7361 from = &bcopy_patch_start;
7169 7362 for (i = 0; i < cnt; i++) {
7170 7363 *to++ = *from++;
7171 7364 }
7172 7365 }
7173 7366 }
7174 7367 #endif /* __amd64 && !__xpv */
7175 7368
7176 7369 /*
7177 7370 * We're being asked to tell the system how many bits are required to represent
7178 7371 * the various thread and strand IDs. While it's tempting to derive this based
7179 7372 * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite
7180 7373 * correct. Instead, this needs to be based on the number of bits that the APIC
7181 7374 * allows for these different configurations. We only update these to a larger
7182 7375 * value if we find one.
7183 7376 */
7184 7377 void
7185 7378 cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
7186 7379 {
7187 7380 struct cpuid_info *cpi;
7188 7381
7189 7382 VERIFY(cpuid_checkpass(CPU, 1));
7190 7383 cpi = cpu->cpu_m.mcpu_cpi;
7191 7384
7192 7385 if (cpi->cpi_ncore_bits > *core_nbits) {
7193 7386 *core_nbits = cpi->cpi_ncore_bits;
7194 7387 }
7195 7388
7196 7389 if (cpi->cpi_nthread_bits > *strand_nbits) {
7197 7390 *strand_nbits = cpi->cpi_nthread_bits;
7198 7391 }
7199 7392 }
7200 7393
7201 7394 void
7202 7395 cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
7203 7396 {
7204 7397 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7205 7398 struct cpuid_regs cp;
7206 7399
7207 7400 /*
7208 7401 * Reread the CPUID portions that we need for various security
7209 7402 * information.
7210 7403 */
7211 7404 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
7212 7405 /*
7213 7406 * Check if we now have leaf 7 available to us.
7214 7407 */
7215 7408 if (cpi->cpi_maxeax < 7) {
7216 7409 bzero(&cp, sizeof (cp));
7217 7410 cp.cp_eax = 0;
7218 7411 cpi->cpi_maxeax = __cpuid_insn(&cp);
7219 7412 if (cpi->cpi_maxeax < 7)
7220 7413 return;
7221 7414 }
7222 7415
7223 7416 bzero(&cp, sizeof (cp));
7224 7417 cp.cp_eax = 7;
7225 7418 cp.cp_ecx = 0;
7226 7419 (void) __cpuid_insn(&cp);
7227 7420 cpi->cpi_std[7] = cp;
7228 7421 } else if (cpi->cpi_vendor == X86_VENDOR_AMD) {
7229 7422 /* No xcpuid support */
7230 7423 if (cpi->cpi_family < 5 ||
7231 7424 (cpi->cpi_family == 5 && cpi->cpi_model < 1))
7232 7425 return;
7233 7426
7234 7427 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7235 7428 bzero(&cp, sizeof (cp));
7236 7429 cp.cp_eax = CPUID_LEAF_EXT_0;
7237 7430 cpi->cpi_xmaxeax = __cpuid_insn(&cp);
7238 7431 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7239 7432 return;
7240 7433 }
7241 7434 }
7242 7435
7243 7436 bzero(&cp, sizeof (cp));
7244 7437 cp.cp_eax = CPUID_LEAF_EXT_8;
7245 7438 (void) __cpuid_insn(&cp);
7246 7439 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
7247 7440 cpi->cpi_extd[8] = cp;
7248 7441 } else {
7249 7442 /*
7250 7443 * Nothing to do here. Return an empty set which has already
7251 7444 * been zeroed for us.
7252 7445 */
7253 7446 return;
7254 7447 }
7255 7448 cpuid_scan_security(cpu, fset);
7256 7449 }
7257 7450
7258 7451 /* ARGSUSED */
7259 7452 static int
7260 7453 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
7261 7454 {
7262 7455 uchar_t *fset;
7263 7456 boolean_t first_pass = (boolean_t)arg1;
7264 7457
7265 7458 fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
7266 7459 if (first_pass && CPU->cpu_id != 0)
7267 7460 return (0);
7268 7461 if (!first_pass && CPU->cpu_id == 0)
7269 7462 return (0);
7270 7463 cpuid_pass_ucode(CPU, fset);
7271 7464
7272 7465 return (0);
7273 7466 }
7274 7467
7275 7468 /*
7276 7469 * After a microcode update where the version has changed, then we need to
7277 7470 * rescan CPUID. To do this we check every CPU to make sure that they have the
7278 7471 * same microcode. Then we perform a cross call to all such CPUs. It's the
7279 7472 * caller's job to make sure that no one else can end up doing an update while
7280 7473 * this is going on.
7281 7474 *
7282 7475 * We assume that the system is microcode capable if we're called.
7283 7476 */
7284 7477 void
7285 7478 cpuid_post_ucodeadm(void)
7286 7479 {
7287 7480 uint32_t rev;
7288 7481 int i;
7289 7482 struct cpu *cpu;
7290 7483 cpuset_t cpuset;
7291 7484 void *argdata;
7292 7485 uchar_t *f0;
7293 7486
7294 7487 argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
7295 7488
7296 7489 mutex_enter(&cpu_lock);
7297 7490 cpu = cpu_get(0);
7298 7491 rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
7299 7492 CPUSET_ONLY(cpuset, 0);
7300 7493 for (i = 1; i < max_ncpus; i++) {
7301 7494 if ((cpu = cpu_get(i)) == NULL)
7302 7495 continue;
7303 7496
7304 7497 if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
7305 7498 panic("post microcode update CPU %d has differing "
7306 7499 "microcode revision (%u) from CPU 0 (%u)",
7307 7500 i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
7308 7501 }
7309 7502 CPUSET_ADD(cpuset, i);
7310 7503 }
7311 7504
7312 7505 /*
7313 7506 * We do the cross calls in two passes. The first pass is only for the
7314 7507 * boot CPU. The second pass is for all of the other CPUs. This allows
7315 7508 * the boot CPU to go through and change behavior related to patching or
7316 7509 * whether or not Enhanced IBRS needs to be enabled and then allow all
7317 7510 * other CPUs to follow suit.
7318 7511 */
7319 7512 kpreempt_disable();
7320 7513 xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset),
7321 7514 cpuid_post_ucodeadm_xc);
7322 7515 xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset),
7323 7516 cpuid_post_ucodeadm_xc);
7324 7517 kpreempt_enable();
7325 7518
7326 7519 /*
7327 7520 * OK, now look at each CPU and see if their feature sets are equal.
7328 7521 */
7329 7522 f0 = argdata;
7330 7523 for (i = 1; i < max_ncpus; i++) {
7331 7524 uchar_t *fset;
7332 7525 if (!CPU_IN_SET(cpuset, i))
7333 7526 continue;
7334 7527
7335 7528 fset = (uchar_t *)((uintptr_t)argdata +
7336 7529 sizeof (x86_featureset) * i);
7337 7530
7338 7531 if (!compare_x86_featureset(f0, fset)) {
7339 7532 panic("Post microcode update CPU %d has "
7340 7533 "differing security feature (%p) set from CPU 0 "
7341 7534 "(%p), not appending to feature set", i,
7342 7535 (void *)fset, (void *)f0);
7343 7536 }
7344 7537 }
7345 7538
7346 7539 mutex_exit(&cpu_lock);
7347 7540
7348 7541 for (i = 0; i < NUM_X86_FEATURES; i++) {
7349 7542 cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n",
7350 7543 x86_feature_names[i]);
7351 7544 if (is_x86_feature(f0, i)) {
7352 7545 add_x86_feature(x86_featureset, i);
7353 7546 }
7354 7547 }
7355 7548 kmem_free(argdata, sizeof (x86_featureset) * NCPU);
7356 7549 }
↓ open down ↓ |
4482 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX