Print this page
11787 Kernel needs to be built with retpolines
11788 Kernel needs to generally use RSB stuffing
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: John Levon <john.levon@joyent.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/i86pc/os/cpuid.c
+++ new/usr/src/uts/i86pc/os/cpuid.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24 24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25 25 * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
26 26 */
27 27 /*
28 28 * Copyright (c) 2010, Intel Corporation.
29 29 * All rights reserved.
30 30 */
31 31 /*
32 32 * Portions Copyright 2009 Advanced Micro Devices, Inc.
33 33 */
34 34 /*
35 35 * Copyright 2019 Joyent, Inc.
36 36 */
37 37
38 38 /*
39 39 * CPU Identification logic
40 40 *
41 41 * The purpose of this file and its companion, cpuid_subr.c, is to help deal
42 42 * with the identification of CPUs, their features, and their topologies. More
43 43 * specifically, this file helps drive the following:
44 44 *
45 45 * 1. Enumeration of features of the processor which are used by the kernel to
46 46 * determine what features to enable or disable. These may be instruction set
47 47 * enhancements or features that we use.
48 48 *
49 49 * 2. Enumeration of instruction set architecture (ISA) additions that userland
50 50 * will be told about through the auxiliary vector.
51 51 *
52 52 * 3. Understanding the physical topology of the CPU such as the number of
53 53 * caches, how many cores it has, whether or not it supports symmetric
54 54 * multi-processing (SMT), etc.
55 55 *
56 56 * ------------------------
57 57 * CPUID History and Basics
58 58 * ------------------------
59 59 *
60 60 * The cpuid instruction was added by Intel roughly around the time that the
61 61 * original Pentium was introduced. The purpose of cpuid was to tell in a
62 62 * programmatic fashion information about the CPU that previously was guessed
63 63 * at. For example, an important part of cpuid is that we can know what
64 64 * extensions to the ISA exist. If you use an invalid opcode you would get a
65 65 * #UD, so this method allows a program (whether a user program or the kernel)
66 66 * to determine what exists without crashing or getting a SIGILL. Of course,
67 67 * this was also during the era of the clones and the AMD Am5x86. The vendor
68 68 * name shows up first in cpuid for a reason.
69 69 *
70 70 * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
71 71 * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
72 72 * its own meaning. The different leaves are broken down into different regions:
73 73 *
74 74 * [ 0, 7fffffff ] This region is called the 'basic'
75 75 * region. This region is generally defined
76 76 * by Intel, though some of the original
77 77 * portions have different meanings based
78 78 * on the manufacturer. These days, Intel
79 79 * adds most new features to this region.
80 80 * AMD adds non-Intel compatible
81 81 * information in the third, extended
82 82 * region. Intel uses this for everything
83 83 * including ISA extensions, CPU
84 84 * features, cache information, topology,
85 85 * and more.
86 86 *
87 87 * There is a hole carved out of this
88 88 * region which is reserved for
89 89 * hypervisors.
90 90 *
91 91 * [ 40000000, 4fffffff ] This region, which is found in the
92 92 * middle of the previous region, is
93 93 * explicitly promised to never be used by
94 94 * CPUs. Instead, it is used by hypervisors
95 95 * to communicate information about
96 96 * themselves to the operating system. The
97 97 * values and details are unique for each
98 98 * hypervisor.
99 99 *
100 100 * [ 80000000, ffffffff ] This region is called the 'extended'
101 101 * region. Some of the low leaves mirror
102 102 * parts of the basic leaves. This region
103 103 * has generally been used by AMD for
104 104 * various extensions. For example, AMD-
105 105 * specific information about caches,
106 106 * features, and topology are found in this
107 107 * region.
108 108 *
109 109 * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
110 110 * and %edx, and then issue the cpuid instruction. At the first leaf in each of
111 111 * the ranges, one of the primary things returned is the maximum valid leaf in
112 112 * that range. This allows for discovery of what range of CPUID is valid.
113 113 *
114 114 * The CPUs have potentially surprising behavior when using an invalid leaf or
115 115 * unimplemented leaf. If the requested leaf is within the valid basic or
116 116 * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
117 117 * set to zero. However, if you specify a leaf that is outside of a valid range,
118 118 * then instead it will be filled with the last valid _basic_ leaf. For example,
119 119 * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
120 120 * an invalid extended leaf will return the information for leaf 3.
121 121 *
122 122 * Some leaves are broken down into sub-leaves. This means that the value
123 123 * depends on both the leaf asked for in %eax and a secondary register. For
124 124 * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
125 125 * additional information. Or when getting topology information in leaf 0xb, the
126 126 * initial value in %ecx changes which level of the topology that you are
127 127 * getting information about.
128 128 *
129 129 * cpuid values are always kept to 32 bits regardless of whether or not the
130 130 * program is in 64-bit mode. When executing in 64-bit mode, the upper
131 131 * 32 bits of the register are always set to zero so that way the values are the
132 132 * same regardless of execution mode.
133 133 *
134 134 * ----------------------
135 135 * Identifying Processors
136 136 * ----------------------
137 137 *
138 138 * We can identify a processor in two steps. The first step looks at cpuid leaf
139 139 * 0. Leaf 0 contains the processor's vendor information. This is done by
140 140 * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
141 141 * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
142 142 *
143 143 * From there, a processor is identified by a combination of three different
144 144 * values:
145 145 *
146 146 * 1. Family
147 147 * 2. Model
148 148 * 3. Stepping
149 149 *
150 150 * Each vendor uses the family and model to uniquely identify a processor. The
151 151 * way that family and model are changed depends on the vendor. For example,
152 152 * Intel has been using family 0x6 for almost all of their processor since the
153 153 * Pentium Pro/Pentium II era, often called the P6. The model is used to
154 154 * identify the exact processor. Different models are often used for the client
155 155 * (consumer) and server parts. Even though each processor often has major
156 156 * architectural differences, they still are considered the same family by
157 157 * Intel.
158 158 *
159 159 * On the other hand, each major AMD architecture generally has its own family.
160 160 * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
161 161 * the model number is used to help identify specific processors.
162 162 *
163 163 * The stepping is used to refer to a revision of a specific microprocessor. The
164 164 * term comes from equipment used to produce masks that are used to create
165 165 * integrated circuits.
166 166 *
167 167 * The information is present in leaf 1, %eax. In technical documentation you
168 168 * will see the terms extended model and extended family. The original family,
169 169 * model, and stepping fields were each 4 bits wide. If the values in either
170 170 * are 0xf, then one is to consult the extended model and extended family, which
171 171 * take previously reserved bits and allow for a larger number of models and add
172 172 * 0xf to them.
173 173 *
174 174 * When we process this information, we store the full family, model, and
175 175 * stepping in the struct cpuid_info members cpi_family, cpi_model, and
176 176 * cpi_step, respectively. Whenever you are performing comparisons with the
177 177 * family, model, and stepping, you should use these members and not the raw
178 178 * values from cpuid. If you must use the raw values from cpuid directly, you
179 179 * must make sure that you add the extended model and family to the base model
180 180 * and family.
181 181 *
182 182 * In general, we do not use information about the family, model, and stepping
183 183 * to determine whether or not a feature is present; that is generally driven by
184 184 * specific leaves. However, when something we care about on the processor is
185 185 * not considered 'architectural' meaning that it is specific to a set of
186 186 * processors and not promised in the architecture model to be consistent from
187 187 * generation to generation, then we will fall back on this information. The
188 188 * most common cases where this comes up is when we have to workaround errata in
189 189 * the processor, are dealing with processor-specific features such as CPU
190 190 * performance counters, or we want to provide additional information for things
191 191 * such as fault management.
192 192 *
193 193 * While processors also do have a brand string, which is the name that people
194 194 * are familiar with when buying the processor, they are not meant for
195 195 * programmatic consumption. That is what the family, model, and stepping are
196 196 * for.
197 197 *
198 198 * ------------
199 199 * CPUID Passes
200 200 * ------------
201 201 *
202 202 * As part of performing feature detection, we break this into several different
203 203 * passes. The passes are as follows:
204 204 *
205 205 * Pass 0 This is a primordial pass done in locore.s to deal with
206 206 * Cyrix CPUs that don't support cpuid. The reality is that
207 207 * we likely don't run on them any more, but there is still
208 208 * logic for handling them.
209 209 *
210 210 * Pass 1 This is the primary pass and is responsible for doing a
211 211 * large number of different things:
212 212 *
213 213 * 1. Determine which vendor manufactured the CPU and
214 214 * determining the family, model, and stepping information.
215 215 *
216 216 * 2. Gathering a large number of feature flags to
217 217 * determine which features the CPU support and which
218 218 * indicate things that we need to do other work in the OS
219 219 * to enable. Features detected this way are added to the
220 220 * x86_featureset which can be queried to
221 221 * determine what we should do. This includes processing
222 222 * all of the basic and extended CPU features that we care
223 223 * about.
224 224 *
225 225 * 3. Determining the CPU's topology. This includes
226 226 * information about how many cores and threads are present
227 227 * in the package. It also is responsible for figuring out
228 228 * which logical CPUs are potentially part of the same core
229 229 * and what other resources they might share. For more
230 230 * information see the 'Topology' section.
231 231 *
232 232 * 4. Determining the set of CPU security-specific features
233 233 * that we need to worry about and determine the
234 234 * appropriate set of workarounds.
235 235 *
236 236 * Pass 1 on the boot CPU occurs before KMDB is started.
237 237 *
238 238 * Pass 2 The second pass is done after startup(). Here, we check
239 239 * other miscellaneous features. Most of this is gathering
240 240 * additional basic and extended features that we'll use in
241 241 * later passes or for debugging support.
242 242 *
243 243 * Pass 3 The third pass occurs after the kernel memory allocator
244 244 * has been fully initialized. This gathers information
245 245 * where we might need dynamic memory available for our
246 246 * uses. This includes several varying width leaves that
247 247 * have cache information and the processor's brand string.
248 248 *
249 249 * Pass 4 The fourth and final normal pass is performed after the
250 250 * kernel has brought most everything online. This is
251 251 * invoked from post_startup(). In this pass, we go through
252 252 * the set of features that we have enabled and turn that
253 253 * into the hardware auxiliary vector features that
254 254 * userland receives. This is used by userland, primarily
255 255 * by the run-time link-editor (RTLD), though userland
256 256 * software could also refer to it directly.
257 257 *
258 258 * Microcode After a microcode update, we do a selective rescan of
259 259 * the cpuid leaves to determine what features have
260 260 * changed. Microcode updates can provide more details
261 261 * about security related features to deal with issues like
262 262 * Spectre and L1TF. On occasion, vendors have violated
263 263 * their contract and removed bits. However, we don't try
264 264 * to detect that because that puts us in a situation that
265 265 * we really can't deal with. As such, the only thing we
266 266 * rescan are security related features today. See
267 267 * cpuid_pass_ucode().
268 268 *
269 269 * All of the passes (except pass 0) are run on all CPUs. However, for the most
270 270 * part we only care about what the boot CPU says about this information and use
271 271 * the other CPUs as a rough guide to sanity check that we have the same feature
272 272 * set.
273 273 *
274 274 * We do not support running multiple logical CPUs with disjoint, let alone
275 275 * different, feature sets.
276 276 *
277 277 * ------------------
278 278 * Processor Topology
279 279 * ------------------
280 280 *
281 281 * One of the important things that we need to do is to understand the topology
282 282 * of the underlying processor. When we say topology in this case, we're trying
283 283 * to understand the relationship between the logical CPUs that the operating
284 284 * system sees and the underlying physical layout. Different logical CPUs may
285 285 * share different resources which can have important consequences for the
286 286 * performance of the system. For example, they may share caches, execution
287 287 * units, and more.
288 288 *
289 289 * The topology of the processor changes from generation to generation and
290 290 * vendor to vendor. Along with that, different vendors use different
291 291 * terminology, and the operating system itself uses occasionally overlapping
292 292 * terminology. It's important to understand what this topology looks like so
293 293 * one can understand the different things that we try to calculate and
294 294 * determine.
295 295 *
296 296 * To get started, let's talk about a little bit of terminology that we've used
297 297 * so far, is used throughout this file, and is fairly generic across multiple
298 298 * vendors:
299 299 *
300 300 * CPU
301 301 * A central processing unit (CPU) refers to a logical and/or virtual
302 302 * entity that the operating system can execute instructions on. The
303 303 * underlying resources for this CPU may be shared between multiple
304 304 * entities; however, to the operating system it is a discrete unit.
305 305 *
306 306 * PROCESSOR and PACKAGE
307 307 *
308 308 * Generally, when we use the term 'processor' on its own, we are referring
309 309 * to the physical entity that one buys and plugs into a board. However,
310 310 * because processor has been overloaded and one might see it used to mean
311 311 * multiple different levels, we will instead use the term 'package' for
312 312 * the rest of this file. The term package comes from the electrical
313 313 * engineering side and refers to the physical entity that encloses the
314 314 * electronics inside. Strictly speaking the package can contain more than
315 315 * just the CPU, for example, on many processors it may also have what's
316 316 * called an 'integrated graphical processing unit (GPU)'. Because the
317 317 * package can encapsulate multiple units, it is the largest physical unit
318 318 * that we refer to.
319 319 *
320 320 * SOCKET
321 321 *
322 322 * A socket refers to unit on a system board (generally the motherboard)
323 323 * that can receive a package. A single package, or processor, is plugged
324 324 * into a single socket. A system may have multiple sockets. Often times,
325 325 * the term socket is used interchangeably with package and refers to the
326 326 * electrical component that has plugged in, and not the receptacle itself.
327 327 *
328 328 * CORE
329 329 *
330 330 * A core refers to the physical instantiation of a CPU, generally, with a
331 331 * full set of hardware resources available to it. A package may contain
332 332 * multiple cores inside of it or it may just have a single one. A
333 333 * processor with more than one core is often referred to as 'multi-core'.
334 334 * In illumos, we will use the feature X86FSET_CMP to refer to a system
335 335 * that has 'multi-core' processors.
336 336 *
337 337 * A core may expose a single logical CPU to the operating system, or it
338 338 * may expose multiple CPUs, which we call threads, defined below.
339 339 *
340 340 * Some resources may still be shared by cores in the same package. For
341 341 * example, many processors will share the level 3 cache between cores.
342 342 * Some AMD generations share hardware resources between cores. For more
343 343 * information on that see the section 'AMD Topology'.
344 344 *
345 345 * THREAD and STRAND
346 346 *
347 347 * In this file, generally a thread refers to a hardware resources and not
348 348 * the operating system's logical abstraction. A thread is always exposed
349 349 * as an independent logical CPU to the operating system. A thread belongs
350 350 * to a specific core. A core may have more than one thread. When that is
351 351 * the case, the threads that are part of the same core are often referred
352 352 * to as 'siblings'.
353 353 *
354 354 * When multiple threads exist, this is generally referred to as
355 355 * simultaneous multi-threading (SMT). When Intel introduced this in their
356 356 * processors they called it hyper-threading (HT). When multiple threads
357 357 * are active in a core, they split the resources of the core. For example,
358 358 * two threads may share the same set of hardware execution units.
359 359 *
360 360 * The operating system often uses the term 'strand' to refer to a thread.
361 361 * This helps disambiguate it from the software concept.
362 362 *
363 363 * CHIP
364 364 *
365 365 * Unfortunately, the term 'chip' is dramatically overloaded. At its most
366 366 * base meaning, it is used to refer to a single integrated circuit, which
367 367 * may or may not be the only thing in the package. In illumos, when you
368 368 * see the term 'chip' it is almost always referring to the same thing as
369 369 * the 'package'. However, many vendors may use chip to refer to one of
370 370 * many integrated circuits that have been placed in the package. As an
371 371 * example, see the subsequent definition.
372 372 *
373 373 * To try and keep things consistent, we will only use chip when referring
374 374 * to the entire integrated circuit package, with the exception of the
375 375 * definition of multi-chip module (because it is in the name) and use the
376 376 * term 'die' when we want the more general, potential sub-component
377 377 * definition.
378 378 *
379 379 * DIE
380 380 *
381 381 * A die refers to an integrated circuit. Inside of the package there may
382 382 * be a single die or multiple dies. This is sometimes called a 'chip' in
383 383 * vendor's parlance, but in this file, we use the term die to refer to a
384 384 * subcomponent.
385 385 *
386 386 * MULTI-CHIP MODULE
387 387 *
388 388 * A multi-chip module (MCM) refers to putting multiple distinct chips that
389 389 * are connected together in the same package. When a multi-chip design is
390 390 * used, generally each chip is manufactured independently and then joined
391 391 * together in the package. For example, on AMD's Zen microarchitecture
392 392 * (family 0x17), the package contains several dies (the second meaning of
393 393 * chip from above) that are connected together.
394 394 *
395 395 * CACHE
396 396 *
397 397 * A cache is a part of the processor that maintains copies of recently
398 398 * accessed memory. Caches are split into levels and then into types.
399 399 * Commonly there are one to three levels, called level one, two, and
400 400 * three. The lower the level, the smaller it is, the closer it is to the
401 401 * execution units of the CPU, and the faster it is to access. The layout
402 402 * and design of the cache come in many different flavors, consult other
403 403 * resources for a discussion of those.
404 404 *
405 405 * Caches are generally split into two types, the instruction and data
406 406 * cache. The caches contain what their names suggest, the instruction
407 407 * cache has executable program text, while the data cache has all other
408 408 * memory that the processor accesses. As of this writing, data is kept
409 409 * coherent between all of the caches on x86, so if one modifies program
410 410 * text before it is executed, that will be in the data cache, and the
411 411 * instruction cache will be synchronized with that change when the
412 412 * processor actually executes those instructions. This coherency also
413 413 * covers the fact that data could show up in multiple caches.
414 414 *
415 415 * Generally, the lowest level caches are specific to a core. However, the
416 416 * last layer cache is shared between some number of cores. The number of
417 417 * CPUs sharing this last level cache is important. This has implications
418 418 * for the choices that the scheduler makes, as accessing memory that might
419 419 * be in a remote cache after thread migration can be quite expensive.
420 420 *
421 421 * Sometimes, the word cache is abbreviated with a '$', because in US
422 422 * English the word cache is pronounced the same as cash. So L1D$ refers to
423 423 * the L1 data cache, and L2$ would be the L2 cache. This will not be used
424 424 * in the rest of this theory statement for clarity.
425 425 *
426 426 * MEMORY CONTROLLER
427 427 *
428 428 * The memory controller is a component that provides access to DRAM. Each
429 429 * memory controller can access a set number of DRAM channels. Each channel
430 430 * can have a number of DIMMs (sticks of memory) associated with it. A
431 431 * given package may have more than one memory controller. The association
432 432 * of the memory controller to a group of cores is important as it is
433 433 * cheaper to access memory on the controller that you are associated with.
434 434 *
435 435 * NUMA
436 436 *
437 437 * NUMA or non-uniform memory access, describes a way that systems are
438 438 * built. On x86, any processor core can address all of the memory in the
439 439 * system. However, When using multiple sockets or possibly within a
440 440 * multi-chip module, some of that memory is physically closer and some of
441 441 * it is further. Memory that is further away is more expensive to access.
442 442 * Consider the following image of multiple sockets with memory:
443 443 *
444 444 * +--------+ +--------+
445 445 * | DIMM A | +----------+ +----------+ | DIMM D |
446 446 * +--------+-+ | | | | +-+------+-+
447 447 * | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
448 448 * +--------+-+ | | | | +-+------+-+
449 449 * | DIMM C | +----------+ +----------+ | DIMM F |
450 450 * +--------+ +--------+
451 451 *
452 452 * In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
453 453 * closer to DIMMs D-F. This means that it is cheaper for socket 0 to
454 454 * access DIMMs A-C and more expensive to access D-F as it has to go
455 455 * through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
456 456 * D-F are cheaper than A-C. While the socket form is the most common, when
457 457 * using multi-chip modules, this can also sometimes occur. For another
458 458 * example of this that's more involved, see the AMD topology section.
459 459 *
460 460 *
461 461 * Intel Topology
462 462 * --------------
463 463 *
464 464 * Most Intel processors since Nehalem, (as of this writing the current gen
465 465 * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
466 466 * the package is a single monolithic die. MCMs currently aren't used. Most
467 467 * parts have three levels of caches, with the L3 cache being shared between
468 468 * all of the cores on the package. The L1/L2 cache is generally specific to
469 469 * an individual core. The following image shows at a simplified level what
470 470 * this looks like. The memory controller is commonly part of something called
471 471 * the 'Uncore', that used to be separate physical chips that were not a part of
472 472 * the package, but are now part of the same chip.
473 473 *
474 474 * +-----------------------------------------------------------------------+
475 475 * | Package |
476 476 * | +-------------------+ +-------------------+ +-------------------+ |
477 477 * | | Core | | Core | | Core | |
478 478 * | | +--------+ +---+ | | +--------+ +---+ | | +--------+ +---+ | |
479 479 * | | | Thread | | L | | | | Thread | | L | | | | Thread | | L | | |
480 480 * | | +--------+ | 1 | | | +--------+ | 1 | | | +--------+ | 1 | | |
481 481 * | | +--------+ | | | | +--------+ | | | | +--------+ | | | |
482 482 * | | | Thread | | | | | | Thread | | | | | | Thread | | | | |
483 483 * | | +--------+ +---+ | | +--------+ +---+ | | +--------+ +---+ | |
484 484 * | | +--------------+ | | +--------------+ | | +--------------+ | |
485 485 * | | | L2 Cache | | | | L2 Cache | | | | L2 Cache | | |
486 486 * | | +--------------+ | | +--------------+ | | +--------------+ | |
487 487 * | +-------------------+ +-------------------+ +-------------------+ |
488 488 * | +-------------------------------------------------------------------+ |
489 489 * | | Shared L3 Cache | |
490 490 * | +-------------------------------------------------------------------+ |
491 491 * | +-------------------------------------------------------------------+ |
492 492 * | | Memory Controller | |
493 493 * | +-------------------------------------------------------------------+ |
494 494 * +-----------------------------------------------------------------------+
495 495 *
496 496 * A side effect of this current architecture is that what we care about from a
497 497 * scheduling and topology perspective, is simplified. In general we care about
498 498 * understanding which logical CPUs are part of the same core and socket.
499 499 *
500 500 * To determine the relationship between threads and cores, Intel initially used
501 501 * the identifier in the advanced programmable interrupt controller (APIC). They
502 502 * also added cpuid leaf 4 to give additional information about the number of
503 503 * threads and CPUs in the processor. With the addition of x2apic (which
504 504 * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
505 505 * additional cpuid topology leaf 0xB was added.
506 506 *
507 507 * AMD Topology
508 508 * ------------
509 509 *
510 510 * When discussing AMD topology, we want to break this into three distinct
511 511 * generations of topology. There's the basic topology that has been used in
512 512 * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
513 513 * with family 0x15 (Bulldozer), and there's the topology that was introduced
514 514 * with family 0x17 (Zen). AMD also has some additional terminology that's worth
515 515 * talking about.
516 516 *
517 517 * Until the introduction of family 0x17 (Zen), AMD did not implement something
518 518 * that they considered SMT. Whether or not the AMD processors have SMT
519 519 * influences many things including scheduling and reliability, availability,
520 520 * and serviceability (RAS) features.
521 521 *
522 522 * NODE
523 523 *
524 524 * AMD uses the term node to refer to a die that contains a number of cores
525 525 * and I/O resources. Depending on the processor family and model, more
526 526 * than one node can be present in the package. When there is more than one
527 527 * node this indicates a multi-chip module. Usually each node has its own
528 528 * access to memory and I/O devices. This is important and generally
529 529 * different from the corresponding Intel Nehalem-Skylake+ processors. As a
530 530 * result, we track this relationship in the operating system.
531 531 *
532 532 * In processors with an L3 cache, the L3 cache is generally shared across
533 533 * the entire node, though the way this is carved up varies from generation
534 534 * to generation.
535 535 *
536 536 * BULLDOZER
537 537 *
538 538 * Starting with the Bulldozer family (0x15) and continuing until the
539 539 * introduction of the Zen microarchitecture, AMD introduced the idea of a
540 540 * compute unit. In a compute unit, two traditional cores share a number of
541 541 * hardware resources. Critically, they share the FPU, L1 instruction
542 542 * cache, and the L2 cache. Several compute units were then combined inside
543 543 * of a single node. Because the integer execution units, L1 data cache,
544 544 * and some other resources were not shared between the cores, AMD never
545 545 * considered this to be SMT.
546 546 *
547 547 * ZEN
548 548 *
549 549 * The Zen family (0x17) uses a multi-chip module (MCM) design, the module
550 550 * is called Zeppelin. These modules are similar to the idea of nodes used
551 551 * previously. Each of these nodes has two DRAM channels which all of the
552 552 * cores in the node can access uniformly. These nodes are linked together
553 553 * in the package, creating a NUMA environment.
554 554 *
555 555 * The Zeppelin die itself contains two different 'core complexes'. Each
556 556 * core complex consists of four cores which each have two threads, for a
557 557 * total of 8 logical CPUs per complex. Unlike other generations,
558 558 * where all the logical CPUs in a given node share the L3 cache, here each
559 559 * core complex has its own shared L3 cache.
560 560 *
561 561 * A further thing that we need to consider is that in some configurations,
562 562 * particularly with the Threadripper line of processors, not every die
563 563 * actually has its memory controllers wired up to actual memory channels.
564 564 * This means that some cores have memory attached to them and others
565 565 * don't.
566 566 *
567 567 * To put Zen in perspective, consider the following images:
568 568 *
569 569 * +--------------------------------------------------------+
570 570 * | Core Complex |
571 571 * | +-------------------+ +-------------------+ +---+ |
572 572 * | | Core +----+ | | Core +----+ | | | |
573 573 * | | +--------+ | L2 | | | +--------+ | L2 | | | | |
574 574 * | | | Thread | +----+ | | | Thread | +----+ | | | |
575 575 * | | +--------+-+ +--+ | | +--------+-+ +--+ | | L | |
576 576 * | | | Thread | |L1| | | | Thread | |L1| | | 3 | |
577 577 * | | +--------+ +--+ | | +--------+ +--+ | | | |
578 578 * | +-------------------+ +-------------------+ | C | |
579 579 * | +-------------------+ +-------------------+ | a | |
580 580 * | | Core +----+ | | Core +----+ | | c | |
581 581 * | | +--------+ | L2 | | | +--------+ | L2 | | | h | |
582 582 * | | | Thread | +----+ | | | Thread | +----+ | | e | |
583 583 * | | +--------+-+ +--+ | | +--------+-+ +--+ | | | |
584 584 * | | | Thread | |L1| | | | Thread | |L1| | | | |
585 585 * | | +--------+ +--+ | | +--------+ +--+ | | | |
586 586 * | +-------------------+ +-------------------+ +---+ |
587 587 * | |
588 588 * +--------------------------------------------------------+
589 589 *
590 590 * This first image represents a single Zen core complex that consists of four
591 591 * cores.
592 592 *
593 593 *
594 594 * +--------------------------------------------------------+
595 595 * | Zeppelin Die |
596 596 * | +--------------------------------------------------+ |
597 597 * | | I/O Units (PCIe, SATA, USB, etc.) | |
598 598 * | +--------------------------------------------------+ |
599 599 * | HH |
600 600 * | +-----------+ HH +-----------+ |
601 601 * | | | HH | | |
602 602 * | | Core |==========| Core | |
603 603 * | | Complex |==========| Complex | |
604 604 * | | | HH | | |
605 605 * | +-----------+ HH +-----------+ |
606 606 * | HH |
607 607 * | +--------------------------------------------------+ |
608 608 * | | Memory Controller | |
609 609 * | +--------------------------------------------------+ |
610 610 * | |
611 611 * +--------------------------------------------------------+
612 612 *
613 613 * This image represents a single Zeppelin Die. Note how both cores are
614 614 * connected to the same memory controller and I/O units. While each core
615 615 * complex has its own L3 cache as seen in the first image, they both have
616 616 * uniform access to memory.
617 617 *
618 618 *
619 619 * PP PP
620 620 * PP PP
621 621 * +----------PP---------------------PP---------+
622 622 * | PP PP |
623 623 * | +-----------+ +-----------+ |
624 624 * | | | | | |
625 625 * MMMMMMMMM| Zeppelin |==========| Zeppelin |MMMMMMMMM
626 626 * MMMMMMMMM| Die |==========| Die |MMMMMMMMM
627 627 * | | | | | |
628 628 * | +-----------+ooo ...+-----------+ |
629 629 * | HH ooo ... HH |
630 630 * | HH oo.. HH |
631 631 * | HH ..oo HH |
632 632 * | HH ... ooo HH |
633 633 * | +-----------+... ooo+-----------+ |
634 634 * | | | | | |
635 635 * MMMMMMMMM| Zeppelin |==========| Zeppelin |MMMMMMMMM
636 636 * MMMMMMMMM| Die |==========| Die |MMMMMMMMM
637 637 * | | | | | |
638 638 * | +-----------+ +-----------+ |
639 639 * | PP PP |
640 640 * +----------PP---------------------PP---------+
641 641 * PP PP
642 642 * PP PP
643 643 *
644 644 * This image represents a single Zen package. In this example, it has four
645 645 * Zeppelin dies, though some configurations only have a single one. In this
646 646 * example, each die is directly connected to the next. Also, each die is
647 647 * represented as being connected to memory by the 'M' character and connected
648 648 * to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
649 649 * die is made up of two core complexes, we have multiple different NUMA
650 650 * domains that we care about for these systems.
651 651 *
652 652 * CPUID LEAVES
653 653 *
654 654 * There are a few different CPUID leaves that we can use to try and understand
655 655 * the actual state of the world. As part of the introduction of family 0xf, AMD
656 656 * added CPUID leaf 0x80000008. This leaf tells us the number of logical
657 657 * processors that are in the system. Because families before Zen didn't have
658 658 * SMT, this was always the number of cores that were in the system. However, it
659 659 * should always be thought of as the number of logical threads to be consistent
660 660 * between generations. In addition we also get the size of the APIC ID that is
661 661 * used to represent the number of logical processors. This is important for
662 662 * deriving topology information.
663 663 *
664 664 * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
665 665 * bit between Bulldozer and later families, but it is quite useful in
666 666 * determining the topology information. Because this information has changed
667 667 * across family generations, it's worth calling out what these mean
668 668 * explicitly. The registers have the following meanings:
669 669 *
670 670 * %eax The APIC ID. The entire register is defined to have a 32-bit
671 671 * APIC ID, even though on systems without x2apic support, it will
672 672 * be limited to 8 bits.
673 673 *
674 674 * %ebx On Bulldozer-era systems this contains information about the
675 675 * number of cores that are in a compute unit (cores that share
676 676 * resources). It also contains a per-package compute unit ID that
677 677 * identifies which compute unit the logical CPU is a part of.
678 678 *
679 679 * On Zen-era systems this instead contains the number of threads
680 680 * per core and the ID of the core that the logical CPU is a part
681 681 * of. Note, this ID is unique only to the package, it is not
682 682 * globally unique across the entire system.
683 683 *
684 684 * %ecx This contains the number of nodes that exist in the package. It
685 685 * also contains an ID that identifies which node the logical CPU
686 686 * is a part of.
687 687 *
688 688 * Finally, we also use cpuid leaf 0x8000001D to determine information about the
689 689 * cache layout to determine which logical CPUs are sharing which caches.
690 690 *
691 691 * illumos Topology
692 692 * ----------------
693 693 *
694 694 * Based on the above we synthesize the information into several different
695 695 * variables that we store in the 'struct cpuid_info'. We'll go into the details
696 696 * of what each member is supposed to represent and their uniqueness. In
697 697 * general, there are two levels of uniqueness that we care about. We care about
698 698 * an ID that is globally unique. That means that it will be unique across all
699 699 * entities in the system. For example, the default logical CPU ID is globally
700 700 * unique. On the other hand, there is some information that we only care about
701 701 * being unique within the context of a single package / socket. Here are the
702 702 * variables that we keep track of and their meaning.
703 703 *
704 704 * Several of the values that are asking for an identifier, with the exception
705 705 * of cpi_apicid, are allowed to be synthetic.
706 706 *
707 707 *
708 708 * cpi_apicid
709 709 *
710 710 * This is the value of the CPU's APIC id. This should be the full 32-bit
711 711 * ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
712 712 * APIC ID. This value is globally unique between all logical CPUs across
713 713 * all packages. This is usually required by the APIC.
714 714 *
715 715 * cpi_chipid
716 716 *
717 717 * This value indicates the ID of the package that the logical CPU is a
718 718 * part of. This value is allowed to be synthetic. It is usually derived by
719 719 * taking the CPU's APIC ID and determining how many bits are used to
720 720 * represent CPU cores in the package. All logical CPUs that are part of
721 721 * the same package must have the same value.
722 722 *
723 723 * cpi_coreid
724 724 *
725 725 * This represents the ID of a CPU core. Two logical CPUs should only have
726 726 * the same cpi_coreid value if they are part of the same core. These
727 727 * values may be synthetic. On systems that support SMT, this value is
728 728 * usually derived from the APIC ID, otherwise it is often synthetic and
729 729 * just set to the value of the cpu_id in the cpu_t.
730 730 *
731 731 * cpi_pkgcoreid
732 732 *
733 733 * This is similar to the cpi_coreid in that logical CPUs that are part of
734 734 * the same core should have the same ID. The main difference is that these
735 735 * values are only required to be unique to a given socket.
736 736 *
737 737 * cpi_clogid
738 738 *
739 739 * This represents the logical ID of a logical CPU. This value should be
740 740 * unique within a given socket for each logical CPU. This is allowed to be
741 741 * synthetic, though it is usually based off of the CPU's apic ID. The
742 742 * broader system expects that logical CPUs that have are part of the same
743 743 * core have contiguous numbers. For example, if there were two threads per
744 744 * core, then the core IDs divided by two should be the same and the first
745 745 * modulus two should be zero and the second one. For example, IDs 4 and 5
746 746 * indicate two logical CPUs that are part of the same core. But IDs 5 and
747 747 * 6 represent two logical CPUs that are part of different cores.
748 748 *
749 749 * While it is common for the cpi_coreid and the cpi_clogid to be derived
750 750 * from the same source, strictly speaking, they don't have to be and the
751 751 * two values should be considered logically independent. One should not
752 752 * try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
753 753 * some kind of relationship. While this is tempting, we've seen cases on
754 754 * AMD family 0xf where the system's cpu id is not related to its APIC ID.
755 755 *
756 756 * cpi_ncpu_per_chip
757 757 *
758 758 * This value indicates the total number of logical CPUs that exist in the
759 759 * physical package. Critically, this is not the number of logical CPUs
760 760 * that exist for just the single core.
761 761 *
762 762 * This value should be the same for all logical CPUs in the same package.
763 763 *
764 764 * cpi_ncore_per_chip
765 765 *
766 766 * This value indicates the total number of physical CPU cores that exist
767 767 * in the package. The system compares this value with cpi_ncpu_per_chip to
768 768 * determine if simultaneous multi-threading (SMT) is enabled. When
769 769 * cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
770 770 * the X86FSET_HTT feature is not set. If this value is greater than one,
771 771 * than we consider the processor to have the feature X86FSET_CMP, to
772 772 * indicate that there is support for more than one core.
773 773 *
774 774 * This value should be the same for all logical CPUs in the same package.
775 775 *
776 776 * cpi_procnodes_per_pkg
777 777 *
778 778 * This value indicates the number of 'nodes' that exist in the package.
779 779 * When processors are actually a multi-chip module, this represents the
780 780 * number of such modules that exist in the package. Currently, on Intel
781 781 * based systems this member is always set to 1.
782 782 *
783 783 * This value should be the same for all logical CPUs in the same package.
784 784 *
785 785 * cpi_procnodeid
786 786 *
787 787 * This value indicates the ID of the node that the logical CPU is a part
788 788 * of. All logical CPUs that are in the same node must have the same value
789 789 * here. This value must be unique across all of the packages in the
790 790 * system. On Intel based systems, this is currently set to the value in
791 791 * cpi_chipid because there is only one node.
792 792 *
793 793 * cpi_cores_per_compunit
794 794 *
795 795 * This value indicates the number of cores that are part of a compute
796 796 * unit. See the AMD topology section for this. This member only has real
797 797 * meaning currently for AMD Bulldozer family processors. For all other
798 798 * processors, this should currently be set to 1.
799 799 *
800 800 * cpi_compunitid
801 801 *
802 802 * This indicates the compute unit that the logical CPU belongs to. For
803 803 * processors without AMD Bulldozer-style compute units this should be set
804 804 * to the value of cpi_coreid.
805 805 *
806 806 * cpi_ncpu_shr_last_cache
807 807 *
808 808 * This indicates the number of logical CPUs that are sharing the same last
809 809 * level cache. This value should be the same for all CPUs that are sharing
810 810 * that cache. The last cache refers to the cache that is closest to memory
811 811 * and furthest away from the CPU.
812 812 *
813 813 * cpi_last_lvl_cacheid
814 814 *
815 815 * This indicates the ID of the last cache that the logical CPU uses. This
816 816 * cache is often shared between multiple logical CPUs and is the cache
817 817 * that is closest to memory and furthest away from the CPU. This value
818 818 * should be the same for a group of logical CPUs only if they actually
819 819 * share the same last level cache. IDs should not overlap between
820 820 * packages.
821 821 *
822 822 * cpi_ncore_bits
823 823 *
824 824 * This indicates the number of bits that are required to represent all of
825 825 * the cores in the system. As cores are derived based on their APIC IDs,
826 826 * we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
827 827 * this value to be larger than the actual number of IDs that are present
828 828 * in the system. This is used to size tables by the CMI framework. It is
829 829 * only filled in for Intel and AMD CPUs.
830 830 *
831 831 * cpi_nthread_bits
832 832 *
833 833 * This indicates the number of bits required to represent all of the IDs
834 834 * that cover the logical CPUs that exist on a given core. It's OK for this
835 835 * value to be larger than the actual number of IDs that are present in the
836 836 * system. This is used to size tables by the CMI framework. It is
837 837 * only filled in for Intel and AMD CPUs.
838 838 *
839 839 * -----------
840 840 * Hypervisors
841 841 * -----------
842 842 *
843 843 * If trying to manage the differences between vendors wasn't bad enough, it can
844 844 * get worse thanks to our friend hardware virtualization. Hypervisors are given
845 845 * the ability to interpose on all cpuid instructions and change them to suit
846 846 * their purposes. In general, this is necessary as the hypervisor wants to be
847 847 * able to present a more uniform set of features or not necessarily give the
848 848 * guest operating system kernel knowledge of all features so it can be
849 849 * more easily migrated between systems.
850 850 *
851 851 * When it comes to trying to determine topology information, this can be a
852 852 * double edged sword. When a hypervisor doesn't actually implement a cpuid
853 853 * leaf, it'll often return all zeros. Because of that, you'll often see various
854 854 * checks scattered about fields being non-zero before we assume we can use
855 855 * them.
856 856 *
857 857 * When it comes to topology information, the hypervisor is often incentivized
858 858 * to lie to you about topology. This is because it doesn't always actually
859 859 * guarantee that topology at all. The topology path we take in the system
860 860 * depends on how the CPU advertises itself. If it advertises itself as an Intel
861 861 * or AMD CPU, then we basically do our normal path. However, when they don't
862 862 * use an actual vendor, then that usually turns into multiple one-core CPUs
863 863 * that we enumerate that are often on different sockets. The actual behavior
864 864 * depends greatly on what the hypervisor actually exposes to us.
865 865 *
866 866 * --------------------
867 867 * Exposing Information
868 868 * --------------------
869 869 *
870 870 * We expose CPUID information in three different forms in the system.
871 871 *
872 872 * The first is through the x86_featureset variable. This is used in conjunction
873 873 * with the is_x86_feature() function. This is queried by x86-specific functions
874 874 * to determine which features are or aren't present in the system and to make
875 875 * decisions based upon them. For example, users of this include everything from
876 876 * parts of the system dedicated to reliability, availability, and
877 877 * serviceability (RAS), to making decisions about how to handle security
878 878 * mitigations, to various x86-specific drivers. General purpose or
879 879 * architecture independent drivers should never be calling this function.
880 880 *
881 881 * The second means is through the auxiliary vector. The auxiliary vector is a
882 882 * series of tagged data that the kernel passes down to a user program when it
883 883 * begins executing. This information is used to indicate to programs what
884 884 * instruction set extensions are present. For example, information about the
885 885 * CPU supporting the machine check architecture (MCA) wouldn't be passed down
886 886 * since user programs cannot make use of it. However, things like the AVX
887 887 * instruction sets are. Programs use this information to make run-time
888 888 * decisions about what features they should use. As an example, the run-time
889 889 * link-editor (rtld) can relocate different functions depending on the hardware
↓ open down ↓ |
889 lines elided |
↑ open up ↑ |
890 890 * support available.
891 891 *
892 892 * The final form is through a series of accessor functions that all have the
893 893 * form cpuid_get*. This is used by a number of different subsystems in the
894 894 * kernel to determine more detailed information about what we're running on,
895 895 * topology information, etc. Some of these subsystems include processor groups
896 896 * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
897 897 * microcode, and performance monitoring. These functions all ASSERT that the
898 898 * CPU they're being called on has reached a certain cpuid pass. If the passes
899 899 * are rearranged, then this needs to be adjusted.
900 + *
901 + * -----------------------------------------------
902 + * Speculative Execution CPU Side Channel Security
903 + * -----------------------------------------------
904 + *
905 + * With the advent of the Spectre and Meltdown attacks which exploit speculative
906 + * execution in the CPU to create side channels there have been a number of
907 + * different attacks and corresponding issues that the operating system needs to
908 + * mitigate against. The following list is some of the common, but not
909 + * exhaustive, set of issues that we know about and have done some or need to do
910 + * more work in the system to mitigate against:
911 + *
912 + * - Spectre v1
913 + * - Spectre v2
914 + * - Meltdown (Spectre v3)
915 + * - Rogue Register Read (Spectre v3a)
916 + * - Speculative Store Bypass (Spectre v4)
917 + * - ret2spec, SpectreRSB
918 + * - L1 Terminal Fault (L1TF)
919 + * - Microarchitectural Data Sampling (MDS)
920 + *
921 + * Each of these requires different sets of mitigations and has different attack
922 + * surfaces. For the most part, this discussion is about protecting the kernel
923 + * from non-kernel executing environments such as user processes and hardware
924 + * virtual machines. Unfortunately, there are a number of user vs. user
925 + * scenarios that exist with these. The rest of this section will describe the
926 + * overall approach that the system has taken to address these as well as their
927 + * shortcomings. Unfortunately, not all of the above have been handled today.
928 + *
929 + * SPECTRE FAMILY (Spectre v2, ret2spec, SpectreRSB)
930 + *
931 + * The second variant of the spectre attack focuses on performing branch target
932 + * injection. This generally impacts indirect call instructions in the system.
933 + * There are three different ways to mitigate this issue that are commonly
934 + * described today:
935 + *
936 + * 1. Using Indirect Branch Restricted Speculation (IBRS).
937 + * 2. Using Retpolines and RSB Stuffing
938 + * 3. Using Enhanced Indirect Branch Restricted Speculation (EIBRS)
939 + *
940 + * IBRS uses a feature added to microcode to restrict speculation, among other
941 + * things. This form of mitigation has not been used as it has been generally
942 + * seen as too expensive and requires reactivation upon various transitions in
943 + * the system.
944 + *
945 + * As a less impactful alternative to IBRS, retpolines were developed by
946 + * Google. These basically require one to replace indirect calls with a specific
947 + * trampoline that will cause speculation to fail and break the attack.
948 + * Retpolines require compiler support. We always build with retpolines in the
949 + * external thunk mode. This means that a traditional indirect call is replaced
950 + * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
951 + * of this is that all indirect function calls are performed through a register.
952 + *
953 + * We have to use a common external location of the thunk and not inline it into
954 + * the callsite so that way we can have a single place to patch these functions.
955 + * As it turns out, we actually have three different forms of retpolines that
956 + * exist in the system:
957 + *
958 + * 1. A full retpoline
959 + * 2. An AMD-specific optimized retpoline
960 + * 3. A no-op version
961 + *
962 + * The first one is used in the general case. The second one is used if we can
963 + * determine that we're on an AMD system and we can successfully toggle the
964 + * lfence serializing MSR that exists on the platform. Basically with this
965 + * present, an lfence is sufficient and we don't need to do anywhere near as
966 + * complicated a dance to successfully use retpolines.
967 + *
968 + * The third form described above is the most curious. It turns out that the way
969 + * that retpolines are implemented is that they rely on how speculation is
970 + * performed on a 'ret' instruction. Intel has continued to optimize this
971 + * process (which is partly why we need to have return stack buffer stuffing,
972 + * but more on that in a bit) and in processors starting with Cascade Lake
973 + * on the server side, it's dangerous to rely on retpolines. Instead, a new
974 + * mechanism has been introduced called Enhanced IBRS (EIBRS).
975 + *
976 + * Unlike IBRS, EIBRS is designed to be enabled once at boot and left on each
977 + * physical core. However, if this is the case, we don't want to use retpolines
978 + * any more. Therefore if EIBRS is present, we end up turning each retpoline
979 + * function (called a thunk) into a jmp instruction. This means that we're still
980 + * paying the cost of an extra jump to the external thunk, but it gives us
981 + * flexibility and the ability to have a single kernel image that works across a
982 + * wide variety of systems and hardware features.
983 + *
984 + * Unfortunately, this alone is insufficient. First, Skylake systems have
985 + * additional speculation for the Return Stack Buffer (RSB) which is used to
986 + * return from call instructions which retpolines take advantage of. However,
987 + * this problem is not just limited to Skylake and is actually more pernicious.
988 + * The SpectreRSB paper introduces several more problems that can arise with
989 + * dealing with this. The RSB can be poisoned just like the indirect branch
990 + * predictor. This means that one needs to clear the RSB when transitioning
991 + * between two different privilege domains. Some examples include:
992 + *
993 + * - Switching between two different user processes
994 + * - Going between user land and the kernel
995 + * - Returning to the kernel from a hardware virtual machine
996 + *
997 + * Mitigating this involves combining a couple of different things. The first is
998 + * SMEP (supervisor mode execution protection) which was introduced in Ivy
999 + * Bridge. When an RSB entry refers to a user address and we're executing in the
1000 + * kernel, speculation through it will be stopped when SMEP is enabled. This
1001 + * protects against a number of the different cases that we would normally be
1002 + * worried about such as when we enter the kernel from user land.
1003 + *
1004 + * To prevent against additional manipulation of the RSB from other contexts
1005 + * such as a non-root VMX context attacking the kernel we first look to enhanced
1006 + * IBRS. When EIBRS is present and enabled, then there is nothing else that we
1007 + * need to do to protect the kernel at this time.
1008 + *
1009 + * On CPUs without EIBRS we need to manually overwrite the contents of the
1010 + * return stack buffer. We do this through the x86_rsb_stuff() function.
1011 + * Currently this is employed on context switch. The x86_rsb_stuff() function is
1012 + * disabled when enhanced IBRS is present because Intel claims on such systems
1013 + * it will be ineffective. Stuffing the RSB in context switch helps prevent user
1014 + * to user attacks via the RSB.
1015 + *
1016 + * If SMEP is not present, then we would have to stuff the RSB every time we
1017 + * transitioned from user mode to the kernel, which isn't very practical right
1018 + * now.
1019 + *
1020 + * To fully protect user to user and vmx to vmx attacks from these classes of
1021 + * issues, we would also need to allow them to opt into performing an Indirect
1022 + * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1023 + *
1024 + * By default, the system will enable RSB stuffing and the required variant of
1025 + * retpolines and store that information in the x86_spectrev2_mitigation value.
1026 + * This will be evaluated after a microcode update as well, though it is
1027 + * expected that microcode updates will not take away features. This may mean
1028 + * that a late loaded microcode may not end up in the optimal configuration
1029 + * (though this should be rare).
1030 + *
1031 + * Currently we do not build kmdb with retpolines or perform any additional side
1032 + * channel security mitigations for it. One complication with kmdb is that it
1033 + * requires its own retpoline thunks and it would need to adjust itself based on
1034 + * what the kernel does. The threat model of kmdb is more limited and therefore
1035 + * it may make more sense to investigate using prediction barriers as the whole
1036 + * system is only executing a single instruction at a time while in kmdb.
1037 + *
1038 + * SPECTRE FAMILY (v1, v4)
1039 + *
1040 + * The v1 and v4 variants of spectre are not currently mitigated in the
1041 + * system and require other classes of changes to occur in the code.
1042 + *
1043 + * MELTDOWN
1044 + *
1045 + * Meltdown, or spectre v3, allowed a user process to read any data in their
1046 + * address space regardless of whether or not the page tables in question
1047 + * allowed the user to have the ability to read them. The solution to meltdown
1048 + * is kernel page table isolation. In this world, there are two page tables that
1049 + * are used for a process, one in user land and one in the kernel. To implement
1050 + * this we use per-CPU page tables and switch between the user and kernel
1051 + * variants when entering and exiting the kernel. For more information about
1052 + * this process and how the trampolines work, please see the big theory
1053 + * statements and additional comments in:
1054 + *
1055 + * - uts/i86pc/ml/kpti_trampolines.s
1056 + * - uts/i86pc/vm/hat_i86.c
1057 + *
1058 + * While Meltdown only impacted Intel systems and there are also Intel systems
1059 + * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1060 + * kernel page table isolation enabled. While this may at first seem weird, an
1061 + * important thing to remember is that you can't speculatively read an address
1062 + * if it's never in your page table at all. Having user processes without kernel
1063 + * pages present provides us with an important layer of defense in the kernel
1064 + * against any other side channel attacks that exist and have yet to be
1065 + * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1066 + * default, no matter the x86 system.
1067 + *
1068 + * L1 TERMINAL FAULT
1069 + *
1070 + * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1071 + * execution uses page table entries. Effectively, it is two different problems.
1072 + * The first is that it ignores the not present bit in the page table entries
1073 + * when performing speculative execution. This means that something can
1074 + * speculatively read the listed physical address if it's present in the L1
1075 + * cache under certain conditions (see Intel's documentation for the full set of
1076 + * conditions). Secondly, this can be used to bypass hardware virtualization
1077 + * extended page tables (EPT) that are part of Intel's hardware virtual machine
1078 + * instructions.
1079 + *
1080 + * For the non-hardware virtualized case, this is relatively easy to deal with.
1081 + * We must make sure that all unmapped pages have an address of zero. This means
1082 + * that they could read the first 4k of physical memory; however, we never use
1083 + * that first page in the operating system and always skip putting it in our
1084 + * memory map, even if firmware tells us we can use it in our memory map. While
1085 + * other systems try to put extra metadata in the address and reserved bits,
1086 + * which led to this being problematic in those cases, we do not.
1087 + *
1088 + * For hardware virtual machines things are more complicated. Because they can
1089 + * construct their own page tables, it isn't hard for them to perform this
1090 + * attack against any physical address. The one wrinkle is that this physical
1091 + * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1092 + * to flush the L1 data cache. We wrap this up in the function
1093 + * spec_uarch_flush(). This function is also used in the mitigation of
1094 + * microarchitectural data sampling (MDS) discussed later on. Kernel based
1095 + * hypervisors such as KVM or bhyve are responsible for performing this before
1096 + * entering the guest.
1097 + *
1098 + * Because this attack takes place in the L1 cache, there's another wrinkle
1099 + * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1100 + * designs. This means that when a thread enters a hardware virtualized context
1101 + * and flushes the L1 data cache, the other thread on the processor may then go
1102 + * ahead and put new data in it that can be potentially attacked. While one
1103 + * solution is to disable SMT on the system, another option that is available is
1104 + * to use a feature for hardware virtualization called 'SMT exclusion'. This
1105 + * goes through and makes sure that if a HVM is being scheduled on one thread,
1106 + * then the thing on the other thread is from the same hardware virtual machine.
1107 + * If an interrupt comes in or the guest exits to the broader system, then the
1108 + * other SMT thread will be kicked out.
1109 + *
1110 + * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1111 + * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1112 + * perform L1TF related mitigations.
1113 + *
1114 + * MICROARCHITECTURAL DATA SAMPLING
1115 + *
1116 + * Microarchitectural data sampling (MDS) is a combination of four discrete
1117 + * vulnerabilities that are similar issues affecting various parts of the CPU's
1118 + * microarchitectural implementation around load, store, and fill buffers.
1119 + * Specifically it is made up of the following subcomponents:
1120 + *
1121 + * 1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1122 + * 2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1123 + * 3. Microarchitectural Load Port Data Sampling (MLPDS)
1124 + * 4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1125 + *
1126 + * To begin addressing these, Intel has introduced another feature in microcode
1127 + * called MD_CLEAR. This changes the verw instruction to operate in a different
1128 + * way. This allows us to execute the verw instruction in a particular way to
1129 + * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1130 + * updated when this microcode is present to flush this state.
1131 + *
1132 + * Primarily we need to flush this state whenever we transition from the kernel
1133 + * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1134 + * little bit different. Here the structures are statically sized when a logical
1135 + * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1136 + * flush the microarchitectural state before the CPU goes idles by calling hlt,
1137 + * mwait, or another ACPI method. To perform these flushes, we call
1138 + * x86_md_clear() at all of these transition points.
1139 + *
1140 + * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1141 + * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1142 + * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1143 + * a no-op.
1144 + *
1145 + * Unfortunately, with this issue hyperthreading rears its ugly head. In
1146 + * particular, everything we've discussed above is only valid for a single
1147 + * thread executing on a core. In the case where you have hyper-threading
1148 + * present, this attack can be performed between threads. The theoretical fix
1149 + * for this is to ensure that both threads are always in the same security
1150 + * domain. This means that they are executing in the same ring and mutually
1151 + * trust each other. Practically speaking, this would mean that a system call
1152 + * would have to issue an inter-processor interrupt (IPI) to the other thread.
1153 + * Rather than implement this, we recommend that one disables hyper-threading
1154 + * through the use of psradm -aS.
1155 + *
1156 + * SUMMARY
1157 + *
1158 + * The following table attempts to summarize the mitigations for various issues
1159 + * and what's done in various places:
1160 + *
1161 + * - Spectre v1: Not currently mitigated
1162 + * - Spectre v2: Retpolines/RSB Stuffing or EIBRS if HW support
1163 + * - Meltdown: Kernel Page Table Isolation
1164 + * - Spectre v3a: Updated CPU microcode
1165 + * - Spectre v4: Not currently mitigated
1166 + * - SpectreRSB: SMEP and RSB Stuffing
1167 + * - L1TF: spec_uarch_flush, smt exclusion, requires microcode
1168 + * - MDS: x86_md_clear, requires microcode, disabling hyper threading
1169 + *
1170 + * The following table indicates the x86 feature set bits that indicate that a
1171 + * given problem has been solved or a notable feature is present:
1172 + *
1173 + * - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1174 + * - MDS_NO: All forms of MDS
900 1175 */
901 1176
902 1177 #include <sys/types.h>
903 1178 #include <sys/archsystm.h>
904 1179 #include <sys/x86_archext.h>
905 1180 #include <sys/kmem.h>
906 1181 #include <sys/systm.h>
907 1182 #include <sys/cmn_err.h>
908 1183 #include <sys/sunddi.h>
909 1184 #include <sys/sunndi.h>
910 1185 #include <sys/cpuvar.h>
911 1186 #include <sys/processor.h>
912 1187 #include <sys/sysmacros.h>
913 1188 #include <sys/pg.h>
↓ open down ↓ |
4 lines elided |
↑ open up ↑ |
914 1189 #include <sys/fp.h>
915 1190 #include <sys/controlregs.h>
916 1191 #include <sys/bitmap.h>
917 1192 #include <sys/auxv_386.h>
918 1193 #include <sys/memnode.h>
919 1194 #include <sys/pci_cfgspace.h>
920 1195 #include <sys/comm_page.h>
921 1196 #include <sys/mach_mmu.h>
922 1197 #include <sys/ucode.h>
923 1198 #include <sys/tsc.h>
1199 +#include <sys/kobj.h>
1200 +#include <sys/asm_misc.h>
924 1201
925 1202 #ifdef __xpv
926 1203 #include <sys/hypervisor.h>
927 1204 #else
928 1205 #include <sys/ontrap.h>
929 1206 #endif
930 1207
931 1208 uint_t x86_vendor = X86_VENDOR_IntelClone;
932 1209 uint_t x86_type = X86_TYPE_OTHER;
933 1210 uint_t x86_clflush_size = 0;
934 1211
935 1212 #if defined(__xpv)
936 1213 int x86_use_pcid = 0;
937 1214 int x86_use_invpcid = 0;
938 1215 #else
939 1216 int x86_use_pcid = -1;
940 1217 int x86_use_invpcid = -1;
941 1218 #endif
942 1219
1220 +typedef enum {
1221 + X86_SPECTREV2_RETPOLINE,
1222 + X86_SPECTREV2_RETPOLINE_AMD,
1223 + X86_SPECTREV2_ENHANCED_IBRS,
1224 + X86_SPECTREV2_DISABLED
1225 +} x86_spectrev2_mitigation_t;
1226 +
1227 +uint_t x86_disable_spectrev2 = 0;
1228 +static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1229 + X86_SPECTREV2_RETPOLINE;
1230 +
943 1231 uint_t pentiumpro_bug4046376;
944 1232
945 1233 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
946 1234
947 1235 static char *x86_feature_names[NUM_X86_FEATURES] = {
948 1236 "lgpg",
949 1237 "tsc",
950 1238 "msr",
951 1239 "mtrr",
952 1240 "pge",
953 1241 "de",
954 1242 "cmov",
955 1243 "mmx",
956 1244 "mca",
957 1245 "pae",
958 1246 "cv8",
959 1247 "pat",
960 1248 "sep",
961 1249 "sse",
962 1250 "sse2",
963 1251 "htt",
964 1252 "asysc",
965 1253 "nx",
966 1254 "sse3",
967 1255 "cx16",
968 1256 "cmp",
969 1257 "tscp",
970 1258 "mwait",
971 1259 "sse4a",
972 1260 "cpuid",
973 1261 "ssse3",
974 1262 "sse4_1",
975 1263 "sse4_2",
976 1264 "1gpg",
977 1265 "clfsh",
978 1266 "64",
979 1267 "aes",
980 1268 "pclmulqdq",
981 1269 "xsave",
982 1270 "avx",
983 1271 "vmx",
984 1272 "svm",
985 1273 "topoext",
986 1274 "f16c",
987 1275 "rdrand",
988 1276 "x2apic",
989 1277 "avx2",
990 1278 "bmi1",
991 1279 "bmi2",
992 1280 "fma",
993 1281 "smep",
994 1282 "smap",
995 1283 "adx",
996 1284 "rdseed",
997 1285 "mpx",
998 1286 "avx512f",
999 1287 "avx512dq",
1000 1288 "avx512pf",
1001 1289 "avx512er",
1002 1290 "avx512cd",
1003 1291 "avx512bw",
1004 1292 "avx512vl",
1005 1293 "avx512fma",
1006 1294 "avx512vbmi",
1007 1295 "avx512_vpopcntdq",
1008 1296 "avx512_4vnniw",
1009 1297 "avx512_4fmaps",
1010 1298 "xsaveopt",
1011 1299 "xsavec",
1012 1300 "xsaves",
1013 1301 "sha",
1014 1302 "umip",
1015 1303 "pku",
1016 1304 "ospke",
1017 1305 "pcid",
1018 1306 "invpcid",
1019 1307 "ibrs",
1020 1308 "ibpb",
1021 1309 "stibp",
1022 1310 "ssbd",
1023 1311 "ssbd_virt",
1024 1312 "rdcl_no",
1025 1313 "ibrs_all",
1026 1314 "rsba",
1027 1315 "ssb_no",
1028 1316 "stibp_all",
1029 1317 "flush_cmd",
1030 1318 "l1d_vmentry_no",
1031 1319 "fsgsbase",
1032 1320 "clflushopt",
1033 1321 "clwb",
1034 1322 "monitorx",
1035 1323 "clzero",
1036 1324 "xop",
1037 1325 "fma4",
1038 1326 "tbm",
1039 1327 "avx512_vnni",
1040 1328 "amd_pcec",
1041 1329 "mb_clear",
1042 1330 "mds_no",
1043 1331 "core_thermal",
1044 1332 "pkg_thermal"
1045 1333 };
1046 1334
1047 1335 boolean_t
1048 1336 is_x86_feature(void *featureset, uint_t feature)
1049 1337 {
1050 1338 ASSERT(feature < NUM_X86_FEATURES);
1051 1339 return (BT_TEST((ulong_t *)featureset, feature));
1052 1340 }
1053 1341
1054 1342 void
1055 1343 add_x86_feature(void *featureset, uint_t feature)
1056 1344 {
1057 1345 ASSERT(feature < NUM_X86_FEATURES);
1058 1346 BT_SET((ulong_t *)featureset, feature);
1059 1347 }
1060 1348
1061 1349 void
1062 1350 remove_x86_feature(void *featureset, uint_t feature)
1063 1351 {
1064 1352 ASSERT(feature < NUM_X86_FEATURES);
1065 1353 BT_CLEAR((ulong_t *)featureset, feature);
1066 1354 }
1067 1355
1068 1356 boolean_t
1069 1357 compare_x86_featureset(void *setA, void *setB)
1070 1358 {
1071 1359 /*
1072 1360 * We assume that the unused bits of the bitmap are always zero.
1073 1361 */
1074 1362 if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1075 1363 return (B_TRUE);
1076 1364 } else {
1077 1365 return (B_FALSE);
1078 1366 }
1079 1367 }
1080 1368
1081 1369 void
1082 1370 print_x86_featureset(void *featureset)
1083 1371 {
1084 1372 uint_t i;
1085 1373
1086 1374 for (i = 0; i < NUM_X86_FEATURES; i++) {
1087 1375 if (is_x86_feature(featureset, i)) {
1088 1376 cmn_err(CE_CONT, "?x86_feature: %s\n",
1089 1377 x86_feature_names[i]);
1090 1378 }
1091 1379 }
1092 1380 }
1093 1381
1094 1382 /* Note: This is the maximum size for the CPU, not the size of the structure. */
1095 1383 static size_t xsave_state_size = 0;
1096 1384 uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1097 1385 boolean_t xsave_force_disable = B_FALSE;
1098 1386 extern int disable_smap;
1099 1387
1100 1388 /*
1101 1389 * This is set to platform type we are running on.
1102 1390 */
1103 1391 static int platform_type = -1;
1104 1392
1105 1393 #if !defined(__xpv)
1106 1394 /*
1107 1395 * Variable to patch if hypervisor platform detection needs to be
1108 1396 * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1109 1397 */
1110 1398 int enable_platform_detection = 1;
1111 1399 #endif
1112 1400
1113 1401 /*
1114 1402 * monitor/mwait info.
1115 1403 *
1116 1404 * size_actual and buf_actual are the real address and size allocated to get
1117 1405 * proper mwait_buf alignement. buf_actual and size_actual should be passed
1118 1406 * to kmem_free(). Currently kmem_alloc() and mwait happen to both use
1119 1407 * processor cache-line alignment, but this is not guarantied in the furture.
1120 1408 */
1121 1409 struct mwait_info {
1122 1410 size_t mon_min; /* min size to avoid missed wakeups */
1123 1411 size_t mon_max; /* size to avoid false wakeups */
1124 1412 size_t size_actual; /* size actually allocated */
1125 1413 void *buf_actual; /* memory actually allocated */
1126 1414 uint32_t support; /* processor support of monitor/mwait */
1127 1415 };
1128 1416
1129 1417 /*
1130 1418 * xsave/xrestor info.
1131 1419 *
1132 1420 * This structure contains HW feature bits and the size of the xsave save area.
1133 1421 * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1134 1422 * (xsave_state) to describe the xsave layout. However, at runtime the
1135 1423 * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1136 1424 * xsave_state structure simply represents the legacy layout of the beginning
1137 1425 * of the xsave area.
1138 1426 */
1139 1427 struct xsave_info {
1140 1428 uint32_t xsav_hw_features_low; /* Supported HW features */
1141 1429 uint32_t xsav_hw_features_high; /* Supported HW features */
1142 1430 size_t xsav_max_size; /* max size save area for HW features */
1143 1431 size_t ymm_size; /* AVX: size of ymm save area */
1144 1432 size_t ymm_offset; /* AVX: offset for ymm save area */
1145 1433 size_t bndregs_size; /* MPX: size of bndregs save area */
1146 1434 size_t bndregs_offset; /* MPX: offset for bndregs save area */
1147 1435 size_t bndcsr_size; /* MPX: size of bndcsr save area */
1148 1436 size_t bndcsr_offset; /* MPX: offset for bndcsr save area */
1149 1437 size_t opmask_size; /* AVX512: size of opmask save */
1150 1438 size_t opmask_offset; /* AVX512: offset for opmask save */
1151 1439 size_t zmmlo_size; /* AVX512: size of zmm 256 save */
1152 1440 size_t zmmlo_offset; /* AVX512: offset for zmm 256 save */
1153 1441 size_t zmmhi_size; /* AVX512: size of zmm hi reg save */
1154 1442 size_t zmmhi_offset; /* AVX512: offset for zmm hi reg save */
1155 1443 };
1156 1444
1157 1445
1158 1446 /*
1159 1447 * These constants determine how many of the elements of the
1160 1448 * cpuid we cache in the cpuid_info data structure; the
1161 1449 * remaining elements are accessible via the cpuid instruction.
1162 1450 */
1163 1451
1164 1452 #define NMAX_CPI_STD 8 /* eax = 0 .. 7 */
1165 1453 #define NMAX_CPI_EXTD 0x1f /* eax = 0x80000000 .. 0x8000001e */
1166 1454
1167 1455 /*
1168 1456 * See the big theory statement for a more detailed explanation of what some of
1169 1457 * these members mean.
1170 1458 */
1171 1459 struct cpuid_info {
1172 1460 uint_t cpi_pass; /* last pass completed */
1173 1461 /*
1174 1462 * standard function information
1175 1463 */
1176 1464 uint_t cpi_maxeax; /* fn 0: %eax */
1177 1465 char cpi_vendorstr[13]; /* fn 0: %ebx:%ecx:%edx */
1178 1466 uint_t cpi_vendor; /* enum of cpi_vendorstr */
1179 1467
1180 1468 uint_t cpi_family; /* fn 1: extended family */
1181 1469 uint_t cpi_model; /* fn 1: extended model */
1182 1470 uint_t cpi_step; /* fn 1: stepping */
1183 1471 chipid_t cpi_chipid; /* fn 1: %ebx: Intel: chip # */
1184 1472 /* AMD: package/socket # */
1185 1473 uint_t cpi_brandid; /* fn 1: %ebx: brand ID */
1186 1474 int cpi_clogid; /* fn 1: %ebx: thread # */
1187 1475 uint_t cpi_ncpu_per_chip; /* fn 1: %ebx: logical cpu count */
1188 1476 uint8_t cpi_cacheinfo[16]; /* fn 2: intel-style cache desc */
1189 1477 uint_t cpi_ncache; /* fn 2: number of elements */
1190 1478 uint_t cpi_ncpu_shr_last_cache; /* fn 4: %eax: ncpus sharing cache */
1191 1479 id_t cpi_last_lvl_cacheid; /* fn 4: %eax: derived cache id */
1192 1480 uint_t cpi_cache_leaf_size; /* Number of cache elements */
1193 1481 /* Intel fn: 4, AMD fn: 8000001d */
1194 1482 struct cpuid_regs **cpi_cache_leaves; /* Acual leaves from above */
1195 1483 struct cpuid_regs cpi_std[NMAX_CPI_STD]; /* 0 .. 7 */
1196 1484 /*
1197 1485 * extended function information
1198 1486 */
1199 1487 uint_t cpi_xmaxeax; /* fn 0x80000000: %eax */
1200 1488 char cpi_brandstr[49]; /* fn 0x8000000[234] */
1201 1489 uint8_t cpi_pabits; /* fn 0x80000006: %eax */
1202 1490 uint8_t cpi_vabits; /* fn 0x80000006: %eax */
1203 1491 uint8_t cpi_fp_amd_save; /* AMD: FP error pointer save rqd. */
1204 1492 struct cpuid_regs cpi_extd[NMAX_CPI_EXTD]; /* 0x800000XX */
1205 1493
1206 1494 id_t cpi_coreid; /* same coreid => strands share core */
1207 1495 int cpi_pkgcoreid; /* core number within single package */
1208 1496 uint_t cpi_ncore_per_chip; /* AMD: fn 0x80000008: %ecx[7-0] */
1209 1497 /* Intel: fn 4: %eax[31-26] */
1210 1498
1211 1499 /*
1212 1500 * These values represent the number of bits that are required to store
1213 1501 * information about the number of cores and threads.
1214 1502 */
1215 1503 uint_t cpi_ncore_bits;
1216 1504 uint_t cpi_nthread_bits;
1217 1505 /*
1218 1506 * supported feature information
1219 1507 */
1220 1508 uint32_t cpi_support[6];
1221 1509 #define STD_EDX_FEATURES 0
1222 1510 #define AMD_EDX_FEATURES 1
1223 1511 #define TM_EDX_FEATURES 2
1224 1512 #define STD_ECX_FEATURES 3
1225 1513 #define AMD_ECX_FEATURES 4
1226 1514 #define STD_EBX_FEATURES 5
1227 1515 /*
1228 1516 * Synthesized information, where known.
1229 1517 */
1230 1518 uint32_t cpi_chiprev; /* See X86_CHIPREV_* in x86_archext.h */
1231 1519 const char *cpi_chiprevstr; /* May be NULL if chiprev unknown */
1232 1520 uint32_t cpi_socket; /* Chip package/socket type */
1233 1521
1234 1522 struct mwait_info cpi_mwait; /* fn 5: monitor/mwait info */
1235 1523 uint32_t cpi_apicid;
1236 1524 uint_t cpi_procnodeid; /* AMD: nodeID on HT, Intel: chipid */
1237 1525 uint_t cpi_procnodes_per_pkg; /* AMD: # of nodes in the package */
1238 1526 /* Intel: 1 */
1239 1527 uint_t cpi_compunitid; /* AMD: ComputeUnit ID, Intel: coreid */
1240 1528 uint_t cpi_cores_per_compunit; /* AMD: # of cores in the ComputeUnit */
1241 1529
1242 1530 struct xsave_info cpi_xsave; /* fn D: xsave/xrestor info */
1243 1531 };
1244 1532
1245 1533
1246 1534 static struct cpuid_info cpuid_info0;
1247 1535
1248 1536 /*
1249 1537 * These bit fields are defined by the Intel Application Note AP-485
1250 1538 * "Intel Processor Identification and the CPUID Instruction"
1251 1539 */
1252 1540 #define CPI_FAMILY_XTD(cpi) BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1253 1541 #define CPI_MODEL_XTD(cpi) BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1254 1542 #define CPI_TYPE(cpi) BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1255 1543 #define CPI_FAMILY(cpi) BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1256 1544 #define CPI_STEP(cpi) BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1257 1545 #define CPI_MODEL(cpi) BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1258 1546
1259 1547 #define CPI_FEATURES_EDX(cpi) ((cpi)->cpi_std[1].cp_edx)
1260 1548 #define CPI_FEATURES_ECX(cpi) ((cpi)->cpi_std[1].cp_ecx)
1261 1549 #define CPI_FEATURES_XTD_EDX(cpi) ((cpi)->cpi_extd[1].cp_edx)
1262 1550 #define CPI_FEATURES_XTD_ECX(cpi) ((cpi)->cpi_extd[1].cp_ecx)
1263 1551 #define CPI_FEATURES_7_0_EBX(cpi) ((cpi)->cpi_std[7].cp_ebx)
1264 1552 #define CPI_FEATURES_7_0_ECX(cpi) ((cpi)->cpi_std[7].cp_ecx)
1265 1553 #define CPI_FEATURES_7_0_EDX(cpi) ((cpi)->cpi_std[7].cp_edx)
1266 1554
1267 1555 #define CPI_BRANDID(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1268 1556 #define CPI_CHUNKS(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1269 1557 #define CPI_CPU_COUNT(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1270 1558 #define CPI_APIC_ID(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1271 1559
1272 1560 #define CPI_MAXEAX_MAX 0x100 /* sanity control */
1273 1561 #define CPI_XMAXEAX_MAX 0x80000100
1274 1562 #define CPI_FN4_ECX_MAX 0x20 /* sanity: max fn 4 levels */
1275 1563 #define CPI_FNB_ECX_MAX 0x20 /* sanity: max fn B levels */
1276 1564
1277 1565 /*
1278 1566 * Function 4 (Deterministic Cache Parameters) macros
1279 1567 * Defined by Intel Application Note AP-485
1280 1568 */
1281 1569 #define CPI_NUM_CORES(regs) BITX((regs)->cp_eax, 31, 26)
1282 1570 #define CPI_NTHR_SHR_CACHE(regs) BITX((regs)->cp_eax, 25, 14)
1283 1571 #define CPI_FULL_ASSOC_CACHE(regs) BITX((regs)->cp_eax, 9, 9)
1284 1572 #define CPI_SELF_INIT_CACHE(regs) BITX((regs)->cp_eax, 8, 8)
1285 1573 #define CPI_CACHE_LVL(regs) BITX((regs)->cp_eax, 7, 5)
1286 1574 #define CPI_CACHE_TYPE(regs) BITX((regs)->cp_eax, 4, 0)
1287 1575 #define CPI_CPU_LEVEL_TYPE(regs) BITX((regs)->cp_ecx, 15, 8)
1288 1576
1289 1577 #define CPI_CACHE_WAYS(regs) BITX((regs)->cp_ebx, 31, 22)
1290 1578 #define CPI_CACHE_PARTS(regs) BITX((regs)->cp_ebx, 21, 12)
1291 1579 #define CPI_CACHE_COH_LN_SZ(regs) BITX((regs)->cp_ebx, 11, 0)
1292 1580
1293 1581 #define CPI_CACHE_SETS(regs) BITX((regs)->cp_ecx, 31, 0)
1294 1582
1295 1583 #define CPI_PREFCH_STRIDE(regs) BITX((regs)->cp_edx, 9, 0)
1296 1584
1297 1585
1298 1586 /*
1299 1587 * A couple of shorthand macros to identify "later" P6-family chips
1300 1588 * like the Pentium M and Core. First, the "older" P6-based stuff
1301 1589 * (loosely defined as "pre-Pentium-4"):
1302 1590 * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1303 1591 */
1304 1592 #define IS_LEGACY_P6(cpi) ( \
1305 1593 cpi->cpi_family == 6 && \
1306 1594 (cpi->cpi_model == 1 || \
1307 1595 cpi->cpi_model == 3 || \
1308 1596 cpi->cpi_model == 5 || \
1309 1597 cpi->cpi_model == 6 || \
1310 1598 cpi->cpi_model == 7 || \
1311 1599 cpi->cpi_model == 8 || \
1312 1600 cpi->cpi_model == 0xA || \
1313 1601 cpi->cpi_model == 0xB) \
1314 1602 )
1315 1603
1316 1604 /* A "new F6" is everything with family 6 that's not the above */
1317 1605 #define IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
1318 1606
1319 1607 /* Extended family/model support */
1320 1608 #define IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
1321 1609 cpi->cpi_family >= 0xf)
1322 1610
1323 1611 /*
1324 1612 * Info for monitor/mwait idle loop.
1325 1613 *
1326 1614 * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
1327 1615 * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
1328 1616 * 2006.
1329 1617 * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
1330 1618 * Documentation Updates" #33633, Rev 2.05, December 2006.
1331 1619 */
1332 1620 #define MWAIT_SUPPORT (0x00000001) /* mwait supported */
1333 1621 #define MWAIT_EXTENSIONS (0x00000002) /* extenstion supported */
1334 1622 #define MWAIT_ECX_INT_ENABLE (0x00000004) /* ecx 1 extension supported */
1335 1623 #define MWAIT_SUPPORTED(cpi) ((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
1336 1624 #define MWAIT_INT_ENABLE(cpi) ((cpi)->cpi_std[5].cp_ecx & 0x2)
1337 1625 #define MWAIT_EXTENSION(cpi) ((cpi)->cpi_std[5].cp_ecx & 0x1)
1338 1626 #define MWAIT_SIZE_MIN(cpi) BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
1339 1627 #define MWAIT_SIZE_MAX(cpi) BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
1340 1628 /*
1341 1629 * Number of sub-cstates for a given c-state.
1342 1630 */
1343 1631 #define MWAIT_NUM_SUBC_STATES(cpi, c_state) \
1344 1632 BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
1345 1633
1346 1634 /*
1347 1635 * XSAVE leaf 0xD enumeration
1348 1636 */
1349 1637 #define CPUID_LEAFD_2_YMM_OFFSET 576
1350 1638 #define CPUID_LEAFD_2_YMM_SIZE 256
1351 1639
1352 1640 /*
1353 1641 * Common extended leaf names to cut down on typos.
1354 1642 */
1355 1643 #define CPUID_LEAF_EXT_0 0x80000000
1356 1644 #define CPUID_LEAF_EXT_8 0x80000008
1357 1645 #define CPUID_LEAF_EXT_1d 0x8000001d
1358 1646 #define CPUID_LEAF_EXT_1e 0x8000001e
1359 1647
1360 1648 /*
1361 1649 * Functions we consune from cpuid_subr.c; don't publish these in a header
1362 1650 * file to try and keep people using the expected cpuid_* interfaces.
1363 1651 */
1364 1652 extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
1365 1653 extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
1366 1654 extern uint32_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
1367 1655 extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
1368 1656 extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
1369 1657
1370 1658 /*
1371 1659 * Apply up various platform-dependent restrictions where the
1372 1660 * underlying platform restrictions mean the CPU can be marked
1373 1661 * as less capable than its cpuid instruction would imply.
1374 1662 */
1375 1663 #if defined(__xpv)
1376 1664 static void
1377 1665 platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
1378 1666 {
1379 1667 switch (eax) {
1380 1668 case 1: {
1381 1669 uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
1382 1670 0 : CPUID_INTC_EDX_MCA;
1383 1671 cp->cp_edx &=
1384 1672 ~(mcamask |
1385 1673 CPUID_INTC_EDX_PSE |
1386 1674 CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1387 1675 CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
1388 1676 CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
1389 1677 CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1390 1678 CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
1391 1679 break;
1392 1680 }
1393 1681
1394 1682 case 0x80000001:
1395 1683 cp->cp_edx &=
1396 1684 ~(CPUID_AMD_EDX_PSE |
1397 1685 CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1398 1686 CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
1399 1687 CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
1400 1688 CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1401 1689 CPUID_AMD_EDX_TSCP);
1402 1690 cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
1403 1691 break;
1404 1692 default:
1405 1693 break;
1406 1694 }
1407 1695
1408 1696 switch (vendor) {
1409 1697 case X86_VENDOR_Intel:
1410 1698 switch (eax) {
1411 1699 case 4:
1412 1700 /*
1413 1701 * Zero out the (ncores-per-chip - 1) field
1414 1702 */
1415 1703 cp->cp_eax &= 0x03fffffff;
1416 1704 break;
1417 1705 default:
1418 1706 break;
1419 1707 }
1420 1708 break;
1421 1709 case X86_VENDOR_AMD:
1422 1710 switch (eax) {
1423 1711
1424 1712 case 0x80000001:
1425 1713 cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
1426 1714 break;
1427 1715
1428 1716 case CPUID_LEAF_EXT_8:
1429 1717 /*
1430 1718 * Zero out the (ncores-per-chip - 1) field
1431 1719 */
1432 1720 cp->cp_ecx &= 0xffffff00;
1433 1721 break;
1434 1722 default:
1435 1723 break;
1436 1724 }
1437 1725 break;
1438 1726 default:
1439 1727 break;
1440 1728 }
1441 1729 }
1442 1730 #else
1443 1731 #define platform_cpuid_mangle(vendor, eax, cp) /* nothing */
1444 1732 #endif
1445 1733
1446 1734 /*
1447 1735 * Some undocumented ways of patching the results of the cpuid
1448 1736 * instruction to permit running Solaris 10 on future cpus that
1449 1737 * we don't currently support. Could be set to non-zero values
1450 1738 * via settings in eeprom.
1451 1739 */
1452 1740
1453 1741 uint32_t cpuid_feature_ecx_include;
1454 1742 uint32_t cpuid_feature_ecx_exclude;
1455 1743 uint32_t cpuid_feature_edx_include;
1456 1744 uint32_t cpuid_feature_edx_exclude;
1457 1745
1458 1746 /*
1459 1747 * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
1460 1748 */
1461 1749 void
1462 1750 cpuid_alloc_space(cpu_t *cpu)
1463 1751 {
1464 1752 /*
1465 1753 * By convention, cpu0 is the boot cpu, which is set up
1466 1754 * before memory allocation is available. All other cpus get
1467 1755 * their cpuid_info struct allocated here.
1468 1756 */
1469 1757 ASSERT(cpu->cpu_id != 0);
1470 1758 ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
1471 1759 cpu->cpu_m.mcpu_cpi =
1472 1760 kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
1473 1761 }
1474 1762
1475 1763 void
1476 1764 cpuid_free_space(cpu_t *cpu)
1477 1765 {
1478 1766 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1479 1767 int i;
1480 1768
1481 1769 ASSERT(cpi != NULL);
1482 1770 ASSERT(cpi != &cpuid_info0);
1483 1771
1484 1772 /*
1485 1773 * Free up any cache leaf related dynamic storage. The first entry was
1486 1774 * cached from the standard cpuid storage, so we should not free it.
1487 1775 */
1488 1776 for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
1489 1777 kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
1490 1778 if (cpi->cpi_cache_leaf_size > 0)
1491 1779 kmem_free(cpi->cpi_cache_leaves,
1492 1780 cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
1493 1781
1494 1782 kmem_free(cpi, sizeof (*cpi));
1495 1783 cpu->cpu_m.mcpu_cpi = NULL;
1496 1784 }
1497 1785
1498 1786 #if !defined(__xpv)
1499 1787 /*
1500 1788 * Determine the type of the underlying platform. This is used to customize
1501 1789 * initialization of various subsystems (e.g. TSC). determine_platform() must
1502 1790 * only ever be called once to prevent two processors from seeing different
1503 1791 * values of platform_type. Must be called before cpuid_pass1(), the earliest
1504 1792 * consumer to execute (uses _cpuid_chiprev --> synth_amd_info --> get_hwenv).
1505 1793 */
1506 1794 void
1507 1795 determine_platform(void)
1508 1796 {
1509 1797 struct cpuid_regs cp;
1510 1798 uint32_t base;
1511 1799 uint32_t regs[4];
1512 1800 char *hvstr = (char *)regs;
1513 1801
1514 1802 ASSERT(platform_type == -1);
1515 1803
1516 1804 platform_type = HW_NATIVE;
1517 1805
1518 1806 if (!enable_platform_detection)
1519 1807 return;
1520 1808
1521 1809 /*
1522 1810 * If Hypervisor CPUID bit is set, try to determine hypervisor
1523 1811 * vendor signature, and set platform type accordingly.
1524 1812 *
1525 1813 * References:
1526 1814 * http://lkml.org/lkml/2008/10/1/246
1527 1815 * http://kb.vmware.com/kb/1009458
1528 1816 */
1529 1817 cp.cp_eax = 0x1;
1530 1818 (void) __cpuid_insn(&cp);
1531 1819 if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
1532 1820 cp.cp_eax = 0x40000000;
1533 1821 (void) __cpuid_insn(&cp);
1534 1822 regs[0] = cp.cp_ebx;
1535 1823 regs[1] = cp.cp_ecx;
1536 1824 regs[2] = cp.cp_edx;
1537 1825 regs[3] = 0;
1538 1826 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
1539 1827 platform_type = HW_XEN_HVM;
1540 1828 return;
1541 1829 }
1542 1830 if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
1543 1831 platform_type = HW_VMWARE;
1544 1832 return;
1545 1833 }
1546 1834 if (strcmp(hvstr, HVSIG_KVM) == 0) {
1547 1835 platform_type = HW_KVM;
1548 1836 return;
1549 1837 }
1550 1838 if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
1551 1839 platform_type = HW_BHYVE;
1552 1840 return;
1553 1841 }
1554 1842 if (strcmp(hvstr, HVSIG_MICROSOFT) == 0)
1555 1843 platform_type = HW_MICROSOFT;
1556 1844 } else {
1557 1845 /*
1558 1846 * Check older VMware hardware versions. VMware hypervisor is
1559 1847 * detected by performing an IN operation to VMware hypervisor
1560 1848 * port and checking that value returned in %ebx is VMware
1561 1849 * hypervisor magic value.
1562 1850 *
1563 1851 * References: http://kb.vmware.com/kb/1009458
1564 1852 */
1565 1853 vmware_port(VMWARE_HVCMD_GETVERSION, regs);
1566 1854 if (regs[1] == VMWARE_HVMAGIC) {
1567 1855 platform_type = HW_VMWARE;
1568 1856 return;
1569 1857 }
1570 1858 }
1571 1859
1572 1860 /*
1573 1861 * Check Xen hypervisor. In a fully virtualized domain,
1574 1862 * Xen's pseudo-cpuid function returns a string representing the
1575 1863 * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
1576 1864 * supported cpuid function. We need at least a (base + 2) leaf value
1577 1865 * to do what we want to do. Try different base values, since the
1578 1866 * hypervisor might use a different one depending on whether Hyper-V
1579 1867 * emulation is switched on by default or not.
1580 1868 */
1581 1869 for (base = 0x40000000; base < 0x40010000; base += 0x100) {
1582 1870 cp.cp_eax = base;
1583 1871 (void) __cpuid_insn(&cp);
1584 1872 regs[0] = cp.cp_ebx;
1585 1873 regs[1] = cp.cp_ecx;
1586 1874 regs[2] = cp.cp_edx;
1587 1875 regs[3] = 0;
1588 1876 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
1589 1877 cp.cp_eax >= (base + 2)) {
1590 1878 platform_type &= ~HW_NATIVE;
1591 1879 platform_type |= HW_XEN_HVM;
1592 1880 return;
1593 1881 }
1594 1882 }
1595 1883 }
1596 1884
1597 1885 int
1598 1886 get_hwenv(void)
1599 1887 {
1600 1888 ASSERT(platform_type != -1);
1601 1889 return (platform_type);
1602 1890 }
1603 1891
1604 1892 int
1605 1893 is_controldom(void)
1606 1894 {
1607 1895 return (0);
1608 1896 }
1609 1897
1610 1898 #else
1611 1899
1612 1900 int
1613 1901 get_hwenv(void)
1614 1902 {
1615 1903 return (HW_XEN_PV);
1616 1904 }
1617 1905
1618 1906 int
1619 1907 is_controldom(void)
1620 1908 {
1621 1909 return (DOMAIN_IS_INITDOMAIN(xen_info));
1622 1910 }
1623 1911
1624 1912 #endif /* __xpv */
1625 1913
1626 1914 /*
1627 1915 * Make sure that we have gathered all of the CPUID leaves that we might need to
1628 1916 * determine topology. We assume that the standard leaf 1 has already been done
1629 1917 * and that xmaxeax has already been calculated.
1630 1918 */
1631 1919 static void
1632 1920 cpuid_gather_amd_topology_leaves(cpu_t *cpu)
1633 1921 {
1634 1922 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1635 1923
1636 1924 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
1637 1925 struct cpuid_regs *cp;
1638 1926
1639 1927 cp = &cpi->cpi_extd[8];
1640 1928 cp->cp_eax = CPUID_LEAF_EXT_8;
1641 1929 (void) __cpuid_insn(cp);
1642 1930 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
1643 1931 }
1644 1932
1645 1933 if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
1646 1934 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
1647 1935 struct cpuid_regs *cp;
1648 1936
1649 1937 cp = &cpi->cpi_extd[0x1e];
1650 1938 cp->cp_eax = CPUID_LEAF_EXT_1e;
1651 1939 (void) __cpuid_insn(cp);
1652 1940 }
1653 1941 }
1654 1942
1655 1943 /*
1656 1944 * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
1657 1945 * it to everything else. If not, and we're on an AMD system where 8000001e is
1658 1946 * valid, then we use that. Othewrise, we fall back to the default value for the
1659 1947 * APIC ID in leaf 1.
1660 1948 */
1661 1949 static uint32_t
1662 1950 cpuid_gather_apicid(struct cpuid_info *cpi)
1663 1951 {
1664 1952 /*
1665 1953 * Leaf B changes based on the arguments to it. Beacuse we don't cache
1666 1954 * it, we need to gather it again.
1667 1955 */
1668 1956 if (cpi->cpi_maxeax >= 0xB) {
1669 1957 struct cpuid_regs regs;
1670 1958 struct cpuid_regs *cp;
1671 1959
1672 1960 cp = ®s;
1673 1961 cp->cp_eax = 0xB;
1674 1962 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
1675 1963 (void) __cpuid_insn(cp);
1676 1964
1677 1965 if (cp->cp_ebx != 0) {
1678 1966 return (cp->cp_edx);
1679 1967 }
1680 1968 }
1681 1969
1682 1970 if (cpi->cpi_vendor == X86_VENDOR_AMD &&
1683 1971 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
1684 1972 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
1685 1973 return (cpi->cpi_extd[0x1e].cp_eax);
1686 1974 }
1687 1975
1688 1976 return (CPI_APIC_ID(cpi));
1689 1977 }
1690 1978
1691 1979 /*
1692 1980 * For AMD processors, attempt to calculate the number of chips and cores that
1693 1981 * exist. The way that we do this varies based on the generation, because the
1694 1982 * generations themselves have changed dramatically.
1695 1983 *
1696 1984 * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
1697 1985 * However, with the advent of family 17h (Zen) it actually tells us the number
1698 1986 * of threads, so we need to look at leaf 0x8000001e if available to determine
1699 1987 * its value. Otherwise, for all prior families, the number of enabled cores is
1700 1988 * the same as threads.
1701 1989 *
1702 1990 * If we do not have leaf 0x80000008, then we assume that this processor does
1703 1991 * not have anything. AMD's older CPUID specification says there's no reason to
1704 1992 * fall back to leaf 1.
1705 1993 *
1706 1994 * In some virtualization cases we will not have leaf 8000001e or it will be
1707 1995 * zero. When that happens we assume the number of threads is one.
1708 1996 */
1709 1997 static void
1710 1998 cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
1711 1999 {
1712 2000 uint_t nthreads, nthread_per_core;
1713 2001
1714 2002 nthreads = nthread_per_core = 1;
1715 2003
1716 2004 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
1717 2005 nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
1718 2006 } else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
1719 2007 nthreads = CPI_CPU_COUNT(cpi);
1720 2008 }
1721 2009
1722 2010 /*
1723 2011 * For us to have threads, and know about it, we have to be at least at
1724 2012 * family 17h and have the cpuid bit that says we have extended
1725 2013 * topology.
1726 2014 */
1727 2015 if (cpi->cpi_family >= 0x17 &&
1728 2016 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
1729 2017 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
1730 2018 nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
1731 2019 }
1732 2020
1733 2021 *ncpus = nthreads;
1734 2022 *ncores = nthreads / nthread_per_core;
1735 2023 }
1736 2024
1737 2025 /*
1738 2026 * Seed the initial values for the cores and threads for an Intel based
1739 2027 * processor. These values will be overwritten if we detect that the processor
1740 2028 * supports CPUID leaf 0xb.
1741 2029 */
1742 2030 static void
1743 2031 cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
1744 2032 {
1745 2033 /*
1746 2034 * Only seed the number of physical cores from the first level leaf 4
1747 2035 * information. The number of threads there indicate how many share the
1748 2036 * L1 cache, which may or may not have anything to do with the number of
1749 2037 * logical CPUs per core.
1750 2038 */
1751 2039 if (cpi->cpi_maxeax >= 4) {
1752 2040 *ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
1753 2041 } else {
1754 2042 *ncores = 1;
1755 2043 }
1756 2044
1757 2045 if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
1758 2046 *ncpus = CPI_CPU_COUNT(cpi);
1759 2047 } else {
1760 2048 *ncpus = *ncores;
1761 2049 }
1762 2050 }
1763 2051
1764 2052 static boolean_t
1765 2053 cpuid_leafB_getids(cpu_t *cpu)
1766 2054 {
1767 2055 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1768 2056 struct cpuid_regs regs;
1769 2057 struct cpuid_regs *cp;
1770 2058
1771 2059 if (cpi->cpi_maxeax < 0xB)
1772 2060 return (B_FALSE);
1773 2061
1774 2062 cp = ®s;
1775 2063 cp->cp_eax = 0xB;
1776 2064 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
1777 2065
1778 2066 (void) __cpuid_insn(cp);
1779 2067
1780 2068 /*
1781 2069 * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
1782 2070 * indicates that the extended topology enumeration leaf is
1783 2071 * available.
1784 2072 */
1785 2073 if (cp->cp_ebx != 0) {
1786 2074 uint32_t x2apic_id = 0;
1787 2075 uint_t coreid_shift = 0;
1788 2076 uint_t ncpu_per_core = 1;
1789 2077 uint_t chipid_shift = 0;
1790 2078 uint_t ncpu_per_chip = 1;
1791 2079 uint_t i;
1792 2080 uint_t level;
1793 2081
1794 2082 for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
1795 2083 cp->cp_eax = 0xB;
1796 2084 cp->cp_ecx = i;
1797 2085
1798 2086 (void) __cpuid_insn(cp);
1799 2087 level = CPI_CPU_LEVEL_TYPE(cp);
1800 2088
1801 2089 if (level == 1) {
1802 2090 x2apic_id = cp->cp_edx;
1803 2091 coreid_shift = BITX(cp->cp_eax, 4, 0);
1804 2092 ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
1805 2093 } else if (level == 2) {
1806 2094 x2apic_id = cp->cp_edx;
1807 2095 chipid_shift = BITX(cp->cp_eax, 4, 0);
1808 2096 ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
1809 2097 }
1810 2098 }
1811 2099
1812 2100 /*
1813 2101 * cpi_apicid is taken care of in cpuid_gather_apicid.
1814 2102 */
1815 2103 cpi->cpi_ncpu_per_chip = ncpu_per_chip;
1816 2104 cpi->cpi_ncore_per_chip = ncpu_per_chip /
1817 2105 ncpu_per_core;
1818 2106 cpi->cpi_chipid = x2apic_id >> chipid_shift;
1819 2107 cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
1820 2108 cpi->cpi_coreid = x2apic_id >> coreid_shift;
1821 2109 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
1822 2110 cpi->cpi_procnodeid = cpi->cpi_chipid;
1823 2111 cpi->cpi_compunitid = cpi->cpi_coreid;
1824 2112
1825 2113 if (coreid_shift > 0 && chipid_shift > coreid_shift) {
1826 2114 cpi->cpi_nthread_bits = coreid_shift;
1827 2115 cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
1828 2116 }
1829 2117
1830 2118 return (B_TRUE);
1831 2119 } else {
1832 2120 return (B_FALSE);
1833 2121 }
1834 2122 }
1835 2123
1836 2124 static void
1837 2125 cpuid_intel_getids(cpu_t *cpu, void *feature)
1838 2126 {
1839 2127 uint_t i;
1840 2128 uint_t chipid_shift = 0;
1841 2129 uint_t coreid_shift = 0;
1842 2130 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1843 2131
1844 2132 /*
1845 2133 * There are no compute units or processor nodes currently on Intel.
1846 2134 * Always set these to one.
1847 2135 */
1848 2136 cpi->cpi_procnodes_per_pkg = 1;
1849 2137 cpi->cpi_cores_per_compunit = 1;
1850 2138
1851 2139 /*
1852 2140 * If cpuid Leaf B is present, use that to try and get this information.
1853 2141 * It will be the most accurate for Intel CPUs.
1854 2142 */
1855 2143 if (cpuid_leafB_getids(cpu))
1856 2144 return;
1857 2145
1858 2146 /*
1859 2147 * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
1860 2148 * and ncore_per_chip. These represent the largest power of two values
1861 2149 * that we need to cover all of the IDs in the system. Therefore, we use
1862 2150 * those values to seed the number of bits needed to cover information
1863 2151 * in the case when leaf B is not available. These values will probably
1864 2152 * be larger than required, but that's OK.
1865 2153 */
1866 2154 cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
1867 2155 cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
1868 2156
1869 2157 for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
1870 2158 chipid_shift++;
1871 2159
1872 2160 cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
1873 2161 cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
1874 2162
1875 2163 if (is_x86_feature(feature, X86FSET_CMP)) {
1876 2164 /*
1877 2165 * Multi-core (and possibly multi-threaded)
1878 2166 * processors.
1879 2167 */
1880 2168 uint_t ncpu_per_core;
1881 2169 if (cpi->cpi_ncore_per_chip == 1)
1882 2170 ncpu_per_core = cpi->cpi_ncpu_per_chip;
1883 2171 else if (cpi->cpi_ncore_per_chip > 1)
1884 2172 ncpu_per_core = cpi->cpi_ncpu_per_chip /
1885 2173 cpi->cpi_ncore_per_chip;
1886 2174 /*
1887 2175 * 8bit APIC IDs on dual core Pentiums
1888 2176 * look like this:
1889 2177 *
1890 2178 * +-----------------------+------+------+
1891 2179 * | Physical Package ID | MC | HT |
1892 2180 * +-----------------------+------+------+
1893 2181 * <------- chipid -------->
1894 2182 * <------- coreid --------------->
1895 2183 * <--- clogid -->
1896 2184 * <------>
1897 2185 * pkgcoreid
1898 2186 *
1899 2187 * Where the number of bits necessary to
1900 2188 * represent MC and HT fields together equals
1901 2189 * to the minimum number of bits necessary to
1902 2190 * store the value of cpi->cpi_ncpu_per_chip.
1903 2191 * Of those bits, the MC part uses the number
1904 2192 * of bits necessary to store the value of
1905 2193 * cpi->cpi_ncore_per_chip.
1906 2194 */
1907 2195 for (i = 1; i < ncpu_per_core; i <<= 1)
1908 2196 coreid_shift++;
1909 2197 cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
1910 2198 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
1911 2199 } else if (is_x86_feature(feature, X86FSET_HTT)) {
1912 2200 /*
1913 2201 * Single-core multi-threaded processors.
1914 2202 */
1915 2203 cpi->cpi_coreid = cpi->cpi_chipid;
1916 2204 cpi->cpi_pkgcoreid = 0;
1917 2205 } else {
1918 2206 /*
1919 2207 * Single-core single-thread processors.
1920 2208 */
1921 2209 cpi->cpi_coreid = cpu->cpu_id;
1922 2210 cpi->cpi_pkgcoreid = 0;
1923 2211 }
1924 2212 cpi->cpi_procnodeid = cpi->cpi_chipid;
1925 2213 cpi->cpi_compunitid = cpi->cpi_coreid;
1926 2214 }
1927 2215
1928 2216 /*
1929 2217 * Historically, AMD has had CMP chips with only a single thread per core.
1930 2218 * However, starting in family 17h (Zen), this has changed and they now have
1931 2219 * multiple threads. Our internal core id needs to be a unique value.
1932 2220 *
1933 2221 * To determine the core id of an AMD system, if we're from a family before 17h,
1934 2222 * then we just use the cpu id, as that gives us a good value that will be
1935 2223 * unique for each core. If instead, we're on family 17h or later, then we need
1936 2224 * to do something more complicated. CPUID leaf 0x8000001e can tell us
1937 2225 * how many threads are in the system. Based on that, we'll shift the APIC ID.
1938 2226 * We can't use the normal core id in that leaf as it's only unique within the
1939 2227 * socket, which is perfect for cpi_pkgcoreid, but not us.
1940 2228 */
1941 2229 static id_t
1942 2230 cpuid_amd_get_coreid(cpu_t *cpu)
1943 2231 {
1944 2232 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1945 2233
1946 2234 if (cpi->cpi_family >= 0x17 &&
1947 2235 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
1948 2236 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
1949 2237 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
1950 2238 if (nthreads > 1) {
1951 2239 VERIFY3U(nthreads, ==, 2);
1952 2240 return (cpi->cpi_apicid >> 1);
1953 2241 }
1954 2242 }
1955 2243
1956 2244 return (cpu->cpu_id);
1957 2245 }
1958 2246
1959 2247 /*
1960 2248 * IDs on AMD is a more challenging task. This is notable because of the
1961 2249 * following two facts:
1962 2250 *
1963 2251 * 1. Before family 0x17 (Zen), there was no support for SMT and there was
1964 2252 * also no way to get an actual unique core id from the system. As such, we
1965 2253 * synthesize this case by using cpu->cpu_id. This scheme does not,
1966 2254 * however, guarantee that sibling cores of a chip will have sequential
1967 2255 * coreids starting at a multiple of the number of cores per chip - that is
1968 2256 * usually the case, but if the ACPI MADT table is presented in a different
1969 2257 * order then we need to perform a few more gymnastics for the pkgcoreid.
1970 2258 *
1971 2259 * 2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
1972 2260 * called compute units. These compute units share the L1I cache, L2 cache,
1973 2261 * and the FPU. To deal with this, a new topology leaf was added in
1974 2262 * 0x8000001e. However, parts of this leaf have different meanings
1975 2263 * once we get to family 0x17.
1976 2264 */
1977 2265
1978 2266 static void
1979 2267 cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
1980 2268 {
1981 2269 int i, first_half, coreidsz;
1982 2270 uint32_t nb_caps_reg;
1983 2271 uint_t node2_1;
1984 2272 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1985 2273 struct cpuid_regs *cp;
1986 2274
1987 2275 /*
1988 2276 * Calculate the core id (this comes from hardware in family 0x17 if it
1989 2277 * hasn't been stripped by virtualization). We always set the compute
1990 2278 * unit id to the same value. Also, initialize the default number of
1991 2279 * cores per compute unit and nodes per package. This will be
1992 2280 * overwritten when we know information about a particular family.
1993 2281 */
1994 2282 cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
1995 2283 cpi->cpi_compunitid = cpi->cpi_coreid;
1996 2284 cpi->cpi_cores_per_compunit = 1;
1997 2285 cpi->cpi_procnodes_per_pkg = 1;
1998 2286
1999 2287 /*
2000 2288 * To construct the logical ID, we need to determine how many APIC IDs
2001 2289 * are dedicated to the cores and threads. This is provided for us in
2002 2290 * 0x80000008. However, if it's not present (say due to virtualization),
2003 2291 * then we assume it's one. This should be present on all 64-bit AMD
2004 2292 * processors. It was added in family 0xf (Hammer).
2005 2293 */
2006 2294 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2007 2295 coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2008 2296
2009 2297 /*
2010 2298 * In AMD parlance chip is really a node while illumos
2011 2299 * uses chip as equivalent to socket/package.
2012 2300 */
2013 2301 if (coreidsz == 0) {
2014 2302 /* Use legacy method */
2015 2303 for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2016 2304 coreidsz++;
2017 2305 if (coreidsz == 0)
2018 2306 coreidsz = 1;
2019 2307 }
2020 2308 } else {
2021 2309 /* Assume single-core part */
2022 2310 coreidsz = 1;
2023 2311 }
2024 2312 cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2025 2313
2026 2314 /*
2027 2315 * The package core ID varies depending on the family. While it may be
2028 2316 * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2029 2317 * this value is the core id in the given node. For non-virtualized
2030 2318 * family 17h, we need to take the logical core id and shift off the
2031 2319 * threads like we do when getting the core id. Otherwise, we can use
2032 2320 * the clogid as is. When family 17h is virtualized, the clogid should
2033 2321 * be sufficient as if we don't have valid data in the leaf, then we
2034 2322 * won't think we have SMT, in which case the cpi_clogid should be
2035 2323 * sufficient.
2036 2324 */
2037 2325 if (cpi->cpi_family >= 0x17 &&
2038 2326 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2039 2327 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2040 2328 cpi->cpi_extd[0x1e].cp_ebx != 0) {
2041 2329 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2042 2330 if (nthreads > 1) {
2043 2331 VERIFY3U(nthreads, ==, 2);
2044 2332 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2045 2333 } else {
2046 2334 cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2047 2335 }
2048 2336 } else {
2049 2337 cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2050 2338 }
2051 2339
2052 2340 /*
2053 2341 * Obtain the node ID and compute unit IDs. If we're on family 0x15
2054 2342 * (bulldozer) or newer, then we can derive all of this from leaf
2055 2343 * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2056 2344 */
2057 2345 if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2058 2346 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2059 2347 cp = &cpi->cpi_extd[0x1e];
2060 2348
2061 2349 cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2062 2350 cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2063 2351
2064 2352 /*
2065 2353 * For Bulldozer-era CPUs, recalculate the compute unit
2066 2354 * information.
2067 2355 */
2068 2356 if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2069 2357 cpi->cpi_cores_per_compunit =
2070 2358 BITX(cp->cp_ebx, 15, 8) + 1;
2071 2359 cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2072 2360 (cpi->cpi_ncore_per_chip /
2073 2361 cpi->cpi_cores_per_compunit) *
2074 2362 (cpi->cpi_procnodeid /
2075 2363 cpi->cpi_procnodes_per_pkg);
2076 2364 }
2077 2365 } else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2078 2366 cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2079 2367 } else if (cpi->cpi_family == 0x10) {
2080 2368 /*
2081 2369 * See if we are a multi-node processor.
2082 2370 * All processors in the system have the same number of nodes
2083 2371 */
2084 2372 nb_caps_reg = pci_getl_func(0, 24, 3, 0xe8);
2085 2373 if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2086 2374 /* Single-node */
2087 2375 cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2088 2376 coreidsz);
2089 2377 } else {
2090 2378
2091 2379 /*
2092 2380 * Multi-node revision D (2 nodes per package
2093 2381 * are supported)
2094 2382 */
2095 2383 cpi->cpi_procnodes_per_pkg = 2;
2096 2384
2097 2385 first_half = (cpi->cpi_pkgcoreid <=
2098 2386 (cpi->cpi_ncore_per_chip/2 - 1));
2099 2387
2100 2388 if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2101 2389 /* We are BSP */
2102 2390 cpi->cpi_procnodeid = (first_half ? 0 : 1);
2103 2391 } else {
2104 2392
2105 2393 /* We are AP */
2106 2394 /* NodeId[2:1] bits to use for reading F3xe8 */
2107 2395 node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2108 2396
2109 2397 nb_caps_reg =
2110 2398 pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2111 2399
2112 2400 /*
2113 2401 * Check IntNodeNum bit (31:30, but bit 31 is
2114 2402 * always 0 on dual-node processors)
2115 2403 */
2116 2404 if (BITX(nb_caps_reg, 30, 30) == 0)
2117 2405 cpi->cpi_procnodeid = node2_1 +
2118 2406 !first_half;
2119 2407 else
2120 2408 cpi->cpi_procnodeid = node2_1 +
2121 2409 first_half;
2122 2410 }
2123 2411 }
2124 2412 } else {
2125 2413 cpi->cpi_procnodeid = 0;
2126 2414 }
2127 2415
2128 2416 cpi->cpi_chipid =
2129 2417 cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2130 2418
2131 2419 cpi->cpi_ncore_bits = coreidsz;
2132 2420 cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2133 2421 cpi->cpi_ncore_per_chip);
2134 2422 }
2135 2423
2136 2424 static void
2137 2425 spec_uarch_flush_noop(void)
2138 2426 {
2139 2427 }
2140 2428
2141 2429 /*
2142 2430 * When microcode is present that mitigates MDS, this wrmsr will also flush the
2143 2431 * MDS-related micro-architectural state that would normally happen by calling
2144 2432 * x86_md_clear().
2145 2433 */
2146 2434 static void
2147 2435 spec_uarch_flush_msr(void)
2148 2436 {
2149 2437 wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2150 2438 }
2151 2439
2152 2440 /*
2153 2441 * This function points to a function that will flush certain
2154 2442 * micro-architectural state on the processor. This flush is used to mitigate
2155 2443 * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This
2156 2444 * function can point to one of three functions:
2157 2445 *
2158 2446 * - A noop which is done because we either are vulnerable, but do not have
2159 2447 * microcode available to help deal with a fix, or because we aren't
2160 2448 * vulnerable.
2161 2449 *
2162 2450 * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
↓ open down ↓ |
1210 lines elided |
↑ open up ↑ |
2163 2451 * mitigate MDS is present, also perform the equivalent of the MDS flush;
2164 2452 * however, it only flushes the MDS related micro-architectural state on the
2165 2453 * current hyperthread, it does not do anything for the twin.
2166 2454 *
2167 2455 * - x86_md_clear which will flush the MDS related state. This is done when we
2168 2456 * have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2169 2457 * (RDCL_NO is set).
2170 2458 */
2171 2459 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2172 2460
2173 -void (*x86_md_clear)(void) = x86_md_clear_noop;
2174 -
2175 2461 static void
2176 2462 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2177 2463 {
2178 2464 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2179 2465
2180 2466 /*
2181 2467 * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2182 2468 * has been fixed in hardware, it doesn't cover everything related to
2183 2469 * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2184 2470 * need to mitigate this.
2185 2471 */
2186 2472 if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2187 2473 is_x86_feature(featureset, X86FSET_MDS_NO)) {
2188 - x86_md_clear = x86_md_clear_noop;
2189 - membar_producer();
2190 2474 return;
2191 2475 }
2192 2476
2193 2477 if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2194 - x86_md_clear = x86_md_clear_verw;
2478 + const uint8_t nop = NOP_INSTR;
2479 + uint8_t *md = (uint8_t *)x86_md_clear;
2480 +
2481 + *md = nop;
2195 2482 }
2196 2483
2197 2484 membar_producer();
2198 2485 }
2199 2486
2200 2487 static void
2201 2488 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2202 2489 {
2203 2490 boolean_t need_l1d, need_mds;
2204 2491 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2205 2492
2206 2493 /*
2207 2494 * If we're not on Intel or we've mitigated both RDCL and MDS in
2208 2495 * hardware, then there's nothing left for us to do for enabling the
2209 2496 * flush. We can also go ahead and say that SMT exclusion is
2210 2497 * unnecessary.
2211 2498 */
2212 2499 if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2213 2500 (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2214 2501 is_x86_feature(featureset, X86FSET_MDS_NO))) {
2215 2502 extern int smt_exclusion;
2216 2503 smt_exclusion = 0;
2217 2504 spec_uarch_flush = spec_uarch_flush_noop;
2218 2505 membar_producer();
2219 2506 return;
2220 2507 }
2221 2508
2222 2509 /*
2223 2510 * The locations where we need to perform an L1D flush are required both
2224 2511 * for mitigating L1TF and MDS. When verw support is present in
2225 2512 * microcode, then the L1D flush will take care of doing that as well.
2226 2513 * However, if we have a system where RDCL_NO is present, but we don't
2227 2514 * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2228 2515 * L1D flush.
2229 2516 */
2230 2517 if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2231 2518 is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2232 2519 !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2233 2520 need_l1d = B_TRUE;
2234 2521 } else {
2235 2522 need_l1d = B_FALSE;
2236 2523 }
2237 2524
2238 2525 if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2239 2526 is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2240 2527 need_mds = B_TRUE;
2241 2528 } else {
2242 2529 need_mds = B_FALSE;
2243 2530 }
2244 2531
2245 2532 if (need_l1d) {
2246 2533 spec_uarch_flush = spec_uarch_flush_msr;
2247 2534 } else if (need_mds) {
↓ open down ↓ |
43 lines elided |
↑ open up ↑ |
2248 2535 spec_uarch_flush = x86_md_clear;
2249 2536 } else {
2250 2537 /*
2251 2538 * We have no hardware mitigations available to us.
2252 2539 */
2253 2540 spec_uarch_flush = spec_uarch_flush_noop;
2254 2541 }
2255 2542 membar_producer();
2256 2543 }
2257 2544
2545 +/*
2546 + * We default to enabling RSB mitigations.
2547 + */
2258 2548 static void
2549 +cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)
2550 +{
2551 + const uint8_t ret = RET_INSTR;
2552 + uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
2553 +
2554 + switch (mit) {
2555 + case X86_SPECTREV2_ENHANCED_IBRS:
2556 + case X86_SPECTREV2_DISABLED:
2557 + *stuff = ret;
2558 + break;
2559 + default:
2560 + break;
2561 + }
2562 +}
2563 +
2564 +static void
2565 +cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
2566 +{
2567 + const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
2568 + "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
2569 + "_r14", "_r15" };
2570 + const uint_t nthunks = ARRAY_SIZE(thunks);
2571 + const char *type;
2572 + uint_t i;
2573 +
2574 + if (mit == x86_spectrev2_mitigation)
2575 + return;
2576 +
2577 + switch (mit) {
2578 + case X86_SPECTREV2_RETPOLINE:
2579 + type = "gen";
2580 + break;
2581 + case X86_SPECTREV2_RETPOLINE_AMD:
2582 + type = "amd";
2583 + break;
2584 + case X86_SPECTREV2_ENHANCED_IBRS:
2585 + case X86_SPECTREV2_DISABLED:
2586 + type = "jmp";
2587 + break;
2588 + default:
2589 + panic("asked to updated retpoline state with unknown state!");
2590 + }
2591 +
2592 + for (i = 0; i < nthunks; i++) {
2593 + uintptr_t source, dest;
2594 + int ssize, dsize;
2595 + char sourcebuf[64], destbuf[64];
2596 + size_t len;
2597 +
2598 + (void) snprintf(destbuf, sizeof (destbuf),
2599 + "__x86_indirect_thunk%s", thunks[i]);
2600 + (void) snprintf(sourcebuf, sizeof (sourcebuf),
2601 + "__x86_indirect_thunk_%s%s", type, thunks[i]);
2602 +
2603 + source = kobj_getelfsym(sourcebuf, NULL, &ssize);
2604 + dest = kobj_getelfsym(destbuf, NULL, &dsize);
2605 + VERIFY3U(source, !=, 0);
2606 + VERIFY3U(dest, !=, 0);
2607 + VERIFY3S(dsize, >=, ssize);
2608 + bcopy((void *)source, (void *)dest, ssize);
2609 + }
2610 +}
2611 +
2612 +static void
2613 +cpuid_enable_enhanced_ibrs(void)
2614 +{
2615 + uint64_t val;
2616 +
2617 + val = rdmsr(MSR_IA32_SPEC_CTRL);
2618 + val |= IA32_SPEC_CTRL_IBRS;
2619 + wrmsr(MSR_IA32_SPEC_CTRL, val);
2620 +}
2621 +
2622 +#ifndef __xpv
2623 +/*
2624 + * Determine whether or not we can use the AMD optimized retpoline
2625 + * functionality. We use this when we know we're on an AMD system and we can
2626 + * successfully verify that lfence is dispatch serializing.
2627 + */
2628 +static boolean_t
2629 +cpuid_use_amd_retpoline(struct cpuid_info *cpi)
2630 +{
2631 + uint64_t val;
2632 + on_trap_data_t otd;
2633 +
2634 + if (cpi->cpi_vendor != X86_VENDOR_AMD)
2635 + return (B_FALSE);
2636 +
2637 + /*
2638 + * We need to determine whether or not lfence is serializing. It always
2639 + * is on families 0xf and 0x11. On others, it's controlled by
2640 + * MSR_AMD_DECODE_CONFIG (MSRC001_1029). If some hypervisor gives us a
2641 + * crazy old family, don't try and do anything.
2642 + */
2643 + if (cpi->cpi_family < 0xf)
2644 + return (B_FALSE);
2645 + if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11)
2646 + return (B_TRUE);
2647 +
2648 + /*
2649 + * While it may be tempting to use get_hwenv(), there are no promises
2650 + * that a hypervisor will actually declare themselves to be so in a
2651 + * friendly way. As such, try to read and set the MSR. If we can then
2652 + * read back the value we set (it wasn't just set to zero), then we go
2653 + * for it.
2654 + */
2655 + if (!on_trap(&otd, OT_DATA_ACCESS)) {
2656 + val = rdmsr(MSR_AMD_DECODE_CONFIG);
2657 + val |= AMD_DECODE_CONFIG_LFENCE_DISPATCH;
2658 + wrmsr(MSR_AMD_DECODE_CONFIG, val);
2659 + val = rdmsr(MSR_AMD_DECODE_CONFIG);
2660 + } else {
2661 + val = 0;
2662 + }
2663 + no_trap();
2664 +
2665 + if ((val & AMD_DECODE_CONFIG_LFENCE_DISPATCH) != 0)
2666 + return (B_TRUE);
2667 + return (B_FALSE);
2668 +}
2669 +#endif /* !__xpv */
2670 +
2671 +static void
2259 2672 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
2260 2673 {
2261 2674 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2675 + x86_spectrev2_mitigation_t v2mit;
2262 2676
2263 2677 if (cpi->cpi_vendor == X86_VENDOR_AMD &&
2264 2678 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2265 2679 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
2266 2680 add_x86_feature(featureset, X86FSET_IBPB);
2267 2681 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
2268 2682 add_x86_feature(featureset, X86FSET_IBRS);
2269 2683 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
2270 2684 add_x86_feature(featureset, X86FSET_STIBP);
2271 - if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)
2272 - add_x86_feature(featureset, X86FSET_IBRS_ALL);
2273 2685 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
2274 2686 add_x86_feature(featureset, X86FSET_STIBP_ALL);
2275 - if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS)
2276 - add_x86_feature(featureset, X86FSET_RSBA);
2277 2687 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
2278 2688 add_x86_feature(featureset, X86FSET_SSBD);
2279 2689 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
2280 2690 add_x86_feature(featureset, X86FSET_SSBD_VIRT);
2281 2691 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
2282 2692 add_x86_feature(featureset, X86FSET_SSB_NO);
2693 + /*
2694 + * Don't enable enhanced IBRS unless we're told that we should
2695 + * prefer it and it has the same semantics as Intel. This is
2696 + * split into two bits rather than a single one.
2697 + */
2698 + if ((cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
2699 + (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)) {
2700 + add_x86_feature(featureset, X86FSET_IBRS_ALL);
2701 + }
2702 +
2283 2703 } else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
2284 2704 cpi->cpi_maxeax >= 7) {
2285 2705 struct cpuid_regs *ecp;
2286 2706 ecp = &cpi->cpi_std[7];
2287 2707
2288 2708 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
2289 2709 add_x86_feature(featureset, X86FSET_MD_CLEAR);
2290 2710 }
2291 2711
2292 2712 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
2293 2713 add_x86_feature(featureset, X86FSET_IBRS);
2294 2714 add_x86_feature(featureset, X86FSET_IBPB);
2295 2715 }
2296 2716
2297 2717 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
2298 2718 add_x86_feature(featureset, X86FSET_STIBP);
2299 2719 }
2300 2720
2301 2721 /*
2302 2722 * Don't read the arch caps MSR on xpv where we lack the
2303 2723 * on_trap().
2304 2724 */
2305 2725 #ifndef __xpv
2306 2726 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
2307 2727 on_trap_data_t otd;
2308 2728
2309 2729 /*
2310 2730 * Be paranoid and assume we'll get a #GP.
2311 2731 */
2312 2732 if (!on_trap(&otd, OT_DATA_ACCESS)) {
2313 2733 uint64_t reg;
2314 2734
2315 2735 reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
2316 2736 if (reg & IA32_ARCH_CAP_RDCL_NO) {
2317 2737 add_x86_feature(featureset,
2318 2738 X86FSET_RDCL_NO);
2319 2739 }
2320 2740 if (reg & IA32_ARCH_CAP_IBRS_ALL) {
2321 2741 add_x86_feature(featureset,
2322 2742 X86FSET_IBRS_ALL);
2323 2743 }
2324 2744 if (reg & IA32_ARCH_CAP_RSBA) {
2325 2745 add_x86_feature(featureset,
2326 2746 X86FSET_RSBA);
2327 2747 }
2328 2748 if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
2329 2749 add_x86_feature(featureset,
2330 2750 X86FSET_L1D_VM_NO);
2331 2751 }
2332 2752 if (reg & IA32_ARCH_CAP_SSB_NO) {
2333 2753 add_x86_feature(featureset,
2334 2754 X86FSET_SSB_NO);
2335 2755 }
2336 2756 if (reg & IA32_ARCH_CAP_MDS_NO) {
2337 2757 add_x86_feature(featureset,
2338 2758 X86FSET_MDS_NO);
2339 2759 }
2340 2760 }
2341 2761 no_trap();
↓ open down ↓ |
49 lines elided |
↑ open up ↑ |
2342 2762 }
2343 2763 #endif /* !__xpv */
2344 2764
2345 2765 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
2346 2766 add_x86_feature(featureset, X86FSET_SSBD);
2347 2767
2348 2768 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
2349 2769 add_x86_feature(featureset, X86FSET_FLUSH_CMD);
2350 2770 }
2351 2771
2352 - if (cpu->cpu_id != 0)
2772 + if (cpu->cpu_id != 0) {
2773 + if (x86_spectrev2_mitigation == X86_SPECTREV2_ENHANCED_IBRS) {
2774 + cpuid_enable_enhanced_ibrs();
2775 + }
2353 2776 return;
2777 + }
2354 2778
2355 2779 /*
2780 + * Go through and initialize various security mechanisms that we should
2781 + * only do on a single CPU. This includes Spectre V2, L1TF, and MDS.
2782 + */
2783 +
2784 + /*
2785 + * By default we've come in with retpolines enabled. Check whether we
2786 + * should disable them or enable enhanced IBRS. RSB stuffing is enabled
2787 + * by default, but disabled if we are using enhanced IBRS.
2788 + */
2789 + if (x86_disable_spectrev2 != 0) {
2790 + v2mit = X86_SPECTREV2_DISABLED;
2791 + } else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
2792 + cpuid_enable_enhanced_ibrs();
2793 + v2mit = X86_SPECTREV2_ENHANCED_IBRS;
2794 +#ifndef __xpv
2795 + } else if (cpuid_use_amd_retpoline(cpi)) {
2796 + v2mit = X86_SPECTREV2_RETPOLINE_AMD;
2797 +#endif /* !__xpv */
2798 + } else {
2799 + v2mit = X86_SPECTREV2_RETPOLINE;
2800 + }
2801 +
2802 + cpuid_patch_retpolines(v2mit);
2803 + cpuid_patch_rsb(v2mit);
2804 + x86_spectrev2_mitigation = v2mit;
2805 + membar_producer();
2806 +
2807 + /*
2356 2808 * We need to determine what changes are required for mitigating L1TF
2357 2809 * and MDS. If the CPU suffers from either of them, then SMT exclusion
2358 2810 * is required.
2359 2811 *
2360 2812 * If any of these are present, then we need to flush u-arch state at
2361 2813 * various points. For MDS, we need to do so whenever we change to a
2362 2814 * lesser privilege level or we are halting the CPU. For L1TF we need to
2363 2815 * flush the L1D cache at VM entry. When we have microcode that handles
2364 2816 * MDS, the L1D flush also clears the other u-arch state that the
2365 2817 * md_clear does.
2366 2818 */
2367 2819
2368 2820 /*
2369 2821 * Update whether or not we need to be taking explicit action against
2370 2822 * MDS.
2371 2823 */
2372 2824 cpuid_update_md_clear(cpu, featureset);
2373 2825
2374 2826 /*
2375 2827 * Determine whether SMT exclusion is required and whether or not we
2376 2828 * need to perform an l1d flush.
2377 2829 */
2378 2830 cpuid_update_l1d_flush(cpu, featureset);
2379 2831 }
2380 2832
2381 2833 /*
2382 2834 * Setup XFeature_Enabled_Mask register. Required by xsave feature.
2383 2835 */
2384 2836 void
2385 2837 setup_xfem(void)
2386 2838 {
2387 2839 uint64_t flags = XFEATURE_LEGACY_FP;
2388 2840
2389 2841 ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
2390 2842
2391 2843 if (is_x86_feature(x86_featureset, X86FSET_SSE))
2392 2844 flags |= XFEATURE_SSE;
2393 2845
2394 2846 if (is_x86_feature(x86_featureset, X86FSET_AVX))
2395 2847 flags |= XFEATURE_AVX;
2396 2848
2397 2849 if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
2398 2850 flags |= XFEATURE_AVX512;
2399 2851
2400 2852 set_xcr(XFEATURE_ENABLED_MASK, flags);
2401 2853
2402 2854 xsave_bv_all = flags;
2403 2855 }
2404 2856
2405 2857 static void
2406 2858 cpuid_pass1_topology(cpu_t *cpu, uchar_t *featureset)
2407 2859 {
2408 2860 struct cpuid_info *cpi;
2409 2861
2410 2862 cpi = cpu->cpu_m.mcpu_cpi;
2411 2863
2412 2864 if (cpi->cpi_vendor == X86_VENDOR_AMD) {
2413 2865 cpuid_gather_amd_topology_leaves(cpu);
2414 2866 }
2415 2867
2416 2868 cpi->cpi_apicid = cpuid_gather_apicid(cpi);
2417 2869
2418 2870 /*
2419 2871 * Before we can calculate the IDs that we should assign to this
2420 2872 * processor, we need to understand how many cores and threads it has.
2421 2873 */
2422 2874 switch (cpi->cpi_vendor) {
2423 2875 case X86_VENDOR_Intel:
2424 2876 cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
2425 2877 &cpi->cpi_ncore_per_chip);
2426 2878 break;
2427 2879 case X86_VENDOR_AMD:
2428 2880 cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
2429 2881 &cpi->cpi_ncore_per_chip);
2430 2882 break;
2431 2883 default:
2432 2884 /*
2433 2885 * If we have some other x86 compatible chip, it's not clear how
2434 2886 * they would behave. The most common case is virtualization
2435 2887 * today, though there are also 64-bit VIA chips. Assume that
2436 2888 * all we can get is the basic Leaf 1 HTT information.
2437 2889 */
2438 2890 if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2439 2891 cpi->cpi_ncore_per_chip = 1;
2440 2892 cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
2441 2893 }
2442 2894 break;
2443 2895 }
2444 2896
2445 2897 /*
2446 2898 * Based on the calculated number of threads and cores, potentially
2447 2899 * assign the HTT and CMT features.
2448 2900 */
2449 2901 if (cpi->cpi_ncore_per_chip > 1) {
2450 2902 add_x86_feature(featureset, X86FSET_CMP);
2451 2903 }
2452 2904
2453 2905 if (cpi->cpi_ncpu_per_chip > 1 &&
2454 2906 cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
2455 2907 add_x86_feature(featureset, X86FSET_HTT);
2456 2908 }
2457 2909
2458 2910 /*
2459 2911 * Now that has been set up, we need to go through and calculate all of
2460 2912 * the rest of the parameters that exist. If we think the CPU doesn't
2461 2913 * have either SMT (HTT) or CMP, then we basically go through and fake
2462 2914 * up information in some way. The most likely case for this is
2463 2915 * virtualization where we have a lot of partial topology information.
2464 2916 */
2465 2917 if (!is_x86_feature(featureset, X86FSET_HTT) &&
2466 2918 !is_x86_feature(featureset, X86FSET_CMP)) {
2467 2919 /*
2468 2920 * This is a single core, single-threaded processor.
2469 2921 */
2470 2922 cpi->cpi_procnodes_per_pkg = 1;
2471 2923 cpi->cpi_cores_per_compunit = 1;
2472 2924 cpi->cpi_compunitid = 0;
2473 2925 cpi->cpi_chipid = -1;
2474 2926 cpi->cpi_clogid = 0;
2475 2927 cpi->cpi_coreid = cpu->cpu_id;
2476 2928 cpi->cpi_pkgcoreid = 0;
2477 2929 if (cpi->cpi_vendor == X86_VENDOR_AMD) {
2478 2930 cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
2479 2931 } else {
2480 2932 cpi->cpi_procnodeid = cpi->cpi_chipid;
2481 2933 }
2482 2934 } else {
2483 2935 switch (cpi->cpi_vendor) {
2484 2936 case X86_VENDOR_Intel:
2485 2937 cpuid_intel_getids(cpu, featureset);
2486 2938 break;
2487 2939 case X86_VENDOR_AMD:
2488 2940 cpuid_amd_getids(cpu, featureset);
2489 2941 break;
2490 2942 default:
2491 2943 /*
2492 2944 * In this case, it's hard to say what we should do.
2493 2945 * We're going to model them to the OS as single core
2494 2946 * threads. We don't have a good identifier for them, so
2495 2947 * we're just going to use the cpu id all on a single
2496 2948 * chip.
2497 2949 *
2498 2950 * This case has historically been different from the
2499 2951 * case above where we don't have HTT or CMP. While they
2500 2952 * could be combined, we've opted to keep it separate to
2501 2953 * minimize the risk of topology changes in weird cases.
2502 2954 */
2503 2955 cpi->cpi_procnodes_per_pkg = 1;
2504 2956 cpi->cpi_cores_per_compunit = 1;
2505 2957 cpi->cpi_chipid = 0;
2506 2958 cpi->cpi_coreid = cpu->cpu_id;
2507 2959 cpi->cpi_clogid = cpu->cpu_id;
2508 2960 cpi->cpi_pkgcoreid = cpu->cpu_id;
2509 2961 cpi->cpi_procnodeid = cpi->cpi_chipid;
2510 2962 cpi->cpi_compunitid = cpi->cpi_coreid;
2511 2963 break;
2512 2964 }
2513 2965 }
2514 2966 }
2515 2967
2516 2968 /*
2517 2969 * Gather relevant CPU features from leaf 6 which covers thermal information. We
2518 2970 * always gather leaf 6 if it's supported; however, we only look for features on
2519 2971 * Intel systems as AMD does not currently define any of the features we look
2520 2972 * for below.
2521 2973 */
2522 2974 static void
2523 2975 cpuid_pass1_thermal(cpu_t *cpu, uchar_t *featureset)
2524 2976 {
2525 2977 struct cpuid_regs *cp;
2526 2978 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2527 2979
2528 2980 if (cpi->cpi_maxeax < 6) {
2529 2981 return;
2530 2982 }
2531 2983
2532 2984 cp = &cpi->cpi_std[6];
2533 2985 cp->cp_eax = 6;
2534 2986 cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
2535 2987 (void) __cpuid_insn(cp);
2536 2988 platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
2537 2989
2538 2990 if (cpi->cpi_vendor != X86_VENDOR_Intel) {
2539 2991 return;
2540 2992 }
2541 2993
2542 2994 if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
2543 2995 add_x86_feature(featureset, X86FSET_CORE_THERMAL);
2544 2996 }
2545 2997
2546 2998 if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
2547 2999 add_x86_feature(featureset, X86FSET_PKG_THERMAL);
2548 3000 }
2549 3001 }
2550 3002
2551 3003 void
2552 3004 cpuid_pass1(cpu_t *cpu, uchar_t *featureset)
2553 3005 {
2554 3006 uint32_t mask_ecx, mask_edx;
2555 3007 struct cpuid_info *cpi;
2556 3008 struct cpuid_regs *cp;
2557 3009 int xcpuid;
2558 3010 #if !defined(__xpv)
2559 3011 extern int idle_cpu_prefer_mwait;
2560 3012 #endif
2561 3013
2562 3014 /*
2563 3015 * Space statically allocated for BSP, ensure pointer is set
2564 3016 */
2565 3017 if (cpu->cpu_id == 0) {
2566 3018 if (cpu->cpu_m.mcpu_cpi == NULL)
2567 3019 cpu->cpu_m.mcpu_cpi = &cpuid_info0;
2568 3020 }
2569 3021
2570 3022 add_x86_feature(featureset, X86FSET_CPUID);
2571 3023
2572 3024 cpi = cpu->cpu_m.mcpu_cpi;
2573 3025 ASSERT(cpi != NULL);
2574 3026 cp = &cpi->cpi_std[0];
2575 3027 cp->cp_eax = 0;
2576 3028 cpi->cpi_maxeax = __cpuid_insn(cp);
2577 3029 {
2578 3030 uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
2579 3031 *iptr++ = cp->cp_ebx;
2580 3032 *iptr++ = cp->cp_edx;
2581 3033 *iptr++ = cp->cp_ecx;
2582 3034 *(char *)&cpi->cpi_vendorstr[12] = '\0';
2583 3035 }
2584 3036
2585 3037 cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
2586 3038 x86_vendor = cpi->cpi_vendor; /* for compatibility */
2587 3039
2588 3040 /*
2589 3041 * Limit the range in case of weird hardware
2590 3042 */
2591 3043 if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
2592 3044 cpi->cpi_maxeax = CPI_MAXEAX_MAX;
2593 3045 if (cpi->cpi_maxeax < 1)
2594 3046 goto pass1_done;
2595 3047
2596 3048 cp = &cpi->cpi_std[1];
2597 3049 cp->cp_eax = 1;
2598 3050 (void) __cpuid_insn(cp);
2599 3051
2600 3052 /*
2601 3053 * Extract identifying constants for easy access.
2602 3054 */
2603 3055 cpi->cpi_model = CPI_MODEL(cpi);
2604 3056 cpi->cpi_family = CPI_FAMILY(cpi);
2605 3057
2606 3058 if (cpi->cpi_family == 0xf)
2607 3059 cpi->cpi_family += CPI_FAMILY_XTD(cpi);
2608 3060
2609 3061 /*
2610 3062 * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
2611 3063 * Intel, and presumably everyone else, uses model == 0xf, as
2612 3064 * one would expect (max value means possible overflow). Sigh.
2613 3065 */
2614 3066
2615 3067 switch (cpi->cpi_vendor) {
2616 3068 case X86_VENDOR_Intel:
2617 3069 if (IS_EXTENDED_MODEL_INTEL(cpi))
2618 3070 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
2619 3071 break;
2620 3072 case X86_VENDOR_AMD:
2621 3073 if (CPI_FAMILY(cpi) == 0xf)
2622 3074 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
2623 3075 break;
2624 3076 default:
2625 3077 if (cpi->cpi_model == 0xf)
2626 3078 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
2627 3079 break;
2628 3080 }
2629 3081
2630 3082 cpi->cpi_step = CPI_STEP(cpi);
2631 3083 cpi->cpi_brandid = CPI_BRANDID(cpi);
2632 3084
2633 3085 /*
2634 3086 * *default* assumptions:
2635 3087 * - believe %edx feature word
2636 3088 * - ignore %ecx feature word
2637 3089 * - 32-bit virtual and physical addressing
2638 3090 */
2639 3091 mask_edx = 0xffffffff;
2640 3092 mask_ecx = 0;
2641 3093
2642 3094 cpi->cpi_pabits = cpi->cpi_vabits = 32;
2643 3095
2644 3096 switch (cpi->cpi_vendor) {
2645 3097 case X86_VENDOR_Intel:
2646 3098 if (cpi->cpi_family == 5)
2647 3099 x86_type = X86_TYPE_P5;
2648 3100 else if (IS_LEGACY_P6(cpi)) {
2649 3101 x86_type = X86_TYPE_P6;
2650 3102 pentiumpro_bug4046376 = 1;
2651 3103 /*
2652 3104 * Clear the SEP bit when it was set erroneously
2653 3105 */
2654 3106 if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
2655 3107 cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
2656 3108 } else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
2657 3109 x86_type = X86_TYPE_P4;
2658 3110 /*
2659 3111 * We don't currently depend on any of the %ecx
2660 3112 * features until Prescott, so we'll only check
2661 3113 * this from P4 onwards. We might want to revisit
2662 3114 * that idea later.
2663 3115 */
2664 3116 mask_ecx = 0xffffffff;
2665 3117 } else if (cpi->cpi_family > 0xf)
2666 3118 mask_ecx = 0xffffffff;
2667 3119 /*
2668 3120 * We don't support MONITOR/MWAIT if leaf 5 is not available
2669 3121 * to obtain the monitor linesize.
2670 3122 */
2671 3123 if (cpi->cpi_maxeax < 5)
2672 3124 mask_ecx &= ~CPUID_INTC_ECX_MON;
2673 3125 break;
2674 3126 case X86_VENDOR_IntelClone:
2675 3127 default:
2676 3128 break;
2677 3129 case X86_VENDOR_AMD:
2678 3130 #if defined(OPTERON_ERRATUM_108)
2679 3131 if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
2680 3132 cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
2681 3133 cpi->cpi_model = 0xc;
2682 3134 } else
2683 3135 #endif
2684 3136 if (cpi->cpi_family == 5) {
2685 3137 /*
2686 3138 * AMD K5 and K6
2687 3139 *
2688 3140 * These CPUs have an incomplete implementation
2689 3141 * of MCA/MCE which we mask away.
2690 3142 */
2691 3143 mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
2692 3144
2693 3145 /*
2694 3146 * Model 0 uses the wrong (APIC) bit
2695 3147 * to indicate PGE. Fix it here.
2696 3148 */
2697 3149 if (cpi->cpi_model == 0) {
2698 3150 if (cp->cp_edx & 0x200) {
2699 3151 cp->cp_edx &= ~0x200;
2700 3152 cp->cp_edx |= CPUID_INTC_EDX_PGE;
2701 3153 }
2702 3154 }
2703 3155
2704 3156 /*
2705 3157 * Early models had problems w/ MMX; disable.
2706 3158 */
2707 3159 if (cpi->cpi_model < 6)
2708 3160 mask_edx &= ~CPUID_INTC_EDX_MMX;
2709 3161 }
2710 3162
2711 3163 /*
2712 3164 * For newer families, SSE3 and CX16, at least, are valid;
2713 3165 * enable all
2714 3166 */
2715 3167 if (cpi->cpi_family >= 0xf)
2716 3168 mask_ecx = 0xffffffff;
2717 3169 /*
2718 3170 * We don't support MONITOR/MWAIT if leaf 5 is not available
2719 3171 * to obtain the monitor linesize.
2720 3172 */
2721 3173 if (cpi->cpi_maxeax < 5)
2722 3174 mask_ecx &= ~CPUID_INTC_ECX_MON;
2723 3175
2724 3176 #if !defined(__xpv)
2725 3177 /*
2726 3178 * AMD has not historically used MWAIT in the CPU's idle loop.
2727 3179 * Pre-family-10h Opterons do not have the MWAIT instruction. We
2728 3180 * know for certain that in at least family 17h, per AMD, mwait
2729 3181 * is preferred. Families in-between are less certain.
2730 3182 */
2731 3183 if (cpi->cpi_family < 0x17) {
2732 3184 idle_cpu_prefer_mwait = 0;
2733 3185 }
2734 3186 #endif
2735 3187
2736 3188 break;
2737 3189 case X86_VENDOR_TM:
2738 3190 /*
2739 3191 * workaround the NT workaround in CMS 4.1
2740 3192 */
2741 3193 if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
2742 3194 (cpi->cpi_step == 2 || cpi->cpi_step == 3))
2743 3195 cp->cp_edx |= CPUID_INTC_EDX_CX8;
2744 3196 break;
2745 3197 case X86_VENDOR_Centaur:
2746 3198 /*
2747 3199 * workaround the NT workarounds again
2748 3200 */
2749 3201 if (cpi->cpi_family == 6)
2750 3202 cp->cp_edx |= CPUID_INTC_EDX_CX8;
2751 3203 break;
2752 3204 case X86_VENDOR_Cyrix:
2753 3205 /*
2754 3206 * We rely heavily on the probing in locore
2755 3207 * to actually figure out what parts, if any,
2756 3208 * of the Cyrix cpuid instruction to believe.
2757 3209 */
2758 3210 switch (x86_type) {
2759 3211 case X86_TYPE_CYRIX_486:
2760 3212 mask_edx = 0;
2761 3213 break;
2762 3214 case X86_TYPE_CYRIX_6x86:
2763 3215 mask_edx = 0;
2764 3216 break;
2765 3217 case X86_TYPE_CYRIX_6x86L:
2766 3218 mask_edx =
2767 3219 CPUID_INTC_EDX_DE |
2768 3220 CPUID_INTC_EDX_CX8;
2769 3221 break;
2770 3222 case X86_TYPE_CYRIX_6x86MX:
2771 3223 mask_edx =
2772 3224 CPUID_INTC_EDX_DE |
2773 3225 CPUID_INTC_EDX_MSR |
2774 3226 CPUID_INTC_EDX_CX8 |
2775 3227 CPUID_INTC_EDX_PGE |
2776 3228 CPUID_INTC_EDX_CMOV |
2777 3229 CPUID_INTC_EDX_MMX;
2778 3230 break;
2779 3231 case X86_TYPE_CYRIX_GXm:
2780 3232 mask_edx =
2781 3233 CPUID_INTC_EDX_MSR |
2782 3234 CPUID_INTC_EDX_CX8 |
2783 3235 CPUID_INTC_EDX_CMOV |
2784 3236 CPUID_INTC_EDX_MMX;
2785 3237 break;
2786 3238 case X86_TYPE_CYRIX_MediaGX:
2787 3239 break;
2788 3240 case X86_TYPE_CYRIX_MII:
2789 3241 case X86_TYPE_VIA_CYRIX_III:
2790 3242 mask_edx =
2791 3243 CPUID_INTC_EDX_DE |
2792 3244 CPUID_INTC_EDX_TSC |
2793 3245 CPUID_INTC_EDX_MSR |
2794 3246 CPUID_INTC_EDX_CX8 |
2795 3247 CPUID_INTC_EDX_PGE |
2796 3248 CPUID_INTC_EDX_CMOV |
2797 3249 CPUID_INTC_EDX_MMX;
2798 3250 break;
2799 3251 default:
2800 3252 break;
2801 3253 }
2802 3254 break;
2803 3255 }
2804 3256
2805 3257 #if defined(__xpv)
2806 3258 /*
2807 3259 * Do not support MONITOR/MWAIT under a hypervisor
2808 3260 */
2809 3261 mask_ecx &= ~CPUID_INTC_ECX_MON;
2810 3262 /*
2811 3263 * Do not support XSAVE under a hypervisor for now
2812 3264 */
2813 3265 xsave_force_disable = B_TRUE;
2814 3266
2815 3267 #endif /* __xpv */
2816 3268
2817 3269 if (xsave_force_disable) {
2818 3270 mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
2819 3271 mask_ecx &= ~CPUID_INTC_ECX_AVX;
2820 3272 mask_ecx &= ~CPUID_INTC_ECX_F16C;
2821 3273 mask_ecx &= ~CPUID_INTC_ECX_FMA;
2822 3274 }
2823 3275
2824 3276 /*
2825 3277 * Now we've figured out the masks that determine
2826 3278 * which bits we choose to believe, apply the masks
2827 3279 * to the feature words, then map the kernel's view
2828 3280 * of these feature words into its feature word.
2829 3281 */
2830 3282 cp->cp_edx &= mask_edx;
2831 3283 cp->cp_ecx &= mask_ecx;
2832 3284
2833 3285 /*
2834 3286 * apply any platform restrictions (we don't call this
2835 3287 * immediately after __cpuid_insn here, because we need the
2836 3288 * workarounds applied above first)
2837 3289 */
2838 3290 platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
2839 3291
2840 3292 /*
2841 3293 * In addition to ecx and edx, Intel and AMD are storing a bunch of
2842 3294 * instruction set extensions in leaf 7's ebx, ecx, and edx.
2843 3295 */
2844 3296 if (cpi->cpi_maxeax >= 7) {
2845 3297 struct cpuid_regs *ecp;
2846 3298 ecp = &cpi->cpi_std[7];
2847 3299 ecp->cp_eax = 7;
2848 3300 ecp->cp_ecx = 0;
2849 3301 (void) __cpuid_insn(ecp);
2850 3302
2851 3303 /*
2852 3304 * If XSAVE has been disabled, just ignore all of the
2853 3305 * extended-save-area dependent flags here.
2854 3306 */
2855 3307 if (xsave_force_disable) {
2856 3308 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
2857 3309 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
2858 3310 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
2859 3311 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
2860 3312 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
2861 3313 ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
2862 3314 ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
2863 3315 }
2864 3316
2865 3317 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
2866 3318 add_x86_feature(featureset, X86FSET_SMEP);
2867 3319
2868 3320 /*
2869 3321 * We check disable_smap here in addition to in startup_smap()
2870 3322 * to ensure CPUs that aren't the boot CPU don't accidentally
2871 3323 * include it in the feature set and thus generate a mismatched
2872 3324 * x86 feature set across CPUs.
2873 3325 */
2874 3326 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
2875 3327 disable_smap == 0)
2876 3328 add_x86_feature(featureset, X86FSET_SMAP);
2877 3329
2878 3330 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
2879 3331 add_x86_feature(featureset, X86FSET_RDSEED);
2880 3332
2881 3333 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
2882 3334 add_x86_feature(featureset, X86FSET_ADX);
2883 3335
2884 3336 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
2885 3337 add_x86_feature(featureset, X86FSET_FSGSBASE);
2886 3338
2887 3339 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
2888 3340 add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
2889 3341
2890 3342 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
2891 3343 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
2892 3344 add_x86_feature(featureset, X86FSET_INVPCID);
2893 3345
2894 3346 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
2895 3347 add_x86_feature(featureset, X86FSET_MPX);
2896 3348
2897 3349 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
2898 3350 add_x86_feature(featureset, X86FSET_CLWB);
2899 3351 }
2900 3352 }
2901 3353
2902 3354 /*
2903 3355 * fold in overrides from the "eeprom" mechanism
2904 3356 */
2905 3357 cp->cp_edx |= cpuid_feature_edx_include;
2906 3358 cp->cp_edx &= ~cpuid_feature_edx_exclude;
2907 3359
2908 3360 cp->cp_ecx |= cpuid_feature_ecx_include;
2909 3361 cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
2910 3362
2911 3363 if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
2912 3364 add_x86_feature(featureset, X86FSET_LARGEPAGE);
2913 3365 }
2914 3366 if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
2915 3367 add_x86_feature(featureset, X86FSET_TSC);
2916 3368 }
2917 3369 if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
2918 3370 add_x86_feature(featureset, X86FSET_MSR);
2919 3371 }
2920 3372 if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
2921 3373 add_x86_feature(featureset, X86FSET_MTRR);
2922 3374 }
2923 3375 if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
2924 3376 add_x86_feature(featureset, X86FSET_PGE);
2925 3377 }
2926 3378 if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
2927 3379 add_x86_feature(featureset, X86FSET_CMOV);
2928 3380 }
2929 3381 if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
2930 3382 add_x86_feature(featureset, X86FSET_MMX);
2931 3383 }
2932 3384 if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
2933 3385 (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
2934 3386 add_x86_feature(featureset, X86FSET_MCA);
2935 3387 }
2936 3388 if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
2937 3389 add_x86_feature(featureset, X86FSET_PAE);
2938 3390 }
2939 3391 if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
2940 3392 add_x86_feature(featureset, X86FSET_CX8);
2941 3393 }
2942 3394 if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
2943 3395 add_x86_feature(featureset, X86FSET_CX16);
2944 3396 }
2945 3397 if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
2946 3398 add_x86_feature(featureset, X86FSET_PAT);
2947 3399 }
2948 3400 if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
2949 3401 add_x86_feature(featureset, X86FSET_SEP);
2950 3402 }
2951 3403 if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
2952 3404 /*
2953 3405 * In our implementation, fxsave/fxrstor
2954 3406 * are prerequisites before we'll even
2955 3407 * try and do SSE things.
2956 3408 */
2957 3409 if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
2958 3410 add_x86_feature(featureset, X86FSET_SSE);
2959 3411 }
2960 3412 if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
2961 3413 add_x86_feature(featureset, X86FSET_SSE2);
2962 3414 }
2963 3415 if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
2964 3416 add_x86_feature(featureset, X86FSET_SSE3);
2965 3417 }
2966 3418 if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
2967 3419 add_x86_feature(featureset, X86FSET_SSSE3);
2968 3420 }
2969 3421 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
2970 3422 add_x86_feature(featureset, X86FSET_SSE4_1);
2971 3423 }
2972 3424 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
2973 3425 add_x86_feature(featureset, X86FSET_SSE4_2);
2974 3426 }
2975 3427 if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
2976 3428 add_x86_feature(featureset, X86FSET_AES);
2977 3429 }
2978 3430 if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
2979 3431 add_x86_feature(featureset, X86FSET_PCLMULQDQ);
2980 3432 }
2981 3433
2982 3434 if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
2983 3435 add_x86_feature(featureset, X86FSET_SHA);
2984 3436
2985 3437 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
2986 3438 add_x86_feature(featureset, X86FSET_UMIP);
2987 3439 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_PKU)
2988 3440 add_x86_feature(featureset, X86FSET_PKU);
2989 3441 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
2990 3442 add_x86_feature(featureset, X86FSET_OSPKE);
2991 3443
2992 3444 if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
2993 3445 add_x86_feature(featureset, X86FSET_XSAVE);
2994 3446
2995 3447 /* We only test AVX & AVX512 when there is XSAVE */
2996 3448
2997 3449 if (cp->cp_ecx & CPUID_INTC_ECX_AVX) {
2998 3450 add_x86_feature(featureset,
2999 3451 X86FSET_AVX);
3000 3452
3001 3453 /*
3002 3454 * Intel says we can't check these without also
3003 3455 * checking AVX.
3004 3456 */
3005 3457 if (cp->cp_ecx & CPUID_INTC_ECX_F16C)
3006 3458 add_x86_feature(featureset,
3007 3459 X86FSET_F16C);
3008 3460
3009 3461 if (cp->cp_ecx & CPUID_INTC_ECX_FMA)
3010 3462 add_x86_feature(featureset,
3011 3463 X86FSET_FMA);
3012 3464
3013 3465 if (cpi->cpi_std[7].cp_ebx &
3014 3466 CPUID_INTC_EBX_7_0_BMI1)
3015 3467 add_x86_feature(featureset,
3016 3468 X86FSET_BMI1);
3017 3469
3018 3470 if (cpi->cpi_std[7].cp_ebx &
3019 3471 CPUID_INTC_EBX_7_0_BMI2)
3020 3472 add_x86_feature(featureset,
3021 3473 X86FSET_BMI2);
3022 3474
3023 3475 if (cpi->cpi_std[7].cp_ebx &
3024 3476 CPUID_INTC_EBX_7_0_AVX2)
3025 3477 add_x86_feature(featureset,
3026 3478 X86FSET_AVX2);
3027 3479 }
3028 3480
3029 3481 if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3030 3482 (cpi->cpi_std[7].cp_ebx &
3031 3483 CPUID_INTC_EBX_7_0_AVX512F) != 0) {
3032 3484 add_x86_feature(featureset, X86FSET_AVX512F);
3033 3485
3034 3486 if (cpi->cpi_std[7].cp_ebx &
3035 3487 CPUID_INTC_EBX_7_0_AVX512DQ)
3036 3488 add_x86_feature(featureset,
3037 3489 X86FSET_AVX512DQ);
3038 3490 if (cpi->cpi_std[7].cp_ebx &
3039 3491 CPUID_INTC_EBX_7_0_AVX512IFMA)
3040 3492 add_x86_feature(featureset,
3041 3493 X86FSET_AVX512FMA);
3042 3494 if (cpi->cpi_std[7].cp_ebx &
3043 3495 CPUID_INTC_EBX_7_0_AVX512PF)
3044 3496 add_x86_feature(featureset,
3045 3497 X86FSET_AVX512PF);
3046 3498 if (cpi->cpi_std[7].cp_ebx &
3047 3499 CPUID_INTC_EBX_7_0_AVX512ER)
3048 3500 add_x86_feature(featureset,
3049 3501 X86FSET_AVX512ER);
3050 3502 if (cpi->cpi_std[7].cp_ebx &
3051 3503 CPUID_INTC_EBX_7_0_AVX512CD)
3052 3504 add_x86_feature(featureset,
3053 3505 X86FSET_AVX512CD);
3054 3506 if (cpi->cpi_std[7].cp_ebx &
3055 3507 CPUID_INTC_EBX_7_0_AVX512BW)
3056 3508 add_x86_feature(featureset,
3057 3509 X86FSET_AVX512BW);
3058 3510 if (cpi->cpi_std[7].cp_ebx &
3059 3511 CPUID_INTC_EBX_7_0_AVX512VL)
3060 3512 add_x86_feature(featureset,
3061 3513 X86FSET_AVX512VL);
3062 3514
3063 3515 if (cpi->cpi_std[7].cp_ecx &
3064 3516 CPUID_INTC_ECX_7_0_AVX512VBMI)
3065 3517 add_x86_feature(featureset,
3066 3518 X86FSET_AVX512VBMI);
3067 3519 if (cpi->cpi_std[7].cp_ecx &
3068 3520 CPUID_INTC_ECX_7_0_AVX512VNNI)
3069 3521 add_x86_feature(featureset,
3070 3522 X86FSET_AVX512VNNI);
3071 3523 if (cpi->cpi_std[7].cp_ecx &
3072 3524 CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3073 3525 add_x86_feature(featureset,
3074 3526 X86FSET_AVX512VPOPCDQ);
3075 3527
3076 3528 if (cpi->cpi_std[7].cp_edx &
3077 3529 CPUID_INTC_EDX_7_0_AVX5124NNIW)
3078 3530 add_x86_feature(featureset,
3079 3531 X86FSET_AVX512NNIW);
3080 3532 if (cpi->cpi_std[7].cp_edx &
3081 3533 CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3082 3534 add_x86_feature(featureset,
3083 3535 X86FSET_AVX512FMAPS);
3084 3536 }
3085 3537 }
3086 3538 }
3087 3539
3088 3540 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3089 3541 if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
3090 3542 add_x86_feature(featureset, X86FSET_PCID);
3091 3543 }
3092 3544 }
3093 3545
3094 3546 if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
3095 3547 add_x86_feature(featureset, X86FSET_X2APIC);
3096 3548 }
3097 3549 if (cp->cp_edx & CPUID_INTC_EDX_DE) {
3098 3550 add_x86_feature(featureset, X86FSET_DE);
3099 3551 }
3100 3552 #if !defined(__xpv)
3101 3553 if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
3102 3554
3103 3555 /*
3104 3556 * We require the CLFLUSH instruction for erratum workaround
3105 3557 * to use MONITOR/MWAIT.
3106 3558 */
3107 3559 if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3108 3560 cpi->cpi_mwait.support |= MWAIT_SUPPORT;
3109 3561 add_x86_feature(featureset, X86FSET_MWAIT);
3110 3562 } else {
3111 3563 extern int idle_cpu_assert_cflush_monitor;
3112 3564
3113 3565 /*
3114 3566 * All processors we are aware of which have
3115 3567 * MONITOR/MWAIT also have CLFLUSH.
3116 3568 */
3117 3569 if (idle_cpu_assert_cflush_monitor) {
3118 3570 ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
3119 3571 (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
3120 3572 }
3121 3573 }
3122 3574 }
3123 3575 #endif /* __xpv */
3124 3576
3125 3577 if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
3126 3578 add_x86_feature(featureset, X86FSET_VMX);
3127 3579 }
3128 3580
3129 3581 if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
3130 3582 add_x86_feature(featureset, X86FSET_RDRAND);
3131 3583
3132 3584 /*
3133 3585 * Only need it first time, rest of the cpus would follow suit.
3134 3586 * we only capture this for the bootcpu.
3135 3587 */
3136 3588 if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3137 3589 add_x86_feature(featureset, X86FSET_CLFSH);
3138 3590 x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
3139 3591 }
3140 3592 if (is_x86_feature(featureset, X86FSET_PAE))
3141 3593 cpi->cpi_pabits = 36;
3142 3594
3143 3595 if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
3144 3596 struct cpuid_regs r, *ecp;
3145 3597
3146 3598 ecp = &r;
3147 3599 ecp->cp_eax = 0xD;
3148 3600 ecp->cp_ecx = 1;
3149 3601 ecp->cp_edx = ecp->cp_ebx = 0;
3150 3602 (void) __cpuid_insn(ecp);
3151 3603
3152 3604 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
3153 3605 add_x86_feature(featureset, X86FSET_XSAVEOPT);
3154 3606 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
3155 3607 add_x86_feature(featureset, X86FSET_XSAVEC);
3156 3608 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
3157 3609 add_x86_feature(featureset, X86FSET_XSAVES);
3158 3610 }
3159 3611
3160 3612 /*
3161 3613 * Work on the "extended" feature information, doing
3162 3614 * some basic initialization for cpuid_pass2()
3163 3615 */
3164 3616 xcpuid = 0;
3165 3617 switch (cpi->cpi_vendor) {
3166 3618 case X86_VENDOR_Intel:
3167 3619 /*
3168 3620 * On KVM we know we will have proper support for extended
3169 3621 * cpuid.
3170 3622 */
3171 3623 if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
3172 3624 (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
3173 3625 (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
3174 3626 xcpuid++;
3175 3627 break;
3176 3628 case X86_VENDOR_AMD:
3177 3629 if (cpi->cpi_family > 5 ||
3178 3630 (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
3179 3631 xcpuid++;
3180 3632 break;
3181 3633 case X86_VENDOR_Cyrix:
3182 3634 /*
3183 3635 * Only these Cyrix CPUs are -known- to support
3184 3636 * extended cpuid operations.
3185 3637 */
3186 3638 if (x86_type == X86_TYPE_VIA_CYRIX_III ||
3187 3639 x86_type == X86_TYPE_CYRIX_GXm)
3188 3640 xcpuid++;
3189 3641 break;
3190 3642 case X86_VENDOR_Centaur:
3191 3643 case X86_VENDOR_TM:
3192 3644 default:
3193 3645 xcpuid++;
3194 3646 break;
3195 3647 }
3196 3648
3197 3649 if (xcpuid) {
3198 3650 cp = &cpi->cpi_extd[0];
3199 3651 cp->cp_eax = CPUID_LEAF_EXT_0;
3200 3652 cpi->cpi_xmaxeax = __cpuid_insn(cp);
3201 3653 }
3202 3654
3203 3655 if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
3204 3656
3205 3657 if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
3206 3658 cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
3207 3659
3208 3660 switch (cpi->cpi_vendor) {
3209 3661 case X86_VENDOR_Intel:
3210 3662 case X86_VENDOR_AMD:
3211 3663 if (cpi->cpi_xmaxeax < 0x80000001)
3212 3664 break;
3213 3665 cp = &cpi->cpi_extd[1];
3214 3666 cp->cp_eax = 0x80000001;
3215 3667 (void) __cpuid_insn(cp);
3216 3668
3217 3669 if (cpi->cpi_vendor == X86_VENDOR_AMD &&
3218 3670 cpi->cpi_family == 5 &&
3219 3671 cpi->cpi_model == 6 &&
3220 3672 cpi->cpi_step == 6) {
3221 3673 /*
3222 3674 * K6 model 6 uses bit 10 to indicate SYSC
3223 3675 * Later models use bit 11. Fix it here.
3224 3676 */
3225 3677 if (cp->cp_edx & 0x400) {
3226 3678 cp->cp_edx &= ~0x400;
3227 3679 cp->cp_edx |= CPUID_AMD_EDX_SYSC;
3228 3680 }
3229 3681 }
3230 3682
3231 3683 platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
3232 3684
3233 3685 /*
3234 3686 * Compute the additions to the kernel's feature word.
3235 3687 */
3236 3688 if (cp->cp_edx & CPUID_AMD_EDX_NX) {
3237 3689 add_x86_feature(featureset, X86FSET_NX);
3238 3690 }
3239 3691
3240 3692 /*
3241 3693 * Regardless whether or not we boot 64-bit,
3242 3694 * we should have a way to identify whether
3243 3695 * the CPU is capable of running 64-bit.
3244 3696 */
3245 3697 if (cp->cp_edx & CPUID_AMD_EDX_LM) {
3246 3698 add_x86_feature(featureset, X86FSET_64);
3247 3699 }
3248 3700
3249 3701 /* 1 GB large page - enable only for 64 bit kernel */
3250 3702 if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
3251 3703 add_x86_feature(featureset, X86FSET_1GPG);
3252 3704 }
3253 3705
3254 3706 if ((cpi->cpi_vendor == X86_VENDOR_AMD) &&
3255 3707 (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
3256 3708 (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
3257 3709 add_x86_feature(featureset, X86FSET_SSE4A);
3258 3710 }
3259 3711
3260 3712 /*
3261 3713 * It's really tricky to support syscall/sysret in
3262 3714 * the i386 kernel; we rely on sysenter/sysexit
3263 3715 * instead. In the amd64 kernel, things are -way-
3264 3716 * better.
3265 3717 */
3266 3718 if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
3267 3719 add_x86_feature(featureset, X86FSET_ASYSC);
3268 3720 }
3269 3721
3270 3722 /*
3271 3723 * While we're thinking about system calls, note
3272 3724 * that AMD processors don't support sysenter
3273 3725 * in long mode at all, so don't try to program them.
3274 3726 */
3275 3727 if (x86_vendor == X86_VENDOR_AMD) {
3276 3728 remove_x86_feature(featureset, X86FSET_SEP);
3277 3729 }
3278 3730
3279 3731 if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
3280 3732 add_x86_feature(featureset, X86FSET_TSCP);
3281 3733 }
3282 3734
3283 3735 if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
3284 3736 add_x86_feature(featureset, X86FSET_SVM);
3285 3737 }
3286 3738
3287 3739 if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
3288 3740 add_x86_feature(featureset, X86FSET_TOPOEXT);
3289 3741 }
3290 3742
3291 3743 if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
3292 3744 add_x86_feature(featureset, X86FSET_AMD_PCEC);
3293 3745 }
3294 3746
3295 3747 if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
3296 3748 add_x86_feature(featureset, X86FSET_XOP);
3297 3749 }
3298 3750
3299 3751 if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
3300 3752 add_x86_feature(featureset, X86FSET_FMA4);
3301 3753 }
3302 3754
3303 3755 if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
3304 3756 add_x86_feature(featureset, X86FSET_TBM);
3305 3757 }
3306 3758
3307 3759 if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
3308 3760 add_x86_feature(featureset, X86FSET_MONITORX);
3309 3761 }
3310 3762 break;
3311 3763 default:
3312 3764 break;
3313 3765 }
3314 3766
3315 3767 /*
3316 3768 * Get CPUID data about processor cores and hyperthreads.
3317 3769 */
3318 3770 switch (cpi->cpi_vendor) {
3319 3771 case X86_VENDOR_Intel:
3320 3772 if (cpi->cpi_maxeax >= 4) {
3321 3773 cp = &cpi->cpi_std[4];
3322 3774 cp->cp_eax = 4;
3323 3775 cp->cp_ecx = 0;
3324 3776 (void) __cpuid_insn(cp);
3325 3777 platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
3326 3778 }
3327 3779 /*FALLTHROUGH*/
3328 3780 case X86_VENDOR_AMD:
3329 3781 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
3330 3782 break;
3331 3783 cp = &cpi->cpi_extd[8];
3332 3784 cp->cp_eax = CPUID_LEAF_EXT_8;
3333 3785 (void) __cpuid_insn(cp);
3334 3786 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
3335 3787 cp);
3336 3788
3337 3789 /*
3338 3790 * AMD uses ebx for some extended functions.
3339 3791 */
3340 3792 if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3341 3793 /*
3342 3794 * While we're here, check for the AMD "Error
3343 3795 * Pointer Zero/Restore" feature. This can be
3344 3796 * used to setup the FP save handlers
3345 3797 * appropriately.
3346 3798 */
3347 3799 if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
3348 3800 cpi->cpi_fp_amd_save = 0;
3349 3801 } else {
3350 3802 cpi->cpi_fp_amd_save = 1;
3351 3803 }
3352 3804
3353 3805 if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
3354 3806 add_x86_feature(featureset,
3355 3807 X86FSET_CLZERO);
3356 3808 }
3357 3809 }
3358 3810
3359 3811 /*
3360 3812 * Virtual and physical address limits from
3361 3813 * cpuid override previously guessed values.
3362 3814 */
3363 3815 cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
3364 3816 cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
3365 3817 break;
3366 3818 default:
3367 3819 break;
3368 3820 }
3369 3821
3370 3822 /*
3371 3823 * Get CPUID data about TSC Invariance in Deep C-State.
3372 3824 */
3373 3825 switch (cpi->cpi_vendor) {
3374 3826 case X86_VENDOR_Intel:
3375 3827 case X86_VENDOR_AMD:
3376 3828 if (cpi->cpi_maxeax >= 7) {
3377 3829 cp = &cpi->cpi_extd[7];
3378 3830 cp->cp_eax = 0x80000007;
3379 3831 cp->cp_ecx = 0;
3380 3832 (void) __cpuid_insn(cp);
3381 3833 }
3382 3834 break;
3383 3835 default:
3384 3836 break;
3385 3837 }
3386 3838 }
3387 3839
3388 3840 cpuid_pass1_topology(cpu, featureset);
3389 3841 cpuid_pass1_thermal(cpu, featureset);
3390 3842
3391 3843 /*
3392 3844 * Synthesize chip "revision" and socket type
3393 3845 */
3394 3846 cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
3395 3847 cpi->cpi_model, cpi->cpi_step);
3396 3848 cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
3397 3849 cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
3398 3850 cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
3399 3851 cpi->cpi_model, cpi->cpi_step);
3400 3852
3401 3853 if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3402 3854 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
3403 3855 cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
3404 3856 /* Special handling for AMD FP not necessary. */
3405 3857 cpi->cpi_fp_amd_save = 0;
3406 3858 } else {
3407 3859 cpi->cpi_fp_amd_save = 1;
3408 3860 }
3409 3861 }
3410 3862
3411 3863 /*
3412 3864 * Check the processor leaves that are used for security features.
3413 3865 */
3414 3866 cpuid_scan_security(cpu, featureset);
3415 3867
3416 3868 pass1_done:
3417 3869 cpi->cpi_pass = 1;
3418 3870 }
3419 3871
3420 3872 /*
3421 3873 * Make copies of the cpuid table entries we depend on, in
3422 3874 * part for ease of parsing now, in part so that we have only
3423 3875 * one place to correct any of it, in part for ease of
3424 3876 * later export to userland, and in part so we can look at
3425 3877 * this stuff in a crash dump.
3426 3878 */
3427 3879
3428 3880 /*ARGSUSED*/
3429 3881 void
3430 3882 cpuid_pass2(cpu_t *cpu)
3431 3883 {
3432 3884 uint_t n, nmax;
3433 3885 int i;
3434 3886 struct cpuid_regs *cp;
3435 3887 uint8_t *dp;
3436 3888 uint32_t *iptr;
3437 3889 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3438 3890
3439 3891 ASSERT(cpi->cpi_pass == 1);
3440 3892
3441 3893 if (cpi->cpi_maxeax < 1)
3442 3894 goto pass2_done;
3443 3895
3444 3896 if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
3445 3897 nmax = NMAX_CPI_STD;
3446 3898 /*
3447 3899 * (We already handled n == 0 and n == 1 in pass 1)
3448 3900 */
3449 3901 for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
3450 3902 /*
3451 3903 * leaves 6 and 7 were handled in pass 1
3452 3904 */
3453 3905 if (n == 6 || n == 7)
3454 3906 continue;
3455 3907
3456 3908 cp->cp_eax = n;
3457 3909
3458 3910 /*
3459 3911 * CPUID function 4 expects %ecx to be initialized
3460 3912 * with an index which indicates which cache to return
3461 3913 * information about. The OS is expected to call function 4
3462 3914 * with %ecx set to 0, 1, 2, ... until it returns with
3463 3915 * EAX[4:0] set to 0, which indicates there are no more
3464 3916 * caches.
3465 3917 *
3466 3918 * Here, populate cpi_std[4] with the information returned by
3467 3919 * function 4 when %ecx == 0, and do the rest in cpuid_pass3()
3468 3920 * when dynamic memory allocation becomes available.
3469 3921 *
3470 3922 * Note: we need to explicitly initialize %ecx here, since
3471 3923 * function 4 may have been previously invoked.
3472 3924 */
3473 3925 if (n == 4)
3474 3926 cp->cp_ecx = 0;
3475 3927
3476 3928 (void) __cpuid_insn(cp);
3477 3929 platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
3478 3930 switch (n) {
3479 3931 case 2:
3480 3932 /*
3481 3933 * "the lower 8 bits of the %eax register
3482 3934 * contain a value that identifies the number
3483 3935 * of times the cpuid [instruction] has to be
3484 3936 * executed to obtain a complete image of the
3485 3937 * processor's caching systems."
3486 3938 *
3487 3939 * How *do* they make this stuff up?
3488 3940 */
3489 3941 cpi->cpi_ncache = sizeof (*cp) *
3490 3942 BITX(cp->cp_eax, 7, 0);
3491 3943 if (cpi->cpi_ncache == 0)
3492 3944 break;
3493 3945 cpi->cpi_ncache--; /* skip count byte */
3494 3946
3495 3947 /*
3496 3948 * Well, for now, rather than attempt to implement
3497 3949 * this slightly dubious algorithm, we just look
3498 3950 * at the first 15 ..
3499 3951 */
3500 3952 if (cpi->cpi_ncache > (sizeof (*cp) - 1))
3501 3953 cpi->cpi_ncache = sizeof (*cp) - 1;
3502 3954
3503 3955 dp = cpi->cpi_cacheinfo;
3504 3956 if (BITX(cp->cp_eax, 31, 31) == 0) {
3505 3957 uint8_t *p = (void *)&cp->cp_eax;
3506 3958 for (i = 1; i < 4; i++)
3507 3959 if (p[i] != 0)
3508 3960 *dp++ = p[i];
3509 3961 }
3510 3962 if (BITX(cp->cp_ebx, 31, 31) == 0) {
3511 3963 uint8_t *p = (void *)&cp->cp_ebx;
3512 3964 for (i = 0; i < 4; i++)
3513 3965 if (p[i] != 0)
3514 3966 *dp++ = p[i];
3515 3967 }
3516 3968 if (BITX(cp->cp_ecx, 31, 31) == 0) {
3517 3969 uint8_t *p = (void *)&cp->cp_ecx;
3518 3970 for (i = 0; i < 4; i++)
3519 3971 if (p[i] != 0)
3520 3972 *dp++ = p[i];
3521 3973 }
3522 3974 if (BITX(cp->cp_edx, 31, 31) == 0) {
3523 3975 uint8_t *p = (void *)&cp->cp_edx;
3524 3976 for (i = 0; i < 4; i++)
3525 3977 if (p[i] != 0)
3526 3978 *dp++ = p[i];
3527 3979 }
3528 3980 break;
3529 3981
3530 3982 case 3: /* Processor serial number, if PSN supported */
3531 3983 break;
3532 3984
3533 3985 case 4: /* Deterministic cache parameters */
3534 3986 break;
3535 3987
3536 3988 case 5: /* Monitor/Mwait parameters */
3537 3989 {
3538 3990 size_t mwait_size;
3539 3991
3540 3992 /*
3541 3993 * check cpi_mwait.support which was set in cpuid_pass1
3542 3994 */
3543 3995 if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
3544 3996 break;
3545 3997
3546 3998 /*
3547 3999 * Protect ourself from insane mwait line size.
3548 4000 * Workaround for incomplete hardware emulator(s).
3549 4001 */
3550 4002 mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
3551 4003 if (mwait_size < sizeof (uint32_t) ||
3552 4004 !ISP2(mwait_size)) {
3553 4005 #if DEBUG
3554 4006 cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
3555 4007 "size %ld", cpu->cpu_id, (long)mwait_size);
3556 4008 #endif
3557 4009 break;
3558 4010 }
3559 4011
3560 4012 cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
3561 4013 cpi->cpi_mwait.mon_max = mwait_size;
3562 4014 if (MWAIT_EXTENSION(cpi)) {
3563 4015 cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
3564 4016 if (MWAIT_INT_ENABLE(cpi))
3565 4017 cpi->cpi_mwait.support |=
3566 4018 MWAIT_ECX_INT_ENABLE;
3567 4019 }
3568 4020 break;
3569 4021 }
3570 4022 default:
3571 4023 break;
3572 4024 }
3573 4025 }
3574 4026
3575 4027 /*
3576 4028 * XSAVE enumeration
3577 4029 */
3578 4030 if (cpi->cpi_maxeax >= 0xD) {
3579 4031 struct cpuid_regs regs;
3580 4032 boolean_t cpuid_d_valid = B_TRUE;
3581 4033
3582 4034 cp = ®s;
3583 4035 cp->cp_eax = 0xD;
3584 4036 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
3585 4037
3586 4038 (void) __cpuid_insn(cp);
3587 4039
3588 4040 /*
3589 4041 * Sanity checks for debug
3590 4042 */
3591 4043 if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
3592 4044 (cp->cp_eax & XFEATURE_SSE) == 0) {
3593 4045 cpuid_d_valid = B_FALSE;
3594 4046 }
3595 4047
3596 4048 cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
3597 4049 cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
3598 4050 cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
3599 4051
3600 4052 /*
3601 4053 * If the hw supports AVX, get the size and offset in the save
3602 4054 * area for the ymm state.
3603 4055 */
3604 4056 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
3605 4057 cp->cp_eax = 0xD;
3606 4058 cp->cp_ecx = 2;
3607 4059 cp->cp_edx = cp->cp_ebx = 0;
3608 4060
3609 4061 (void) __cpuid_insn(cp);
3610 4062
3611 4063 if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
3612 4064 cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
3613 4065 cpuid_d_valid = B_FALSE;
3614 4066 }
3615 4067
3616 4068 cpi->cpi_xsave.ymm_size = cp->cp_eax;
3617 4069 cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
3618 4070 }
3619 4071
3620 4072 /*
3621 4073 * If the hw supports MPX, get the size and offset in the
3622 4074 * save area for BNDREGS and BNDCSR.
3623 4075 */
3624 4076 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
3625 4077 cp->cp_eax = 0xD;
3626 4078 cp->cp_ecx = 3;
3627 4079 cp->cp_edx = cp->cp_ebx = 0;
3628 4080
3629 4081 (void) __cpuid_insn(cp);
3630 4082
3631 4083 cpi->cpi_xsave.bndregs_size = cp->cp_eax;
3632 4084 cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
3633 4085
3634 4086 cp->cp_eax = 0xD;
3635 4087 cp->cp_ecx = 4;
3636 4088 cp->cp_edx = cp->cp_ebx = 0;
3637 4089
3638 4090 (void) __cpuid_insn(cp);
3639 4091
3640 4092 cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
3641 4093 cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
3642 4094 }
3643 4095
3644 4096 /*
3645 4097 * If the hw supports AVX512, get the size and offset in the
3646 4098 * save area for the opmask registers and zmm state.
3647 4099 */
3648 4100 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
3649 4101 cp->cp_eax = 0xD;
3650 4102 cp->cp_ecx = 5;
3651 4103 cp->cp_edx = cp->cp_ebx = 0;
3652 4104
3653 4105 (void) __cpuid_insn(cp);
3654 4106
3655 4107 cpi->cpi_xsave.opmask_size = cp->cp_eax;
3656 4108 cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
3657 4109
3658 4110 cp->cp_eax = 0xD;
3659 4111 cp->cp_ecx = 6;
3660 4112 cp->cp_edx = cp->cp_ebx = 0;
3661 4113
3662 4114 (void) __cpuid_insn(cp);
3663 4115
3664 4116 cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
3665 4117 cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
3666 4118
3667 4119 cp->cp_eax = 0xD;
3668 4120 cp->cp_ecx = 7;
3669 4121 cp->cp_edx = cp->cp_ebx = 0;
3670 4122
3671 4123 (void) __cpuid_insn(cp);
3672 4124
3673 4125 cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
3674 4126 cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
3675 4127 }
3676 4128
3677 4129 if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
3678 4130 xsave_state_size = 0;
3679 4131 } else if (cpuid_d_valid) {
3680 4132 xsave_state_size = cpi->cpi_xsave.xsav_max_size;
3681 4133 } else {
3682 4134 /* Broken CPUID 0xD, probably in HVM */
3683 4135 cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
3684 4136 "value: hw_low = %d, hw_high = %d, xsave_size = %d"
3685 4137 ", ymm_size = %d, ymm_offset = %d\n",
3686 4138 cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
3687 4139 cpi->cpi_xsave.xsav_hw_features_high,
3688 4140 (int)cpi->cpi_xsave.xsav_max_size,
3689 4141 (int)cpi->cpi_xsave.ymm_size,
3690 4142 (int)cpi->cpi_xsave.ymm_offset);
3691 4143
3692 4144 if (xsave_state_size != 0) {
3693 4145 /*
3694 4146 * This must be a non-boot CPU. We cannot
3695 4147 * continue, because boot cpu has already
3696 4148 * enabled XSAVE.
3697 4149 */
3698 4150 ASSERT(cpu->cpu_id != 0);
3699 4151 cmn_err(CE_PANIC, "cpu%d: we have already "
3700 4152 "enabled XSAVE on boot cpu, cannot "
3701 4153 "continue.", cpu->cpu_id);
3702 4154 } else {
3703 4155 /*
3704 4156 * If we reached here on the boot CPU, it's also
3705 4157 * almost certain that we'll reach here on the
3706 4158 * non-boot CPUs. When we're here on a boot CPU
3707 4159 * we should disable the feature, on a non-boot
3708 4160 * CPU we need to confirm that we have.
3709 4161 */
3710 4162 if (cpu->cpu_id == 0) {
3711 4163 remove_x86_feature(x86_featureset,
3712 4164 X86FSET_XSAVE);
3713 4165 remove_x86_feature(x86_featureset,
3714 4166 X86FSET_AVX);
3715 4167 remove_x86_feature(x86_featureset,
3716 4168 X86FSET_F16C);
3717 4169 remove_x86_feature(x86_featureset,
3718 4170 X86FSET_BMI1);
3719 4171 remove_x86_feature(x86_featureset,
3720 4172 X86FSET_BMI2);
3721 4173 remove_x86_feature(x86_featureset,
3722 4174 X86FSET_FMA);
3723 4175 remove_x86_feature(x86_featureset,
3724 4176 X86FSET_AVX2);
3725 4177 remove_x86_feature(x86_featureset,
3726 4178 X86FSET_MPX);
3727 4179 remove_x86_feature(x86_featureset,
3728 4180 X86FSET_AVX512F);
3729 4181 remove_x86_feature(x86_featureset,
3730 4182 X86FSET_AVX512DQ);
3731 4183 remove_x86_feature(x86_featureset,
3732 4184 X86FSET_AVX512PF);
3733 4185 remove_x86_feature(x86_featureset,
3734 4186 X86FSET_AVX512ER);
3735 4187 remove_x86_feature(x86_featureset,
3736 4188 X86FSET_AVX512CD);
3737 4189 remove_x86_feature(x86_featureset,
3738 4190 X86FSET_AVX512BW);
3739 4191 remove_x86_feature(x86_featureset,
3740 4192 X86FSET_AVX512VL);
3741 4193 remove_x86_feature(x86_featureset,
3742 4194 X86FSET_AVX512FMA);
3743 4195 remove_x86_feature(x86_featureset,
3744 4196 X86FSET_AVX512VBMI);
3745 4197 remove_x86_feature(x86_featureset,
3746 4198 X86FSET_AVX512VNNI);
3747 4199 remove_x86_feature(x86_featureset,
3748 4200 X86FSET_AVX512VPOPCDQ);
3749 4201 remove_x86_feature(x86_featureset,
3750 4202 X86FSET_AVX512NNIW);
3751 4203 remove_x86_feature(x86_featureset,
3752 4204 X86FSET_AVX512FMAPS);
3753 4205
3754 4206 CPI_FEATURES_ECX(cpi) &=
3755 4207 ~CPUID_INTC_ECX_XSAVE;
3756 4208 CPI_FEATURES_ECX(cpi) &=
3757 4209 ~CPUID_INTC_ECX_AVX;
3758 4210 CPI_FEATURES_ECX(cpi) &=
3759 4211 ~CPUID_INTC_ECX_F16C;
3760 4212 CPI_FEATURES_ECX(cpi) &=
3761 4213 ~CPUID_INTC_ECX_FMA;
3762 4214 CPI_FEATURES_7_0_EBX(cpi) &=
3763 4215 ~CPUID_INTC_EBX_7_0_BMI1;
3764 4216 CPI_FEATURES_7_0_EBX(cpi) &=
3765 4217 ~CPUID_INTC_EBX_7_0_BMI2;
3766 4218 CPI_FEATURES_7_0_EBX(cpi) &=
3767 4219 ~CPUID_INTC_EBX_7_0_AVX2;
3768 4220 CPI_FEATURES_7_0_EBX(cpi) &=
3769 4221 ~CPUID_INTC_EBX_7_0_MPX;
3770 4222 CPI_FEATURES_7_0_EBX(cpi) &=
3771 4223 ~CPUID_INTC_EBX_7_0_ALL_AVX512;
3772 4224
3773 4225 CPI_FEATURES_7_0_ECX(cpi) &=
3774 4226 ~CPUID_INTC_ECX_7_0_ALL_AVX512;
3775 4227
3776 4228 CPI_FEATURES_7_0_EDX(cpi) &=
3777 4229 ~CPUID_INTC_EDX_7_0_ALL_AVX512;
3778 4230
3779 4231 xsave_force_disable = B_TRUE;
3780 4232 } else {
3781 4233 VERIFY(is_x86_feature(x86_featureset,
3782 4234 X86FSET_XSAVE) == B_FALSE);
3783 4235 }
3784 4236 }
3785 4237 }
3786 4238 }
3787 4239
3788 4240
3789 4241 if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
3790 4242 goto pass2_done;
3791 4243
3792 4244 if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
3793 4245 nmax = NMAX_CPI_EXTD;
3794 4246 /*
3795 4247 * Copy the extended properties, fixing them as we go.
3796 4248 * (We already handled n == 0 and n == 1 in pass 1)
3797 4249 */
3798 4250 iptr = (void *)cpi->cpi_brandstr;
3799 4251 for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
3800 4252 cp->cp_eax = CPUID_LEAF_EXT_0 + n;
3801 4253 (void) __cpuid_insn(cp);
3802 4254 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
3803 4255 cp);
3804 4256 switch (n) {
3805 4257 case 2:
3806 4258 case 3:
3807 4259 case 4:
3808 4260 /*
3809 4261 * Extract the brand string
3810 4262 */
3811 4263 *iptr++ = cp->cp_eax;
3812 4264 *iptr++ = cp->cp_ebx;
3813 4265 *iptr++ = cp->cp_ecx;
3814 4266 *iptr++ = cp->cp_edx;
3815 4267 break;
3816 4268 case 5:
3817 4269 switch (cpi->cpi_vendor) {
3818 4270 case X86_VENDOR_AMD:
3819 4271 /*
3820 4272 * The Athlon and Duron were the first
3821 4273 * parts to report the sizes of the
3822 4274 * TLB for large pages. Before then,
3823 4275 * we don't trust the data.
3824 4276 */
3825 4277 if (cpi->cpi_family < 6 ||
3826 4278 (cpi->cpi_family == 6 &&
3827 4279 cpi->cpi_model < 1))
3828 4280 cp->cp_eax = 0;
3829 4281 break;
3830 4282 default:
3831 4283 break;
3832 4284 }
3833 4285 break;
3834 4286 case 6:
3835 4287 switch (cpi->cpi_vendor) {
3836 4288 case X86_VENDOR_AMD:
3837 4289 /*
3838 4290 * The Athlon and Duron were the first
3839 4291 * AMD parts with L2 TLB's.
3840 4292 * Before then, don't trust the data.
3841 4293 */
3842 4294 if (cpi->cpi_family < 6 ||
3843 4295 cpi->cpi_family == 6 &&
3844 4296 cpi->cpi_model < 1)
3845 4297 cp->cp_eax = cp->cp_ebx = 0;
3846 4298 /*
3847 4299 * AMD Duron rev A0 reports L2
3848 4300 * cache size incorrectly as 1K
3849 4301 * when it is really 64K
3850 4302 */
3851 4303 if (cpi->cpi_family == 6 &&
3852 4304 cpi->cpi_model == 3 &&
3853 4305 cpi->cpi_step == 0) {
3854 4306 cp->cp_ecx &= 0xffff;
3855 4307 cp->cp_ecx |= 0x400000;
3856 4308 }
3857 4309 break;
3858 4310 case X86_VENDOR_Cyrix: /* VIA C3 */
3859 4311 /*
3860 4312 * VIA C3 processors are a bit messed
3861 4313 * up w.r.t. encoding cache sizes in %ecx
3862 4314 */
3863 4315 if (cpi->cpi_family != 6)
3864 4316 break;
3865 4317 /*
3866 4318 * model 7 and 8 were incorrectly encoded
3867 4319 *
3868 4320 * xxx is model 8 really broken?
3869 4321 */
3870 4322 if (cpi->cpi_model == 7 ||
3871 4323 cpi->cpi_model == 8)
3872 4324 cp->cp_ecx =
3873 4325 BITX(cp->cp_ecx, 31, 24) << 16 |
3874 4326 BITX(cp->cp_ecx, 23, 16) << 12 |
3875 4327 BITX(cp->cp_ecx, 15, 8) << 8 |
3876 4328 BITX(cp->cp_ecx, 7, 0);
3877 4329 /*
3878 4330 * model 9 stepping 1 has wrong associativity
3879 4331 */
3880 4332 if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
3881 4333 cp->cp_ecx |= 8 << 12;
3882 4334 break;
3883 4335 case X86_VENDOR_Intel:
3884 4336 /*
3885 4337 * Extended L2 Cache features function.
3886 4338 * First appeared on Prescott.
3887 4339 */
3888 4340 default:
3889 4341 break;
3890 4342 }
3891 4343 break;
3892 4344 default:
3893 4345 break;
3894 4346 }
3895 4347 }
3896 4348
3897 4349 pass2_done:
3898 4350 cpi->cpi_pass = 2;
3899 4351 }
3900 4352
3901 4353 static const char *
3902 4354 intel_cpubrand(const struct cpuid_info *cpi)
3903 4355 {
3904 4356 int i;
3905 4357
3906 4358 if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
3907 4359 cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
3908 4360 return ("i486");
3909 4361
3910 4362 switch (cpi->cpi_family) {
3911 4363 case 5:
3912 4364 return ("Intel Pentium(r)");
3913 4365 case 6:
3914 4366 switch (cpi->cpi_model) {
3915 4367 uint_t celeron, xeon;
3916 4368 const struct cpuid_regs *cp;
3917 4369 case 0:
3918 4370 case 1:
3919 4371 case 2:
3920 4372 return ("Intel Pentium(r) Pro");
3921 4373 case 3:
3922 4374 case 4:
3923 4375 return ("Intel Pentium(r) II");
3924 4376 case 6:
3925 4377 return ("Intel Celeron(r)");
3926 4378 case 5:
3927 4379 case 7:
3928 4380 celeron = xeon = 0;
3929 4381 cp = &cpi->cpi_std[2]; /* cache info */
3930 4382
3931 4383 for (i = 1; i < 4; i++) {
3932 4384 uint_t tmp;
3933 4385
3934 4386 tmp = (cp->cp_eax >> (8 * i)) & 0xff;
3935 4387 if (tmp == 0x40)
3936 4388 celeron++;
3937 4389 if (tmp >= 0x44 && tmp <= 0x45)
3938 4390 xeon++;
3939 4391 }
3940 4392
3941 4393 for (i = 0; i < 2; i++) {
3942 4394 uint_t tmp;
3943 4395
3944 4396 tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
3945 4397 if (tmp == 0x40)
3946 4398 celeron++;
3947 4399 else if (tmp >= 0x44 && tmp <= 0x45)
3948 4400 xeon++;
3949 4401 }
3950 4402
3951 4403 for (i = 0; i < 4; i++) {
3952 4404 uint_t tmp;
3953 4405
3954 4406 tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
3955 4407 if (tmp == 0x40)
3956 4408 celeron++;
3957 4409 else if (tmp >= 0x44 && tmp <= 0x45)
3958 4410 xeon++;
3959 4411 }
3960 4412
3961 4413 for (i = 0; i < 4; i++) {
3962 4414 uint_t tmp;
3963 4415
3964 4416 tmp = (cp->cp_edx >> (8 * i)) & 0xff;
3965 4417 if (tmp == 0x40)
3966 4418 celeron++;
3967 4419 else if (tmp >= 0x44 && tmp <= 0x45)
3968 4420 xeon++;
3969 4421 }
3970 4422
3971 4423 if (celeron)
3972 4424 return ("Intel Celeron(r)");
3973 4425 if (xeon)
3974 4426 return (cpi->cpi_model == 5 ?
3975 4427 "Intel Pentium(r) II Xeon(tm)" :
3976 4428 "Intel Pentium(r) III Xeon(tm)");
3977 4429 return (cpi->cpi_model == 5 ?
3978 4430 "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
3979 4431 "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
3980 4432 default:
3981 4433 break;
3982 4434 }
3983 4435 default:
3984 4436 break;
3985 4437 }
3986 4438
3987 4439 /* BrandID is present if the field is nonzero */
3988 4440 if (cpi->cpi_brandid != 0) {
3989 4441 static const struct {
3990 4442 uint_t bt_bid;
3991 4443 const char *bt_str;
3992 4444 } brand_tbl[] = {
3993 4445 { 0x1, "Intel(r) Celeron(r)" },
3994 4446 { 0x2, "Intel(r) Pentium(r) III" },
3995 4447 { 0x3, "Intel(r) Pentium(r) III Xeon(tm)" },
3996 4448 { 0x4, "Intel(r) Pentium(r) III" },
3997 4449 { 0x6, "Mobile Intel(r) Pentium(r) III" },
3998 4450 { 0x7, "Mobile Intel(r) Celeron(r)" },
3999 4451 { 0x8, "Intel(r) Pentium(r) 4" },
4000 4452 { 0x9, "Intel(r) Pentium(r) 4" },
4001 4453 { 0xa, "Intel(r) Celeron(r)" },
4002 4454 { 0xb, "Intel(r) Xeon(tm)" },
4003 4455 { 0xc, "Intel(r) Xeon(tm) MP" },
4004 4456 { 0xe, "Mobile Intel(r) Pentium(r) 4" },
4005 4457 { 0xf, "Mobile Intel(r) Celeron(r)" },
4006 4458 { 0x11, "Mobile Genuine Intel(r)" },
4007 4459 { 0x12, "Intel(r) Celeron(r) M" },
4008 4460 { 0x13, "Mobile Intel(r) Celeron(r)" },
4009 4461 { 0x14, "Intel(r) Celeron(r)" },
4010 4462 { 0x15, "Mobile Genuine Intel(r)" },
4011 4463 { 0x16, "Intel(r) Pentium(r) M" },
4012 4464 { 0x17, "Mobile Intel(r) Celeron(r)" }
4013 4465 };
4014 4466 uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
4015 4467 uint_t sgn;
4016 4468
4017 4469 sgn = (cpi->cpi_family << 8) |
4018 4470 (cpi->cpi_model << 4) | cpi->cpi_step;
4019 4471
4020 4472 for (i = 0; i < btblmax; i++)
4021 4473 if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
4022 4474 break;
4023 4475 if (i < btblmax) {
4024 4476 if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
4025 4477 return ("Intel(r) Celeron(r)");
4026 4478 if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
4027 4479 return ("Intel(r) Xeon(tm) MP");
4028 4480 if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
4029 4481 return ("Intel(r) Xeon(tm)");
4030 4482 return (brand_tbl[i].bt_str);
4031 4483 }
4032 4484 }
4033 4485
4034 4486 return (NULL);
4035 4487 }
4036 4488
4037 4489 static const char *
4038 4490 amd_cpubrand(const struct cpuid_info *cpi)
4039 4491 {
4040 4492 if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4041 4493 cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
4042 4494 return ("i486 compatible");
4043 4495
4044 4496 switch (cpi->cpi_family) {
4045 4497 case 5:
4046 4498 switch (cpi->cpi_model) {
4047 4499 case 0:
4048 4500 case 1:
4049 4501 case 2:
4050 4502 case 3:
4051 4503 case 4:
4052 4504 case 5:
4053 4505 return ("AMD-K5(r)");
4054 4506 case 6:
4055 4507 case 7:
4056 4508 return ("AMD-K6(r)");
4057 4509 case 8:
4058 4510 return ("AMD-K6(r)-2");
4059 4511 case 9:
4060 4512 return ("AMD-K6(r)-III");
4061 4513 default:
4062 4514 return ("AMD (family 5)");
4063 4515 }
4064 4516 case 6:
4065 4517 switch (cpi->cpi_model) {
4066 4518 case 1:
4067 4519 return ("AMD-K7(tm)");
4068 4520 case 0:
4069 4521 case 2:
4070 4522 case 4:
4071 4523 return ("AMD Athlon(tm)");
4072 4524 case 3:
4073 4525 case 7:
4074 4526 return ("AMD Duron(tm)");
4075 4527 case 6:
4076 4528 case 8:
4077 4529 case 10:
4078 4530 /*
4079 4531 * Use the L2 cache size to distinguish
4080 4532 */
4081 4533 return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
4082 4534 "AMD Athlon(tm)" : "AMD Duron(tm)");
4083 4535 default:
4084 4536 return ("AMD (family 6)");
4085 4537 }
4086 4538 default:
4087 4539 break;
4088 4540 }
4089 4541
4090 4542 if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
4091 4543 cpi->cpi_brandid != 0) {
4092 4544 switch (BITX(cpi->cpi_brandid, 7, 5)) {
4093 4545 case 3:
4094 4546 return ("AMD Opteron(tm) UP 1xx");
4095 4547 case 4:
4096 4548 return ("AMD Opteron(tm) DP 2xx");
4097 4549 case 5:
4098 4550 return ("AMD Opteron(tm) MP 8xx");
4099 4551 default:
4100 4552 return ("AMD Opteron(tm)");
4101 4553 }
4102 4554 }
4103 4555
4104 4556 return (NULL);
4105 4557 }
4106 4558
4107 4559 static const char *
4108 4560 cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
4109 4561 {
4110 4562 if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4111 4563 cpi->cpi_maxeax < 1 || cpi->cpi_family < 5 ||
4112 4564 type == X86_TYPE_CYRIX_486)
4113 4565 return ("i486 compatible");
4114 4566
4115 4567 switch (type) {
4116 4568 case X86_TYPE_CYRIX_6x86:
4117 4569 return ("Cyrix 6x86");
4118 4570 case X86_TYPE_CYRIX_6x86L:
4119 4571 return ("Cyrix 6x86L");
4120 4572 case X86_TYPE_CYRIX_6x86MX:
4121 4573 return ("Cyrix 6x86MX");
4122 4574 case X86_TYPE_CYRIX_GXm:
4123 4575 return ("Cyrix GXm");
4124 4576 case X86_TYPE_CYRIX_MediaGX:
4125 4577 return ("Cyrix MediaGX");
4126 4578 case X86_TYPE_CYRIX_MII:
4127 4579 return ("Cyrix M2");
4128 4580 case X86_TYPE_VIA_CYRIX_III:
4129 4581 return ("VIA Cyrix M3");
4130 4582 default:
4131 4583 /*
4132 4584 * Have another wild guess ..
4133 4585 */
4134 4586 if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
4135 4587 return ("Cyrix 5x86");
4136 4588 else if (cpi->cpi_family == 5) {
4137 4589 switch (cpi->cpi_model) {
4138 4590 case 2:
4139 4591 return ("Cyrix 6x86"); /* Cyrix M1 */
4140 4592 case 4:
4141 4593 return ("Cyrix MediaGX");
4142 4594 default:
4143 4595 break;
4144 4596 }
4145 4597 } else if (cpi->cpi_family == 6) {
4146 4598 switch (cpi->cpi_model) {
4147 4599 case 0:
4148 4600 return ("Cyrix 6x86MX"); /* Cyrix M2? */
4149 4601 case 5:
4150 4602 case 6:
4151 4603 case 7:
4152 4604 case 8:
4153 4605 case 9:
4154 4606 return ("VIA C3");
4155 4607 default:
4156 4608 break;
4157 4609 }
4158 4610 }
4159 4611 break;
4160 4612 }
4161 4613 return (NULL);
4162 4614 }
4163 4615
4164 4616 /*
4165 4617 * This only gets called in the case that the CPU extended
4166 4618 * feature brand string (0x80000002, 0x80000003, 0x80000004)
4167 4619 * aren't available, or contain null bytes for some reason.
4168 4620 */
4169 4621 static void
4170 4622 fabricate_brandstr(struct cpuid_info *cpi)
4171 4623 {
4172 4624 const char *brand = NULL;
4173 4625
4174 4626 switch (cpi->cpi_vendor) {
4175 4627 case X86_VENDOR_Intel:
4176 4628 brand = intel_cpubrand(cpi);
4177 4629 break;
4178 4630 case X86_VENDOR_AMD:
4179 4631 brand = amd_cpubrand(cpi);
4180 4632 break;
4181 4633 case X86_VENDOR_Cyrix:
4182 4634 brand = cyrix_cpubrand(cpi, x86_type);
4183 4635 break;
4184 4636 case X86_VENDOR_NexGen:
4185 4637 if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4186 4638 brand = "NexGen Nx586";
4187 4639 break;
4188 4640 case X86_VENDOR_Centaur:
4189 4641 if (cpi->cpi_family == 5)
4190 4642 switch (cpi->cpi_model) {
4191 4643 case 4:
4192 4644 brand = "Centaur C6";
4193 4645 break;
4194 4646 case 8:
4195 4647 brand = "Centaur C2";
4196 4648 break;
4197 4649 case 9:
4198 4650 brand = "Centaur C3";
4199 4651 break;
4200 4652 default:
4201 4653 break;
4202 4654 }
4203 4655 break;
4204 4656 case X86_VENDOR_Rise:
4205 4657 if (cpi->cpi_family == 5 &&
4206 4658 (cpi->cpi_model == 0 || cpi->cpi_model == 2))
4207 4659 brand = "Rise mP6";
4208 4660 break;
4209 4661 case X86_VENDOR_SiS:
4210 4662 if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4211 4663 brand = "SiS 55x";
4212 4664 break;
4213 4665 case X86_VENDOR_TM:
4214 4666 if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
4215 4667 brand = "Transmeta Crusoe TM3x00 or TM5x00";
4216 4668 break;
4217 4669 case X86_VENDOR_NSC:
4218 4670 case X86_VENDOR_UMC:
4219 4671 default:
4220 4672 break;
4221 4673 }
4222 4674 if (brand) {
4223 4675 (void) strcpy((char *)cpi->cpi_brandstr, brand);
4224 4676 return;
4225 4677 }
4226 4678
4227 4679 /*
4228 4680 * If all else fails ...
4229 4681 */
4230 4682 (void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
4231 4683 "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
4232 4684 cpi->cpi_model, cpi->cpi_step);
4233 4685 }
4234 4686
4235 4687 /*
4236 4688 * This routine is called just after kernel memory allocation
4237 4689 * becomes available on cpu0, and as part of mp_startup() on
4238 4690 * the other cpus.
4239 4691 *
4240 4692 * Fixup the brand string, and collect any information from cpuid
4241 4693 * that requires dynamically allocated storage to represent.
4242 4694 */
4243 4695 /*ARGSUSED*/
4244 4696 void
4245 4697 cpuid_pass3(cpu_t *cpu)
4246 4698 {
4247 4699 int i, max, shft, level, size;
4248 4700 struct cpuid_regs regs;
4249 4701 struct cpuid_regs *cp;
4250 4702 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4251 4703
4252 4704 ASSERT(cpi->cpi_pass == 2);
4253 4705
4254 4706 /*
4255 4707 * Deterministic cache parameters
4256 4708 *
4257 4709 * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
4258 4710 * values that are present are currently defined to be the same. This
4259 4711 * means we can use the same logic to parse it as long as we use the
4260 4712 * appropriate leaf to get the data. If you're updating this, make sure
4261 4713 * you're careful about which vendor supports which aspect.
4262 4714 *
4263 4715 * Take this opportunity to detect the number of threads sharing the
4264 4716 * last level cache, and construct a corresponding cache id. The
4265 4717 * respective cpuid_info members are initialized to the default case of
4266 4718 * "no last level cache sharing".
4267 4719 */
4268 4720 cpi->cpi_ncpu_shr_last_cache = 1;
4269 4721 cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
4270 4722
4271 4723 if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
4272 4724 (cpi->cpi_vendor == X86_VENDOR_AMD &&
4273 4725 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
4274 4726 is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
4275 4727 uint32_t leaf;
4276 4728
4277 4729 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4278 4730 leaf = 4;
4279 4731 } else {
4280 4732 leaf = CPUID_LEAF_EXT_1d;
4281 4733 }
4282 4734
4283 4735 /*
4284 4736 * Find the # of elements (size) returned by the leaf and along
4285 4737 * the way detect last level cache sharing details.
4286 4738 */
4287 4739 bzero(®s, sizeof (regs));
4288 4740 cp = ®s;
4289 4741 for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
4290 4742 cp->cp_eax = leaf;
4291 4743 cp->cp_ecx = i;
4292 4744
4293 4745 (void) __cpuid_insn(cp);
4294 4746
4295 4747 if (CPI_CACHE_TYPE(cp) == 0)
4296 4748 break;
4297 4749 level = CPI_CACHE_LVL(cp);
4298 4750 if (level > max) {
4299 4751 max = level;
4300 4752 cpi->cpi_ncpu_shr_last_cache =
4301 4753 CPI_NTHR_SHR_CACHE(cp) + 1;
4302 4754 }
4303 4755 }
4304 4756 cpi->cpi_cache_leaf_size = size = i;
4305 4757
4306 4758 /*
4307 4759 * Allocate the cpi_cache_leaves array. The first element
4308 4760 * references the regs for the corresponding leaf with %ecx set
4309 4761 * to 0. This was gathered in cpuid_pass2().
4310 4762 */
4311 4763 if (size > 0) {
4312 4764 cpi->cpi_cache_leaves =
4313 4765 kmem_alloc(size * sizeof (cp), KM_SLEEP);
4314 4766 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4315 4767 cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
4316 4768 } else {
4317 4769 cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
4318 4770 }
4319 4771
4320 4772 /*
4321 4773 * Allocate storage to hold the additional regs
4322 4774 * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
4323 4775 *
4324 4776 * The regs for the leaf, %ecx == 0 has already
4325 4777 * been allocated as indicated above.
4326 4778 */
4327 4779 for (i = 1; i < size; i++) {
4328 4780 cp = cpi->cpi_cache_leaves[i] =
4329 4781 kmem_zalloc(sizeof (regs), KM_SLEEP);
4330 4782 cp->cp_eax = leaf;
4331 4783 cp->cp_ecx = i;
4332 4784
4333 4785 (void) __cpuid_insn(cp);
4334 4786 }
4335 4787 }
4336 4788 /*
4337 4789 * Determine the number of bits needed to represent
4338 4790 * the number of CPUs sharing the last level cache.
4339 4791 *
4340 4792 * Shift off that number of bits from the APIC id to
4341 4793 * derive the cache id.
4342 4794 */
4343 4795 shft = 0;
4344 4796 for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
4345 4797 shft++;
4346 4798 cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
4347 4799 }
4348 4800
4349 4801 /*
4350 4802 * Now fixup the brand string
4351 4803 */
4352 4804 if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
4353 4805 fabricate_brandstr(cpi);
4354 4806 } else {
4355 4807
4356 4808 /*
4357 4809 * If we successfully extracted a brand string from the cpuid
4358 4810 * instruction, clean it up by removing leading spaces and
4359 4811 * similar junk.
4360 4812 */
4361 4813 if (cpi->cpi_brandstr[0]) {
4362 4814 size_t maxlen = sizeof (cpi->cpi_brandstr);
4363 4815 char *src, *dst;
4364 4816
4365 4817 dst = src = (char *)cpi->cpi_brandstr;
4366 4818 src[maxlen - 1] = '\0';
4367 4819 /*
4368 4820 * strip leading spaces
4369 4821 */
4370 4822 while (*src == ' ')
4371 4823 src++;
4372 4824 /*
4373 4825 * Remove any 'Genuine' or "Authentic" prefixes
4374 4826 */
4375 4827 if (strncmp(src, "Genuine ", 8) == 0)
4376 4828 src += 8;
4377 4829 if (strncmp(src, "Authentic ", 10) == 0)
4378 4830 src += 10;
4379 4831
4380 4832 /*
4381 4833 * Now do an in-place copy.
4382 4834 * Map (R) to (r) and (TM) to (tm).
4383 4835 * The era of teletypes is long gone, and there's
4384 4836 * -really- no need to shout.
4385 4837 */
4386 4838 while (*src != '\0') {
4387 4839 if (src[0] == '(') {
4388 4840 if (strncmp(src + 1, "R)", 2) == 0) {
4389 4841 (void) strncpy(dst, "(r)", 3);
4390 4842 src += 3;
4391 4843 dst += 3;
4392 4844 continue;
4393 4845 }
4394 4846 if (strncmp(src + 1, "TM)", 3) == 0) {
4395 4847 (void) strncpy(dst, "(tm)", 4);
4396 4848 src += 4;
4397 4849 dst += 4;
4398 4850 continue;
4399 4851 }
4400 4852 }
4401 4853 *dst++ = *src++;
4402 4854 }
4403 4855 *dst = '\0';
4404 4856
4405 4857 /*
4406 4858 * Finally, remove any trailing spaces
4407 4859 */
4408 4860 while (--dst > cpi->cpi_brandstr)
4409 4861 if (*dst == ' ')
4410 4862 *dst = '\0';
4411 4863 else
4412 4864 break;
4413 4865 } else
4414 4866 fabricate_brandstr(cpi);
4415 4867 }
4416 4868 cpi->cpi_pass = 3;
4417 4869 }
4418 4870
4419 4871 /*
4420 4872 * This routine is called out of bind_hwcap() much later in the life
4421 4873 * of the kernel (post_startup()). The job of this routine is to resolve
4422 4874 * the hardware feature support and kernel support for those features into
4423 4875 * what we're actually going to tell applications via the aux vector.
4424 4876 */
4425 4877 void
4426 4878 cpuid_pass4(cpu_t *cpu, uint_t *hwcap_out)
4427 4879 {
4428 4880 struct cpuid_info *cpi;
4429 4881 uint_t hwcap_flags = 0, hwcap_flags_2 = 0;
4430 4882
4431 4883 if (cpu == NULL)
4432 4884 cpu = CPU;
4433 4885 cpi = cpu->cpu_m.mcpu_cpi;
4434 4886
4435 4887 ASSERT(cpi->cpi_pass == 3);
4436 4888
4437 4889 if (cpi->cpi_maxeax >= 1) {
4438 4890 uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
4439 4891 uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
4440 4892 uint32_t *ebx = &cpi->cpi_support[STD_EBX_FEATURES];
4441 4893
4442 4894 *edx = CPI_FEATURES_EDX(cpi);
4443 4895 *ecx = CPI_FEATURES_ECX(cpi);
4444 4896 *ebx = CPI_FEATURES_7_0_EBX(cpi);
4445 4897
4446 4898 /*
4447 4899 * [these require explicit kernel support]
4448 4900 */
4449 4901 if (!is_x86_feature(x86_featureset, X86FSET_SEP))
4450 4902 *edx &= ~CPUID_INTC_EDX_SEP;
4451 4903
4452 4904 if (!is_x86_feature(x86_featureset, X86FSET_SSE))
4453 4905 *edx &= ~(CPUID_INTC_EDX_FXSR|CPUID_INTC_EDX_SSE);
4454 4906 if (!is_x86_feature(x86_featureset, X86FSET_SSE2))
4455 4907 *edx &= ~CPUID_INTC_EDX_SSE2;
4456 4908
4457 4909 if (!is_x86_feature(x86_featureset, X86FSET_HTT))
4458 4910 *edx &= ~CPUID_INTC_EDX_HTT;
4459 4911
4460 4912 if (!is_x86_feature(x86_featureset, X86FSET_SSE3))
4461 4913 *ecx &= ~CPUID_INTC_ECX_SSE3;
4462 4914
4463 4915 if (!is_x86_feature(x86_featureset, X86FSET_SSSE3))
4464 4916 *ecx &= ~CPUID_INTC_ECX_SSSE3;
4465 4917 if (!is_x86_feature(x86_featureset, X86FSET_SSE4_1))
4466 4918 *ecx &= ~CPUID_INTC_ECX_SSE4_1;
4467 4919 if (!is_x86_feature(x86_featureset, X86FSET_SSE4_2))
4468 4920 *ecx &= ~CPUID_INTC_ECX_SSE4_2;
4469 4921 if (!is_x86_feature(x86_featureset, X86FSET_AES))
4470 4922 *ecx &= ~CPUID_INTC_ECX_AES;
4471 4923 if (!is_x86_feature(x86_featureset, X86FSET_PCLMULQDQ))
4472 4924 *ecx &= ~CPUID_INTC_ECX_PCLMULQDQ;
4473 4925 if (!is_x86_feature(x86_featureset, X86FSET_XSAVE))
4474 4926 *ecx &= ~(CPUID_INTC_ECX_XSAVE |
4475 4927 CPUID_INTC_ECX_OSXSAVE);
4476 4928 if (!is_x86_feature(x86_featureset, X86FSET_AVX))
4477 4929 *ecx &= ~CPUID_INTC_ECX_AVX;
4478 4930 if (!is_x86_feature(x86_featureset, X86FSET_F16C))
4479 4931 *ecx &= ~CPUID_INTC_ECX_F16C;
4480 4932 if (!is_x86_feature(x86_featureset, X86FSET_FMA))
4481 4933 *ecx &= ~CPUID_INTC_ECX_FMA;
4482 4934 if (!is_x86_feature(x86_featureset, X86FSET_BMI1))
4483 4935 *ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
4484 4936 if (!is_x86_feature(x86_featureset, X86FSET_BMI2))
4485 4937 *ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
4486 4938 if (!is_x86_feature(x86_featureset, X86FSET_AVX2))
4487 4939 *ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
4488 4940 if (!is_x86_feature(x86_featureset, X86FSET_RDSEED))
4489 4941 *ebx &= ~CPUID_INTC_EBX_7_0_RDSEED;
4490 4942 if (!is_x86_feature(x86_featureset, X86FSET_ADX))
4491 4943 *ebx &= ~CPUID_INTC_EBX_7_0_ADX;
4492 4944
4493 4945 /*
4494 4946 * [no explicit support required beyond x87 fp context]
4495 4947 */
4496 4948 if (!fpu_exists)
4497 4949 *edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
4498 4950
4499 4951 /*
4500 4952 * Now map the supported feature vector to things that we
4501 4953 * think userland will care about.
4502 4954 */
4503 4955 if (*edx & CPUID_INTC_EDX_SEP)
4504 4956 hwcap_flags |= AV_386_SEP;
4505 4957 if (*edx & CPUID_INTC_EDX_SSE)
4506 4958 hwcap_flags |= AV_386_FXSR | AV_386_SSE;
4507 4959 if (*edx & CPUID_INTC_EDX_SSE2)
4508 4960 hwcap_flags |= AV_386_SSE2;
4509 4961 if (*ecx & CPUID_INTC_ECX_SSE3)
4510 4962 hwcap_flags |= AV_386_SSE3;
4511 4963 if (*ecx & CPUID_INTC_ECX_SSSE3)
4512 4964 hwcap_flags |= AV_386_SSSE3;
4513 4965 if (*ecx & CPUID_INTC_ECX_SSE4_1)
4514 4966 hwcap_flags |= AV_386_SSE4_1;
4515 4967 if (*ecx & CPUID_INTC_ECX_SSE4_2)
4516 4968 hwcap_flags |= AV_386_SSE4_2;
4517 4969 if (*ecx & CPUID_INTC_ECX_MOVBE)
4518 4970 hwcap_flags |= AV_386_MOVBE;
4519 4971 if (*ecx & CPUID_INTC_ECX_AES)
4520 4972 hwcap_flags |= AV_386_AES;
4521 4973 if (*ecx & CPUID_INTC_ECX_PCLMULQDQ)
4522 4974 hwcap_flags |= AV_386_PCLMULQDQ;
4523 4975 if ((*ecx & CPUID_INTC_ECX_XSAVE) &&
4524 4976 (*ecx & CPUID_INTC_ECX_OSXSAVE)) {
4525 4977 hwcap_flags |= AV_386_XSAVE;
4526 4978
4527 4979 if (*ecx & CPUID_INTC_ECX_AVX) {
4528 4980 uint32_t *ecx_7 = &CPI_FEATURES_7_0_ECX(cpi);
4529 4981 uint32_t *edx_7 = &CPI_FEATURES_7_0_EDX(cpi);
4530 4982
4531 4983 hwcap_flags |= AV_386_AVX;
4532 4984 if (*ecx & CPUID_INTC_ECX_F16C)
4533 4985 hwcap_flags_2 |= AV_386_2_F16C;
4534 4986 if (*ecx & CPUID_INTC_ECX_FMA)
4535 4987 hwcap_flags_2 |= AV_386_2_FMA;
4536 4988
4537 4989 if (*ebx & CPUID_INTC_EBX_7_0_BMI1)
4538 4990 hwcap_flags_2 |= AV_386_2_BMI1;
4539 4991 if (*ebx & CPUID_INTC_EBX_7_0_BMI2)
4540 4992 hwcap_flags_2 |= AV_386_2_BMI2;
4541 4993 if (*ebx & CPUID_INTC_EBX_7_0_AVX2)
4542 4994 hwcap_flags_2 |= AV_386_2_AVX2;
4543 4995 if (*ebx & CPUID_INTC_EBX_7_0_AVX512F)
4544 4996 hwcap_flags_2 |= AV_386_2_AVX512F;
4545 4997 if (*ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
4546 4998 hwcap_flags_2 |= AV_386_2_AVX512DQ;
4547 4999 if (*ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
4548 5000 hwcap_flags_2 |= AV_386_2_AVX512IFMA;
4549 5001 if (*ebx & CPUID_INTC_EBX_7_0_AVX512PF)
4550 5002 hwcap_flags_2 |= AV_386_2_AVX512PF;
4551 5003 if (*ebx & CPUID_INTC_EBX_7_0_AVX512ER)
4552 5004 hwcap_flags_2 |= AV_386_2_AVX512ER;
4553 5005 if (*ebx & CPUID_INTC_EBX_7_0_AVX512CD)
4554 5006 hwcap_flags_2 |= AV_386_2_AVX512CD;
4555 5007 if (*ebx & CPUID_INTC_EBX_7_0_AVX512BW)
4556 5008 hwcap_flags_2 |= AV_386_2_AVX512BW;
4557 5009 if (*ebx & CPUID_INTC_EBX_7_0_AVX512VL)
4558 5010 hwcap_flags_2 |= AV_386_2_AVX512VL;
4559 5011
4560 5012 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VBMI)
4561 5013 hwcap_flags_2 |= AV_386_2_AVX512VBMI;
4562 5014 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VNNI)
4563 5015 hwcap_flags_2 |= AV_386_2_AVX512_VNNI;
4564 5016 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
4565 5017 hwcap_flags_2 |= AV_386_2_AVX512VPOPCDQ;
4566 5018
4567 5019 if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124NNIW)
4568 5020 hwcap_flags_2 |= AV_386_2_AVX512_4NNIW;
4569 5021 if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
4570 5022 hwcap_flags_2 |= AV_386_2_AVX512_4FMAPS;
4571 5023 }
4572 5024 }
4573 5025 if (*ecx & CPUID_INTC_ECX_VMX)
4574 5026 hwcap_flags |= AV_386_VMX;
4575 5027 if (*ecx & CPUID_INTC_ECX_POPCNT)
4576 5028 hwcap_flags |= AV_386_POPCNT;
4577 5029 if (*edx & CPUID_INTC_EDX_FPU)
4578 5030 hwcap_flags |= AV_386_FPU;
4579 5031 if (*edx & CPUID_INTC_EDX_MMX)
4580 5032 hwcap_flags |= AV_386_MMX;
4581 5033
4582 5034 if (*edx & CPUID_INTC_EDX_TSC)
4583 5035 hwcap_flags |= AV_386_TSC;
4584 5036 if (*edx & CPUID_INTC_EDX_CX8)
4585 5037 hwcap_flags |= AV_386_CX8;
4586 5038 if (*edx & CPUID_INTC_EDX_CMOV)
4587 5039 hwcap_flags |= AV_386_CMOV;
4588 5040 if (*ecx & CPUID_INTC_ECX_CX16)
4589 5041 hwcap_flags |= AV_386_CX16;
4590 5042
4591 5043 if (*ecx & CPUID_INTC_ECX_RDRAND)
4592 5044 hwcap_flags_2 |= AV_386_2_RDRAND;
4593 5045 if (*ebx & CPUID_INTC_EBX_7_0_ADX)
4594 5046 hwcap_flags_2 |= AV_386_2_ADX;
4595 5047 if (*ebx & CPUID_INTC_EBX_7_0_RDSEED)
4596 5048 hwcap_flags_2 |= AV_386_2_RDSEED;
4597 5049 if (*ebx & CPUID_INTC_EBX_7_0_SHA)
4598 5050 hwcap_flags_2 |= AV_386_2_SHA;
4599 5051 if (*ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
4600 5052 hwcap_flags_2 |= AV_386_2_FSGSBASE;
4601 5053 if (*ebx & CPUID_INTC_EBX_7_0_CLWB)
4602 5054 hwcap_flags_2 |= AV_386_2_CLWB;
4603 5055 if (*ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
4604 5056 hwcap_flags_2 |= AV_386_2_CLFLUSHOPT;
4605 5057
4606 5058 }
4607 5059 /*
4608 5060 * Check a few miscilaneous features.
4609 5061 */
4610 5062 if (is_x86_feature(x86_featureset, X86FSET_CLZERO))
4611 5063 hwcap_flags_2 |= AV_386_2_CLZERO;
4612 5064
4613 5065 if (cpi->cpi_xmaxeax < 0x80000001)
4614 5066 goto pass4_done;
4615 5067
4616 5068 switch (cpi->cpi_vendor) {
4617 5069 struct cpuid_regs cp;
4618 5070 uint32_t *edx, *ecx;
4619 5071
4620 5072 case X86_VENDOR_Intel:
4621 5073 /*
4622 5074 * Seems like Intel duplicated what we necessary
4623 5075 * here to make the initial crop of 64-bit OS's work.
4624 5076 * Hopefully, those are the only "extended" bits
4625 5077 * they'll add.
4626 5078 */
4627 5079 /*FALLTHROUGH*/
4628 5080
4629 5081 case X86_VENDOR_AMD:
4630 5082 edx = &cpi->cpi_support[AMD_EDX_FEATURES];
4631 5083 ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
4632 5084
4633 5085 *edx = CPI_FEATURES_XTD_EDX(cpi);
4634 5086 *ecx = CPI_FEATURES_XTD_ECX(cpi);
4635 5087
4636 5088 /*
4637 5089 * [these features require explicit kernel support]
4638 5090 */
4639 5091 switch (cpi->cpi_vendor) {
4640 5092 case X86_VENDOR_Intel:
4641 5093 if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
4642 5094 *edx &= ~CPUID_AMD_EDX_TSCP;
4643 5095 break;
4644 5096
4645 5097 case X86_VENDOR_AMD:
4646 5098 if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
4647 5099 *edx &= ~CPUID_AMD_EDX_TSCP;
4648 5100 if (!is_x86_feature(x86_featureset, X86FSET_SSE4A))
4649 5101 *ecx &= ~CPUID_AMD_ECX_SSE4A;
4650 5102 break;
4651 5103
4652 5104 default:
4653 5105 break;
4654 5106 }
4655 5107
4656 5108 /*
4657 5109 * [no explicit support required beyond
4658 5110 * x87 fp context and exception handlers]
4659 5111 */
4660 5112 if (!fpu_exists)
4661 5113 *edx &= ~(CPUID_AMD_EDX_MMXamd |
4662 5114 CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
4663 5115
4664 5116 if (!is_x86_feature(x86_featureset, X86FSET_NX))
4665 5117 *edx &= ~CPUID_AMD_EDX_NX;
4666 5118 #if !defined(__amd64)
4667 5119 *edx &= ~CPUID_AMD_EDX_LM;
4668 5120 #endif
4669 5121 /*
4670 5122 * Now map the supported feature vector to
4671 5123 * things that we think userland will care about.
4672 5124 */
4673 5125 #if defined(__amd64)
4674 5126 if (*edx & CPUID_AMD_EDX_SYSC)
4675 5127 hwcap_flags |= AV_386_AMD_SYSC;
4676 5128 #endif
4677 5129 if (*edx & CPUID_AMD_EDX_MMXamd)
4678 5130 hwcap_flags |= AV_386_AMD_MMX;
4679 5131 if (*edx & CPUID_AMD_EDX_3DNow)
4680 5132 hwcap_flags |= AV_386_AMD_3DNow;
4681 5133 if (*edx & CPUID_AMD_EDX_3DNowx)
4682 5134 hwcap_flags |= AV_386_AMD_3DNowx;
4683 5135 if (*ecx & CPUID_AMD_ECX_SVM)
4684 5136 hwcap_flags |= AV_386_AMD_SVM;
4685 5137
4686 5138 switch (cpi->cpi_vendor) {
4687 5139 case X86_VENDOR_AMD:
4688 5140 if (*edx & CPUID_AMD_EDX_TSCP)
4689 5141 hwcap_flags |= AV_386_TSCP;
4690 5142 if (*ecx & CPUID_AMD_ECX_AHF64)
4691 5143 hwcap_flags |= AV_386_AHF;
4692 5144 if (*ecx & CPUID_AMD_ECX_SSE4A)
4693 5145 hwcap_flags |= AV_386_AMD_SSE4A;
4694 5146 if (*ecx & CPUID_AMD_ECX_LZCNT)
4695 5147 hwcap_flags |= AV_386_AMD_LZCNT;
4696 5148 if (*ecx & CPUID_AMD_ECX_MONITORX)
4697 5149 hwcap_flags_2 |= AV_386_2_MONITORX;
4698 5150 break;
4699 5151
4700 5152 case X86_VENDOR_Intel:
4701 5153 if (*edx & CPUID_AMD_EDX_TSCP)
4702 5154 hwcap_flags |= AV_386_TSCP;
4703 5155 if (*ecx & CPUID_AMD_ECX_LZCNT)
4704 5156 hwcap_flags |= AV_386_AMD_LZCNT;
4705 5157 /*
4706 5158 * Aarrgh.
4707 5159 * Intel uses a different bit in the same word.
4708 5160 */
4709 5161 if (*ecx & CPUID_INTC_ECX_AHF64)
4710 5162 hwcap_flags |= AV_386_AHF;
4711 5163 break;
4712 5164
4713 5165 default:
4714 5166 break;
4715 5167 }
4716 5168 break;
4717 5169
4718 5170 case X86_VENDOR_TM:
4719 5171 cp.cp_eax = 0x80860001;
4720 5172 (void) __cpuid_insn(&cp);
4721 5173 cpi->cpi_support[TM_EDX_FEATURES] = cp.cp_edx;
4722 5174 break;
4723 5175
4724 5176 default:
4725 5177 break;
4726 5178 }
4727 5179
4728 5180 pass4_done:
4729 5181 cpi->cpi_pass = 4;
4730 5182 if (hwcap_out != NULL) {
4731 5183 hwcap_out[0] = hwcap_flags;
4732 5184 hwcap_out[1] = hwcap_flags_2;
4733 5185 }
4734 5186 }
4735 5187
4736 5188
4737 5189 /*
4738 5190 * Simulate the cpuid instruction using the data we previously
4739 5191 * captured about this CPU. We try our best to return the truth
4740 5192 * about the hardware, independently of kernel support.
4741 5193 */
4742 5194 uint32_t
4743 5195 cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
4744 5196 {
4745 5197 struct cpuid_info *cpi;
4746 5198 struct cpuid_regs *xcp;
4747 5199
4748 5200 if (cpu == NULL)
4749 5201 cpu = CPU;
4750 5202 cpi = cpu->cpu_m.mcpu_cpi;
4751 5203
4752 5204 ASSERT(cpuid_checkpass(cpu, 3));
4753 5205
4754 5206 /*
4755 5207 * CPUID data is cached in two separate places: cpi_std for standard
4756 5208 * CPUID leaves , and cpi_extd for extended CPUID leaves.
4757 5209 */
4758 5210 if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
4759 5211 xcp = &cpi->cpi_std[cp->cp_eax];
4760 5212 } else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
4761 5213 cp->cp_eax <= cpi->cpi_xmaxeax &&
4762 5214 cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
4763 5215 xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
4764 5216 } else {
4765 5217 /*
4766 5218 * The caller is asking for data from an input parameter which
4767 5219 * the kernel has not cached. In this case we go fetch from
4768 5220 * the hardware and return the data directly to the user.
4769 5221 */
4770 5222 return (__cpuid_insn(cp));
4771 5223 }
4772 5224
4773 5225 cp->cp_eax = xcp->cp_eax;
4774 5226 cp->cp_ebx = xcp->cp_ebx;
4775 5227 cp->cp_ecx = xcp->cp_ecx;
4776 5228 cp->cp_edx = xcp->cp_edx;
4777 5229 return (cp->cp_eax);
4778 5230 }
4779 5231
4780 5232 int
4781 5233 cpuid_checkpass(cpu_t *cpu, int pass)
4782 5234 {
4783 5235 return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
4784 5236 cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
4785 5237 }
4786 5238
4787 5239 int
4788 5240 cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
4789 5241 {
4790 5242 ASSERT(cpuid_checkpass(cpu, 3));
4791 5243
4792 5244 return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
4793 5245 }
4794 5246
4795 5247 int
4796 5248 cpuid_is_cmt(cpu_t *cpu)
4797 5249 {
4798 5250 if (cpu == NULL)
4799 5251 cpu = CPU;
4800 5252
4801 5253 ASSERT(cpuid_checkpass(cpu, 1));
4802 5254
4803 5255 return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
4804 5256 }
4805 5257
4806 5258 /*
4807 5259 * AMD and Intel both implement the 64-bit variant of the syscall
4808 5260 * instruction (syscallq), so if there's -any- support for syscall,
4809 5261 * cpuid currently says "yes, we support this".
4810 5262 *
4811 5263 * However, Intel decided to -not- implement the 32-bit variant of the
4812 5264 * syscall instruction, so we provide a predicate to allow our caller
4813 5265 * to test that subtlety here.
4814 5266 *
4815 5267 * XXPV Currently, 32-bit syscall instructions don't work via the hypervisor,
4816 5268 * even in the case where the hardware would in fact support it.
4817 5269 */
4818 5270 /*ARGSUSED*/
4819 5271 int
4820 5272 cpuid_syscall32_insn(cpu_t *cpu)
4821 5273 {
4822 5274 ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), 1));
4823 5275
4824 5276 #if !defined(__xpv)
4825 5277 if (cpu == NULL)
4826 5278 cpu = CPU;
4827 5279
4828 5280 /*CSTYLED*/
4829 5281 {
4830 5282 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4831 5283
4832 5284 if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4833 5285 cpi->cpi_xmaxeax >= 0x80000001 &&
4834 5286 (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
4835 5287 return (1);
4836 5288 }
4837 5289 #endif
4838 5290 return (0);
4839 5291 }
4840 5292
4841 5293 int
4842 5294 cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
4843 5295 {
4844 5296 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4845 5297
4846 5298 static const char fmt[] =
4847 5299 "x86 (%s %X family %d model %d step %d clock %d MHz)";
4848 5300 static const char fmt_ht[] =
4849 5301 "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
4850 5302
4851 5303 ASSERT(cpuid_checkpass(cpu, 1));
4852 5304
4853 5305 if (cpuid_is_cmt(cpu))
4854 5306 return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
4855 5307 cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
4856 5308 cpi->cpi_family, cpi->cpi_model,
4857 5309 cpi->cpi_step, cpu->cpu_type_info.pi_clock));
4858 5310 return (snprintf(s, n, fmt,
4859 5311 cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
4860 5312 cpi->cpi_family, cpi->cpi_model,
4861 5313 cpi->cpi_step, cpu->cpu_type_info.pi_clock));
4862 5314 }
4863 5315
4864 5316 const char *
4865 5317 cpuid_getvendorstr(cpu_t *cpu)
4866 5318 {
4867 5319 ASSERT(cpuid_checkpass(cpu, 1));
4868 5320 return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
4869 5321 }
4870 5322
4871 5323 uint_t
4872 5324 cpuid_getvendor(cpu_t *cpu)
4873 5325 {
4874 5326 ASSERT(cpuid_checkpass(cpu, 1));
4875 5327 return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
4876 5328 }
4877 5329
4878 5330 uint_t
4879 5331 cpuid_getfamily(cpu_t *cpu)
4880 5332 {
4881 5333 ASSERT(cpuid_checkpass(cpu, 1));
4882 5334 return (cpu->cpu_m.mcpu_cpi->cpi_family);
4883 5335 }
4884 5336
4885 5337 uint_t
4886 5338 cpuid_getmodel(cpu_t *cpu)
4887 5339 {
4888 5340 ASSERT(cpuid_checkpass(cpu, 1));
4889 5341 return (cpu->cpu_m.mcpu_cpi->cpi_model);
4890 5342 }
4891 5343
4892 5344 uint_t
4893 5345 cpuid_get_ncpu_per_chip(cpu_t *cpu)
4894 5346 {
4895 5347 ASSERT(cpuid_checkpass(cpu, 1));
4896 5348 return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
4897 5349 }
4898 5350
4899 5351 uint_t
4900 5352 cpuid_get_ncore_per_chip(cpu_t *cpu)
4901 5353 {
4902 5354 ASSERT(cpuid_checkpass(cpu, 1));
4903 5355 return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
4904 5356 }
4905 5357
4906 5358 uint_t
4907 5359 cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
4908 5360 {
4909 5361 ASSERT(cpuid_checkpass(cpu, 2));
4910 5362 return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
4911 5363 }
4912 5364
4913 5365 id_t
4914 5366 cpuid_get_last_lvl_cacheid(cpu_t *cpu)
4915 5367 {
4916 5368 ASSERT(cpuid_checkpass(cpu, 2));
4917 5369 return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
4918 5370 }
4919 5371
4920 5372 uint_t
4921 5373 cpuid_getstep(cpu_t *cpu)
4922 5374 {
4923 5375 ASSERT(cpuid_checkpass(cpu, 1));
4924 5376 return (cpu->cpu_m.mcpu_cpi->cpi_step);
4925 5377 }
4926 5378
4927 5379 uint_t
4928 5380 cpuid_getsig(struct cpu *cpu)
4929 5381 {
4930 5382 ASSERT(cpuid_checkpass(cpu, 1));
4931 5383 return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
4932 5384 }
4933 5385
4934 5386 uint32_t
4935 5387 cpuid_getchiprev(struct cpu *cpu)
4936 5388 {
4937 5389 ASSERT(cpuid_checkpass(cpu, 1));
4938 5390 return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
4939 5391 }
4940 5392
4941 5393 const char *
4942 5394 cpuid_getchiprevstr(struct cpu *cpu)
4943 5395 {
4944 5396 ASSERT(cpuid_checkpass(cpu, 1));
4945 5397 return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
4946 5398 }
4947 5399
4948 5400 uint32_t
4949 5401 cpuid_getsockettype(struct cpu *cpu)
4950 5402 {
4951 5403 ASSERT(cpuid_checkpass(cpu, 1));
4952 5404 return (cpu->cpu_m.mcpu_cpi->cpi_socket);
4953 5405 }
4954 5406
4955 5407 const char *
4956 5408 cpuid_getsocketstr(cpu_t *cpu)
4957 5409 {
4958 5410 static const char *socketstr = NULL;
4959 5411 struct cpuid_info *cpi;
4960 5412
4961 5413 ASSERT(cpuid_checkpass(cpu, 1));
4962 5414 cpi = cpu->cpu_m.mcpu_cpi;
4963 5415
4964 5416 /* Assume that socket types are the same across the system */
4965 5417 if (socketstr == NULL)
4966 5418 socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
4967 5419 cpi->cpi_model, cpi->cpi_step);
4968 5420
4969 5421
4970 5422 return (socketstr);
4971 5423 }
4972 5424
4973 5425 int
4974 5426 cpuid_get_chipid(cpu_t *cpu)
4975 5427 {
4976 5428 ASSERT(cpuid_checkpass(cpu, 1));
4977 5429
4978 5430 if (cpuid_is_cmt(cpu))
4979 5431 return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
4980 5432 return (cpu->cpu_id);
4981 5433 }
4982 5434
4983 5435 id_t
4984 5436 cpuid_get_coreid(cpu_t *cpu)
4985 5437 {
4986 5438 ASSERT(cpuid_checkpass(cpu, 1));
4987 5439 return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
4988 5440 }
4989 5441
4990 5442 int
4991 5443 cpuid_get_pkgcoreid(cpu_t *cpu)
4992 5444 {
4993 5445 ASSERT(cpuid_checkpass(cpu, 1));
4994 5446 return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
4995 5447 }
4996 5448
4997 5449 int
4998 5450 cpuid_get_clogid(cpu_t *cpu)
4999 5451 {
5000 5452 ASSERT(cpuid_checkpass(cpu, 1));
5001 5453 return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
5002 5454 }
5003 5455
5004 5456 int
5005 5457 cpuid_get_cacheid(cpu_t *cpu)
5006 5458 {
5007 5459 ASSERT(cpuid_checkpass(cpu, 1));
5008 5460 return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5009 5461 }
5010 5462
5011 5463 uint_t
5012 5464 cpuid_get_procnodeid(cpu_t *cpu)
5013 5465 {
5014 5466 ASSERT(cpuid_checkpass(cpu, 1));
5015 5467 return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
5016 5468 }
5017 5469
5018 5470 uint_t
5019 5471 cpuid_get_procnodes_per_pkg(cpu_t *cpu)
5020 5472 {
5021 5473 ASSERT(cpuid_checkpass(cpu, 1));
5022 5474 return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
5023 5475 }
5024 5476
5025 5477 uint_t
5026 5478 cpuid_get_compunitid(cpu_t *cpu)
5027 5479 {
5028 5480 ASSERT(cpuid_checkpass(cpu, 1));
5029 5481 return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
5030 5482 }
5031 5483
5032 5484 uint_t
5033 5485 cpuid_get_cores_per_compunit(cpu_t *cpu)
5034 5486 {
5035 5487 ASSERT(cpuid_checkpass(cpu, 1));
5036 5488 return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
5037 5489 }
5038 5490
5039 5491 /*ARGSUSED*/
5040 5492 int
5041 5493 cpuid_have_cr8access(cpu_t *cpu)
5042 5494 {
5043 5495 #if defined(__amd64)
5044 5496 return (1);
5045 5497 #else
5046 5498 struct cpuid_info *cpi;
5047 5499
5048 5500 ASSERT(cpu != NULL);
5049 5501 cpi = cpu->cpu_m.mcpu_cpi;
5050 5502 if (cpi->cpi_vendor == X86_VENDOR_AMD && cpi->cpi_maxeax >= 1 &&
5051 5503 (CPI_FEATURES_XTD_ECX(cpi) & CPUID_AMD_ECX_CR8D) != 0)
5052 5504 return (1);
5053 5505 return (0);
5054 5506 #endif
5055 5507 }
5056 5508
5057 5509 uint32_t
5058 5510 cpuid_get_apicid(cpu_t *cpu)
5059 5511 {
5060 5512 ASSERT(cpuid_checkpass(cpu, 1));
5061 5513 if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
5062 5514 return (UINT32_MAX);
5063 5515 } else {
5064 5516 return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
5065 5517 }
5066 5518 }
5067 5519
5068 5520 void
5069 5521 cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
5070 5522 {
5071 5523 struct cpuid_info *cpi;
5072 5524
5073 5525 if (cpu == NULL)
5074 5526 cpu = CPU;
5075 5527 cpi = cpu->cpu_m.mcpu_cpi;
5076 5528
5077 5529 ASSERT(cpuid_checkpass(cpu, 1));
5078 5530
5079 5531 if (pabits)
5080 5532 *pabits = cpi->cpi_pabits;
5081 5533 if (vabits)
5082 5534 *vabits = cpi->cpi_vabits;
5083 5535 }
5084 5536
5085 5537 size_t
5086 5538 cpuid_get_xsave_size()
5087 5539 {
5088 5540 return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
5089 5541 sizeof (struct xsave_state)));
5090 5542 }
5091 5543
5092 5544 /*
5093 5545 * Return true if the CPUs on this system require 'pointer clearing' for the
5094 5546 * floating point error pointer exception handling. In the past, this has been
5095 5547 * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
5096 5548 * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
5097 5549 * feature bit and is reflected in the cpi_fp_amd_save member.
5098 5550 */
5099 5551 boolean_t
5100 5552 cpuid_need_fp_excp_handling()
5101 5553 {
5102 5554 return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
5103 5555 cpuid_info0.cpi_fp_amd_save != 0);
5104 5556 }
5105 5557
5106 5558 /*
5107 5559 * Returns the number of data TLB entries for a corresponding
5108 5560 * pagesize. If it can't be computed, or isn't known, the
5109 5561 * routine returns zero. If you ask about an architecturally
5110 5562 * impossible pagesize, the routine will panic (so that the
5111 5563 * hat implementor knows that things are inconsistent.)
5112 5564 */
5113 5565 uint_t
5114 5566 cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
5115 5567 {
5116 5568 struct cpuid_info *cpi;
5117 5569 uint_t dtlb_nent = 0;
5118 5570
5119 5571 if (cpu == NULL)
5120 5572 cpu = CPU;
5121 5573 cpi = cpu->cpu_m.mcpu_cpi;
5122 5574
5123 5575 ASSERT(cpuid_checkpass(cpu, 1));
5124 5576
5125 5577 /*
5126 5578 * Check the L2 TLB info
5127 5579 */
5128 5580 if (cpi->cpi_xmaxeax >= 0x80000006) {
5129 5581 struct cpuid_regs *cp = &cpi->cpi_extd[6];
5130 5582
5131 5583 switch (pagesize) {
5132 5584
5133 5585 case 4 * 1024:
5134 5586 /*
5135 5587 * All zero in the top 16 bits of the register
5136 5588 * indicates a unified TLB. Size is in low 16 bits.
5137 5589 */
5138 5590 if ((cp->cp_ebx & 0xffff0000) == 0)
5139 5591 dtlb_nent = cp->cp_ebx & 0x0000ffff;
5140 5592 else
5141 5593 dtlb_nent = BITX(cp->cp_ebx, 27, 16);
5142 5594 break;
5143 5595
5144 5596 case 2 * 1024 * 1024:
5145 5597 if ((cp->cp_eax & 0xffff0000) == 0)
5146 5598 dtlb_nent = cp->cp_eax & 0x0000ffff;
5147 5599 else
5148 5600 dtlb_nent = BITX(cp->cp_eax, 27, 16);
5149 5601 break;
5150 5602
5151 5603 default:
5152 5604 panic("unknown L2 pagesize");
5153 5605 /*NOTREACHED*/
5154 5606 }
5155 5607 }
5156 5608
5157 5609 if (dtlb_nent != 0)
5158 5610 return (dtlb_nent);
5159 5611
5160 5612 /*
5161 5613 * No L2 TLB support for this size, try L1.
5162 5614 */
5163 5615 if (cpi->cpi_xmaxeax >= 0x80000005) {
5164 5616 struct cpuid_regs *cp = &cpi->cpi_extd[5];
5165 5617
5166 5618 switch (pagesize) {
5167 5619 case 4 * 1024:
5168 5620 dtlb_nent = BITX(cp->cp_ebx, 23, 16);
5169 5621 break;
5170 5622 case 2 * 1024 * 1024:
5171 5623 dtlb_nent = BITX(cp->cp_eax, 23, 16);
5172 5624 break;
5173 5625 default:
5174 5626 panic("unknown L1 d-TLB pagesize");
5175 5627 /*NOTREACHED*/
5176 5628 }
5177 5629 }
5178 5630
5179 5631 return (dtlb_nent);
5180 5632 }
5181 5633
5182 5634 /*
5183 5635 * Return 0 if the erratum is not present or not applicable, positive
5184 5636 * if it is, and negative if the status of the erratum is unknown.
5185 5637 *
5186 5638 * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
5187 5639 * Processors" #25759, Rev 3.57, August 2005
5188 5640 */
5189 5641 int
5190 5642 cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
5191 5643 {
5192 5644 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5193 5645 uint_t eax;
5194 5646
5195 5647 /*
5196 5648 * Bail out if this CPU isn't an AMD CPU, or if it's
5197 5649 * a legacy (32-bit) AMD CPU.
5198 5650 */
5199 5651 if (cpi->cpi_vendor != X86_VENDOR_AMD ||
5200 5652 cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
5201 5653 cpi->cpi_family == 6) {
5202 5654 return (0);
5203 5655 }
5204 5656
5205 5657 eax = cpi->cpi_std[1].cp_eax;
5206 5658
5207 5659 #define SH_B0(eax) (eax == 0xf40 || eax == 0xf50)
5208 5660 #define SH_B3(eax) (eax == 0xf51)
5209 5661 #define B(eax) (SH_B0(eax) || SH_B3(eax))
5210 5662
5211 5663 #define SH_C0(eax) (eax == 0xf48 || eax == 0xf58)
5212 5664
5213 5665 #define SH_CG(eax) (eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
5214 5666 #define DH_CG(eax) (eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
5215 5667 #define CH_CG(eax) (eax == 0xf82 || eax == 0xfb2)
5216 5668 #define CG(eax) (SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
5217 5669
5218 5670 #define SH_D0(eax) (eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
5219 5671 #define DH_D0(eax) (eax == 0x10fc0 || eax == 0x10ff0)
5220 5672 #define CH_D0(eax) (eax == 0x10f80 || eax == 0x10fb0)
5221 5673 #define D0(eax) (SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
5222 5674
5223 5675 #define SH_E0(eax) (eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
5224 5676 #define JH_E1(eax) (eax == 0x20f10) /* JH8_E0 had 0x20f30 */
5225 5677 #define DH_E3(eax) (eax == 0x20fc0 || eax == 0x20ff0)
5226 5678 #define SH_E4(eax) (eax == 0x20f51 || eax == 0x20f71)
5227 5679 #define BH_E4(eax) (eax == 0x20fb1)
5228 5680 #define SH_E5(eax) (eax == 0x20f42)
5229 5681 #define DH_E6(eax) (eax == 0x20ff2 || eax == 0x20fc2)
5230 5682 #define JH_E6(eax) (eax == 0x20f12 || eax == 0x20f32)
5231 5683 #define EX(eax) (SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
5232 5684 SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
5233 5685 DH_E6(eax) || JH_E6(eax))
5234 5686
5235 5687 #define DR_AX(eax) (eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
5236 5688 #define DR_B0(eax) (eax == 0x100f20)
5237 5689 #define DR_B1(eax) (eax == 0x100f21)
5238 5690 #define DR_BA(eax) (eax == 0x100f2a)
5239 5691 #define DR_B2(eax) (eax == 0x100f22)
5240 5692 #define DR_B3(eax) (eax == 0x100f23)
5241 5693 #define RB_C0(eax) (eax == 0x100f40)
5242 5694
5243 5695 switch (erratum) {
5244 5696 case 1:
5245 5697 return (cpi->cpi_family < 0x10);
5246 5698 case 51: /* what does the asterisk mean? */
5247 5699 return (B(eax) || SH_C0(eax) || CG(eax));
5248 5700 case 52:
5249 5701 return (B(eax));
5250 5702 case 57:
5251 5703 return (cpi->cpi_family <= 0x11);
5252 5704 case 58:
5253 5705 return (B(eax));
5254 5706 case 60:
5255 5707 return (cpi->cpi_family <= 0x11);
5256 5708 case 61:
5257 5709 case 62:
5258 5710 case 63:
5259 5711 case 64:
5260 5712 case 65:
5261 5713 case 66:
5262 5714 case 68:
5263 5715 case 69:
5264 5716 case 70:
5265 5717 case 71:
5266 5718 return (B(eax));
5267 5719 case 72:
5268 5720 return (SH_B0(eax));
5269 5721 case 74:
5270 5722 return (B(eax));
5271 5723 case 75:
5272 5724 return (cpi->cpi_family < 0x10);
5273 5725 case 76:
5274 5726 return (B(eax));
5275 5727 case 77:
5276 5728 return (cpi->cpi_family <= 0x11);
5277 5729 case 78:
5278 5730 return (B(eax) || SH_C0(eax));
5279 5731 case 79:
5280 5732 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5281 5733 case 80:
5282 5734 case 81:
5283 5735 case 82:
5284 5736 return (B(eax));
5285 5737 case 83:
5286 5738 return (B(eax) || SH_C0(eax) || CG(eax));
5287 5739 case 85:
5288 5740 return (cpi->cpi_family < 0x10);
5289 5741 case 86:
5290 5742 return (SH_C0(eax) || CG(eax));
5291 5743 case 88:
5292 5744 #if !defined(__amd64)
5293 5745 return (0);
5294 5746 #else
5295 5747 return (B(eax) || SH_C0(eax));
5296 5748 #endif
5297 5749 case 89:
5298 5750 return (cpi->cpi_family < 0x10);
5299 5751 case 90:
5300 5752 return (B(eax) || SH_C0(eax) || CG(eax));
5301 5753 case 91:
5302 5754 case 92:
5303 5755 return (B(eax) || SH_C0(eax));
5304 5756 case 93:
5305 5757 return (SH_C0(eax));
5306 5758 case 94:
5307 5759 return (B(eax) || SH_C0(eax) || CG(eax));
5308 5760 case 95:
5309 5761 #if !defined(__amd64)
5310 5762 return (0);
5311 5763 #else
5312 5764 return (B(eax) || SH_C0(eax));
5313 5765 #endif
5314 5766 case 96:
5315 5767 return (B(eax) || SH_C0(eax) || CG(eax));
5316 5768 case 97:
5317 5769 case 98:
5318 5770 return (SH_C0(eax) || CG(eax));
5319 5771 case 99:
5320 5772 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5321 5773 case 100:
5322 5774 return (B(eax) || SH_C0(eax));
5323 5775 case 101:
5324 5776 case 103:
5325 5777 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5326 5778 case 104:
5327 5779 return (SH_C0(eax) || CG(eax) || D0(eax));
5328 5780 case 105:
5329 5781 case 106:
5330 5782 case 107:
5331 5783 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5332 5784 case 108:
5333 5785 return (DH_CG(eax));
5334 5786 case 109:
5335 5787 return (SH_C0(eax) || CG(eax) || D0(eax));
5336 5788 case 110:
5337 5789 return (D0(eax) || EX(eax));
5338 5790 case 111:
5339 5791 return (CG(eax));
5340 5792 case 112:
5341 5793 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5342 5794 case 113:
5343 5795 return (eax == 0x20fc0);
5344 5796 case 114:
5345 5797 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
5346 5798 case 115:
5347 5799 return (SH_E0(eax) || JH_E1(eax));
5348 5800 case 116:
5349 5801 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
5350 5802 case 117:
5351 5803 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5352 5804 case 118:
5353 5805 return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
5354 5806 JH_E6(eax));
5355 5807 case 121:
5356 5808 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5357 5809 case 122:
5358 5810 return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
5359 5811 case 123:
5360 5812 return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
5361 5813 case 131:
5362 5814 return (cpi->cpi_family < 0x10);
5363 5815 case 6336786:
5364 5816
5365 5817 /*
5366 5818 * Test for AdvPowerMgmtInfo.TscPStateInvariant
5367 5819 * if this is a K8 family or newer processor. We're testing for
5368 5820 * this 'erratum' to determine whether or not we have a constant
5369 5821 * TSC.
5370 5822 *
5371 5823 * Our current fix for this is to disable the C1-Clock ramping.
5372 5824 * However, this doesn't work on newer processor families nor
5373 5825 * does it work when virtualized as those devices don't exist.
5374 5826 */
5375 5827 if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
5376 5828 return (0);
5377 5829 }
5378 5830
5379 5831 if (CPI_FAMILY(cpi) == 0xf) {
5380 5832 struct cpuid_regs regs;
5381 5833 regs.cp_eax = 0x80000007;
5382 5834 (void) __cpuid_insn(®s);
5383 5835 return (!(regs.cp_edx & 0x100));
5384 5836 }
5385 5837 return (0);
5386 5838 case 6323525:
5387 5839 /*
5388 5840 * This erratum (K8 #147) is not present on family 10 and newer.
5389 5841 */
5390 5842 if (cpi->cpi_family >= 0x10) {
5391 5843 return (0);
5392 5844 }
5393 5845 return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
5394 5846 (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
5395 5847
5396 5848 case 6671130:
5397 5849 /*
5398 5850 * check for processors (pre-Shanghai) that do not provide
5399 5851 * optimal management of 1gb ptes in its tlb.
5400 5852 */
5401 5853 return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
5402 5854
5403 5855 case 298:
5404 5856 return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
5405 5857 DR_B2(eax) || RB_C0(eax));
5406 5858
5407 5859 case 721:
5408 5860 #if defined(__amd64)
5409 5861 return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
5410 5862 #else
5411 5863 return (0);
5412 5864 #endif
5413 5865
5414 5866 default:
5415 5867 return (-1);
5416 5868
5417 5869 }
5418 5870 }
5419 5871
5420 5872 /*
5421 5873 * Determine if specified erratum is present via OSVW (OS Visible Workaround).
5422 5874 * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
5423 5875 */
5424 5876 int
5425 5877 osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
5426 5878 {
5427 5879 struct cpuid_info *cpi;
5428 5880 uint_t osvwid;
5429 5881 static int osvwfeature = -1;
5430 5882 uint64_t osvwlength;
5431 5883
5432 5884
5433 5885 cpi = cpu->cpu_m.mcpu_cpi;
5434 5886
5435 5887 /* confirm OSVW supported */
5436 5888 if (osvwfeature == -1) {
5437 5889 osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
5438 5890 } else {
5439 5891 /* assert that osvw feature setting is consistent on all cpus */
5440 5892 ASSERT(osvwfeature ==
5441 5893 (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
5442 5894 }
5443 5895 if (!osvwfeature)
5444 5896 return (-1);
5445 5897
5446 5898 osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
5447 5899
5448 5900 switch (erratum) {
5449 5901 case 298: /* osvwid is 0 */
5450 5902 osvwid = 0;
5451 5903 if (osvwlength <= (uint64_t)osvwid) {
5452 5904 /* osvwid 0 is unknown */
5453 5905 return (-1);
5454 5906 }
5455 5907
5456 5908 /*
5457 5909 * Check the OSVW STATUS MSR to determine the state
5458 5910 * of the erratum where:
5459 5911 * 0 - fixed by HW
5460 5912 * 1 - BIOS has applied the workaround when BIOS
5461 5913 * workaround is available. (Or for other errata,
5462 5914 * OS workaround is required.)
5463 5915 * For a value of 1, caller will confirm that the
5464 5916 * erratum 298 workaround has indeed been applied by BIOS.
5465 5917 *
5466 5918 * A 1 may be set in cpus that have a HW fix
5467 5919 * in a mixed cpu system. Regarding erratum 298:
5468 5920 * In a multiprocessor platform, the workaround above
5469 5921 * should be applied to all processors regardless of
5470 5922 * silicon revision when an affected processor is
5471 5923 * present.
5472 5924 */
5473 5925
5474 5926 return (rdmsr(MSR_AMD_OSVW_STATUS +
5475 5927 (osvwid / OSVW_ID_CNT_PER_MSR)) &
5476 5928 (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
5477 5929
5478 5930 default:
5479 5931 return (-1);
5480 5932 }
5481 5933 }
5482 5934
5483 5935 static const char assoc_str[] = "associativity";
5484 5936 static const char line_str[] = "line-size";
5485 5937 static const char size_str[] = "size";
5486 5938
5487 5939 static void
5488 5940 add_cache_prop(dev_info_t *devi, const char *label, const char *type,
5489 5941 uint32_t val)
5490 5942 {
5491 5943 char buf[128];
5492 5944
5493 5945 /*
5494 5946 * ndi_prop_update_int() is used because it is desirable for
5495 5947 * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
5496 5948 */
5497 5949 if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
5498 5950 (void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
5499 5951 }
5500 5952
5501 5953 /*
5502 5954 * Intel-style cache/tlb description
5503 5955 *
5504 5956 * Standard cpuid level 2 gives a randomly ordered
5505 5957 * selection of tags that index into a table that describes
5506 5958 * cache and tlb properties.
5507 5959 */
5508 5960
5509 5961 static const char l1_icache_str[] = "l1-icache";
5510 5962 static const char l1_dcache_str[] = "l1-dcache";
5511 5963 static const char l2_cache_str[] = "l2-cache";
5512 5964 static const char l3_cache_str[] = "l3-cache";
5513 5965 static const char itlb4k_str[] = "itlb-4K";
5514 5966 static const char dtlb4k_str[] = "dtlb-4K";
5515 5967 static const char itlb2M_str[] = "itlb-2M";
5516 5968 static const char itlb4M_str[] = "itlb-4M";
5517 5969 static const char dtlb4M_str[] = "dtlb-4M";
5518 5970 static const char dtlb24_str[] = "dtlb0-2M-4M";
5519 5971 static const char itlb424_str[] = "itlb-4K-2M-4M";
5520 5972 static const char itlb24_str[] = "itlb-2M-4M";
5521 5973 static const char dtlb44_str[] = "dtlb-4K-4M";
5522 5974 static const char sl1_dcache_str[] = "sectored-l1-dcache";
5523 5975 static const char sl2_cache_str[] = "sectored-l2-cache";
5524 5976 static const char itrace_str[] = "itrace-cache";
5525 5977 static const char sl3_cache_str[] = "sectored-l3-cache";
5526 5978 static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
5527 5979
5528 5980 static const struct cachetab {
5529 5981 uint8_t ct_code;
5530 5982 uint8_t ct_assoc;
5531 5983 uint16_t ct_line_size;
5532 5984 size_t ct_size;
5533 5985 const char *ct_label;
5534 5986 } intel_ctab[] = {
5535 5987 /*
5536 5988 * maintain descending order!
5537 5989 *
5538 5990 * Codes ignored - Reason
5539 5991 * ----------------------
5540 5992 * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
5541 5993 * f0H/f1H - Currently we do not interpret prefetch size by design
5542 5994 */
5543 5995 { 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
5544 5996 { 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
5545 5997 { 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
5546 5998 { 0xde, 12, 64, 6*1024*1024, l3_cache_str},
5547 5999 { 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
5548 6000 { 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
5549 6001 { 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
5550 6002 { 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
5551 6003 { 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
5552 6004 { 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
5553 6005 { 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
5554 6006 { 0xd0, 4, 64, 512*1024, l3_cache_str},
5555 6007 { 0xca, 4, 0, 512, sh_l2_tlb4k_str},
5556 6008 { 0xc0, 4, 0, 8, dtlb44_str },
5557 6009 { 0xba, 4, 0, 64, dtlb4k_str },
5558 6010 { 0xb4, 4, 0, 256, dtlb4k_str },
5559 6011 { 0xb3, 4, 0, 128, dtlb4k_str },
5560 6012 { 0xb2, 4, 0, 64, itlb4k_str },
5561 6013 { 0xb0, 4, 0, 128, itlb4k_str },
5562 6014 { 0x87, 8, 64, 1024*1024, l2_cache_str},
5563 6015 { 0x86, 4, 64, 512*1024, l2_cache_str},
5564 6016 { 0x85, 8, 32, 2*1024*1024, l2_cache_str},
5565 6017 { 0x84, 8, 32, 1024*1024, l2_cache_str},
5566 6018 { 0x83, 8, 32, 512*1024, l2_cache_str},
5567 6019 { 0x82, 8, 32, 256*1024, l2_cache_str},
5568 6020 { 0x80, 8, 64, 512*1024, l2_cache_str},
5569 6021 { 0x7f, 2, 64, 512*1024, l2_cache_str},
5570 6022 { 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
5571 6023 { 0x7c, 8, 64, 1024*1024, sl2_cache_str},
5572 6024 { 0x7b, 8, 64, 512*1024, sl2_cache_str},
5573 6025 { 0x7a, 8, 64, 256*1024, sl2_cache_str},
5574 6026 { 0x79, 8, 64, 128*1024, sl2_cache_str},
5575 6027 { 0x78, 8, 64, 1024*1024, l2_cache_str},
5576 6028 { 0x73, 8, 0, 64*1024, itrace_str},
5577 6029 { 0x72, 8, 0, 32*1024, itrace_str},
5578 6030 { 0x71, 8, 0, 16*1024, itrace_str},
5579 6031 { 0x70, 8, 0, 12*1024, itrace_str},
5580 6032 { 0x68, 4, 64, 32*1024, sl1_dcache_str},
5581 6033 { 0x67, 4, 64, 16*1024, sl1_dcache_str},
5582 6034 { 0x66, 4, 64, 8*1024, sl1_dcache_str},
5583 6035 { 0x60, 8, 64, 16*1024, sl1_dcache_str},
5584 6036 { 0x5d, 0, 0, 256, dtlb44_str},
5585 6037 { 0x5c, 0, 0, 128, dtlb44_str},
5586 6038 { 0x5b, 0, 0, 64, dtlb44_str},
5587 6039 { 0x5a, 4, 0, 32, dtlb24_str},
5588 6040 { 0x59, 0, 0, 16, dtlb4k_str},
5589 6041 { 0x57, 4, 0, 16, dtlb4k_str},
5590 6042 { 0x56, 4, 0, 16, dtlb4M_str},
5591 6043 { 0x55, 0, 0, 7, itlb24_str},
5592 6044 { 0x52, 0, 0, 256, itlb424_str},
5593 6045 { 0x51, 0, 0, 128, itlb424_str},
5594 6046 { 0x50, 0, 0, 64, itlb424_str},
5595 6047 { 0x4f, 0, 0, 32, itlb4k_str},
5596 6048 { 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
5597 6049 { 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
5598 6050 { 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
5599 6051 { 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
5600 6052 { 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
5601 6053 { 0x49, 16, 64, 4*1024*1024, l3_cache_str},
5602 6054 { 0x48, 12, 64, 3*1024*1024, l2_cache_str},
5603 6055 { 0x47, 8, 64, 8*1024*1024, l3_cache_str},
5604 6056 { 0x46, 4, 64, 4*1024*1024, l3_cache_str},
5605 6057 { 0x45, 4, 32, 2*1024*1024, l2_cache_str},
5606 6058 { 0x44, 4, 32, 1024*1024, l2_cache_str},
5607 6059 { 0x43, 4, 32, 512*1024, l2_cache_str},
5608 6060 { 0x42, 4, 32, 256*1024, l2_cache_str},
5609 6061 { 0x41, 4, 32, 128*1024, l2_cache_str},
5610 6062 { 0x3e, 4, 64, 512*1024, sl2_cache_str},
5611 6063 { 0x3d, 6, 64, 384*1024, sl2_cache_str},
5612 6064 { 0x3c, 4, 64, 256*1024, sl2_cache_str},
5613 6065 { 0x3b, 2, 64, 128*1024, sl2_cache_str},
5614 6066 { 0x3a, 6, 64, 192*1024, sl2_cache_str},
5615 6067 { 0x39, 4, 64, 128*1024, sl2_cache_str},
5616 6068 { 0x30, 8, 64, 32*1024, l1_icache_str},
5617 6069 { 0x2c, 8, 64, 32*1024, l1_dcache_str},
5618 6070 { 0x29, 8, 64, 4096*1024, sl3_cache_str},
5619 6071 { 0x25, 8, 64, 2048*1024, sl3_cache_str},
5620 6072 { 0x23, 8, 64, 1024*1024, sl3_cache_str},
5621 6073 { 0x22, 4, 64, 512*1024, sl3_cache_str},
5622 6074 { 0x0e, 6, 64, 24*1024, l1_dcache_str},
5623 6075 { 0x0d, 4, 32, 16*1024, l1_dcache_str},
5624 6076 { 0x0c, 4, 32, 16*1024, l1_dcache_str},
5625 6077 { 0x0b, 4, 0, 4, itlb4M_str},
5626 6078 { 0x0a, 2, 32, 8*1024, l1_dcache_str},
5627 6079 { 0x08, 4, 32, 16*1024, l1_icache_str},
5628 6080 { 0x06, 4, 32, 8*1024, l1_icache_str},
5629 6081 { 0x05, 4, 0, 32, dtlb4M_str},
5630 6082 { 0x04, 4, 0, 8, dtlb4M_str},
5631 6083 { 0x03, 4, 0, 64, dtlb4k_str},
5632 6084 { 0x02, 4, 0, 2, itlb4M_str},
5633 6085 { 0x01, 4, 0, 32, itlb4k_str},
5634 6086 { 0 }
5635 6087 };
5636 6088
5637 6089 static const struct cachetab cyrix_ctab[] = {
5638 6090 { 0x70, 4, 0, 32, "tlb-4K" },
5639 6091 { 0x80, 4, 16, 16*1024, "l1-cache" },
5640 6092 { 0 }
5641 6093 };
5642 6094
5643 6095 /*
5644 6096 * Search a cache table for a matching entry
5645 6097 */
5646 6098 static const struct cachetab *
5647 6099 find_cacheent(const struct cachetab *ct, uint_t code)
5648 6100 {
5649 6101 if (code != 0) {
5650 6102 for (; ct->ct_code != 0; ct++)
5651 6103 if (ct->ct_code <= code)
5652 6104 break;
5653 6105 if (ct->ct_code == code)
5654 6106 return (ct);
5655 6107 }
5656 6108 return (NULL);
5657 6109 }
5658 6110
5659 6111 /*
5660 6112 * Populate cachetab entry with L2 or L3 cache-information using
5661 6113 * cpuid function 4. This function is called from intel_walk_cacheinfo()
5662 6114 * when descriptor 0x49 is encountered. It returns 0 if no such cache
5663 6115 * information is found.
5664 6116 */
5665 6117 static int
5666 6118 intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
5667 6119 {
5668 6120 uint32_t level, i;
5669 6121 int ret = 0;
5670 6122
5671 6123 for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
5672 6124 level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
5673 6125
5674 6126 if (level == 2 || level == 3) {
5675 6127 ct->ct_assoc =
5676 6128 CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
5677 6129 ct->ct_line_size =
5678 6130 CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
5679 6131 ct->ct_size = ct->ct_assoc *
5680 6132 (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
5681 6133 ct->ct_line_size *
5682 6134 (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
5683 6135
5684 6136 if (level == 2) {
5685 6137 ct->ct_label = l2_cache_str;
5686 6138 } else if (level == 3) {
5687 6139 ct->ct_label = l3_cache_str;
5688 6140 }
5689 6141 ret = 1;
5690 6142 }
5691 6143 }
5692 6144
5693 6145 return (ret);
5694 6146 }
5695 6147
5696 6148 /*
5697 6149 * Walk the cacheinfo descriptor, applying 'func' to every valid element
5698 6150 * The walk is terminated if the walker returns non-zero.
5699 6151 */
5700 6152 static void
5701 6153 intel_walk_cacheinfo(struct cpuid_info *cpi,
5702 6154 void *arg, int (*func)(void *, const struct cachetab *))
5703 6155 {
5704 6156 const struct cachetab *ct;
5705 6157 struct cachetab des_49_ct, des_b1_ct;
5706 6158 uint8_t *dp;
5707 6159 int i;
5708 6160
5709 6161 if ((dp = cpi->cpi_cacheinfo) == NULL)
5710 6162 return;
5711 6163 for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
5712 6164 /*
5713 6165 * For overloaded descriptor 0x49 we use cpuid function 4
5714 6166 * if supported by the current processor, to create
5715 6167 * cache information.
5716 6168 * For overloaded descriptor 0xb1 we use X86_PAE flag
5717 6169 * to disambiguate the cache information.
5718 6170 */
5719 6171 if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
5720 6172 intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
5721 6173 ct = &des_49_ct;
5722 6174 } else if (*dp == 0xb1) {
5723 6175 des_b1_ct.ct_code = 0xb1;
5724 6176 des_b1_ct.ct_assoc = 4;
5725 6177 des_b1_ct.ct_line_size = 0;
5726 6178 if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
5727 6179 des_b1_ct.ct_size = 8;
5728 6180 des_b1_ct.ct_label = itlb2M_str;
5729 6181 } else {
5730 6182 des_b1_ct.ct_size = 4;
5731 6183 des_b1_ct.ct_label = itlb4M_str;
5732 6184 }
5733 6185 ct = &des_b1_ct;
5734 6186 } else {
5735 6187 if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
5736 6188 continue;
5737 6189 }
5738 6190 }
5739 6191
5740 6192 if (func(arg, ct) != 0) {
5741 6193 break;
5742 6194 }
5743 6195 }
5744 6196 }
5745 6197
5746 6198 /*
5747 6199 * (Like the Intel one, except for Cyrix CPUs)
5748 6200 */
5749 6201 static void
5750 6202 cyrix_walk_cacheinfo(struct cpuid_info *cpi,
5751 6203 void *arg, int (*func)(void *, const struct cachetab *))
5752 6204 {
5753 6205 const struct cachetab *ct;
5754 6206 uint8_t *dp;
5755 6207 int i;
5756 6208
5757 6209 if ((dp = cpi->cpi_cacheinfo) == NULL)
5758 6210 return;
5759 6211 for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
5760 6212 /*
5761 6213 * Search Cyrix-specific descriptor table first ..
5762 6214 */
5763 6215 if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
5764 6216 if (func(arg, ct) != 0)
5765 6217 break;
5766 6218 continue;
5767 6219 }
5768 6220 /*
5769 6221 * .. else fall back to the Intel one
5770 6222 */
5771 6223 if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
5772 6224 if (func(arg, ct) != 0)
5773 6225 break;
5774 6226 continue;
5775 6227 }
5776 6228 }
5777 6229 }
5778 6230
5779 6231 /*
5780 6232 * A cacheinfo walker that adds associativity, line-size, and size properties
5781 6233 * to the devinfo node it is passed as an argument.
5782 6234 */
5783 6235 static int
5784 6236 add_cacheent_props(void *arg, const struct cachetab *ct)
5785 6237 {
5786 6238 dev_info_t *devi = arg;
5787 6239
5788 6240 add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
5789 6241 if (ct->ct_line_size != 0)
5790 6242 add_cache_prop(devi, ct->ct_label, line_str,
5791 6243 ct->ct_line_size);
5792 6244 add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
5793 6245 return (0);
5794 6246 }
5795 6247
5796 6248
5797 6249 static const char fully_assoc[] = "fully-associative?";
5798 6250
5799 6251 /*
5800 6252 * AMD style cache/tlb description
5801 6253 *
5802 6254 * Extended functions 5 and 6 directly describe properties of
5803 6255 * tlbs and various cache levels.
5804 6256 */
5805 6257 static void
5806 6258 add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
5807 6259 {
5808 6260 switch (assoc) {
5809 6261 case 0: /* reserved; ignore */
5810 6262 break;
5811 6263 default:
5812 6264 add_cache_prop(devi, label, assoc_str, assoc);
5813 6265 break;
5814 6266 case 0xff:
5815 6267 add_cache_prop(devi, label, fully_assoc, 1);
5816 6268 break;
5817 6269 }
5818 6270 }
5819 6271
5820 6272 static void
5821 6273 add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
5822 6274 {
5823 6275 if (size == 0)
5824 6276 return;
5825 6277 add_cache_prop(devi, label, size_str, size);
5826 6278 add_amd_assoc(devi, label, assoc);
5827 6279 }
5828 6280
5829 6281 static void
5830 6282 add_amd_cache(dev_info_t *devi, const char *label,
5831 6283 uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
5832 6284 {
5833 6285 if (size == 0 || line_size == 0)
5834 6286 return;
5835 6287 add_amd_assoc(devi, label, assoc);
5836 6288 /*
5837 6289 * Most AMD parts have a sectored cache. Multiple cache lines are
5838 6290 * associated with each tag. A sector consists of all cache lines
5839 6291 * associated with a tag. For example, the AMD K6-III has a sector
5840 6292 * size of 2 cache lines per tag.
5841 6293 */
5842 6294 if (lines_per_tag != 0)
5843 6295 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
5844 6296 add_cache_prop(devi, label, line_str, line_size);
5845 6297 add_cache_prop(devi, label, size_str, size * 1024);
5846 6298 }
5847 6299
5848 6300 static void
5849 6301 add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
5850 6302 {
5851 6303 switch (assoc) {
5852 6304 case 0: /* off */
5853 6305 break;
5854 6306 case 1:
5855 6307 case 2:
5856 6308 case 4:
5857 6309 add_cache_prop(devi, label, assoc_str, assoc);
5858 6310 break;
5859 6311 case 6:
5860 6312 add_cache_prop(devi, label, assoc_str, 8);
5861 6313 break;
5862 6314 case 8:
5863 6315 add_cache_prop(devi, label, assoc_str, 16);
5864 6316 break;
5865 6317 case 0xf:
5866 6318 add_cache_prop(devi, label, fully_assoc, 1);
5867 6319 break;
5868 6320 default: /* reserved; ignore */
5869 6321 break;
5870 6322 }
5871 6323 }
5872 6324
5873 6325 static void
5874 6326 add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
5875 6327 {
5876 6328 if (size == 0 || assoc == 0)
5877 6329 return;
5878 6330 add_amd_l2_assoc(devi, label, assoc);
5879 6331 add_cache_prop(devi, label, size_str, size);
5880 6332 }
5881 6333
5882 6334 static void
5883 6335 add_amd_l2_cache(dev_info_t *devi, const char *label,
5884 6336 uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
5885 6337 {
5886 6338 if (size == 0 || assoc == 0 || line_size == 0)
5887 6339 return;
5888 6340 add_amd_l2_assoc(devi, label, assoc);
5889 6341 if (lines_per_tag != 0)
5890 6342 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
5891 6343 add_cache_prop(devi, label, line_str, line_size);
5892 6344 add_cache_prop(devi, label, size_str, size * 1024);
5893 6345 }
5894 6346
5895 6347 static void
5896 6348 amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
5897 6349 {
5898 6350 struct cpuid_regs *cp;
5899 6351
5900 6352 if (cpi->cpi_xmaxeax < 0x80000005)
5901 6353 return;
5902 6354 cp = &cpi->cpi_extd[5];
5903 6355
5904 6356 /*
5905 6357 * 4M/2M L1 TLB configuration
5906 6358 *
5907 6359 * We report the size for 2M pages because AMD uses two
5908 6360 * TLB entries for one 4M page.
5909 6361 */
5910 6362 add_amd_tlb(devi, "dtlb-2M",
5911 6363 BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
5912 6364 add_amd_tlb(devi, "itlb-2M",
5913 6365 BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
5914 6366
5915 6367 /*
5916 6368 * 4K L1 TLB configuration
5917 6369 */
5918 6370
5919 6371 switch (cpi->cpi_vendor) {
5920 6372 uint_t nentries;
5921 6373 case X86_VENDOR_TM:
5922 6374 if (cpi->cpi_family >= 5) {
5923 6375 /*
5924 6376 * Crusoe processors have 256 TLB entries, but
5925 6377 * cpuid data format constrains them to only
5926 6378 * reporting 255 of them.
5927 6379 */
5928 6380 if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
5929 6381 nentries = 256;
5930 6382 /*
5931 6383 * Crusoe processors also have a unified TLB
5932 6384 */
5933 6385 add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
5934 6386 nentries);
5935 6387 break;
5936 6388 }
5937 6389 /*FALLTHROUGH*/
5938 6390 default:
5939 6391 add_amd_tlb(devi, itlb4k_str,
5940 6392 BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
5941 6393 add_amd_tlb(devi, dtlb4k_str,
5942 6394 BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
5943 6395 break;
5944 6396 }
5945 6397
5946 6398 /*
5947 6399 * data L1 cache configuration
5948 6400 */
5949 6401
5950 6402 add_amd_cache(devi, l1_dcache_str,
5951 6403 BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
5952 6404 BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
5953 6405
5954 6406 /*
5955 6407 * code L1 cache configuration
5956 6408 */
5957 6409
5958 6410 add_amd_cache(devi, l1_icache_str,
5959 6411 BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
5960 6412 BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
5961 6413
5962 6414 if (cpi->cpi_xmaxeax < 0x80000006)
5963 6415 return;
5964 6416 cp = &cpi->cpi_extd[6];
5965 6417
5966 6418 /* Check for a unified L2 TLB for large pages */
5967 6419
5968 6420 if (BITX(cp->cp_eax, 31, 16) == 0)
5969 6421 add_amd_l2_tlb(devi, "l2-tlb-2M",
5970 6422 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
5971 6423 else {
5972 6424 add_amd_l2_tlb(devi, "l2-dtlb-2M",
5973 6425 BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
5974 6426 add_amd_l2_tlb(devi, "l2-itlb-2M",
5975 6427 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
5976 6428 }
5977 6429
5978 6430 /* Check for a unified L2 TLB for 4K pages */
5979 6431
5980 6432 if (BITX(cp->cp_ebx, 31, 16) == 0) {
5981 6433 add_amd_l2_tlb(devi, "l2-tlb-4K",
5982 6434 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
5983 6435 } else {
5984 6436 add_amd_l2_tlb(devi, "l2-dtlb-4K",
5985 6437 BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
5986 6438 add_amd_l2_tlb(devi, "l2-itlb-4K",
5987 6439 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
5988 6440 }
5989 6441
5990 6442 add_amd_l2_cache(devi, l2_cache_str,
5991 6443 BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
5992 6444 BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
5993 6445 }
5994 6446
5995 6447 /*
5996 6448 * There are two basic ways that the x86 world describes it cache
5997 6449 * and tlb architecture - Intel's way and AMD's way.
5998 6450 *
5999 6451 * Return which flavor of cache architecture we should use
6000 6452 */
6001 6453 static int
6002 6454 x86_which_cacheinfo(struct cpuid_info *cpi)
6003 6455 {
6004 6456 switch (cpi->cpi_vendor) {
6005 6457 case X86_VENDOR_Intel:
6006 6458 if (cpi->cpi_maxeax >= 2)
6007 6459 return (X86_VENDOR_Intel);
6008 6460 break;
6009 6461 case X86_VENDOR_AMD:
6010 6462 /*
6011 6463 * The K5 model 1 was the first part from AMD that reported
6012 6464 * cache sizes via extended cpuid functions.
6013 6465 */
6014 6466 if (cpi->cpi_family > 5 ||
6015 6467 (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
6016 6468 return (X86_VENDOR_AMD);
6017 6469 break;
6018 6470 case X86_VENDOR_TM:
6019 6471 if (cpi->cpi_family >= 5)
6020 6472 return (X86_VENDOR_AMD);
6021 6473 /*FALLTHROUGH*/
6022 6474 default:
6023 6475 /*
6024 6476 * If they have extended CPU data for 0x80000005
6025 6477 * then we assume they have AMD-format cache
6026 6478 * information.
6027 6479 *
6028 6480 * If not, and the vendor happens to be Cyrix,
6029 6481 * then try our-Cyrix specific handler.
6030 6482 *
6031 6483 * If we're not Cyrix, then assume we're using Intel's
6032 6484 * table-driven format instead.
6033 6485 */
6034 6486 if (cpi->cpi_xmaxeax >= 0x80000005)
6035 6487 return (X86_VENDOR_AMD);
6036 6488 else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
6037 6489 return (X86_VENDOR_Cyrix);
6038 6490 else if (cpi->cpi_maxeax >= 2)
6039 6491 return (X86_VENDOR_Intel);
6040 6492 break;
6041 6493 }
6042 6494 return (-1);
6043 6495 }
6044 6496
6045 6497 void
6046 6498 cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
6047 6499 struct cpuid_info *cpi)
6048 6500 {
6049 6501 dev_info_t *cpu_devi;
6050 6502 int create;
6051 6503
6052 6504 cpu_devi = (dev_info_t *)dip;
6053 6505
6054 6506 /* device_type */
6055 6507 (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6056 6508 "device_type", "cpu");
6057 6509
6058 6510 /* reg */
6059 6511 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6060 6512 "reg", cpu_id);
6061 6513
6062 6514 /* cpu-mhz, and clock-frequency */
6063 6515 if (cpu_freq > 0) {
6064 6516 long long mul;
6065 6517
6066 6518 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6067 6519 "cpu-mhz", cpu_freq);
6068 6520 if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
6069 6521 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6070 6522 "clock-frequency", (int)mul);
6071 6523 }
6072 6524
6073 6525 if (!is_x86_feature(x86_featureset, X86FSET_CPUID)) {
6074 6526 return;
6075 6527 }
6076 6528
6077 6529 /* vendor-id */
6078 6530 (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6079 6531 "vendor-id", cpi->cpi_vendorstr);
6080 6532
6081 6533 if (cpi->cpi_maxeax == 0) {
6082 6534 return;
6083 6535 }
6084 6536
6085 6537 /*
6086 6538 * family, model, and step
6087 6539 */
6088 6540 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6089 6541 "family", CPI_FAMILY(cpi));
6090 6542 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6091 6543 "cpu-model", CPI_MODEL(cpi));
6092 6544 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6093 6545 "stepping-id", CPI_STEP(cpi));
6094 6546
6095 6547 /* type */
6096 6548 switch (cpi->cpi_vendor) {
6097 6549 case X86_VENDOR_Intel:
6098 6550 create = 1;
6099 6551 break;
6100 6552 default:
6101 6553 create = 0;
6102 6554 break;
6103 6555 }
6104 6556 if (create)
6105 6557 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6106 6558 "type", CPI_TYPE(cpi));
6107 6559
6108 6560 /* ext-family */
6109 6561 switch (cpi->cpi_vendor) {
6110 6562 case X86_VENDOR_Intel:
6111 6563 case X86_VENDOR_AMD:
6112 6564 create = cpi->cpi_family >= 0xf;
6113 6565 break;
6114 6566 default:
6115 6567 create = 0;
6116 6568 break;
6117 6569 }
6118 6570 if (create)
6119 6571 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6120 6572 "ext-family", CPI_FAMILY_XTD(cpi));
6121 6573
6122 6574 /* ext-model */
6123 6575 switch (cpi->cpi_vendor) {
6124 6576 case X86_VENDOR_Intel:
6125 6577 create = IS_EXTENDED_MODEL_INTEL(cpi);
6126 6578 break;
6127 6579 case X86_VENDOR_AMD:
6128 6580 create = CPI_FAMILY(cpi) == 0xf;
6129 6581 break;
6130 6582 default:
6131 6583 create = 0;
6132 6584 break;
6133 6585 }
6134 6586 if (create)
6135 6587 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6136 6588 "ext-model", CPI_MODEL_XTD(cpi));
6137 6589
6138 6590 /* generation */
6139 6591 switch (cpi->cpi_vendor) {
6140 6592 case X86_VENDOR_AMD:
6141 6593 /*
6142 6594 * AMD K5 model 1 was the first part to support this
6143 6595 */
6144 6596 create = cpi->cpi_xmaxeax >= 0x80000001;
6145 6597 break;
6146 6598 default:
6147 6599 create = 0;
6148 6600 break;
6149 6601 }
6150 6602 if (create)
6151 6603 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6152 6604 "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
6153 6605
6154 6606 /* brand-id */
6155 6607 switch (cpi->cpi_vendor) {
6156 6608 case X86_VENDOR_Intel:
6157 6609 /*
6158 6610 * brand id first appeared on Pentium III Xeon model 8,
6159 6611 * and Celeron model 8 processors and Opteron
6160 6612 */
6161 6613 create = cpi->cpi_family > 6 ||
6162 6614 (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
6163 6615 break;
6164 6616 case X86_VENDOR_AMD:
6165 6617 create = cpi->cpi_family >= 0xf;
6166 6618 break;
6167 6619 default:
6168 6620 create = 0;
6169 6621 break;
6170 6622 }
6171 6623 if (create && cpi->cpi_brandid != 0) {
6172 6624 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6173 6625 "brand-id", cpi->cpi_brandid);
6174 6626 }
6175 6627
6176 6628 /* chunks, and apic-id */
6177 6629 switch (cpi->cpi_vendor) {
6178 6630 /*
6179 6631 * first available on Pentium IV and Opteron (K8)
6180 6632 */
6181 6633 case X86_VENDOR_Intel:
6182 6634 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6183 6635 break;
6184 6636 case X86_VENDOR_AMD:
6185 6637 create = cpi->cpi_family >= 0xf;
6186 6638 break;
6187 6639 default:
6188 6640 create = 0;
6189 6641 break;
6190 6642 }
6191 6643 if (create) {
6192 6644 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6193 6645 "chunks", CPI_CHUNKS(cpi));
6194 6646 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6195 6647 "apic-id", cpi->cpi_apicid);
6196 6648 if (cpi->cpi_chipid >= 0) {
6197 6649 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6198 6650 "chip#", cpi->cpi_chipid);
6199 6651 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6200 6652 "clog#", cpi->cpi_clogid);
6201 6653 }
6202 6654 }
6203 6655
6204 6656 /* cpuid-features */
6205 6657 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6206 6658 "cpuid-features", CPI_FEATURES_EDX(cpi));
6207 6659
6208 6660
6209 6661 /* cpuid-features-ecx */
6210 6662 switch (cpi->cpi_vendor) {
6211 6663 case X86_VENDOR_Intel:
6212 6664 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6213 6665 break;
6214 6666 case X86_VENDOR_AMD:
6215 6667 create = cpi->cpi_family >= 0xf;
6216 6668 break;
6217 6669 default:
6218 6670 create = 0;
6219 6671 break;
6220 6672 }
6221 6673 if (create)
6222 6674 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6223 6675 "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
6224 6676
6225 6677 /* ext-cpuid-features */
6226 6678 switch (cpi->cpi_vendor) {
6227 6679 case X86_VENDOR_Intel:
6228 6680 case X86_VENDOR_AMD:
6229 6681 case X86_VENDOR_Cyrix:
6230 6682 case X86_VENDOR_TM:
6231 6683 case X86_VENDOR_Centaur:
6232 6684 create = cpi->cpi_xmaxeax >= 0x80000001;
6233 6685 break;
6234 6686 default:
6235 6687 create = 0;
6236 6688 break;
6237 6689 }
6238 6690 if (create) {
6239 6691 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6240 6692 "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
6241 6693 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6242 6694 "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
6243 6695 }
6244 6696
6245 6697 /*
6246 6698 * Brand String first appeared in Intel Pentium IV, AMD K5
6247 6699 * model 1, and Cyrix GXm. On earlier models we try and
6248 6700 * simulate something similar .. so this string should always
6249 6701 * same -something- about the processor, however lame.
6250 6702 */
6251 6703 (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6252 6704 "brand-string", cpi->cpi_brandstr);
6253 6705
6254 6706 /*
6255 6707 * Finally, cache and tlb information
6256 6708 */
6257 6709 switch (x86_which_cacheinfo(cpi)) {
6258 6710 case X86_VENDOR_Intel:
6259 6711 intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
6260 6712 break;
6261 6713 case X86_VENDOR_Cyrix:
6262 6714 cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
6263 6715 break;
6264 6716 case X86_VENDOR_AMD:
6265 6717 amd_cache_info(cpi, cpu_devi);
6266 6718 break;
6267 6719 default:
6268 6720 break;
6269 6721 }
6270 6722 }
6271 6723
6272 6724 struct l2info {
6273 6725 int *l2i_csz;
6274 6726 int *l2i_lsz;
6275 6727 int *l2i_assoc;
6276 6728 int l2i_ret;
6277 6729 };
6278 6730
6279 6731 /*
6280 6732 * A cacheinfo walker that fetches the size, line-size and associativity
6281 6733 * of the L2 cache
6282 6734 */
6283 6735 static int
6284 6736 intel_l2cinfo(void *arg, const struct cachetab *ct)
6285 6737 {
6286 6738 struct l2info *l2i = arg;
6287 6739 int *ip;
6288 6740
6289 6741 if (ct->ct_label != l2_cache_str &&
6290 6742 ct->ct_label != sl2_cache_str)
6291 6743 return (0); /* not an L2 -- keep walking */
6292 6744
6293 6745 if ((ip = l2i->l2i_csz) != NULL)
6294 6746 *ip = ct->ct_size;
6295 6747 if ((ip = l2i->l2i_lsz) != NULL)
6296 6748 *ip = ct->ct_line_size;
6297 6749 if ((ip = l2i->l2i_assoc) != NULL)
6298 6750 *ip = ct->ct_assoc;
6299 6751 l2i->l2i_ret = ct->ct_size;
6300 6752 return (1); /* was an L2 -- terminate walk */
6301 6753 }
6302 6754
6303 6755 /*
6304 6756 * AMD L2/L3 Cache and TLB Associativity Field Definition:
6305 6757 *
6306 6758 * Unlike the associativity for the L1 cache and tlb where the 8 bit
6307 6759 * value is the associativity, the associativity for the L2 cache and
6308 6760 * tlb is encoded in the following table. The 4 bit L2 value serves as
6309 6761 * an index into the amd_afd[] array to determine the associativity.
6310 6762 * -1 is undefined. 0 is fully associative.
6311 6763 */
6312 6764
6313 6765 static int amd_afd[] =
6314 6766 {-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
6315 6767
6316 6768 static void
6317 6769 amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
6318 6770 {
6319 6771 struct cpuid_regs *cp;
6320 6772 uint_t size, assoc;
6321 6773 int i;
6322 6774 int *ip;
6323 6775
6324 6776 if (cpi->cpi_xmaxeax < 0x80000006)
6325 6777 return;
6326 6778 cp = &cpi->cpi_extd[6];
6327 6779
6328 6780 if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
6329 6781 (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
6330 6782 uint_t cachesz = size * 1024;
6331 6783 assoc = amd_afd[i];
6332 6784
6333 6785 ASSERT(assoc != -1);
6334 6786
6335 6787 if ((ip = l2i->l2i_csz) != NULL)
6336 6788 *ip = cachesz;
6337 6789 if ((ip = l2i->l2i_lsz) != NULL)
6338 6790 *ip = BITX(cp->cp_ecx, 7, 0);
6339 6791 if ((ip = l2i->l2i_assoc) != NULL)
6340 6792 *ip = assoc;
6341 6793 l2i->l2i_ret = cachesz;
6342 6794 }
6343 6795 }
6344 6796
6345 6797 int
6346 6798 getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
6347 6799 {
6348 6800 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6349 6801 struct l2info __l2info, *l2i = &__l2info;
6350 6802
6351 6803 l2i->l2i_csz = csz;
6352 6804 l2i->l2i_lsz = lsz;
6353 6805 l2i->l2i_assoc = assoc;
6354 6806 l2i->l2i_ret = -1;
6355 6807
6356 6808 switch (x86_which_cacheinfo(cpi)) {
6357 6809 case X86_VENDOR_Intel:
6358 6810 intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
6359 6811 break;
6360 6812 case X86_VENDOR_Cyrix:
6361 6813 cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
6362 6814 break;
6363 6815 case X86_VENDOR_AMD:
6364 6816 amd_l2cacheinfo(cpi, l2i);
6365 6817 break;
6366 6818 default:
6367 6819 break;
6368 6820 }
6369 6821 return (l2i->l2i_ret);
6370 6822 }
6371 6823
6372 6824 #if !defined(__xpv)
6373 6825
6374 6826 uint32_t *
6375 6827 cpuid_mwait_alloc(cpu_t *cpu)
6376 6828 {
6377 6829 uint32_t *ret;
6378 6830 size_t mwait_size;
6379 6831
6380 6832 ASSERT(cpuid_checkpass(CPU, 2));
6381 6833
6382 6834 mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
6383 6835 if (mwait_size == 0)
6384 6836 return (NULL);
6385 6837
6386 6838 /*
6387 6839 * kmem_alloc() returns cache line size aligned data for mwait_size
6388 6840 * allocations. mwait_size is currently cache line sized. Neither
6389 6841 * of these implementation details are guarantied to be true in the
6390 6842 * future.
6391 6843 *
6392 6844 * First try allocating mwait_size as kmem_alloc() currently returns
6393 6845 * correctly aligned memory. If kmem_alloc() does not return
6394 6846 * mwait_size aligned memory, then use mwait_size ROUNDUP.
6395 6847 *
6396 6848 * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
6397 6849 * decide to free this memory.
6398 6850 */
6399 6851 ret = kmem_zalloc(mwait_size, KM_SLEEP);
6400 6852 if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
6401 6853 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
6402 6854 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
6403 6855 *ret = MWAIT_RUNNING;
6404 6856 return (ret);
6405 6857 } else {
6406 6858 kmem_free(ret, mwait_size);
6407 6859 ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
6408 6860 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
6409 6861 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
6410 6862 ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
6411 6863 *ret = MWAIT_RUNNING;
6412 6864 return (ret);
6413 6865 }
6414 6866 }
6415 6867
6416 6868 void
6417 6869 cpuid_mwait_free(cpu_t *cpu)
6418 6870 {
6419 6871 if (cpu->cpu_m.mcpu_cpi == NULL) {
6420 6872 return;
6421 6873 }
6422 6874
6423 6875 if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
6424 6876 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
6425 6877 kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
6426 6878 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
6427 6879 }
6428 6880
6429 6881 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
6430 6882 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
6431 6883 }
6432 6884
6433 6885 void
6434 6886 patch_tsc_read(int flag)
6435 6887 {
6436 6888 size_t cnt;
6437 6889
6438 6890 switch (flag) {
6439 6891 case TSC_NONE:
6440 6892 cnt = &_no_rdtsc_end - &_no_rdtsc_start;
6441 6893 (void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
6442 6894 break;
6443 6895 case TSC_RDTSC_MFENCE:
6444 6896 cnt = &_tsc_mfence_end - &_tsc_mfence_start;
6445 6897 (void) memcpy((void *)tsc_read,
6446 6898 (void *)&_tsc_mfence_start, cnt);
6447 6899 break;
6448 6900 case TSC_RDTSC_LFENCE:
6449 6901 cnt = &_tsc_lfence_end - &_tsc_lfence_start;
6450 6902 (void) memcpy((void *)tsc_read,
6451 6903 (void *)&_tsc_lfence_start, cnt);
6452 6904 break;
6453 6905 case TSC_TSCP:
6454 6906 cnt = &_tscp_end - &_tscp_start;
6455 6907 (void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
6456 6908 break;
6457 6909 default:
6458 6910 /* Bail for unexpected TSC types. (TSC_NONE covers 0) */
6459 6911 cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
6460 6912 break;
6461 6913 }
6462 6914 tsc_type = flag;
6463 6915 }
6464 6916
6465 6917 int
6466 6918 cpuid_deep_cstates_supported(void)
6467 6919 {
6468 6920 struct cpuid_info *cpi;
6469 6921 struct cpuid_regs regs;
6470 6922
6471 6923 ASSERT(cpuid_checkpass(CPU, 1));
6472 6924
6473 6925 cpi = CPU->cpu_m.mcpu_cpi;
6474 6926
6475 6927 if (!is_x86_feature(x86_featureset, X86FSET_CPUID))
6476 6928 return (0);
6477 6929
6478 6930 switch (cpi->cpi_vendor) {
6479 6931 case X86_VENDOR_Intel:
6480 6932 if (cpi->cpi_xmaxeax < 0x80000007)
6481 6933 return (0);
6482 6934
6483 6935 /*
6484 6936 * TSC run at a constant rate in all ACPI C-states?
6485 6937 */
6486 6938 regs.cp_eax = 0x80000007;
6487 6939 (void) __cpuid_insn(®s);
6488 6940 return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
6489 6941
6490 6942 default:
6491 6943 return (0);
6492 6944 }
6493 6945 }
6494 6946
6495 6947 #endif /* !__xpv */
6496 6948
6497 6949 void
6498 6950 post_startup_cpu_fixups(void)
6499 6951 {
6500 6952 #ifndef __xpv
6501 6953 /*
6502 6954 * Some AMD processors support C1E state. Entering this state will
6503 6955 * cause the local APIC timer to stop, which we can't deal with at
6504 6956 * this time.
6505 6957 */
6506 6958 if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
6507 6959 on_trap_data_t otd;
6508 6960 uint64_t reg;
6509 6961
6510 6962 if (!on_trap(&otd, OT_DATA_ACCESS)) {
6511 6963 reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
6512 6964 /* Disable C1E state if it is enabled by BIOS */
6513 6965 if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
6514 6966 AMD_ACTONCMPHALT_MASK) {
6515 6967 reg &= ~(AMD_ACTONCMPHALT_MASK <<
6516 6968 AMD_ACTONCMPHALT_SHIFT);
6517 6969 wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
6518 6970 }
6519 6971 }
6520 6972 no_trap();
6521 6973 }
6522 6974 #endif /* !__xpv */
6523 6975 }
6524 6976
6525 6977 void
6526 6978 enable_pcid(void)
6527 6979 {
6528 6980 if (x86_use_pcid == -1)
6529 6981 x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
6530 6982
6531 6983 if (x86_use_invpcid == -1) {
6532 6984 x86_use_invpcid = is_x86_feature(x86_featureset,
6533 6985 X86FSET_INVPCID);
6534 6986 }
6535 6987
6536 6988 if (!x86_use_pcid)
6537 6989 return;
6538 6990
6539 6991 /*
6540 6992 * Intel say that on setting PCIDE, it immediately starts using the PCID
6541 6993 * bits; better make sure there's nothing there.
6542 6994 */
6543 6995 ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
6544 6996
6545 6997 setcr4(getcr4() | CR4_PCIDE);
6546 6998 }
6547 6999
6548 7000 /*
6549 7001 * Setup necessary registers to enable XSAVE feature on this processor.
6550 7002 * This function needs to be called early enough, so that no xsave/xrstor
6551 7003 * ops will execute on the processor before the MSRs are properly set up.
6552 7004 *
6553 7005 * Current implementation has the following assumption:
6554 7006 * - cpuid_pass1() is done, so that X86 features are known.
6555 7007 * - fpu_probe() is done, so that fp_save_mech is chosen.
6556 7008 */
6557 7009 void
6558 7010 xsave_setup_msr(cpu_t *cpu)
6559 7011 {
6560 7012 ASSERT(fp_save_mech == FP_XSAVE);
6561 7013 ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
6562 7014
6563 7015 /* Enable OSXSAVE in CR4. */
6564 7016 setcr4(getcr4() | CR4_OSXSAVE);
6565 7017 /*
6566 7018 * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report
6567 7019 * correct value.
6568 7020 */
6569 7021 cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE;
6570 7022 setup_xfem();
6571 7023 }
6572 7024
6573 7025 /*
6574 7026 * Starting with the Westmere processor the local
6575 7027 * APIC timer will continue running in all C-states,
6576 7028 * including the deepest C-states.
6577 7029 */
6578 7030 int
6579 7031 cpuid_arat_supported(void)
6580 7032 {
6581 7033 struct cpuid_info *cpi;
6582 7034 struct cpuid_regs regs;
6583 7035
6584 7036 ASSERT(cpuid_checkpass(CPU, 1));
6585 7037 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
6586 7038
6587 7039 cpi = CPU->cpu_m.mcpu_cpi;
6588 7040
6589 7041 switch (cpi->cpi_vendor) {
6590 7042 case X86_VENDOR_Intel:
6591 7043 /*
6592 7044 * Always-running Local APIC Timer is
6593 7045 * indicated by CPUID.6.EAX[2].
6594 7046 */
6595 7047 if (cpi->cpi_maxeax >= 6) {
6596 7048 regs.cp_eax = 6;
6597 7049 (void) cpuid_insn(NULL, ®s);
6598 7050 return (regs.cp_eax & CPUID_INTC_EAX_ARAT);
6599 7051 } else {
6600 7052 return (0);
6601 7053 }
6602 7054 default:
6603 7055 return (0);
6604 7056 }
6605 7057 }
6606 7058
6607 7059 /*
6608 7060 * Check support for Intel ENERGY_PERF_BIAS feature
6609 7061 */
6610 7062 int
6611 7063 cpuid_iepb_supported(struct cpu *cp)
6612 7064 {
6613 7065 struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi;
6614 7066 struct cpuid_regs regs;
6615 7067
6616 7068 ASSERT(cpuid_checkpass(cp, 1));
6617 7069
6618 7070 if (!(is_x86_feature(x86_featureset, X86FSET_CPUID)) ||
6619 7071 !(is_x86_feature(x86_featureset, X86FSET_MSR))) {
6620 7072 return (0);
6621 7073 }
6622 7074
6623 7075 /*
6624 7076 * Intel ENERGY_PERF_BIAS MSR is indicated by
6625 7077 * capability bit CPUID.6.ECX.3
6626 7078 */
6627 7079 if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6))
6628 7080 return (0);
6629 7081
6630 7082 regs.cp_eax = 0x6;
6631 7083 (void) cpuid_insn(NULL, ®s);
6632 7084 return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS);
6633 7085 }
6634 7086
6635 7087 /*
6636 7088 * Check support for TSC deadline timer
6637 7089 *
6638 7090 * TSC deadline timer provides a superior software programming
6639 7091 * model over local APIC timer that eliminates "time drifts".
6640 7092 * Instead of specifying a relative time, software specifies an
6641 7093 * absolute time as the target at which the processor should
6642 7094 * generate a timer event.
6643 7095 */
6644 7096 int
6645 7097 cpuid_deadline_tsc_supported(void)
6646 7098 {
6647 7099 struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi;
6648 7100 struct cpuid_regs regs;
6649 7101
6650 7102 ASSERT(cpuid_checkpass(CPU, 1));
6651 7103 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
6652 7104
6653 7105 switch (cpi->cpi_vendor) {
6654 7106 case X86_VENDOR_Intel:
6655 7107 if (cpi->cpi_maxeax >= 1) {
6656 7108 regs.cp_eax = 1;
6657 7109 (void) cpuid_insn(NULL, ®s);
6658 7110 return (regs.cp_ecx & CPUID_DEADLINE_TSC);
6659 7111 } else {
6660 7112 return (0);
6661 7113 }
6662 7114 default:
6663 7115 return (0);
6664 7116 }
6665 7117 }
6666 7118
6667 7119 #if defined(__amd64) && !defined(__xpv)
6668 7120 /*
6669 7121 * Patch in versions of bcopy for high performance Intel Nhm processors
6670 7122 * and later...
6671 7123 */
6672 7124 void
6673 7125 patch_memops(uint_t vendor)
6674 7126 {
6675 7127 size_t cnt, i;
6676 7128 caddr_t to, from;
6677 7129
6678 7130 if ((vendor == X86_VENDOR_Intel) &&
6679 7131 is_x86_feature(x86_featureset, X86FSET_SSE4_2)) {
6680 7132 cnt = &bcopy_patch_end - &bcopy_patch_start;
6681 7133 to = &bcopy_ck_size;
6682 7134 from = &bcopy_patch_start;
6683 7135 for (i = 0; i < cnt; i++) {
6684 7136 *to++ = *from++;
6685 7137 }
6686 7138 }
6687 7139 }
6688 7140 #endif /* __amd64 && !__xpv */
6689 7141
6690 7142 /*
6691 7143 * We're being asked to tell the system how many bits are required to represent
6692 7144 * the various thread and strand IDs. While it's tempting to derive this based
6693 7145 * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite
6694 7146 * correct. Instead, this needs to be based on the number of bits that the APIC
6695 7147 * allows for these different configurations. We only update these to a larger
6696 7148 * value if we find one.
6697 7149 */
6698 7150 void
6699 7151 cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
6700 7152 {
6701 7153 struct cpuid_info *cpi;
6702 7154
6703 7155 VERIFY(cpuid_checkpass(CPU, 1));
6704 7156 cpi = cpu->cpu_m.mcpu_cpi;
6705 7157
6706 7158 if (cpi->cpi_ncore_bits > *core_nbits) {
6707 7159 *core_nbits = cpi->cpi_ncore_bits;
6708 7160 }
6709 7161
6710 7162 if (cpi->cpi_nthread_bits > *strand_nbits) {
6711 7163 *strand_nbits = cpi->cpi_nthread_bits;
6712 7164 }
6713 7165 }
6714 7166
6715 7167 void
6716 7168 cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
6717 7169 {
6718 7170 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6719 7171 struct cpuid_regs cp;
6720 7172
6721 7173 /*
6722 7174 * Reread the CPUID portions that we need for various security
6723 7175 * information.
6724 7176 */
6725 7177 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
6726 7178 /*
6727 7179 * Check if we now have leaf 7 available to us.
6728 7180 */
6729 7181 if (cpi->cpi_maxeax < 7) {
6730 7182 bzero(&cp, sizeof (cp));
6731 7183 cp.cp_eax = 0;
6732 7184 cpi->cpi_maxeax = __cpuid_insn(&cp);
6733 7185 if (cpi->cpi_maxeax < 7)
6734 7186 return;
6735 7187 }
6736 7188
6737 7189 bzero(&cp, sizeof (cp));
6738 7190 cp.cp_eax = 7;
6739 7191 cp.cp_ecx = 0;
6740 7192 (void) __cpuid_insn(&cp);
6741 7193 cpi->cpi_std[7] = cp;
6742 7194 } else if (cpi->cpi_vendor == X86_VENDOR_AMD) {
6743 7195 /* No xcpuid support */
6744 7196 if (cpi->cpi_family < 5 ||
6745 7197 (cpi->cpi_family == 5 && cpi->cpi_model < 1))
6746 7198 return;
6747 7199
6748 7200 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
6749 7201 bzero(&cp, sizeof (cp));
6750 7202 cp.cp_eax = CPUID_LEAF_EXT_0;
6751 7203 cpi->cpi_xmaxeax = __cpuid_insn(&cp);
6752 7204 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
6753 7205 return;
6754 7206 }
6755 7207 }
6756 7208
6757 7209 bzero(&cp, sizeof (cp));
6758 7210 cp.cp_eax = CPUID_LEAF_EXT_8;
6759 7211 (void) __cpuid_insn(&cp);
6760 7212 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
6761 7213 cpi->cpi_extd[8] = cp;
6762 7214 } else {
6763 7215 /*
6764 7216 * Nothing to do here. Return an empty set which has already
6765 7217 * been zeroed for us.
6766 7218 */
↓ open down ↓ |
4401 lines elided |
↑ open up ↑ |
6767 7219 return;
6768 7220 }
6769 7221 cpuid_scan_security(cpu, fset);
6770 7222 }
6771 7223
6772 7224 /* ARGSUSED */
6773 7225 static int
6774 7226 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
6775 7227 {
6776 7228 uchar_t *fset;
7229 + boolean_t first_pass = (boolean_t)arg1;
6777 7230
6778 7231 fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
7232 + if (first_pass && CPU->cpu_id != 0)
7233 + return (0);
7234 + if (!first_pass && CPU->cpu_id == 0)
7235 + return (0);
6779 7236 cpuid_pass_ucode(CPU, fset);
6780 7237
6781 7238 return (0);
6782 7239 }
6783 7240
6784 7241 /*
6785 7242 * After a microcode update where the version has changed, then we need to
6786 7243 * rescan CPUID. To do this we check every CPU to make sure that they have the
6787 7244 * same microcode. Then we perform a cross call to all such CPUs. It's the
6788 7245 * caller's job to make sure that no one else can end up doing an update while
6789 7246 * this is going on.
6790 7247 *
6791 7248 * We assume that the system is microcode capable if we're called.
6792 7249 */
6793 7250 void
6794 7251 cpuid_post_ucodeadm(void)
6795 7252 {
6796 7253 uint32_t rev;
6797 7254 int i;
6798 7255 struct cpu *cpu;
6799 7256 cpuset_t cpuset;
6800 7257 void *argdata;
6801 7258 uchar_t *f0;
6802 7259
6803 7260 argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
6804 7261
6805 7262 mutex_enter(&cpu_lock);
6806 7263 cpu = cpu_get(0);
6807 7264 rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
6808 7265 CPUSET_ONLY(cpuset, 0);
6809 7266 for (i = 1; i < max_ncpus; i++) {
6810 7267 if ((cpu = cpu_get(i)) == NULL)
↓ open down ↓ |
22 lines elided |
↑ open up ↑ |
6811 7268 continue;
6812 7269
6813 7270 if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
6814 7271 panic("post microcode update CPU %d has differing "
6815 7272 "microcode revision (%u) from CPU 0 (%u)",
6816 7273 i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
6817 7274 }
6818 7275 CPUSET_ADD(cpuset, i);
6819 7276 }
6820 7277
7278 + /*
7279 + * We do the cross calls in two passes. The first pass is only for the
7280 + * boot CPU. The second pass is for all of the other CPUs. This allows
7281 + * the boot CPU to go through and change behavior related to patching or
7282 + * whether or not Enhanced IBRS needs to be enabled and then allow all
7283 + * other CPUs to follow suit.
7284 + */
6821 7285 kpreempt_disable();
6822 - xc_sync((xc_arg_t)argdata, 0, 0, CPUSET2BV(cpuset),
7286 + xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset),
6823 7287 cpuid_post_ucodeadm_xc);
7288 + xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset),
7289 + cpuid_post_ucodeadm_xc);
6824 7290 kpreempt_enable();
6825 7291
6826 7292 /*
6827 7293 * OK, now look at each CPU and see if their feature sets are equal.
6828 7294 */
6829 7295 f0 = argdata;
6830 7296 for (i = 1; i < max_ncpus; i++) {
6831 7297 uchar_t *fset;
6832 7298 if (!CPU_IN_SET(cpuset, i))
6833 7299 continue;
6834 7300
6835 7301 fset = (uchar_t *)((uintptr_t)argdata +
6836 7302 sizeof (x86_featureset) * i);
6837 7303
6838 7304 if (!compare_x86_featureset(f0, fset)) {
6839 7305 panic("Post microcode update CPU %d has "
6840 7306 "differing security feature (%p) set from CPU 0 "
6841 7307 "(%p), not appending to feature set", i,
6842 7308 (void *)fset, (void *)f0);
6843 7309 }
6844 7310 }
6845 7311
6846 7312 mutex_exit(&cpu_lock);
6847 7313
6848 7314 for (i = 0; i < NUM_X86_FEATURES; i++) {
6849 7315 cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n",
6850 7316 x86_feature_names[i]);
6851 7317 if (is_x86_feature(f0, i)) {
6852 7318 add_x86_feature(x86_featureset, i);
6853 7319 }
6854 7320 }
6855 7321 kmem_free(argdata, sizeof (x86_featureset) * NCPU);
6856 7322 }
↓ open down ↓ |
23 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX