Print this page
11859 need swapgs mitigation
Reviewed by: Robert Mustacchi <rm@fingolfin.org>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@fingolfin.org>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/i86pc/os/cpuid.c
+++ new/usr/src/uts/i86pc/os/cpuid.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24 24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25 25 * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
26 26 */
27 27 /*
28 28 * Copyright (c) 2010, Intel Corporation.
29 29 * All rights reserved.
30 30 */
31 31 /*
32 32 * Portions Copyright 2009 Advanced Micro Devices, Inc.
33 33 */
34 34 /*
35 35 * Copyright 2019 Joyent, Inc.
36 36 */
37 37
38 38 /*
39 39 * CPU Identification logic
40 40 *
41 41 * The purpose of this file and its companion, cpuid_subr.c, is to help deal
42 42 * with the identification of CPUs, their features, and their topologies. More
43 43 * specifically, this file helps drive the following:
44 44 *
45 45 * 1. Enumeration of features of the processor which are used by the kernel to
46 46 * determine what features to enable or disable. These may be instruction set
47 47 * enhancements or features that we use.
48 48 *
49 49 * 2. Enumeration of instruction set architecture (ISA) additions that userland
50 50 * will be told about through the auxiliary vector.
51 51 *
52 52 * 3. Understanding the physical topology of the CPU such as the number of
53 53 * caches, how many cores it has, whether or not it supports symmetric
54 54 * multi-processing (SMT), etc.
55 55 *
56 56 * ------------------------
57 57 * CPUID History and Basics
58 58 * ------------------------
59 59 *
60 60 * The cpuid instruction was added by Intel roughly around the time that the
61 61 * original Pentium was introduced. The purpose of cpuid was to tell in a
62 62 * programmatic fashion information about the CPU that previously was guessed
63 63 * at. For example, an important part of cpuid is that we can know what
64 64 * extensions to the ISA exist. If you use an invalid opcode you would get a
65 65 * #UD, so this method allows a program (whether a user program or the kernel)
66 66 * to determine what exists without crashing or getting a SIGILL. Of course,
67 67 * this was also during the era of the clones and the AMD Am5x86. The vendor
68 68 * name shows up first in cpuid for a reason.
69 69 *
70 70 * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
71 71 * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
72 72 * its own meaning. The different leaves are broken down into different regions:
73 73 *
74 74 * [ 0, 7fffffff ] This region is called the 'basic'
75 75 * region. This region is generally defined
76 76 * by Intel, though some of the original
77 77 * portions have different meanings based
78 78 * on the manufacturer. These days, Intel
79 79 * adds most new features to this region.
80 80 * AMD adds non-Intel compatible
81 81 * information in the third, extended
82 82 * region. Intel uses this for everything
83 83 * including ISA extensions, CPU
84 84 * features, cache information, topology,
85 85 * and more.
86 86 *
87 87 * There is a hole carved out of this
88 88 * region which is reserved for
89 89 * hypervisors.
90 90 *
91 91 * [ 40000000, 4fffffff ] This region, which is found in the
92 92 * middle of the previous region, is
93 93 * explicitly promised to never be used by
94 94 * CPUs. Instead, it is used by hypervisors
95 95 * to communicate information about
96 96 * themselves to the operating system. The
97 97 * values and details are unique for each
98 98 * hypervisor.
99 99 *
100 100 * [ 80000000, ffffffff ] This region is called the 'extended'
101 101 * region. Some of the low leaves mirror
102 102 * parts of the basic leaves. This region
103 103 * has generally been used by AMD for
104 104 * various extensions. For example, AMD-
105 105 * specific information about caches,
106 106 * features, and topology are found in this
107 107 * region.
108 108 *
109 109 * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
110 110 * and %edx, and then issue the cpuid instruction. At the first leaf in each of
111 111 * the ranges, one of the primary things returned is the maximum valid leaf in
112 112 * that range. This allows for discovery of what range of CPUID is valid.
113 113 *
114 114 * The CPUs have potentially surprising behavior when using an invalid leaf or
115 115 * unimplemented leaf. If the requested leaf is within the valid basic or
116 116 * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
117 117 * set to zero. However, if you specify a leaf that is outside of a valid range,
118 118 * then instead it will be filled with the last valid _basic_ leaf. For example,
119 119 * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
120 120 * an invalid extended leaf will return the information for leaf 3.
121 121 *
122 122 * Some leaves are broken down into sub-leaves. This means that the value
123 123 * depends on both the leaf asked for in %eax and a secondary register. For
124 124 * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
125 125 * additional information. Or when getting topology information in leaf 0xb, the
126 126 * initial value in %ecx changes which level of the topology that you are
127 127 * getting information about.
128 128 *
129 129 * cpuid values are always kept to 32 bits regardless of whether or not the
130 130 * program is in 64-bit mode. When executing in 64-bit mode, the upper
131 131 * 32 bits of the register are always set to zero so that way the values are the
132 132 * same regardless of execution mode.
133 133 *
134 134 * ----------------------
135 135 * Identifying Processors
136 136 * ----------------------
137 137 *
138 138 * We can identify a processor in two steps. The first step looks at cpuid leaf
139 139 * 0. Leaf 0 contains the processor's vendor information. This is done by
140 140 * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
141 141 * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
142 142 *
143 143 * From there, a processor is identified by a combination of three different
144 144 * values:
145 145 *
146 146 * 1. Family
147 147 * 2. Model
148 148 * 3. Stepping
149 149 *
150 150 * Each vendor uses the family and model to uniquely identify a processor. The
151 151 * way that family and model are changed depends on the vendor. For example,
152 152 * Intel has been using family 0x6 for almost all of their processor since the
153 153 * Pentium Pro/Pentium II era, often called the P6. The model is used to
154 154 * identify the exact processor. Different models are often used for the client
155 155 * (consumer) and server parts. Even though each processor often has major
156 156 * architectural differences, they still are considered the same family by
157 157 * Intel.
158 158 *
159 159 * On the other hand, each major AMD architecture generally has its own family.
160 160 * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
161 161 * the model number is used to help identify specific processors.
162 162 *
163 163 * The stepping is used to refer to a revision of a specific microprocessor. The
164 164 * term comes from equipment used to produce masks that are used to create
165 165 * integrated circuits.
166 166 *
167 167 * The information is present in leaf 1, %eax. In technical documentation you
168 168 * will see the terms extended model and extended family. The original family,
169 169 * model, and stepping fields were each 4 bits wide. If the values in either
170 170 * are 0xf, then one is to consult the extended model and extended family, which
171 171 * take previously reserved bits and allow for a larger number of models and add
172 172 * 0xf to them.
173 173 *
174 174 * When we process this information, we store the full family, model, and
175 175 * stepping in the struct cpuid_info members cpi_family, cpi_model, and
176 176 * cpi_step, respectively. Whenever you are performing comparisons with the
177 177 * family, model, and stepping, you should use these members and not the raw
178 178 * values from cpuid. If you must use the raw values from cpuid directly, you
179 179 * must make sure that you add the extended model and family to the base model
180 180 * and family.
181 181 *
182 182 * In general, we do not use information about the family, model, and stepping
183 183 * to determine whether or not a feature is present; that is generally driven by
184 184 * specific leaves. However, when something we care about on the processor is
185 185 * not considered 'architectural' meaning that it is specific to a set of
186 186 * processors and not promised in the architecture model to be consistent from
187 187 * generation to generation, then we will fall back on this information. The
188 188 * most common cases where this comes up is when we have to workaround errata in
189 189 * the processor, are dealing with processor-specific features such as CPU
190 190 * performance counters, or we want to provide additional information for things
191 191 * such as fault management.
192 192 *
193 193 * While processors also do have a brand string, which is the name that people
194 194 * are familiar with when buying the processor, they are not meant for
195 195 * programmatic consumption. That is what the family, model, and stepping are
196 196 * for.
197 197 *
198 198 * ------------
199 199 * CPUID Passes
200 200 * ------------
201 201 *
202 202 * As part of performing feature detection, we break this into several different
203 203 * passes. The passes are as follows:
204 204 *
205 205 * Pass 0 This is a primordial pass done in locore.s to deal with
206 206 * Cyrix CPUs that don't support cpuid. The reality is that
207 207 * we likely don't run on them any more, but there is still
208 208 * logic for handling them.
209 209 *
210 210 * Pass 1 This is the primary pass and is responsible for doing a
211 211 * large number of different things:
212 212 *
213 213 * 1. Determine which vendor manufactured the CPU and
214 214 * determining the family, model, and stepping information.
215 215 *
216 216 * 2. Gathering a large number of feature flags to
217 217 * determine which features the CPU support and which
218 218 * indicate things that we need to do other work in the OS
219 219 * to enable. Features detected this way are added to the
220 220 * x86_featureset which can be queried to
221 221 * determine what we should do. This includes processing
222 222 * all of the basic and extended CPU features that we care
223 223 * about.
224 224 *
225 225 * 3. Determining the CPU's topology. This includes
226 226 * information about how many cores and threads are present
227 227 * in the package. It also is responsible for figuring out
228 228 * which logical CPUs are potentially part of the same core
229 229 * and what other resources they might share. For more
230 230 * information see the 'Topology' section.
231 231 *
232 232 * 4. Determining the set of CPU security-specific features
233 233 * that we need to worry about and determine the
234 234 * appropriate set of workarounds.
235 235 *
236 236 * Pass 1 on the boot CPU occurs before KMDB is started.
237 237 *
238 238 * Pass 2 The second pass is done after startup(). Here, we check
239 239 * other miscellaneous features. Most of this is gathering
240 240 * additional basic and extended features that we'll use in
241 241 * later passes or for debugging support.
242 242 *
243 243 * Pass 3 The third pass occurs after the kernel memory allocator
244 244 * has been fully initialized. This gathers information
245 245 * where we might need dynamic memory available for our
246 246 * uses. This includes several varying width leaves that
247 247 * have cache information and the processor's brand string.
248 248 *
249 249 * Pass 4 The fourth and final normal pass is performed after the
250 250 * kernel has brought most everything online. This is
251 251 * invoked from post_startup(). In this pass, we go through
252 252 * the set of features that we have enabled and turn that
253 253 * into the hardware auxiliary vector features that
254 254 * userland receives. This is used by userland, primarily
255 255 * by the run-time link-editor (RTLD), though userland
256 256 * software could also refer to it directly.
257 257 *
258 258 * Microcode After a microcode update, we do a selective rescan of
259 259 * the cpuid leaves to determine what features have
260 260 * changed. Microcode updates can provide more details
261 261 * about security related features to deal with issues like
262 262 * Spectre and L1TF. On occasion, vendors have violated
263 263 * their contract and removed bits. However, we don't try
264 264 * to detect that because that puts us in a situation that
265 265 * we really can't deal with. As such, the only thing we
266 266 * rescan are security related features today. See
267 267 * cpuid_pass_ucode().
268 268 *
269 269 * All of the passes (except pass 0) are run on all CPUs. However, for the most
270 270 * part we only care about what the boot CPU says about this information and use
271 271 * the other CPUs as a rough guide to sanity check that we have the same feature
272 272 * set.
273 273 *
274 274 * We do not support running multiple logical CPUs with disjoint, let alone
275 275 * different, feature sets.
276 276 *
277 277 * ------------------
278 278 * Processor Topology
279 279 * ------------------
280 280 *
281 281 * One of the important things that we need to do is to understand the topology
282 282 * of the underlying processor. When we say topology in this case, we're trying
283 283 * to understand the relationship between the logical CPUs that the operating
284 284 * system sees and the underlying physical layout. Different logical CPUs may
285 285 * share different resources which can have important consequences for the
286 286 * performance of the system. For example, they may share caches, execution
287 287 * units, and more.
288 288 *
289 289 * The topology of the processor changes from generation to generation and
290 290 * vendor to vendor. Along with that, different vendors use different
291 291 * terminology, and the operating system itself uses occasionally overlapping
292 292 * terminology. It's important to understand what this topology looks like so
293 293 * one can understand the different things that we try to calculate and
294 294 * determine.
295 295 *
296 296 * To get started, let's talk about a little bit of terminology that we've used
297 297 * so far, is used throughout this file, and is fairly generic across multiple
298 298 * vendors:
299 299 *
300 300 * CPU
301 301 * A central processing unit (CPU) refers to a logical and/or virtual
302 302 * entity that the operating system can execute instructions on. The
303 303 * underlying resources for this CPU may be shared between multiple
304 304 * entities; however, to the operating system it is a discrete unit.
305 305 *
306 306 * PROCESSOR and PACKAGE
307 307 *
308 308 * Generally, when we use the term 'processor' on its own, we are referring
309 309 * to the physical entity that one buys and plugs into a board. However,
310 310 * because processor has been overloaded and one might see it used to mean
311 311 * multiple different levels, we will instead use the term 'package' for
312 312 * the rest of this file. The term package comes from the electrical
313 313 * engineering side and refers to the physical entity that encloses the
314 314 * electronics inside. Strictly speaking the package can contain more than
315 315 * just the CPU, for example, on many processors it may also have what's
316 316 * called an 'integrated graphical processing unit (GPU)'. Because the
317 317 * package can encapsulate multiple units, it is the largest physical unit
318 318 * that we refer to.
319 319 *
320 320 * SOCKET
321 321 *
322 322 * A socket refers to unit on a system board (generally the motherboard)
323 323 * that can receive a package. A single package, or processor, is plugged
324 324 * into a single socket. A system may have multiple sockets. Often times,
325 325 * the term socket is used interchangeably with package and refers to the
326 326 * electrical component that has plugged in, and not the receptacle itself.
327 327 *
328 328 * CORE
329 329 *
330 330 * A core refers to the physical instantiation of a CPU, generally, with a
331 331 * full set of hardware resources available to it. A package may contain
332 332 * multiple cores inside of it or it may just have a single one. A
333 333 * processor with more than one core is often referred to as 'multi-core'.
334 334 * In illumos, we will use the feature X86FSET_CMP to refer to a system
335 335 * that has 'multi-core' processors.
336 336 *
337 337 * A core may expose a single logical CPU to the operating system, or it
338 338 * may expose multiple CPUs, which we call threads, defined below.
339 339 *
340 340 * Some resources may still be shared by cores in the same package. For
341 341 * example, many processors will share the level 3 cache between cores.
342 342 * Some AMD generations share hardware resources between cores. For more
343 343 * information on that see the section 'AMD Topology'.
344 344 *
345 345 * THREAD and STRAND
346 346 *
347 347 * In this file, generally a thread refers to a hardware resources and not
348 348 * the operating system's logical abstraction. A thread is always exposed
349 349 * as an independent logical CPU to the operating system. A thread belongs
350 350 * to a specific core. A core may have more than one thread. When that is
351 351 * the case, the threads that are part of the same core are often referred
352 352 * to as 'siblings'.
353 353 *
354 354 * When multiple threads exist, this is generally referred to as
355 355 * simultaneous multi-threading (SMT). When Intel introduced this in their
356 356 * processors they called it hyper-threading (HT). When multiple threads
357 357 * are active in a core, they split the resources of the core. For example,
358 358 * two threads may share the same set of hardware execution units.
359 359 *
360 360 * The operating system often uses the term 'strand' to refer to a thread.
361 361 * This helps disambiguate it from the software concept.
362 362 *
363 363 * CHIP
364 364 *
365 365 * Unfortunately, the term 'chip' is dramatically overloaded. At its most
366 366 * base meaning, it is used to refer to a single integrated circuit, which
367 367 * may or may not be the only thing in the package. In illumos, when you
368 368 * see the term 'chip' it is almost always referring to the same thing as
369 369 * the 'package'. However, many vendors may use chip to refer to one of
370 370 * many integrated circuits that have been placed in the package. As an
371 371 * example, see the subsequent definition.
372 372 *
373 373 * To try and keep things consistent, we will only use chip when referring
374 374 * to the entire integrated circuit package, with the exception of the
375 375 * definition of multi-chip module (because it is in the name) and use the
376 376 * term 'die' when we want the more general, potential sub-component
377 377 * definition.
378 378 *
379 379 * DIE
380 380 *
381 381 * A die refers to an integrated circuit. Inside of the package there may
382 382 * be a single die or multiple dies. This is sometimes called a 'chip' in
383 383 * vendor's parlance, but in this file, we use the term die to refer to a
384 384 * subcomponent.
385 385 *
386 386 * MULTI-CHIP MODULE
387 387 *
388 388 * A multi-chip module (MCM) refers to putting multiple distinct chips that
389 389 * are connected together in the same package. When a multi-chip design is
390 390 * used, generally each chip is manufactured independently and then joined
391 391 * together in the package. For example, on AMD's Zen microarchitecture
392 392 * (family 0x17), the package contains several dies (the second meaning of
393 393 * chip from above) that are connected together.
394 394 *
395 395 * CACHE
396 396 *
397 397 * A cache is a part of the processor that maintains copies of recently
398 398 * accessed memory. Caches are split into levels and then into types.
399 399 * Commonly there are one to three levels, called level one, two, and
400 400 * three. The lower the level, the smaller it is, the closer it is to the
401 401 * execution units of the CPU, and the faster it is to access. The layout
402 402 * and design of the cache come in many different flavors, consult other
403 403 * resources for a discussion of those.
404 404 *
405 405 * Caches are generally split into two types, the instruction and data
406 406 * cache. The caches contain what their names suggest, the instruction
407 407 * cache has executable program text, while the data cache has all other
408 408 * memory that the processor accesses. As of this writing, data is kept
409 409 * coherent between all of the caches on x86, so if one modifies program
410 410 * text before it is executed, that will be in the data cache, and the
411 411 * instruction cache will be synchronized with that change when the
412 412 * processor actually executes those instructions. This coherency also
413 413 * covers the fact that data could show up in multiple caches.
414 414 *
415 415 * Generally, the lowest level caches are specific to a core. However, the
416 416 * last layer cache is shared between some number of cores. The number of
417 417 * CPUs sharing this last level cache is important. This has implications
418 418 * for the choices that the scheduler makes, as accessing memory that might
419 419 * be in a remote cache after thread migration can be quite expensive.
420 420 *
421 421 * Sometimes, the word cache is abbreviated with a '$', because in US
422 422 * English the word cache is pronounced the same as cash. So L1D$ refers to
423 423 * the L1 data cache, and L2$ would be the L2 cache. This will not be used
424 424 * in the rest of this theory statement for clarity.
425 425 *
426 426 * MEMORY CONTROLLER
427 427 *
428 428 * The memory controller is a component that provides access to DRAM. Each
429 429 * memory controller can access a set number of DRAM channels. Each channel
430 430 * can have a number of DIMMs (sticks of memory) associated with it. A
431 431 * given package may have more than one memory controller. The association
432 432 * of the memory controller to a group of cores is important as it is
433 433 * cheaper to access memory on the controller that you are associated with.
434 434 *
435 435 * NUMA
436 436 *
437 437 * NUMA or non-uniform memory access, describes a way that systems are
438 438 * built. On x86, any processor core can address all of the memory in the
439 439 * system. However, When using multiple sockets or possibly within a
440 440 * multi-chip module, some of that memory is physically closer and some of
441 441 * it is further. Memory that is further away is more expensive to access.
442 442 * Consider the following image of multiple sockets with memory:
443 443 *
444 444 * +--------+ +--------+
445 445 * | DIMM A | +----------+ +----------+ | DIMM D |
446 446 * +--------+-+ | | | | +-+------+-+
447 447 * | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
448 448 * +--------+-+ | | | | +-+------+-+
449 449 * | DIMM C | +----------+ +----------+ | DIMM F |
450 450 * +--------+ +--------+
451 451 *
452 452 * In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
453 453 * closer to DIMMs D-F. This means that it is cheaper for socket 0 to
454 454 * access DIMMs A-C and more expensive to access D-F as it has to go
455 455 * through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
456 456 * D-F are cheaper than A-C. While the socket form is the most common, when
457 457 * using multi-chip modules, this can also sometimes occur. For another
458 458 * example of this that's more involved, see the AMD topology section.
459 459 *
460 460 *
461 461 * Intel Topology
462 462 * --------------
463 463 *
464 464 * Most Intel processors since Nehalem, (as of this writing the current gen
465 465 * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
466 466 * the package is a single monolithic die. MCMs currently aren't used. Most
467 467 * parts have three levels of caches, with the L3 cache being shared between
468 468 * all of the cores on the package. The L1/L2 cache is generally specific to
469 469 * an individual core. The following image shows at a simplified level what
470 470 * this looks like. The memory controller is commonly part of something called
471 471 * the 'Uncore', that used to be separate physical chips that were not a part of
472 472 * the package, but are now part of the same chip.
473 473 *
474 474 * +-----------------------------------------------------------------------+
475 475 * | Package |
476 476 * | +-------------------+ +-------------------+ +-------------------+ |
477 477 * | | Core | | Core | | Core | |
478 478 * | | +--------+ +---+ | | +--------+ +---+ | | +--------+ +---+ | |
479 479 * | | | Thread | | L | | | | Thread | | L | | | | Thread | | L | | |
480 480 * | | +--------+ | 1 | | | +--------+ | 1 | | | +--------+ | 1 | | |
481 481 * | | +--------+ | | | | +--------+ | | | | +--------+ | | | |
482 482 * | | | Thread | | | | | | Thread | | | | | | Thread | | | | |
483 483 * | | +--------+ +---+ | | +--------+ +---+ | | +--------+ +---+ | |
484 484 * | | +--------------+ | | +--------------+ | | +--------------+ | |
485 485 * | | | L2 Cache | | | | L2 Cache | | | | L2 Cache | | |
486 486 * | | +--------------+ | | +--------------+ | | +--------------+ | |
487 487 * | +-------------------+ +-------------------+ +-------------------+ |
488 488 * | +-------------------------------------------------------------------+ |
489 489 * | | Shared L3 Cache | |
490 490 * | +-------------------------------------------------------------------+ |
491 491 * | +-------------------------------------------------------------------+ |
492 492 * | | Memory Controller | |
493 493 * | +-------------------------------------------------------------------+ |
494 494 * +-----------------------------------------------------------------------+
495 495 *
496 496 * A side effect of this current architecture is that what we care about from a
497 497 * scheduling and topology perspective, is simplified. In general we care about
498 498 * understanding which logical CPUs are part of the same core and socket.
499 499 *
500 500 * To determine the relationship between threads and cores, Intel initially used
501 501 * the identifier in the advanced programmable interrupt controller (APIC). They
502 502 * also added cpuid leaf 4 to give additional information about the number of
503 503 * threads and CPUs in the processor. With the addition of x2apic (which
504 504 * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
505 505 * additional cpuid topology leaf 0xB was added.
506 506 *
507 507 * AMD Topology
508 508 * ------------
509 509 *
510 510 * When discussing AMD topology, we want to break this into three distinct
511 511 * generations of topology. There's the basic topology that has been used in
512 512 * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
513 513 * with family 0x15 (Bulldozer), and there's the topology that was introduced
514 514 * with family 0x17 (Zen). AMD also has some additional terminology that's worth
515 515 * talking about.
516 516 *
517 517 * Until the introduction of family 0x17 (Zen), AMD did not implement something
518 518 * that they considered SMT. Whether or not the AMD processors have SMT
519 519 * influences many things including scheduling and reliability, availability,
520 520 * and serviceability (RAS) features.
521 521 *
522 522 * NODE
523 523 *
524 524 * AMD uses the term node to refer to a die that contains a number of cores
525 525 * and I/O resources. Depending on the processor family and model, more
526 526 * than one node can be present in the package. When there is more than one
527 527 * node this indicates a multi-chip module. Usually each node has its own
528 528 * access to memory and I/O devices. This is important and generally
529 529 * different from the corresponding Intel Nehalem-Skylake+ processors. As a
530 530 * result, we track this relationship in the operating system.
531 531 *
532 532 * In processors with an L3 cache, the L3 cache is generally shared across
533 533 * the entire node, though the way this is carved up varies from generation
534 534 * to generation.
535 535 *
536 536 * BULLDOZER
537 537 *
538 538 * Starting with the Bulldozer family (0x15) and continuing until the
539 539 * introduction of the Zen microarchitecture, AMD introduced the idea of a
540 540 * compute unit. In a compute unit, two traditional cores share a number of
541 541 * hardware resources. Critically, they share the FPU, L1 instruction
542 542 * cache, and the L2 cache. Several compute units were then combined inside
543 543 * of a single node. Because the integer execution units, L1 data cache,
544 544 * and some other resources were not shared between the cores, AMD never
545 545 * considered this to be SMT.
546 546 *
547 547 * ZEN
548 548 *
549 549 * The Zen family (0x17) uses a multi-chip module (MCM) design, the module
550 550 * is called Zeppelin. These modules are similar to the idea of nodes used
551 551 * previously. Each of these nodes has two DRAM channels which all of the
552 552 * cores in the node can access uniformly. These nodes are linked together
553 553 * in the package, creating a NUMA environment.
554 554 *
555 555 * The Zeppelin die itself contains two different 'core complexes'. Each
556 556 * core complex consists of four cores which each have two threads, for a
557 557 * total of 8 logical CPUs per complex. Unlike other generations,
558 558 * where all the logical CPUs in a given node share the L3 cache, here each
559 559 * core complex has its own shared L3 cache.
560 560 *
561 561 * A further thing that we need to consider is that in some configurations,
562 562 * particularly with the Threadripper line of processors, not every die
563 563 * actually has its memory controllers wired up to actual memory channels.
564 564 * This means that some cores have memory attached to them and others
565 565 * don't.
566 566 *
567 567 * To put Zen in perspective, consider the following images:
568 568 *
569 569 * +--------------------------------------------------------+
570 570 * | Core Complex |
571 571 * | +-------------------+ +-------------------+ +---+ |
572 572 * | | Core +----+ | | Core +----+ | | | |
573 573 * | | +--------+ | L2 | | | +--------+ | L2 | | | | |
574 574 * | | | Thread | +----+ | | | Thread | +----+ | | | |
575 575 * | | +--------+-+ +--+ | | +--------+-+ +--+ | | L | |
576 576 * | | | Thread | |L1| | | | Thread | |L1| | | 3 | |
577 577 * | | +--------+ +--+ | | +--------+ +--+ | | | |
578 578 * | +-------------------+ +-------------------+ | C | |
579 579 * | +-------------------+ +-------------------+ | a | |
580 580 * | | Core +----+ | | Core +----+ | | c | |
581 581 * | | +--------+ | L2 | | | +--------+ | L2 | | | h | |
582 582 * | | | Thread | +----+ | | | Thread | +----+ | | e | |
583 583 * | | +--------+-+ +--+ | | +--------+-+ +--+ | | | |
584 584 * | | | Thread | |L1| | | | Thread | |L1| | | | |
585 585 * | | +--------+ +--+ | | +--------+ +--+ | | | |
586 586 * | +-------------------+ +-------------------+ +---+ |
587 587 * | |
588 588 * +--------------------------------------------------------+
589 589 *
590 590 * This first image represents a single Zen core complex that consists of four
591 591 * cores.
592 592 *
593 593 *
594 594 * +--------------------------------------------------------+
595 595 * | Zeppelin Die |
596 596 * | +--------------------------------------------------+ |
597 597 * | | I/O Units (PCIe, SATA, USB, etc.) | |
598 598 * | +--------------------------------------------------+ |
599 599 * | HH |
600 600 * | +-----------+ HH +-----------+ |
601 601 * | | | HH | | |
602 602 * | | Core |==========| Core | |
603 603 * | | Complex |==========| Complex | |
604 604 * | | | HH | | |
605 605 * | +-----------+ HH +-----------+ |
606 606 * | HH |
607 607 * | +--------------------------------------------------+ |
608 608 * | | Memory Controller | |
609 609 * | +--------------------------------------------------+ |
610 610 * | |
611 611 * +--------------------------------------------------------+
612 612 *
613 613 * This image represents a single Zeppelin Die. Note how both cores are
614 614 * connected to the same memory controller and I/O units. While each core
615 615 * complex has its own L3 cache as seen in the first image, they both have
616 616 * uniform access to memory.
617 617 *
618 618 *
619 619 * PP PP
620 620 * PP PP
621 621 * +----------PP---------------------PP---------+
622 622 * | PP PP |
623 623 * | +-----------+ +-----------+ |
624 624 * | | | | | |
625 625 * MMMMMMMMM| Zeppelin |==========| Zeppelin |MMMMMMMMM
626 626 * MMMMMMMMM| Die |==========| Die |MMMMMMMMM
627 627 * | | | | | |
628 628 * | +-----------+ooo ...+-----------+ |
629 629 * | HH ooo ... HH |
630 630 * | HH oo.. HH |
631 631 * | HH ..oo HH |
632 632 * | HH ... ooo HH |
633 633 * | +-----------+... ooo+-----------+ |
634 634 * | | | | | |
635 635 * MMMMMMMMM| Zeppelin |==========| Zeppelin |MMMMMMMMM
636 636 * MMMMMMMMM| Die |==========| Die |MMMMMMMMM
637 637 * | | | | | |
638 638 * | +-----------+ +-----------+ |
639 639 * | PP PP |
640 640 * +----------PP---------------------PP---------+
641 641 * PP PP
642 642 * PP PP
643 643 *
644 644 * This image represents a single Zen package. In this example, it has four
645 645 * Zeppelin dies, though some configurations only have a single one. In this
646 646 * example, each die is directly connected to the next. Also, each die is
647 647 * represented as being connected to memory by the 'M' character and connected
648 648 * to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
649 649 * die is made up of two core complexes, we have multiple different NUMA
650 650 * domains that we care about for these systems.
651 651 *
652 652 * CPUID LEAVES
653 653 *
654 654 * There are a few different CPUID leaves that we can use to try and understand
655 655 * the actual state of the world. As part of the introduction of family 0xf, AMD
656 656 * added CPUID leaf 0x80000008. This leaf tells us the number of logical
657 657 * processors that are in the system. Because families before Zen didn't have
658 658 * SMT, this was always the number of cores that were in the system. However, it
659 659 * should always be thought of as the number of logical threads to be consistent
660 660 * between generations. In addition we also get the size of the APIC ID that is
661 661 * used to represent the number of logical processors. This is important for
662 662 * deriving topology information.
663 663 *
664 664 * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
665 665 * bit between Bulldozer and later families, but it is quite useful in
666 666 * determining the topology information. Because this information has changed
667 667 * across family generations, it's worth calling out what these mean
668 668 * explicitly. The registers have the following meanings:
669 669 *
670 670 * %eax The APIC ID. The entire register is defined to have a 32-bit
671 671 * APIC ID, even though on systems without x2apic support, it will
672 672 * be limited to 8 bits.
673 673 *
674 674 * %ebx On Bulldozer-era systems this contains information about the
675 675 * number of cores that are in a compute unit (cores that share
676 676 * resources). It also contains a per-package compute unit ID that
677 677 * identifies which compute unit the logical CPU is a part of.
678 678 *
679 679 * On Zen-era systems this instead contains the number of threads
680 680 * per core and the ID of the core that the logical CPU is a part
681 681 * of. Note, this ID is unique only to the package, it is not
682 682 * globally unique across the entire system.
683 683 *
684 684 * %ecx This contains the number of nodes that exist in the package. It
685 685 * also contains an ID that identifies which node the logical CPU
686 686 * is a part of.
687 687 *
688 688 * Finally, we also use cpuid leaf 0x8000001D to determine information about the
689 689 * cache layout to determine which logical CPUs are sharing which caches.
690 690 *
691 691 * illumos Topology
692 692 * ----------------
693 693 *
694 694 * Based on the above we synthesize the information into several different
695 695 * variables that we store in the 'struct cpuid_info'. We'll go into the details
696 696 * of what each member is supposed to represent and their uniqueness. In
697 697 * general, there are two levels of uniqueness that we care about. We care about
698 698 * an ID that is globally unique. That means that it will be unique across all
699 699 * entities in the system. For example, the default logical CPU ID is globally
700 700 * unique. On the other hand, there is some information that we only care about
701 701 * being unique within the context of a single package / socket. Here are the
702 702 * variables that we keep track of and their meaning.
703 703 *
704 704 * Several of the values that are asking for an identifier, with the exception
705 705 * of cpi_apicid, are allowed to be synthetic.
706 706 *
707 707 *
708 708 * cpi_apicid
709 709 *
710 710 * This is the value of the CPU's APIC id. This should be the full 32-bit
711 711 * ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
712 712 * APIC ID. This value is globally unique between all logical CPUs across
713 713 * all packages. This is usually required by the APIC.
714 714 *
715 715 * cpi_chipid
716 716 *
717 717 * This value indicates the ID of the package that the logical CPU is a
718 718 * part of. This value is allowed to be synthetic. It is usually derived by
719 719 * taking the CPU's APIC ID and determining how many bits are used to
720 720 * represent CPU cores in the package. All logical CPUs that are part of
721 721 * the same package must have the same value.
722 722 *
723 723 * cpi_coreid
724 724 *
725 725 * This represents the ID of a CPU core. Two logical CPUs should only have
726 726 * the same cpi_coreid value if they are part of the same core. These
727 727 * values may be synthetic. On systems that support SMT, this value is
728 728 * usually derived from the APIC ID, otherwise it is often synthetic and
729 729 * just set to the value of the cpu_id in the cpu_t.
730 730 *
731 731 * cpi_pkgcoreid
732 732 *
733 733 * This is similar to the cpi_coreid in that logical CPUs that are part of
734 734 * the same core should have the same ID. The main difference is that these
735 735 * values are only required to be unique to a given socket.
736 736 *
737 737 * cpi_clogid
738 738 *
739 739 * This represents the logical ID of a logical CPU. This value should be
740 740 * unique within a given socket for each logical CPU. This is allowed to be
741 741 * synthetic, though it is usually based off of the CPU's apic ID. The
742 742 * broader system expects that logical CPUs that have are part of the same
743 743 * core have contiguous numbers. For example, if there were two threads per
744 744 * core, then the core IDs divided by two should be the same and the first
745 745 * modulus two should be zero and the second one. For example, IDs 4 and 5
746 746 * indicate two logical CPUs that are part of the same core. But IDs 5 and
747 747 * 6 represent two logical CPUs that are part of different cores.
748 748 *
749 749 * While it is common for the cpi_coreid and the cpi_clogid to be derived
750 750 * from the same source, strictly speaking, they don't have to be and the
751 751 * two values should be considered logically independent. One should not
752 752 * try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
753 753 * some kind of relationship. While this is tempting, we've seen cases on
754 754 * AMD family 0xf where the system's cpu id is not related to its APIC ID.
755 755 *
756 756 * cpi_ncpu_per_chip
757 757 *
758 758 * This value indicates the total number of logical CPUs that exist in the
759 759 * physical package. Critically, this is not the number of logical CPUs
760 760 * that exist for just the single core.
761 761 *
762 762 * This value should be the same for all logical CPUs in the same package.
763 763 *
764 764 * cpi_ncore_per_chip
765 765 *
766 766 * This value indicates the total number of physical CPU cores that exist
767 767 * in the package. The system compares this value with cpi_ncpu_per_chip to
768 768 * determine if simultaneous multi-threading (SMT) is enabled. When
769 769 * cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
770 770 * the X86FSET_HTT feature is not set. If this value is greater than one,
771 771 * than we consider the processor to have the feature X86FSET_CMP, to
772 772 * indicate that there is support for more than one core.
773 773 *
774 774 * This value should be the same for all logical CPUs in the same package.
775 775 *
776 776 * cpi_procnodes_per_pkg
777 777 *
778 778 * This value indicates the number of 'nodes' that exist in the package.
779 779 * When processors are actually a multi-chip module, this represents the
780 780 * number of such modules that exist in the package. Currently, on Intel
781 781 * based systems this member is always set to 1.
782 782 *
783 783 * This value should be the same for all logical CPUs in the same package.
784 784 *
785 785 * cpi_procnodeid
786 786 *
787 787 * This value indicates the ID of the node that the logical CPU is a part
788 788 * of. All logical CPUs that are in the same node must have the same value
789 789 * here. This value must be unique across all of the packages in the
790 790 * system. On Intel based systems, this is currently set to the value in
791 791 * cpi_chipid because there is only one node.
792 792 *
793 793 * cpi_cores_per_compunit
794 794 *
795 795 * This value indicates the number of cores that are part of a compute
796 796 * unit. See the AMD topology section for this. This member only has real
797 797 * meaning currently for AMD Bulldozer family processors. For all other
798 798 * processors, this should currently be set to 1.
799 799 *
800 800 * cpi_compunitid
801 801 *
802 802 * This indicates the compute unit that the logical CPU belongs to. For
803 803 * processors without AMD Bulldozer-style compute units this should be set
804 804 * to the value of cpi_coreid.
805 805 *
806 806 * cpi_ncpu_shr_last_cache
807 807 *
808 808 * This indicates the number of logical CPUs that are sharing the same last
809 809 * level cache. This value should be the same for all CPUs that are sharing
810 810 * that cache. The last cache refers to the cache that is closest to memory
811 811 * and furthest away from the CPU.
812 812 *
813 813 * cpi_last_lvl_cacheid
814 814 *
815 815 * This indicates the ID of the last cache that the logical CPU uses. This
816 816 * cache is often shared between multiple logical CPUs and is the cache
817 817 * that is closest to memory and furthest away from the CPU. This value
818 818 * should be the same for a group of logical CPUs only if they actually
819 819 * share the same last level cache. IDs should not overlap between
820 820 * packages.
821 821 *
822 822 * cpi_ncore_bits
823 823 *
824 824 * This indicates the number of bits that are required to represent all of
825 825 * the cores in the system. As cores are derived based on their APIC IDs,
826 826 * we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
827 827 * this value to be larger than the actual number of IDs that are present
828 828 * in the system. This is used to size tables by the CMI framework. It is
829 829 * only filled in for Intel and AMD CPUs.
830 830 *
831 831 * cpi_nthread_bits
832 832 *
833 833 * This indicates the number of bits required to represent all of the IDs
834 834 * that cover the logical CPUs that exist on a given core. It's OK for this
835 835 * value to be larger than the actual number of IDs that are present in the
836 836 * system. This is used to size tables by the CMI framework. It is
837 837 * only filled in for Intel and AMD CPUs.
838 838 *
839 839 * -----------
840 840 * Hypervisors
841 841 * -----------
842 842 *
843 843 * If trying to manage the differences between vendors wasn't bad enough, it can
844 844 * get worse thanks to our friend hardware virtualization. Hypervisors are given
845 845 * the ability to interpose on all cpuid instructions and change them to suit
846 846 * their purposes. In general, this is necessary as the hypervisor wants to be
847 847 * able to present a more uniform set of features or not necessarily give the
848 848 * guest operating system kernel knowledge of all features so it can be
849 849 * more easily migrated between systems.
850 850 *
851 851 * When it comes to trying to determine topology information, this can be a
852 852 * double edged sword. When a hypervisor doesn't actually implement a cpuid
853 853 * leaf, it'll often return all zeros. Because of that, you'll often see various
854 854 * checks scattered about fields being non-zero before we assume we can use
855 855 * them.
856 856 *
857 857 * When it comes to topology information, the hypervisor is often incentivized
858 858 * to lie to you about topology. This is because it doesn't always actually
859 859 * guarantee that topology at all. The topology path we take in the system
860 860 * depends on how the CPU advertises itself. If it advertises itself as an Intel
861 861 * or AMD CPU, then we basically do our normal path. However, when they don't
862 862 * use an actual vendor, then that usually turns into multiple one-core CPUs
863 863 * that we enumerate that are often on different sockets. The actual behavior
864 864 * depends greatly on what the hypervisor actually exposes to us.
865 865 *
866 866 * --------------------
867 867 * Exposing Information
868 868 * --------------------
869 869 *
870 870 * We expose CPUID information in three different forms in the system.
871 871 *
872 872 * The first is through the x86_featureset variable. This is used in conjunction
873 873 * with the is_x86_feature() function. This is queried by x86-specific functions
874 874 * to determine which features are or aren't present in the system and to make
875 875 * decisions based upon them. For example, users of this include everything from
876 876 * parts of the system dedicated to reliability, availability, and
877 877 * serviceability (RAS), to making decisions about how to handle security
878 878 * mitigations, to various x86-specific drivers. General purpose or
879 879 * architecture independent drivers should never be calling this function.
880 880 *
881 881 * The second means is through the auxiliary vector. The auxiliary vector is a
882 882 * series of tagged data that the kernel passes down to a user program when it
883 883 * begins executing. This information is used to indicate to programs what
884 884 * instruction set extensions are present. For example, information about the
885 885 * CPU supporting the machine check architecture (MCA) wouldn't be passed down
886 886 * since user programs cannot make use of it. However, things like the AVX
887 887 * instruction sets are. Programs use this information to make run-time
888 888 * decisions about what features they should use. As an example, the run-time
889 889 * link-editor (rtld) can relocate different functions depending on the hardware
890 890 * support available.
891 891 *
892 892 * The final form is through a series of accessor functions that all have the
893 893 * form cpuid_get*. This is used by a number of different subsystems in the
894 894 * kernel to determine more detailed information about what we're running on,
895 895 * topology information, etc. Some of these subsystems include processor groups
896 896 * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
897 897 * microcode, and performance monitoring. These functions all ASSERT that the
898 898 * CPU they're being called on has reached a certain cpuid pass. If the passes
899 899 * are rearranged, then this needs to be adjusted.
900 900 *
901 901 * -----------------------------------------------
902 902 * Speculative Execution CPU Side Channel Security
↓ open down ↓ |
902 lines elided |
↑ open up ↑ |
903 903 * -----------------------------------------------
904 904 *
905 905 * With the advent of the Spectre and Meltdown attacks which exploit speculative
906 906 * execution in the CPU to create side channels there have been a number of
907 907 * different attacks and corresponding issues that the operating system needs to
908 908 * mitigate against. The following list is some of the common, but not
909 909 * exhaustive, set of issues that we know about and have done some or need to do
910 910 * more work in the system to mitigate against:
911 911 *
912 912 * - Spectre v1
913 + * - swapgs (Spectre v1 variant)
913 914 * - Spectre v2
914 915 * - Meltdown (Spectre v3)
915 916 * - Rogue Register Read (Spectre v3a)
916 917 * - Speculative Store Bypass (Spectre v4)
917 918 * - ret2spec, SpectreRSB
918 919 * - L1 Terminal Fault (L1TF)
919 920 * - Microarchitectural Data Sampling (MDS)
920 921 *
921 922 * Each of these requires different sets of mitigations and has different attack
922 923 * surfaces. For the most part, this discussion is about protecting the kernel
923 924 * from non-kernel executing environments such as user processes and hardware
924 925 * virtual machines. Unfortunately, there are a number of user vs. user
925 926 * scenarios that exist with these. The rest of this section will describe the
926 927 * overall approach that the system has taken to address these as well as their
927 928 * shortcomings. Unfortunately, not all of the above have been handled today.
928 929 *
929 - * SPECTRE FAMILY (Spectre v2, ret2spec, SpectreRSB)
930 + * SPECTRE v2, ret2spec, SpectreRSB
930 931 *
931 932 * The second variant of the spectre attack focuses on performing branch target
932 933 * injection. This generally impacts indirect call instructions in the system.
933 934 * There are three different ways to mitigate this issue that are commonly
934 935 * described today:
935 936 *
936 937 * 1. Using Indirect Branch Restricted Speculation (IBRS).
937 938 * 2. Using Retpolines and RSB Stuffing
938 939 * 3. Using Enhanced Indirect Branch Restricted Speculation (EIBRS)
939 940 *
940 941 * IBRS uses a feature added to microcode to restrict speculation, among other
941 942 * things. This form of mitigation has not been used as it has been generally
942 943 * seen as too expensive and requires reactivation upon various transitions in
943 944 * the system.
944 945 *
945 946 * As a less impactful alternative to IBRS, retpolines were developed by
946 947 * Google. These basically require one to replace indirect calls with a specific
947 948 * trampoline that will cause speculation to fail and break the attack.
948 949 * Retpolines require compiler support. We always build with retpolines in the
949 950 * external thunk mode. This means that a traditional indirect call is replaced
950 951 * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
951 952 * of this is that all indirect function calls are performed through a register.
952 953 *
953 954 * We have to use a common external location of the thunk and not inline it into
954 955 * the callsite so that way we can have a single place to patch these functions.
955 956 * As it turns out, we actually have three different forms of retpolines that
956 957 * exist in the system:
957 958 *
958 959 * 1. A full retpoline
959 960 * 2. An AMD-specific optimized retpoline
960 961 * 3. A no-op version
961 962 *
962 963 * The first one is used in the general case. The second one is used if we can
963 964 * determine that we're on an AMD system and we can successfully toggle the
964 965 * lfence serializing MSR that exists on the platform. Basically with this
965 966 * present, an lfence is sufficient and we don't need to do anywhere near as
966 967 * complicated a dance to successfully use retpolines.
967 968 *
968 969 * The third form described above is the most curious. It turns out that the way
969 970 * that retpolines are implemented is that they rely on how speculation is
970 971 * performed on a 'ret' instruction. Intel has continued to optimize this
971 972 * process (which is partly why we need to have return stack buffer stuffing,
972 973 * but more on that in a bit) and in processors starting with Cascade Lake
973 974 * on the server side, it's dangerous to rely on retpolines. Instead, a new
974 975 * mechanism has been introduced called Enhanced IBRS (EIBRS).
975 976 *
976 977 * Unlike IBRS, EIBRS is designed to be enabled once at boot and left on each
977 978 * physical core. However, if this is the case, we don't want to use retpolines
978 979 * any more. Therefore if EIBRS is present, we end up turning each retpoline
979 980 * function (called a thunk) into a jmp instruction. This means that we're still
980 981 * paying the cost of an extra jump to the external thunk, but it gives us
981 982 * flexibility and the ability to have a single kernel image that works across a
982 983 * wide variety of systems and hardware features.
983 984 *
984 985 * Unfortunately, this alone is insufficient. First, Skylake systems have
985 986 * additional speculation for the Return Stack Buffer (RSB) which is used to
986 987 * return from call instructions which retpolines take advantage of. However,
987 988 * this problem is not just limited to Skylake and is actually more pernicious.
988 989 * The SpectreRSB paper introduces several more problems that can arise with
989 990 * dealing with this. The RSB can be poisoned just like the indirect branch
990 991 * predictor. This means that one needs to clear the RSB when transitioning
991 992 * between two different privilege domains. Some examples include:
992 993 *
993 994 * - Switching between two different user processes
994 995 * - Going between user land and the kernel
995 996 * - Returning to the kernel from a hardware virtual machine
996 997 *
997 998 * Mitigating this involves combining a couple of different things. The first is
998 999 * SMEP (supervisor mode execution protection) which was introduced in Ivy
999 1000 * Bridge. When an RSB entry refers to a user address and we're executing in the
1000 1001 * kernel, speculation through it will be stopped when SMEP is enabled. This
1001 1002 * protects against a number of the different cases that we would normally be
1002 1003 * worried about such as when we enter the kernel from user land.
1003 1004 *
1004 1005 * To prevent against additional manipulation of the RSB from other contexts
1005 1006 * such as a non-root VMX context attacking the kernel we first look to enhanced
1006 1007 * IBRS. When EIBRS is present and enabled, then there is nothing else that we
1007 1008 * need to do to protect the kernel at this time.
1008 1009 *
1009 1010 * On CPUs without EIBRS we need to manually overwrite the contents of the
1010 1011 * return stack buffer. We do this through the x86_rsb_stuff() function.
1011 1012 * Currently this is employed on context switch. The x86_rsb_stuff() function is
1012 1013 * disabled when enhanced IBRS is present because Intel claims on such systems
1013 1014 * it will be ineffective. Stuffing the RSB in context switch helps prevent user
1014 1015 * to user attacks via the RSB.
1015 1016 *
1016 1017 * If SMEP is not present, then we would have to stuff the RSB every time we
1017 1018 * transitioned from user mode to the kernel, which isn't very practical right
1018 1019 * now.
1019 1020 *
1020 1021 * To fully protect user to user and vmx to vmx attacks from these classes of
1021 1022 * issues, we would also need to allow them to opt into performing an Indirect
1022 1023 * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1023 1024 *
1024 1025 * By default, the system will enable RSB stuffing and the required variant of
1025 1026 * retpolines and store that information in the x86_spectrev2_mitigation value.
1026 1027 * This will be evaluated after a microcode update as well, though it is
1027 1028 * expected that microcode updates will not take away features. This may mean
↓ open down ↓ |
88 lines elided |
↑ open up ↑ |
1028 1029 * that a late loaded microcode may not end up in the optimal configuration
1029 1030 * (though this should be rare).
1030 1031 *
1031 1032 * Currently we do not build kmdb with retpolines or perform any additional side
1032 1033 * channel security mitigations for it. One complication with kmdb is that it
1033 1034 * requires its own retpoline thunks and it would need to adjust itself based on
1034 1035 * what the kernel does. The threat model of kmdb is more limited and therefore
1035 1036 * it may make more sense to investigate using prediction barriers as the whole
1036 1037 * system is only executing a single instruction at a time while in kmdb.
1037 1038 *
1038 - * SPECTRE FAMILY (v1, v4)
1039 + * SPECTRE v1, v4
1039 1040 *
1040 1041 * The v1 and v4 variants of spectre are not currently mitigated in the
1041 1042 * system and require other classes of changes to occur in the code.
1042 1043 *
1044 + * SPECTRE v1 (SWAPGS VARIANT)
1045 + *
1046 + * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but
1047 + * can generally affect any branch-dependent code. The swapgs issue is one
1048 + * variant of this. If we are coming in from userspace, we can have code like
1049 + * this:
1050 + *
1051 + * cmpw $KCS_SEL, REGOFF_CS(%rsp)
1052 + * je 1f
1053 + * movq $0, REGOFF_SAVFP(%rsp)
1054 + * swapgs
1055 + * 1:
1056 + * movq %gs:CPU_THREAD, %rax
1057 + *
1058 + * If an attacker can cause a mis-speculation of the branch here, we could skip
1059 + * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based
1060 + * load. If subsequent code can act as the usual Spectre cache gadget, this
1061 + * would potentially allow KPTI bypass. To fix this, we need an lfence prior to
1062 + * any use of the %gs override.
1063 + *
1064 + * The other case is also an issue: if we're coming into a trap from kernel
1065 + * space, we could mis-speculate and swapgs the user %gsbase back in prior to
1066 + * using it. AMD systems are not vulnerable to this version, as a swapgs is
1067 + * serializing with respect to subsequent uses. But as AMD /does/ need the other
1068 + * case, and the fix is the same in both cases (an lfence at the branch target
1069 + * 1: in this example), we'll just do it unconditionally.
1070 + *
1071 + * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it
1072 + * harder for user-space to actually set a useful %gsbase value: although it's
1073 + * not clear, it might still be feasible via lwp_setprivate(), though, so we
1074 + * mitigate anyway.
1075 + *
1043 1076 * MELTDOWN
1044 1077 *
1045 1078 * Meltdown, or spectre v3, allowed a user process to read any data in their
1046 1079 * address space regardless of whether or not the page tables in question
1047 1080 * allowed the user to have the ability to read them. The solution to meltdown
1048 1081 * is kernel page table isolation. In this world, there are two page tables that
1049 1082 * are used for a process, one in user land and one in the kernel. To implement
1050 1083 * this we use per-CPU page tables and switch between the user and kernel
1051 1084 * variants when entering and exiting the kernel. For more information about
1052 1085 * this process and how the trampolines work, please see the big theory
1053 1086 * statements and additional comments in:
1054 1087 *
1055 1088 * - uts/i86pc/ml/kpti_trampolines.s
1056 1089 * - uts/i86pc/vm/hat_i86.c
1057 1090 *
1058 1091 * While Meltdown only impacted Intel systems and there are also Intel systems
1059 1092 * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1060 1093 * kernel page table isolation enabled. While this may at first seem weird, an
1061 1094 * important thing to remember is that you can't speculatively read an address
1062 1095 * if it's never in your page table at all. Having user processes without kernel
1063 1096 * pages present provides us with an important layer of defense in the kernel
1064 1097 * against any other side channel attacks that exist and have yet to be
1065 1098 * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1066 1099 * default, no matter the x86 system.
1067 1100 *
1068 1101 * L1 TERMINAL FAULT
1069 1102 *
1070 1103 * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1071 1104 * execution uses page table entries. Effectively, it is two different problems.
1072 1105 * The first is that it ignores the not present bit in the page table entries
1073 1106 * when performing speculative execution. This means that something can
1074 1107 * speculatively read the listed physical address if it's present in the L1
1075 1108 * cache under certain conditions (see Intel's documentation for the full set of
1076 1109 * conditions). Secondly, this can be used to bypass hardware virtualization
1077 1110 * extended page tables (EPT) that are part of Intel's hardware virtual machine
1078 1111 * instructions.
1079 1112 *
1080 1113 * For the non-hardware virtualized case, this is relatively easy to deal with.
1081 1114 * We must make sure that all unmapped pages have an address of zero. This means
1082 1115 * that they could read the first 4k of physical memory; however, we never use
1083 1116 * that first page in the operating system and always skip putting it in our
1084 1117 * memory map, even if firmware tells us we can use it in our memory map. While
1085 1118 * other systems try to put extra metadata in the address and reserved bits,
1086 1119 * which led to this being problematic in those cases, we do not.
1087 1120 *
1088 1121 * For hardware virtual machines things are more complicated. Because they can
1089 1122 * construct their own page tables, it isn't hard for them to perform this
1090 1123 * attack against any physical address. The one wrinkle is that this physical
1091 1124 * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1092 1125 * to flush the L1 data cache. We wrap this up in the function
1093 1126 * spec_uarch_flush(). This function is also used in the mitigation of
1094 1127 * microarchitectural data sampling (MDS) discussed later on. Kernel based
1095 1128 * hypervisors such as KVM or bhyve are responsible for performing this before
1096 1129 * entering the guest.
1097 1130 *
1098 1131 * Because this attack takes place in the L1 cache, there's another wrinkle
1099 1132 * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1100 1133 * designs. This means that when a thread enters a hardware virtualized context
1101 1134 * and flushes the L1 data cache, the other thread on the processor may then go
1102 1135 * ahead and put new data in it that can be potentially attacked. While one
1103 1136 * solution is to disable SMT on the system, another option that is available is
1104 1137 * to use a feature for hardware virtualization called 'SMT exclusion'. This
1105 1138 * goes through and makes sure that if a HVM is being scheduled on one thread,
1106 1139 * then the thing on the other thread is from the same hardware virtual machine.
1107 1140 * If an interrupt comes in or the guest exits to the broader system, then the
1108 1141 * other SMT thread will be kicked out.
1109 1142 *
1110 1143 * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1111 1144 * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1112 1145 * perform L1TF related mitigations.
1113 1146 *
1114 1147 * MICROARCHITECTURAL DATA SAMPLING
1115 1148 *
1116 1149 * Microarchitectural data sampling (MDS) is a combination of four discrete
1117 1150 * vulnerabilities that are similar issues affecting various parts of the CPU's
1118 1151 * microarchitectural implementation around load, store, and fill buffers.
1119 1152 * Specifically it is made up of the following subcomponents:
1120 1153 *
1121 1154 * 1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1122 1155 * 2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1123 1156 * 3. Microarchitectural Load Port Data Sampling (MLPDS)
1124 1157 * 4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1125 1158 *
1126 1159 * To begin addressing these, Intel has introduced another feature in microcode
1127 1160 * called MD_CLEAR. This changes the verw instruction to operate in a different
1128 1161 * way. This allows us to execute the verw instruction in a particular way to
1129 1162 * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1130 1163 * updated when this microcode is present to flush this state.
1131 1164 *
1132 1165 * Primarily we need to flush this state whenever we transition from the kernel
1133 1166 * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1134 1167 * little bit different. Here the structures are statically sized when a logical
1135 1168 * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1136 1169 * flush the microarchitectural state before the CPU goes idles by calling hlt,
1137 1170 * mwait, or another ACPI method. To perform these flushes, we call
1138 1171 * x86_md_clear() at all of these transition points.
1139 1172 *
1140 1173 * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1141 1174 * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1142 1175 * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1143 1176 * a no-op.
1144 1177 *
1145 1178 * Unfortunately, with this issue hyperthreading rears its ugly head. In
1146 1179 * particular, everything we've discussed above is only valid for a single
1147 1180 * thread executing on a core. In the case where you have hyper-threading
1148 1181 * present, this attack can be performed between threads. The theoretical fix
1149 1182 * for this is to ensure that both threads are always in the same security
1150 1183 * domain. This means that they are executing in the same ring and mutually
1151 1184 * trust each other. Practically speaking, this would mean that a system call
↓ open down ↓ |
99 lines elided |
↑ open up ↑ |
1152 1185 * would have to issue an inter-processor interrupt (IPI) to the other thread.
1153 1186 * Rather than implement this, we recommend that one disables hyper-threading
1154 1187 * through the use of psradm -aS.
1155 1188 *
1156 1189 * SUMMARY
1157 1190 *
1158 1191 * The following table attempts to summarize the mitigations for various issues
1159 1192 * and what's done in various places:
1160 1193 *
1161 1194 * - Spectre v1: Not currently mitigated
1195 + * - swapgs: lfences after swapgs paths
1162 1196 * - Spectre v2: Retpolines/RSB Stuffing or EIBRS if HW support
1163 1197 * - Meltdown: Kernel Page Table Isolation
1164 1198 * - Spectre v3a: Updated CPU microcode
1165 1199 * - Spectre v4: Not currently mitigated
1166 1200 * - SpectreRSB: SMEP and RSB Stuffing
1167 - * - L1TF: spec_uarch_flush, smt exclusion, requires microcode
1201 + * - L1TF: spec_uarch_flush, SMT exclusion, requires microcode
1168 1202 * - MDS: x86_md_clear, requires microcode, disabling hyper threading
1169 1203 *
1170 1204 * The following table indicates the x86 feature set bits that indicate that a
1171 1205 * given problem has been solved or a notable feature is present:
1172 1206 *
1173 1207 * - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1174 1208 * - MDS_NO: All forms of MDS
1175 1209 */
1176 1210
1177 1211 #include <sys/types.h>
1178 1212 #include <sys/archsystm.h>
1179 1213 #include <sys/x86_archext.h>
1180 1214 #include <sys/kmem.h>
1181 1215 #include <sys/systm.h>
1182 1216 #include <sys/cmn_err.h>
1183 1217 #include <sys/sunddi.h>
1184 1218 #include <sys/sunndi.h>
1185 1219 #include <sys/cpuvar.h>
1186 1220 #include <sys/processor.h>
1187 1221 #include <sys/sysmacros.h>
1188 1222 #include <sys/pg.h>
1189 1223 #include <sys/fp.h>
1190 1224 #include <sys/controlregs.h>
1191 1225 #include <sys/bitmap.h>
1192 1226 #include <sys/auxv_386.h>
1193 1227 #include <sys/memnode.h>
1194 1228 #include <sys/pci_cfgspace.h>
1195 1229 #include <sys/comm_page.h>
1196 1230 #include <sys/mach_mmu.h>
1197 1231 #include <sys/ucode.h>
1198 1232 #include <sys/tsc.h>
1199 1233 #include <sys/kobj.h>
1200 1234 #include <sys/asm_misc.h>
1201 1235
1202 1236 #ifdef __xpv
1203 1237 #include <sys/hypervisor.h>
1204 1238 #else
1205 1239 #include <sys/ontrap.h>
1206 1240 #endif
1207 1241
1208 1242 uint_t x86_vendor = X86_VENDOR_IntelClone;
1209 1243 uint_t x86_type = X86_TYPE_OTHER;
1210 1244 uint_t x86_clflush_size = 0;
1211 1245
1212 1246 #if defined(__xpv)
1213 1247 int x86_use_pcid = 0;
1214 1248 int x86_use_invpcid = 0;
1215 1249 #else
1216 1250 int x86_use_pcid = -1;
1217 1251 int x86_use_invpcid = -1;
1218 1252 #endif
1219 1253
1220 1254 typedef enum {
1221 1255 X86_SPECTREV2_RETPOLINE,
1222 1256 X86_SPECTREV2_RETPOLINE_AMD,
1223 1257 X86_SPECTREV2_ENHANCED_IBRS,
1224 1258 X86_SPECTREV2_DISABLED
1225 1259 } x86_spectrev2_mitigation_t;
1226 1260
1227 1261 uint_t x86_disable_spectrev2 = 0;
1228 1262 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1229 1263 X86_SPECTREV2_RETPOLINE;
1230 1264
1231 1265 uint_t pentiumpro_bug4046376;
1232 1266
1233 1267 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1234 1268
1235 1269 static char *x86_feature_names[NUM_X86_FEATURES] = {
1236 1270 "lgpg",
1237 1271 "tsc",
1238 1272 "msr",
1239 1273 "mtrr",
1240 1274 "pge",
1241 1275 "de",
1242 1276 "cmov",
1243 1277 "mmx",
1244 1278 "mca",
1245 1279 "pae",
1246 1280 "cv8",
1247 1281 "pat",
1248 1282 "sep",
1249 1283 "sse",
1250 1284 "sse2",
1251 1285 "htt",
1252 1286 "asysc",
1253 1287 "nx",
1254 1288 "sse3",
1255 1289 "cx16",
1256 1290 "cmp",
1257 1291 "tscp",
1258 1292 "mwait",
1259 1293 "sse4a",
1260 1294 "cpuid",
1261 1295 "ssse3",
1262 1296 "sse4_1",
1263 1297 "sse4_2",
1264 1298 "1gpg",
1265 1299 "clfsh",
1266 1300 "64",
1267 1301 "aes",
1268 1302 "pclmulqdq",
1269 1303 "xsave",
1270 1304 "avx",
1271 1305 "vmx",
1272 1306 "svm",
1273 1307 "topoext",
1274 1308 "f16c",
1275 1309 "rdrand",
1276 1310 "x2apic",
1277 1311 "avx2",
1278 1312 "bmi1",
1279 1313 "bmi2",
1280 1314 "fma",
1281 1315 "smep",
1282 1316 "smap",
1283 1317 "adx",
1284 1318 "rdseed",
1285 1319 "mpx",
1286 1320 "avx512f",
1287 1321 "avx512dq",
1288 1322 "avx512pf",
1289 1323 "avx512er",
1290 1324 "avx512cd",
1291 1325 "avx512bw",
1292 1326 "avx512vl",
1293 1327 "avx512fma",
1294 1328 "avx512vbmi",
1295 1329 "avx512_vpopcntdq",
1296 1330 "avx512_4vnniw",
1297 1331 "avx512_4fmaps",
1298 1332 "xsaveopt",
1299 1333 "xsavec",
1300 1334 "xsaves",
1301 1335 "sha",
1302 1336 "umip",
1303 1337 "pku",
1304 1338 "ospke",
1305 1339 "pcid",
1306 1340 "invpcid",
1307 1341 "ibrs",
1308 1342 "ibpb",
1309 1343 "stibp",
1310 1344 "ssbd",
1311 1345 "ssbd_virt",
1312 1346 "rdcl_no",
1313 1347 "ibrs_all",
1314 1348 "rsba",
1315 1349 "ssb_no",
1316 1350 "stibp_all",
1317 1351 "flush_cmd",
1318 1352 "l1d_vmentry_no",
1319 1353 "fsgsbase",
1320 1354 "clflushopt",
1321 1355 "clwb",
1322 1356 "monitorx",
1323 1357 "clzero",
1324 1358 "xop",
1325 1359 "fma4",
1326 1360 "tbm",
1327 1361 "avx512_vnni",
1328 1362 "amd_pcec",
1329 1363 "mb_clear",
1330 1364 "mds_no",
1331 1365 "core_thermal",
1332 1366 "pkg_thermal"
1333 1367 };
1334 1368
1335 1369 boolean_t
1336 1370 is_x86_feature(void *featureset, uint_t feature)
1337 1371 {
1338 1372 ASSERT(feature < NUM_X86_FEATURES);
1339 1373 return (BT_TEST((ulong_t *)featureset, feature));
1340 1374 }
1341 1375
1342 1376 void
1343 1377 add_x86_feature(void *featureset, uint_t feature)
1344 1378 {
1345 1379 ASSERT(feature < NUM_X86_FEATURES);
1346 1380 BT_SET((ulong_t *)featureset, feature);
1347 1381 }
1348 1382
1349 1383 void
1350 1384 remove_x86_feature(void *featureset, uint_t feature)
1351 1385 {
1352 1386 ASSERT(feature < NUM_X86_FEATURES);
1353 1387 BT_CLEAR((ulong_t *)featureset, feature);
1354 1388 }
1355 1389
1356 1390 boolean_t
1357 1391 compare_x86_featureset(void *setA, void *setB)
1358 1392 {
1359 1393 /*
1360 1394 * We assume that the unused bits of the bitmap are always zero.
1361 1395 */
1362 1396 if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1363 1397 return (B_TRUE);
1364 1398 } else {
1365 1399 return (B_FALSE);
1366 1400 }
1367 1401 }
1368 1402
1369 1403 void
1370 1404 print_x86_featureset(void *featureset)
1371 1405 {
1372 1406 uint_t i;
1373 1407
1374 1408 for (i = 0; i < NUM_X86_FEATURES; i++) {
1375 1409 if (is_x86_feature(featureset, i)) {
1376 1410 cmn_err(CE_CONT, "?x86_feature: %s\n",
1377 1411 x86_feature_names[i]);
1378 1412 }
1379 1413 }
1380 1414 }
1381 1415
1382 1416 /* Note: This is the maximum size for the CPU, not the size of the structure. */
1383 1417 static size_t xsave_state_size = 0;
1384 1418 uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1385 1419 boolean_t xsave_force_disable = B_FALSE;
1386 1420 extern int disable_smap;
1387 1421
1388 1422 /*
1389 1423 * This is set to platform type we are running on.
1390 1424 */
1391 1425 static int platform_type = -1;
1392 1426
1393 1427 #if !defined(__xpv)
1394 1428 /*
1395 1429 * Variable to patch if hypervisor platform detection needs to be
1396 1430 * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1397 1431 */
1398 1432 int enable_platform_detection = 1;
1399 1433 #endif
1400 1434
1401 1435 /*
1402 1436 * monitor/mwait info.
1403 1437 *
1404 1438 * size_actual and buf_actual are the real address and size allocated to get
1405 1439 * proper mwait_buf alignement. buf_actual and size_actual should be passed
1406 1440 * to kmem_free(). Currently kmem_alloc() and mwait happen to both use
1407 1441 * processor cache-line alignment, but this is not guarantied in the furture.
1408 1442 */
1409 1443 struct mwait_info {
1410 1444 size_t mon_min; /* min size to avoid missed wakeups */
1411 1445 size_t mon_max; /* size to avoid false wakeups */
1412 1446 size_t size_actual; /* size actually allocated */
1413 1447 void *buf_actual; /* memory actually allocated */
1414 1448 uint32_t support; /* processor support of monitor/mwait */
1415 1449 };
1416 1450
1417 1451 /*
1418 1452 * xsave/xrestor info.
1419 1453 *
1420 1454 * This structure contains HW feature bits and the size of the xsave save area.
1421 1455 * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1422 1456 * (xsave_state) to describe the xsave layout. However, at runtime the
1423 1457 * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1424 1458 * xsave_state structure simply represents the legacy layout of the beginning
1425 1459 * of the xsave area.
1426 1460 */
1427 1461 struct xsave_info {
1428 1462 uint32_t xsav_hw_features_low; /* Supported HW features */
1429 1463 uint32_t xsav_hw_features_high; /* Supported HW features */
1430 1464 size_t xsav_max_size; /* max size save area for HW features */
1431 1465 size_t ymm_size; /* AVX: size of ymm save area */
1432 1466 size_t ymm_offset; /* AVX: offset for ymm save area */
1433 1467 size_t bndregs_size; /* MPX: size of bndregs save area */
1434 1468 size_t bndregs_offset; /* MPX: offset for bndregs save area */
1435 1469 size_t bndcsr_size; /* MPX: size of bndcsr save area */
1436 1470 size_t bndcsr_offset; /* MPX: offset for bndcsr save area */
1437 1471 size_t opmask_size; /* AVX512: size of opmask save */
1438 1472 size_t opmask_offset; /* AVX512: offset for opmask save */
1439 1473 size_t zmmlo_size; /* AVX512: size of zmm 256 save */
1440 1474 size_t zmmlo_offset; /* AVX512: offset for zmm 256 save */
1441 1475 size_t zmmhi_size; /* AVX512: size of zmm hi reg save */
1442 1476 size_t zmmhi_offset; /* AVX512: offset for zmm hi reg save */
1443 1477 };
1444 1478
1445 1479
1446 1480 /*
1447 1481 * These constants determine how many of the elements of the
1448 1482 * cpuid we cache in the cpuid_info data structure; the
1449 1483 * remaining elements are accessible via the cpuid instruction.
1450 1484 */
1451 1485
1452 1486 #define NMAX_CPI_STD 8 /* eax = 0 .. 7 */
1453 1487 #define NMAX_CPI_EXTD 0x1f /* eax = 0x80000000 .. 0x8000001e */
1454 1488
1455 1489 /*
1456 1490 * See the big theory statement for a more detailed explanation of what some of
1457 1491 * these members mean.
1458 1492 */
1459 1493 struct cpuid_info {
1460 1494 uint_t cpi_pass; /* last pass completed */
1461 1495 /*
1462 1496 * standard function information
1463 1497 */
1464 1498 uint_t cpi_maxeax; /* fn 0: %eax */
1465 1499 char cpi_vendorstr[13]; /* fn 0: %ebx:%ecx:%edx */
1466 1500 uint_t cpi_vendor; /* enum of cpi_vendorstr */
1467 1501
1468 1502 uint_t cpi_family; /* fn 1: extended family */
1469 1503 uint_t cpi_model; /* fn 1: extended model */
1470 1504 uint_t cpi_step; /* fn 1: stepping */
1471 1505 chipid_t cpi_chipid; /* fn 1: %ebx: Intel: chip # */
1472 1506 /* AMD: package/socket # */
1473 1507 uint_t cpi_brandid; /* fn 1: %ebx: brand ID */
1474 1508 int cpi_clogid; /* fn 1: %ebx: thread # */
1475 1509 uint_t cpi_ncpu_per_chip; /* fn 1: %ebx: logical cpu count */
1476 1510 uint8_t cpi_cacheinfo[16]; /* fn 2: intel-style cache desc */
1477 1511 uint_t cpi_ncache; /* fn 2: number of elements */
1478 1512 uint_t cpi_ncpu_shr_last_cache; /* fn 4: %eax: ncpus sharing cache */
1479 1513 id_t cpi_last_lvl_cacheid; /* fn 4: %eax: derived cache id */
1480 1514 uint_t cpi_cache_leaf_size; /* Number of cache elements */
1481 1515 /* Intel fn: 4, AMD fn: 8000001d */
1482 1516 struct cpuid_regs **cpi_cache_leaves; /* Acual leaves from above */
1483 1517 struct cpuid_regs cpi_std[NMAX_CPI_STD]; /* 0 .. 7 */
1484 1518 /*
1485 1519 * extended function information
1486 1520 */
1487 1521 uint_t cpi_xmaxeax; /* fn 0x80000000: %eax */
1488 1522 char cpi_brandstr[49]; /* fn 0x8000000[234] */
1489 1523 uint8_t cpi_pabits; /* fn 0x80000006: %eax */
1490 1524 uint8_t cpi_vabits; /* fn 0x80000006: %eax */
1491 1525 uint8_t cpi_fp_amd_save; /* AMD: FP error pointer save rqd. */
1492 1526 struct cpuid_regs cpi_extd[NMAX_CPI_EXTD]; /* 0x800000XX */
1493 1527
1494 1528 id_t cpi_coreid; /* same coreid => strands share core */
1495 1529 int cpi_pkgcoreid; /* core number within single package */
1496 1530 uint_t cpi_ncore_per_chip; /* AMD: fn 0x80000008: %ecx[7-0] */
1497 1531 /* Intel: fn 4: %eax[31-26] */
1498 1532
1499 1533 /*
1500 1534 * These values represent the number of bits that are required to store
1501 1535 * information about the number of cores and threads.
1502 1536 */
1503 1537 uint_t cpi_ncore_bits;
1504 1538 uint_t cpi_nthread_bits;
1505 1539 /*
1506 1540 * supported feature information
1507 1541 */
1508 1542 uint32_t cpi_support[6];
1509 1543 #define STD_EDX_FEATURES 0
1510 1544 #define AMD_EDX_FEATURES 1
1511 1545 #define TM_EDX_FEATURES 2
1512 1546 #define STD_ECX_FEATURES 3
1513 1547 #define AMD_ECX_FEATURES 4
1514 1548 #define STD_EBX_FEATURES 5
1515 1549 /*
1516 1550 * Synthesized information, where known.
1517 1551 */
1518 1552 uint32_t cpi_chiprev; /* See X86_CHIPREV_* in x86_archext.h */
1519 1553 const char *cpi_chiprevstr; /* May be NULL if chiprev unknown */
1520 1554 uint32_t cpi_socket; /* Chip package/socket type */
1521 1555
1522 1556 struct mwait_info cpi_mwait; /* fn 5: monitor/mwait info */
1523 1557 uint32_t cpi_apicid;
1524 1558 uint_t cpi_procnodeid; /* AMD: nodeID on HT, Intel: chipid */
1525 1559 uint_t cpi_procnodes_per_pkg; /* AMD: # of nodes in the package */
1526 1560 /* Intel: 1 */
1527 1561 uint_t cpi_compunitid; /* AMD: ComputeUnit ID, Intel: coreid */
1528 1562 uint_t cpi_cores_per_compunit; /* AMD: # of cores in the ComputeUnit */
1529 1563
1530 1564 struct xsave_info cpi_xsave; /* fn D: xsave/xrestor info */
1531 1565 };
1532 1566
1533 1567
1534 1568 static struct cpuid_info cpuid_info0;
1535 1569
1536 1570 /*
1537 1571 * These bit fields are defined by the Intel Application Note AP-485
1538 1572 * "Intel Processor Identification and the CPUID Instruction"
1539 1573 */
1540 1574 #define CPI_FAMILY_XTD(cpi) BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1541 1575 #define CPI_MODEL_XTD(cpi) BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1542 1576 #define CPI_TYPE(cpi) BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1543 1577 #define CPI_FAMILY(cpi) BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1544 1578 #define CPI_STEP(cpi) BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1545 1579 #define CPI_MODEL(cpi) BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1546 1580
1547 1581 #define CPI_FEATURES_EDX(cpi) ((cpi)->cpi_std[1].cp_edx)
1548 1582 #define CPI_FEATURES_ECX(cpi) ((cpi)->cpi_std[1].cp_ecx)
1549 1583 #define CPI_FEATURES_XTD_EDX(cpi) ((cpi)->cpi_extd[1].cp_edx)
1550 1584 #define CPI_FEATURES_XTD_ECX(cpi) ((cpi)->cpi_extd[1].cp_ecx)
1551 1585 #define CPI_FEATURES_7_0_EBX(cpi) ((cpi)->cpi_std[7].cp_ebx)
1552 1586 #define CPI_FEATURES_7_0_ECX(cpi) ((cpi)->cpi_std[7].cp_ecx)
1553 1587 #define CPI_FEATURES_7_0_EDX(cpi) ((cpi)->cpi_std[7].cp_edx)
1554 1588
1555 1589 #define CPI_BRANDID(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1556 1590 #define CPI_CHUNKS(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1557 1591 #define CPI_CPU_COUNT(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1558 1592 #define CPI_APIC_ID(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1559 1593
1560 1594 #define CPI_MAXEAX_MAX 0x100 /* sanity control */
1561 1595 #define CPI_XMAXEAX_MAX 0x80000100
1562 1596 #define CPI_FN4_ECX_MAX 0x20 /* sanity: max fn 4 levels */
1563 1597 #define CPI_FNB_ECX_MAX 0x20 /* sanity: max fn B levels */
1564 1598
1565 1599 /*
1566 1600 * Function 4 (Deterministic Cache Parameters) macros
1567 1601 * Defined by Intel Application Note AP-485
1568 1602 */
1569 1603 #define CPI_NUM_CORES(regs) BITX((regs)->cp_eax, 31, 26)
1570 1604 #define CPI_NTHR_SHR_CACHE(regs) BITX((regs)->cp_eax, 25, 14)
1571 1605 #define CPI_FULL_ASSOC_CACHE(regs) BITX((regs)->cp_eax, 9, 9)
1572 1606 #define CPI_SELF_INIT_CACHE(regs) BITX((regs)->cp_eax, 8, 8)
1573 1607 #define CPI_CACHE_LVL(regs) BITX((regs)->cp_eax, 7, 5)
1574 1608 #define CPI_CACHE_TYPE(regs) BITX((regs)->cp_eax, 4, 0)
1575 1609 #define CPI_CPU_LEVEL_TYPE(regs) BITX((regs)->cp_ecx, 15, 8)
1576 1610
1577 1611 #define CPI_CACHE_WAYS(regs) BITX((regs)->cp_ebx, 31, 22)
1578 1612 #define CPI_CACHE_PARTS(regs) BITX((regs)->cp_ebx, 21, 12)
1579 1613 #define CPI_CACHE_COH_LN_SZ(regs) BITX((regs)->cp_ebx, 11, 0)
1580 1614
1581 1615 #define CPI_CACHE_SETS(regs) BITX((regs)->cp_ecx, 31, 0)
1582 1616
1583 1617 #define CPI_PREFCH_STRIDE(regs) BITX((regs)->cp_edx, 9, 0)
1584 1618
1585 1619
1586 1620 /*
1587 1621 * A couple of shorthand macros to identify "later" P6-family chips
1588 1622 * like the Pentium M and Core. First, the "older" P6-based stuff
1589 1623 * (loosely defined as "pre-Pentium-4"):
1590 1624 * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1591 1625 */
1592 1626 #define IS_LEGACY_P6(cpi) ( \
1593 1627 cpi->cpi_family == 6 && \
1594 1628 (cpi->cpi_model == 1 || \
1595 1629 cpi->cpi_model == 3 || \
1596 1630 cpi->cpi_model == 5 || \
1597 1631 cpi->cpi_model == 6 || \
1598 1632 cpi->cpi_model == 7 || \
1599 1633 cpi->cpi_model == 8 || \
1600 1634 cpi->cpi_model == 0xA || \
1601 1635 cpi->cpi_model == 0xB) \
1602 1636 )
1603 1637
1604 1638 /* A "new F6" is everything with family 6 that's not the above */
1605 1639 #define IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
1606 1640
1607 1641 /* Extended family/model support */
1608 1642 #define IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
1609 1643 cpi->cpi_family >= 0xf)
1610 1644
1611 1645 /*
1612 1646 * Info for monitor/mwait idle loop.
1613 1647 *
1614 1648 * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
1615 1649 * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
1616 1650 * 2006.
1617 1651 * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
1618 1652 * Documentation Updates" #33633, Rev 2.05, December 2006.
1619 1653 */
1620 1654 #define MWAIT_SUPPORT (0x00000001) /* mwait supported */
1621 1655 #define MWAIT_EXTENSIONS (0x00000002) /* extenstion supported */
1622 1656 #define MWAIT_ECX_INT_ENABLE (0x00000004) /* ecx 1 extension supported */
1623 1657 #define MWAIT_SUPPORTED(cpi) ((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
1624 1658 #define MWAIT_INT_ENABLE(cpi) ((cpi)->cpi_std[5].cp_ecx & 0x2)
1625 1659 #define MWAIT_EXTENSION(cpi) ((cpi)->cpi_std[5].cp_ecx & 0x1)
1626 1660 #define MWAIT_SIZE_MIN(cpi) BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
1627 1661 #define MWAIT_SIZE_MAX(cpi) BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
1628 1662 /*
1629 1663 * Number of sub-cstates for a given c-state.
1630 1664 */
1631 1665 #define MWAIT_NUM_SUBC_STATES(cpi, c_state) \
1632 1666 BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
1633 1667
1634 1668 /*
1635 1669 * XSAVE leaf 0xD enumeration
1636 1670 */
1637 1671 #define CPUID_LEAFD_2_YMM_OFFSET 576
1638 1672 #define CPUID_LEAFD_2_YMM_SIZE 256
1639 1673
1640 1674 /*
1641 1675 * Common extended leaf names to cut down on typos.
1642 1676 */
1643 1677 #define CPUID_LEAF_EXT_0 0x80000000
1644 1678 #define CPUID_LEAF_EXT_8 0x80000008
1645 1679 #define CPUID_LEAF_EXT_1d 0x8000001d
1646 1680 #define CPUID_LEAF_EXT_1e 0x8000001e
1647 1681
1648 1682 /*
1649 1683 * Functions we consune from cpuid_subr.c; don't publish these in a header
1650 1684 * file to try and keep people using the expected cpuid_* interfaces.
1651 1685 */
1652 1686 extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
1653 1687 extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
1654 1688 extern uint32_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
1655 1689 extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
1656 1690 extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
1657 1691
1658 1692 /*
1659 1693 * Apply up various platform-dependent restrictions where the
1660 1694 * underlying platform restrictions mean the CPU can be marked
1661 1695 * as less capable than its cpuid instruction would imply.
1662 1696 */
1663 1697 #if defined(__xpv)
1664 1698 static void
1665 1699 platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
1666 1700 {
1667 1701 switch (eax) {
1668 1702 case 1: {
1669 1703 uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
1670 1704 0 : CPUID_INTC_EDX_MCA;
1671 1705 cp->cp_edx &=
1672 1706 ~(mcamask |
1673 1707 CPUID_INTC_EDX_PSE |
1674 1708 CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1675 1709 CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
1676 1710 CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
1677 1711 CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1678 1712 CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
1679 1713 break;
1680 1714 }
1681 1715
1682 1716 case 0x80000001:
1683 1717 cp->cp_edx &=
1684 1718 ~(CPUID_AMD_EDX_PSE |
1685 1719 CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1686 1720 CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
1687 1721 CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
1688 1722 CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1689 1723 CPUID_AMD_EDX_TSCP);
1690 1724 cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
1691 1725 break;
1692 1726 default:
1693 1727 break;
1694 1728 }
1695 1729
1696 1730 switch (vendor) {
1697 1731 case X86_VENDOR_Intel:
1698 1732 switch (eax) {
1699 1733 case 4:
1700 1734 /*
1701 1735 * Zero out the (ncores-per-chip - 1) field
1702 1736 */
1703 1737 cp->cp_eax &= 0x03fffffff;
1704 1738 break;
1705 1739 default:
1706 1740 break;
1707 1741 }
1708 1742 break;
1709 1743 case X86_VENDOR_AMD:
1710 1744 switch (eax) {
1711 1745
1712 1746 case 0x80000001:
1713 1747 cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
1714 1748 break;
1715 1749
1716 1750 case CPUID_LEAF_EXT_8:
1717 1751 /*
1718 1752 * Zero out the (ncores-per-chip - 1) field
1719 1753 */
1720 1754 cp->cp_ecx &= 0xffffff00;
1721 1755 break;
1722 1756 default:
1723 1757 break;
1724 1758 }
1725 1759 break;
1726 1760 default:
1727 1761 break;
1728 1762 }
1729 1763 }
1730 1764 #else
1731 1765 #define platform_cpuid_mangle(vendor, eax, cp) /* nothing */
1732 1766 #endif
1733 1767
1734 1768 /*
1735 1769 * Some undocumented ways of patching the results of the cpuid
1736 1770 * instruction to permit running Solaris 10 on future cpus that
1737 1771 * we don't currently support. Could be set to non-zero values
1738 1772 * via settings in eeprom.
1739 1773 */
1740 1774
1741 1775 uint32_t cpuid_feature_ecx_include;
1742 1776 uint32_t cpuid_feature_ecx_exclude;
1743 1777 uint32_t cpuid_feature_edx_include;
1744 1778 uint32_t cpuid_feature_edx_exclude;
1745 1779
1746 1780 /*
1747 1781 * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
1748 1782 */
1749 1783 void
1750 1784 cpuid_alloc_space(cpu_t *cpu)
1751 1785 {
1752 1786 /*
1753 1787 * By convention, cpu0 is the boot cpu, which is set up
1754 1788 * before memory allocation is available. All other cpus get
1755 1789 * their cpuid_info struct allocated here.
1756 1790 */
1757 1791 ASSERT(cpu->cpu_id != 0);
1758 1792 ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
1759 1793 cpu->cpu_m.mcpu_cpi =
1760 1794 kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
1761 1795 }
1762 1796
1763 1797 void
1764 1798 cpuid_free_space(cpu_t *cpu)
1765 1799 {
1766 1800 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1767 1801 int i;
1768 1802
1769 1803 ASSERT(cpi != NULL);
1770 1804 ASSERT(cpi != &cpuid_info0);
1771 1805
1772 1806 /*
1773 1807 * Free up any cache leaf related dynamic storage. The first entry was
1774 1808 * cached from the standard cpuid storage, so we should not free it.
1775 1809 */
1776 1810 for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
1777 1811 kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
1778 1812 if (cpi->cpi_cache_leaf_size > 0)
1779 1813 kmem_free(cpi->cpi_cache_leaves,
1780 1814 cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
1781 1815
1782 1816 kmem_free(cpi, sizeof (*cpi));
1783 1817 cpu->cpu_m.mcpu_cpi = NULL;
1784 1818 }
1785 1819
1786 1820 #if !defined(__xpv)
1787 1821 /*
1788 1822 * Determine the type of the underlying platform. This is used to customize
1789 1823 * initialization of various subsystems (e.g. TSC). determine_platform() must
1790 1824 * only ever be called once to prevent two processors from seeing different
1791 1825 * values of platform_type. Must be called before cpuid_pass1(), the earliest
1792 1826 * consumer to execute (uses _cpuid_chiprev --> synth_amd_info --> get_hwenv).
1793 1827 */
1794 1828 void
1795 1829 determine_platform(void)
1796 1830 {
1797 1831 struct cpuid_regs cp;
1798 1832 uint32_t base;
1799 1833 uint32_t regs[4];
1800 1834 char *hvstr = (char *)regs;
1801 1835
1802 1836 ASSERT(platform_type == -1);
1803 1837
1804 1838 platform_type = HW_NATIVE;
1805 1839
1806 1840 if (!enable_platform_detection)
1807 1841 return;
1808 1842
1809 1843 /*
1810 1844 * If Hypervisor CPUID bit is set, try to determine hypervisor
1811 1845 * vendor signature, and set platform type accordingly.
1812 1846 *
1813 1847 * References:
1814 1848 * http://lkml.org/lkml/2008/10/1/246
1815 1849 * http://kb.vmware.com/kb/1009458
1816 1850 */
1817 1851 cp.cp_eax = 0x1;
1818 1852 (void) __cpuid_insn(&cp);
1819 1853 if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
1820 1854 cp.cp_eax = 0x40000000;
1821 1855 (void) __cpuid_insn(&cp);
1822 1856 regs[0] = cp.cp_ebx;
1823 1857 regs[1] = cp.cp_ecx;
1824 1858 regs[2] = cp.cp_edx;
1825 1859 regs[3] = 0;
1826 1860 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
1827 1861 platform_type = HW_XEN_HVM;
1828 1862 return;
1829 1863 }
1830 1864 if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
1831 1865 platform_type = HW_VMWARE;
1832 1866 return;
1833 1867 }
1834 1868 if (strcmp(hvstr, HVSIG_KVM) == 0) {
1835 1869 platform_type = HW_KVM;
1836 1870 return;
1837 1871 }
1838 1872 if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
1839 1873 platform_type = HW_BHYVE;
1840 1874 return;
1841 1875 }
1842 1876 if (strcmp(hvstr, HVSIG_MICROSOFT) == 0)
1843 1877 platform_type = HW_MICROSOFT;
1844 1878 } else {
1845 1879 /*
1846 1880 * Check older VMware hardware versions. VMware hypervisor is
1847 1881 * detected by performing an IN operation to VMware hypervisor
1848 1882 * port and checking that value returned in %ebx is VMware
1849 1883 * hypervisor magic value.
1850 1884 *
1851 1885 * References: http://kb.vmware.com/kb/1009458
1852 1886 */
1853 1887 vmware_port(VMWARE_HVCMD_GETVERSION, regs);
1854 1888 if (regs[1] == VMWARE_HVMAGIC) {
1855 1889 platform_type = HW_VMWARE;
1856 1890 return;
1857 1891 }
1858 1892 }
1859 1893
1860 1894 /*
1861 1895 * Check Xen hypervisor. In a fully virtualized domain,
1862 1896 * Xen's pseudo-cpuid function returns a string representing the
1863 1897 * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
1864 1898 * supported cpuid function. We need at least a (base + 2) leaf value
1865 1899 * to do what we want to do. Try different base values, since the
1866 1900 * hypervisor might use a different one depending on whether Hyper-V
1867 1901 * emulation is switched on by default or not.
1868 1902 */
1869 1903 for (base = 0x40000000; base < 0x40010000; base += 0x100) {
1870 1904 cp.cp_eax = base;
1871 1905 (void) __cpuid_insn(&cp);
1872 1906 regs[0] = cp.cp_ebx;
1873 1907 regs[1] = cp.cp_ecx;
1874 1908 regs[2] = cp.cp_edx;
1875 1909 regs[3] = 0;
1876 1910 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
1877 1911 cp.cp_eax >= (base + 2)) {
1878 1912 platform_type &= ~HW_NATIVE;
1879 1913 platform_type |= HW_XEN_HVM;
1880 1914 return;
1881 1915 }
1882 1916 }
1883 1917 }
1884 1918
1885 1919 int
1886 1920 get_hwenv(void)
1887 1921 {
1888 1922 ASSERT(platform_type != -1);
1889 1923 return (platform_type);
1890 1924 }
1891 1925
1892 1926 int
1893 1927 is_controldom(void)
1894 1928 {
1895 1929 return (0);
1896 1930 }
1897 1931
1898 1932 #else
1899 1933
1900 1934 int
1901 1935 get_hwenv(void)
1902 1936 {
1903 1937 return (HW_XEN_PV);
1904 1938 }
1905 1939
1906 1940 int
1907 1941 is_controldom(void)
1908 1942 {
1909 1943 return (DOMAIN_IS_INITDOMAIN(xen_info));
1910 1944 }
1911 1945
1912 1946 #endif /* __xpv */
1913 1947
1914 1948 /*
1915 1949 * Make sure that we have gathered all of the CPUID leaves that we might need to
1916 1950 * determine topology. We assume that the standard leaf 1 has already been done
1917 1951 * and that xmaxeax has already been calculated.
1918 1952 */
1919 1953 static void
1920 1954 cpuid_gather_amd_topology_leaves(cpu_t *cpu)
1921 1955 {
1922 1956 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1923 1957
1924 1958 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
1925 1959 struct cpuid_regs *cp;
1926 1960
1927 1961 cp = &cpi->cpi_extd[8];
1928 1962 cp->cp_eax = CPUID_LEAF_EXT_8;
1929 1963 (void) __cpuid_insn(cp);
1930 1964 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
1931 1965 }
1932 1966
1933 1967 if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
1934 1968 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
1935 1969 struct cpuid_regs *cp;
1936 1970
1937 1971 cp = &cpi->cpi_extd[0x1e];
1938 1972 cp->cp_eax = CPUID_LEAF_EXT_1e;
1939 1973 (void) __cpuid_insn(cp);
1940 1974 }
1941 1975 }
1942 1976
1943 1977 /*
1944 1978 * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
1945 1979 * it to everything else. If not, and we're on an AMD system where 8000001e is
1946 1980 * valid, then we use that. Othewrise, we fall back to the default value for the
1947 1981 * APIC ID in leaf 1.
1948 1982 */
1949 1983 static uint32_t
1950 1984 cpuid_gather_apicid(struct cpuid_info *cpi)
1951 1985 {
1952 1986 /*
1953 1987 * Leaf B changes based on the arguments to it. Beacuse we don't cache
1954 1988 * it, we need to gather it again.
1955 1989 */
1956 1990 if (cpi->cpi_maxeax >= 0xB) {
1957 1991 struct cpuid_regs regs;
1958 1992 struct cpuid_regs *cp;
1959 1993
1960 1994 cp = ®s;
1961 1995 cp->cp_eax = 0xB;
1962 1996 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
1963 1997 (void) __cpuid_insn(cp);
1964 1998
1965 1999 if (cp->cp_ebx != 0) {
1966 2000 return (cp->cp_edx);
1967 2001 }
1968 2002 }
1969 2003
1970 2004 if (cpi->cpi_vendor == X86_VENDOR_AMD &&
1971 2005 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
1972 2006 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
1973 2007 return (cpi->cpi_extd[0x1e].cp_eax);
1974 2008 }
1975 2009
1976 2010 return (CPI_APIC_ID(cpi));
1977 2011 }
1978 2012
1979 2013 /*
1980 2014 * For AMD processors, attempt to calculate the number of chips and cores that
1981 2015 * exist. The way that we do this varies based on the generation, because the
1982 2016 * generations themselves have changed dramatically.
1983 2017 *
1984 2018 * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
1985 2019 * However, with the advent of family 17h (Zen) it actually tells us the number
1986 2020 * of threads, so we need to look at leaf 0x8000001e if available to determine
1987 2021 * its value. Otherwise, for all prior families, the number of enabled cores is
1988 2022 * the same as threads.
1989 2023 *
1990 2024 * If we do not have leaf 0x80000008, then we assume that this processor does
1991 2025 * not have anything. AMD's older CPUID specification says there's no reason to
1992 2026 * fall back to leaf 1.
1993 2027 *
1994 2028 * In some virtualization cases we will not have leaf 8000001e or it will be
1995 2029 * zero. When that happens we assume the number of threads is one.
1996 2030 */
1997 2031 static void
1998 2032 cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
1999 2033 {
2000 2034 uint_t nthreads, nthread_per_core;
2001 2035
2002 2036 nthreads = nthread_per_core = 1;
2003 2037
2004 2038 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2005 2039 nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2006 2040 } else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2007 2041 nthreads = CPI_CPU_COUNT(cpi);
2008 2042 }
2009 2043
2010 2044 /*
2011 2045 * For us to have threads, and know about it, we have to be at least at
2012 2046 * family 17h and have the cpuid bit that says we have extended
2013 2047 * topology.
2014 2048 */
2015 2049 if (cpi->cpi_family >= 0x17 &&
2016 2050 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2017 2051 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2018 2052 nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2019 2053 }
2020 2054
2021 2055 *ncpus = nthreads;
2022 2056 *ncores = nthreads / nthread_per_core;
2023 2057 }
2024 2058
2025 2059 /*
2026 2060 * Seed the initial values for the cores and threads for an Intel based
2027 2061 * processor. These values will be overwritten if we detect that the processor
2028 2062 * supports CPUID leaf 0xb.
2029 2063 */
2030 2064 static void
2031 2065 cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2032 2066 {
2033 2067 /*
2034 2068 * Only seed the number of physical cores from the first level leaf 4
2035 2069 * information. The number of threads there indicate how many share the
2036 2070 * L1 cache, which may or may not have anything to do with the number of
2037 2071 * logical CPUs per core.
2038 2072 */
2039 2073 if (cpi->cpi_maxeax >= 4) {
2040 2074 *ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2041 2075 } else {
2042 2076 *ncores = 1;
2043 2077 }
2044 2078
2045 2079 if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2046 2080 *ncpus = CPI_CPU_COUNT(cpi);
2047 2081 } else {
2048 2082 *ncpus = *ncores;
2049 2083 }
2050 2084 }
2051 2085
2052 2086 static boolean_t
2053 2087 cpuid_leafB_getids(cpu_t *cpu)
2054 2088 {
2055 2089 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2056 2090 struct cpuid_regs regs;
2057 2091 struct cpuid_regs *cp;
2058 2092
2059 2093 if (cpi->cpi_maxeax < 0xB)
2060 2094 return (B_FALSE);
2061 2095
2062 2096 cp = ®s;
2063 2097 cp->cp_eax = 0xB;
2064 2098 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2065 2099
2066 2100 (void) __cpuid_insn(cp);
2067 2101
2068 2102 /*
2069 2103 * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2070 2104 * indicates that the extended topology enumeration leaf is
2071 2105 * available.
2072 2106 */
2073 2107 if (cp->cp_ebx != 0) {
2074 2108 uint32_t x2apic_id = 0;
2075 2109 uint_t coreid_shift = 0;
2076 2110 uint_t ncpu_per_core = 1;
2077 2111 uint_t chipid_shift = 0;
2078 2112 uint_t ncpu_per_chip = 1;
2079 2113 uint_t i;
2080 2114 uint_t level;
2081 2115
2082 2116 for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2083 2117 cp->cp_eax = 0xB;
2084 2118 cp->cp_ecx = i;
2085 2119
2086 2120 (void) __cpuid_insn(cp);
2087 2121 level = CPI_CPU_LEVEL_TYPE(cp);
2088 2122
2089 2123 if (level == 1) {
2090 2124 x2apic_id = cp->cp_edx;
2091 2125 coreid_shift = BITX(cp->cp_eax, 4, 0);
2092 2126 ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2093 2127 } else if (level == 2) {
2094 2128 x2apic_id = cp->cp_edx;
2095 2129 chipid_shift = BITX(cp->cp_eax, 4, 0);
2096 2130 ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2097 2131 }
2098 2132 }
2099 2133
2100 2134 /*
2101 2135 * cpi_apicid is taken care of in cpuid_gather_apicid.
2102 2136 */
2103 2137 cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2104 2138 cpi->cpi_ncore_per_chip = ncpu_per_chip /
2105 2139 ncpu_per_core;
2106 2140 cpi->cpi_chipid = x2apic_id >> chipid_shift;
2107 2141 cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2108 2142 cpi->cpi_coreid = x2apic_id >> coreid_shift;
2109 2143 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2110 2144 cpi->cpi_procnodeid = cpi->cpi_chipid;
2111 2145 cpi->cpi_compunitid = cpi->cpi_coreid;
2112 2146
2113 2147 if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2114 2148 cpi->cpi_nthread_bits = coreid_shift;
2115 2149 cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2116 2150 }
2117 2151
2118 2152 return (B_TRUE);
2119 2153 } else {
2120 2154 return (B_FALSE);
2121 2155 }
2122 2156 }
2123 2157
2124 2158 static void
2125 2159 cpuid_intel_getids(cpu_t *cpu, void *feature)
2126 2160 {
2127 2161 uint_t i;
2128 2162 uint_t chipid_shift = 0;
2129 2163 uint_t coreid_shift = 0;
2130 2164 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2131 2165
2132 2166 /*
2133 2167 * There are no compute units or processor nodes currently on Intel.
2134 2168 * Always set these to one.
2135 2169 */
2136 2170 cpi->cpi_procnodes_per_pkg = 1;
2137 2171 cpi->cpi_cores_per_compunit = 1;
2138 2172
2139 2173 /*
2140 2174 * If cpuid Leaf B is present, use that to try and get this information.
2141 2175 * It will be the most accurate for Intel CPUs.
2142 2176 */
2143 2177 if (cpuid_leafB_getids(cpu))
2144 2178 return;
2145 2179
2146 2180 /*
2147 2181 * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2148 2182 * and ncore_per_chip. These represent the largest power of two values
2149 2183 * that we need to cover all of the IDs in the system. Therefore, we use
2150 2184 * those values to seed the number of bits needed to cover information
2151 2185 * in the case when leaf B is not available. These values will probably
2152 2186 * be larger than required, but that's OK.
2153 2187 */
2154 2188 cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2155 2189 cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2156 2190
2157 2191 for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2158 2192 chipid_shift++;
2159 2193
2160 2194 cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2161 2195 cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2162 2196
2163 2197 if (is_x86_feature(feature, X86FSET_CMP)) {
2164 2198 /*
2165 2199 * Multi-core (and possibly multi-threaded)
2166 2200 * processors.
2167 2201 */
2168 2202 uint_t ncpu_per_core;
2169 2203 if (cpi->cpi_ncore_per_chip == 1)
2170 2204 ncpu_per_core = cpi->cpi_ncpu_per_chip;
2171 2205 else if (cpi->cpi_ncore_per_chip > 1)
2172 2206 ncpu_per_core = cpi->cpi_ncpu_per_chip /
2173 2207 cpi->cpi_ncore_per_chip;
2174 2208 /*
2175 2209 * 8bit APIC IDs on dual core Pentiums
2176 2210 * look like this:
2177 2211 *
2178 2212 * +-----------------------+------+------+
2179 2213 * | Physical Package ID | MC | HT |
2180 2214 * +-----------------------+------+------+
2181 2215 * <------- chipid -------->
2182 2216 * <------- coreid --------------->
2183 2217 * <--- clogid -->
2184 2218 * <------>
2185 2219 * pkgcoreid
2186 2220 *
2187 2221 * Where the number of bits necessary to
2188 2222 * represent MC and HT fields together equals
2189 2223 * to the minimum number of bits necessary to
2190 2224 * store the value of cpi->cpi_ncpu_per_chip.
2191 2225 * Of those bits, the MC part uses the number
2192 2226 * of bits necessary to store the value of
2193 2227 * cpi->cpi_ncore_per_chip.
2194 2228 */
2195 2229 for (i = 1; i < ncpu_per_core; i <<= 1)
2196 2230 coreid_shift++;
2197 2231 cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2198 2232 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2199 2233 } else if (is_x86_feature(feature, X86FSET_HTT)) {
2200 2234 /*
2201 2235 * Single-core multi-threaded processors.
2202 2236 */
2203 2237 cpi->cpi_coreid = cpi->cpi_chipid;
2204 2238 cpi->cpi_pkgcoreid = 0;
2205 2239 } else {
2206 2240 /*
2207 2241 * Single-core single-thread processors.
2208 2242 */
2209 2243 cpi->cpi_coreid = cpu->cpu_id;
2210 2244 cpi->cpi_pkgcoreid = 0;
2211 2245 }
2212 2246 cpi->cpi_procnodeid = cpi->cpi_chipid;
2213 2247 cpi->cpi_compunitid = cpi->cpi_coreid;
2214 2248 }
2215 2249
2216 2250 /*
2217 2251 * Historically, AMD has had CMP chips with only a single thread per core.
2218 2252 * However, starting in family 17h (Zen), this has changed and they now have
2219 2253 * multiple threads. Our internal core id needs to be a unique value.
2220 2254 *
2221 2255 * To determine the core id of an AMD system, if we're from a family before 17h,
2222 2256 * then we just use the cpu id, as that gives us a good value that will be
2223 2257 * unique for each core. If instead, we're on family 17h or later, then we need
2224 2258 * to do something more complicated. CPUID leaf 0x8000001e can tell us
2225 2259 * how many threads are in the system. Based on that, we'll shift the APIC ID.
2226 2260 * We can't use the normal core id in that leaf as it's only unique within the
2227 2261 * socket, which is perfect for cpi_pkgcoreid, but not us.
2228 2262 */
2229 2263 static id_t
2230 2264 cpuid_amd_get_coreid(cpu_t *cpu)
2231 2265 {
2232 2266 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2233 2267
2234 2268 if (cpi->cpi_family >= 0x17 &&
2235 2269 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2236 2270 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2237 2271 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2238 2272 if (nthreads > 1) {
2239 2273 VERIFY3U(nthreads, ==, 2);
2240 2274 return (cpi->cpi_apicid >> 1);
2241 2275 }
2242 2276 }
2243 2277
2244 2278 return (cpu->cpu_id);
2245 2279 }
2246 2280
2247 2281 /*
2248 2282 * IDs on AMD is a more challenging task. This is notable because of the
2249 2283 * following two facts:
2250 2284 *
2251 2285 * 1. Before family 0x17 (Zen), there was no support for SMT and there was
2252 2286 * also no way to get an actual unique core id from the system. As such, we
2253 2287 * synthesize this case by using cpu->cpu_id. This scheme does not,
2254 2288 * however, guarantee that sibling cores of a chip will have sequential
2255 2289 * coreids starting at a multiple of the number of cores per chip - that is
2256 2290 * usually the case, but if the ACPI MADT table is presented in a different
2257 2291 * order then we need to perform a few more gymnastics for the pkgcoreid.
2258 2292 *
2259 2293 * 2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2260 2294 * called compute units. These compute units share the L1I cache, L2 cache,
2261 2295 * and the FPU. To deal with this, a new topology leaf was added in
2262 2296 * 0x8000001e. However, parts of this leaf have different meanings
2263 2297 * once we get to family 0x17.
2264 2298 */
2265 2299
2266 2300 static void
2267 2301 cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2268 2302 {
2269 2303 int i, first_half, coreidsz;
2270 2304 uint32_t nb_caps_reg;
2271 2305 uint_t node2_1;
2272 2306 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2273 2307 struct cpuid_regs *cp;
2274 2308
2275 2309 /*
2276 2310 * Calculate the core id (this comes from hardware in family 0x17 if it
2277 2311 * hasn't been stripped by virtualization). We always set the compute
2278 2312 * unit id to the same value. Also, initialize the default number of
2279 2313 * cores per compute unit and nodes per package. This will be
2280 2314 * overwritten when we know information about a particular family.
2281 2315 */
2282 2316 cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2283 2317 cpi->cpi_compunitid = cpi->cpi_coreid;
2284 2318 cpi->cpi_cores_per_compunit = 1;
2285 2319 cpi->cpi_procnodes_per_pkg = 1;
2286 2320
2287 2321 /*
2288 2322 * To construct the logical ID, we need to determine how many APIC IDs
2289 2323 * are dedicated to the cores and threads. This is provided for us in
2290 2324 * 0x80000008. However, if it's not present (say due to virtualization),
2291 2325 * then we assume it's one. This should be present on all 64-bit AMD
2292 2326 * processors. It was added in family 0xf (Hammer).
2293 2327 */
2294 2328 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2295 2329 coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2296 2330
2297 2331 /*
2298 2332 * In AMD parlance chip is really a node while illumos
2299 2333 * uses chip as equivalent to socket/package.
2300 2334 */
2301 2335 if (coreidsz == 0) {
2302 2336 /* Use legacy method */
2303 2337 for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2304 2338 coreidsz++;
2305 2339 if (coreidsz == 0)
2306 2340 coreidsz = 1;
2307 2341 }
2308 2342 } else {
2309 2343 /* Assume single-core part */
2310 2344 coreidsz = 1;
2311 2345 }
2312 2346 cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2313 2347
2314 2348 /*
2315 2349 * The package core ID varies depending on the family. While it may be
2316 2350 * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2317 2351 * this value is the core id in the given node. For non-virtualized
2318 2352 * family 17h, we need to take the logical core id and shift off the
2319 2353 * threads like we do when getting the core id. Otherwise, we can use
2320 2354 * the clogid as is. When family 17h is virtualized, the clogid should
2321 2355 * be sufficient as if we don't have valid data in the leaf, then we
2322 2356 * won't think we have SMT, in which case the cpi_clogid should be
2323 2357 * sufficient.
2324 2358 */
2325 2359 if (cpi->cpi_family >= 0x17 &&
2326 2360 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2327 2361 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2328 2362 cpi->cpi_extd[0x1e].cp_ebx != 0) {
2329 2363 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2330 2364 if (nthreads > 1) {
2331 2365 VERIFY3U(nthreads, ==, 2);
2332 2366 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2333 2367 } else {
2334 2368 cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2335 2369 }
2336 2370 } else {
2337 2371 cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2338 2372 }
2339 2373
2340 2374 /*
2341 2375 * Obtain the node ID and compute unit IDs. If we're on family 0x15
2342 2376 * (bulldozer) or newer, then we can derive all of this from leaf
2343 2377 * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2344 2378 */
2345 2379 if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2346 2380 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2347 2381 cp = &cpi->cpi_extd[0x1e];
2348 2382
2349 2383 cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2350 2384 cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2351 2385
2352 2386 /*
2353 2387 * For Bulldozer-era CPUs, recalculate the compute unit
2354 2388 * information.
2355 2389 */
2356 2390 if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2357 2391 cpi->cpi_cores_per_compunit =
2358 2392 BITX(cp->cp_ebx, 15, 8) + 1;
2359 2393 cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2360 2394 (cpi->cpi_ncore_per_chip /
2361 2395 cpi->cpi_cores_per_compunit) *
2362 2396 (cpi->cpi_procnodeid /
2363 2397 cpi->cpi_procnodes_per_pkg);
2364 2398 }
2365 2399 } else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2366 2400 cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2367 2401 } else if (cpi->cpi_family == 0x10) {
2368 2402 /*
2369 2403 * See if we are a multi-node processor.
2370 2404 * All processors in the system have the same number of nodes
2371 2405 */
2372 2406 nb_caps_reg = pci_getl_func(0, 24, 3, 0xe8);
2373 2407 if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2374 2408 /* Single-node */
2375 2409 cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2376 2410 coreidsz);
2377 2411 } else {
2378 2412
2379 2413 /*
2380 2414 * Multi-node revision D (2 nodes per package
2381 2415 * are supported)
2382 2416 */
2383 2417 cpi->cpi_procnodes_per_pkg = 2;
2384 2418
2385 2419 first_half = (cpi->cpi_pkgcoreid <=
2386 2420 (cpi->cpi_ncore_per_chip/2 - 1));
2387 2421
2388 2422 if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2389 2423 /* We are BSP */
2390 2424 cpi->cpi_procnodeid = (first_half ? 0 : 1);
2391 2425 } else {
2392 2426
2393 2427 /* We are AP */
2394 2428 /* NodeId[2:1] bits to use for reading F3xe8 */
2395 2429 node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2396 2430
2397 2431 nb_caps_reg =
2398 2432 pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2399 2433
2400 2434 /*
2401 2435 * Check IntNodeNum bit (31:30, but bit 31 is
2402 2436 * always 0 on dual-node processors)
2403 2437 */
2404 2438 if (BITX(nb_caps_reg, 30, 30) == 0)
2405 2439 cpi->cpi_procnodeid = node2_1 +
2406 2440 !first_half;
2407 2441 else
2408 2442 cpi->cpi_procnodeid = node2_1 +
2409 2443 first_half;
2410 2444 }
2411 2445 }
2412 2446 } else {
2413 2447 cpi->cpi_procnodeid = 0;
2414 2448 }
2415 2449
2416 2450 cpi->cpi_chipid =
2417 2451 cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2418 2452
2419 2453 cpi->cpi_ncore_bits = coreidsz;
2420 2454 cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2421 2455 cpi->cpi_ncore_per_chip);
2422 2456 }
2423 2457
2424 2458 static void
2425 2459 spec_uarch_flush_noop(void)
2426 2460 {
2427 2461 }
2428 2462
2429 2463 /*
2430 2464 * When microcode is present that mitigates MDS, this wrmsr will also flush the
2431 2465 * MDS-related micro-architectural state that would normally happen by calling
2432 2466 * x86_md_clear().
2433 2467 */
2434 2468 static void
2435 2469 spec_uarch_flush_msr(void)
2436 2470 {
2437 2471 wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2438 2472 }
2439 2473
2440 2474 /*
2441 2475 * This function points to a function that will flush certain
2442 2476 * micro-architectural state on the processor. This flush is used to mitigate
2443 2477 * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This
2444 2478 * function can point to one of three functions:
2445 2479 *
2446 2480 * - A noop which is done because we either are vulnerable, but do not have
2447 2481 * microcode available to help deal with a fix, or because we aren't
2448 2482 * vulnerable.
2449 2483 *
2450 2484 * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2451 2485 * mitigate MDS is present, also perform the equivalent of the MDS flush;
2452 2486 * however, it only flushes the MDS related micro-architectural state on the
2453 2487 * current hyperthread, it does not do anything for the twin.
2454 2488 *
2455 2489 * - x86_md_clear which will flush the MDS related state. This is done when we
2456 2490 * have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2457 2491 * (RDCL_NO is set).
2458 2492 */
2459 2493 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2460 2494
2461 2495 static void
2462 2496 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2463 2497 {
2464 2498 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2465 2499
2466 2500 /*
2467 2501 * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2468 2502 * has been fixed in hardware, it doesn't cover everything related to
2469 2503 * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2470 2504 * need to mitigate this.
2471 2505 */
2472 2506 if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2473 2507 is_x86_feature(featureset, X86FSET_MDS_NO)) {
2474 2508 return;
2475 2509 }
2476 2510
2477 2511 if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2478 2512 const uint8_t nop = NOP_INSTR;
2479 2513 uint8_t *md = (uint8_t *)x86_md_clear;
2480 2514
2481 2515 *md = nop;
2482 2516 }
2483 2517
2484 2518 membar_producer();
2485 2519 }
2486 2520
2487 2521 static void
2488 2522 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2489 2523 {
2490 2524 boolean_t need_l1d, need_mds;
2491 2525 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2492 2526
2493 2527 /*
2494 2528 * If we're not on Intel or we've mitigated both RDCL and MDS in
2495 2529 * hardware, then there's nothing left for us to do for enabling the
2496 2530 * flush. We can also go ahead and say that SMT exclusion is
2497 2531 * unnecessary.
2498 2532 */
2499 2533 if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2500 2534 (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2501 2535 is_x86_feature(featureset, X86FSET_MDS_NO))) {
2502 2536 extern int smt_exclusion;
2503 2537 smt_exclusion = 0;
2504 2538 spec_uarch_flush = spec_uarch_flush_noop;
2505 2539 membar_producer();
2506 2540 return;
2507 2541 }
2508 2542
2509 2543 /*
2510 2544 * The locations where we need to perform an L1D flush are required both
2511 2545 * for mitigating L1TF and MDS. When verw support is present in
2512 2546 * microcode, then the L1D flush will take care of doing that as well.
2513 2547 * However, if we have a system where RDCL_NO is present, but we don't
2514 2548 * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2515 2549 * L1D flush.
2516 2550 */
2517 2551 if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2518 2552 is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2519 2553 !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2520 2554 need_l1d = B_TRUE;
2521 2555 } else {
2522 2556 need_l1d = B_FALSE;
2523 2557 }
2524 2558
2525 2559 if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2526 2560 is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2527 2561 need_mds = B_TRUE;
2528 2562 } else {
2529 2563 need_mds = B_FALSE;
2530 2564 }
2531 2565
2532 2566 if (need_l1d) {
2533 2567 spec_uarch_flush = spec_uarch_flush_msr;
2534 2568 } else if (need_mds) {
2535 2569 spec_uarch_flush = x86_md_clear;
2536 2570 } else {
2537 2571 /*
2538 2572 * We have no hardware mitigations available to us.
2539 2573 */
2540 2574 spec_uarch_flush = spec_uarch_flush_noop;
2541 2575 }
2542 2576 membar_producer();
2543 2577 }
2544 2578
2545 2579 /*
2546 2580 * We default to enabling RSB mitigations.
2547 2581 */
2548 2582 static void
2549 2583 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)
2550 2584 {
2551 2585 const uint8_t ret = RET_INSTR;
2552 2586 uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
2553 2587
2554 2588 switch (mit) {
2555 2589 case X86_SPECTREV2_ENHANCED_IBRS:
2556 2590 case X86_SPECTREV2_DISABLED:
2557 2591 *stuff = ret;
2558 2592 break;
2559 2593 default:
2560 2594 break;
2561 2595 }
2562 2596 }
2563 2597
2564 2598 static void
2565 2599 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
2566 2600 {
2567 2601 const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
2568 2602 "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
2569 2603 "_r14", "_r15" };
2570 2604 const uint_t nthunks = ARRAY_SIZE(thunks);
2571 2605 const char *type;
2572 2606 uint_t i;
2573 2607
2574 2608 if (mit == x86_spectrev2_mitigation)
2575 2609 return;
2576 2610
2577 2611 switch (mit) {
2578 2612 case X86_SPECTREV2_RETPOLINE:
2579 2613 type = "gen";
2580 2614 break;
2581 2615 case X86_SPECTREV2_RETPOLINE_AMD:
2582 2616 type = "amd";
2583 2617 break;
2584 2618 case X86_SPECTREV2_ENHANCED_IBRS:
2585 2619 case X86_SPECTREV2_DISABLED:
2586 2620 type = "jmp";
2587 2621 break;
2588 2622 default:
2589 2623 panic("asked to updated retpoline state with unknown state!");
2590 2624 }
2591 2625
2592 2626 for (i = 0; i < nthunks; i++) {
2593 2627 uintptr_t source, dest;
2594 2628 int ssize, dsize;
2595 2629 char sourcebuf[64], destbuf[64];
2596 2630 size_t len;
2597 2631
2598 2632 (void) snprintf(destbuf, sizeof (destbuf),
2599 2633 "__x86_indirect_thunk%s", thunks[i]);
2600 2634 (void) snprintf(sourcebuf, sizeof (sourcebuf),
2601 2635 "__x86_indirect_thunk_%s%s", type, thunks[i]);
2602 2636
2603 2637 source = kobj_getelfsym(sourcebuf, NULL, &ssize);
2604 2638 dest = kobj_getelfsym(destbuf, NULL, &dsize);
2605 2639 VERIFY3U(source, !=, 0);
2606 2640 VERIFY3U(dest, !=, 0);
2607 2641 VERIFY3S(dsize, >=, ssize);
2608 2642 bcopy((void *)source, (void *)dest, ssize);
2609 2643 }
2610 2644 }
2611 2645
2612 2646 static void
2613 2647 cpuid_enable_enhanced_ibrs(void)
2614 2648 {
2615 2649 uint64_t val;
2616 2650
2617 2651 val = rdmsr(MSR_IA32_SPEC_CTRL);
2618 2652 val |= IA32_SPEC_CTRL_IBRS;
2619 2653 wrmsr(MSR_IA32_SPEC_CTRL, val);
2620 2654 }
2621 2655
2622 2656 #ifndef __xpv
2623 2657 /*
2624 2658 * Determine whether or not we can use the AMD optimized retpoline
2625 2659 * functionality. We use this when we know we're on an AMD system and we can
2626 2660 * successfully verify that lfence is dispatch serializing.
2627 2661 */
2628 2662 static boolean_t
2629 2663 cpuid_use_amd_retpoline(struct cpuid_info *cpi)
2630 2664 {
2631 2665 uint64_t val;
2632 2666 on_trap_data_t otd;
2633 2667
2634 2668 if (cpi->cpi_vendor != X86_VENDOR_AMD)
2635 2669 return (B_FALSE);
2636 2670
2637 2671 /*
2638 2672 * We need to determine whether or not lfence is serializing. It always
2639 2673 * is on families 0xf and 0x11. On others, it's controlled by
2640 2674 * MSR_AMD_DECODE_CONFIG (MSRC001_1029). If some hypervisor gives us a
2641 2675 * crazy old family, don't try and do anything.
2642 2676 */
2643 2677 if (cpi->cpi_family < 0xf)
2644 2678 return (B_FALSE);
2645 2679 if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11)
2646 2680 return (B_TRUE);
2647 2681
2648 2682 /*
2649 2683 * While it may be tempting to use get_hwenv(), there are no promises
2650 2684 * that a hypervisor will actually declare themselves to be so in a
2651 2685 * friendly way. As such, try to read and set the MSR. If we can then
2652 2686 * read back the value we set (it wasn't just set to zero), then we go
2653 2687 * for it.
2654 2688 */
2655 2689 if (!on_trap(&otd, OT_DATA_ACCESS)) {
2656 2690 val = rdmsr(MSR_AMD_DECODE_CONFIG);
2657 2691 val |= AMD_DECODE_CONFIG_LFENCE_DISPATCH;
2658 2692 wrmsr(MSR_AMD_DECODE_CONFIG, val);
2659 2693 val = rdmsr(MSR_AMD_DECODE_CONFIG);
2660 2694 } else {
2661 2695 val = 0;
2662 2696 }
2663 2697 no_trap();
2664 2698
2665 2699 if ((val & AMD_DECODE_CONFIG_LFENCE_DISPATCH) != 0)
2666 2700 return (B_TRUE);
2667 2701 return (B_FALSE);
2668 2702 }
2669 2703 #endif /* !__xpv */
2670 2704
2671 2705 static void
2672 2706 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
2673 2707 {
2674 2708 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2675 2709 x86_spectrev2_mitigation_t v2mit;
2676 2710
2677 2711 if (cpi->cpi_vendor == X86_VENDOR_AMD &&
2678 2712 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2679 2713 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
2680 2714 add_x86_feature(featureset, X86FSET_IBPB);
2681 2715 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
2682 2716 add_x86_feature(featureset, X86FSET_IBRS);
2683 2717 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
2684 2718 add_x86_feature(featureset, X86FSET_STIBP);
2685 2719 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
2686 2720 add_x86_feature(featureset, X86FSET_STIBP_ALL);
2687 2721 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
2688 2722 add_x86_feature(featureset, X86FSET_SSBD);
2689 2723 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
2690 2724 add_x86_feature(featureset, X86FSET_SSBD_VIRT);
2691 2725 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
2692 2726 add_x86_feature(featureset, X86FSET_SSB_NO);
2693 2727 /*
2694 2728 * Don't enable enhanced IBRS unless we're told that we should
2695 2729 * prefer it and it has the same semantics as Intel. This is
2696 2730 * split into two bits rather than a single one.
2697 2731 */
2698 2732 if ((cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
2699 2733 (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)) {
2700 2734 add_x86_feature(featureset, X86FSET_IBRS_ALL);
2701 2735 }
2702 2736
2703 2737 } else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
2704 2738 cpi->cpi_maxeax >= 7) {
2705 2739 struct cpuid_regs *ecp;
2706 2740 ecp = &cpi->cpi_std[7];
2707 2741
2708 2742 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
2709 2743 add_x86_feature(featureset, X86FSET_MD_CLEAR);
2710 2744 }
2711 2745
2712 2746 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
2713 2747 add_x86_feature(featureset, X86FSET_IBRS);
2714 2748 add_x86_feature(featureset, X86FSET_IBPB);
2715 2749 }
2716 2750
2717 2751 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
2718 2752 add_x86_feature(featureset, X86FSET_STIBP);
2719 2753 }
2720 2754
2721 2755 /*
2722 2756 * Don't read the arch caps MSR on xpv where we lack the
2723 2757 * on_trap().
2724 2758 */
2725 2759 #ifndef __xpv
2726 2760 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
2727 2761 on_trap_data_t otd;
2728 2762
2729 2763 /*
2730 2764 * Be paranoid and assume we'll get a #GP.
2731 2765 */
2732 2766 if (!on_trap(&otd, OT_DATA_ACCESS)) {
2733 2767 uint64_t reg;
2734 2768
2735 2769 reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
2736 2770 if (reg & IA32_ARCH_CAP_RDCL_NO) {
2737 2771 add_x86_feature(featureset,
2738 2772 X86FSET_RDCL_NO);
2739 2773 }
2740 2774 if (reg & IA32_ARCH_CAP_IBRS_ALL) {
2741 2775 add_x86_feature(featureset,
2742 2776 X86FSET_IBRS_ALL);
2743 2777 }
2744 2778 if (reg & IA32_ARCH_CAP_RSBA) {
2745 2779 add_x86_feature(featureset,
2746 2780 X86FSET_RSBA);
2747 2781 }
2748 2782 if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
2749 2783 add_x86_feature(featureset,
2750 2784 X86FSET_L1D_VM_NO);
2751 2785 }
2752 2786 if (reg & IA32_ARCH_CAP_SSB_NO) {
2753 2787 add_x86_feature(featureset,
2754 2788 X86FSET_SSB_NO);
2755 2789 }
2756 2790 if (reg & IA32_ARCH_CAP_MDS_NO) {
2757 2791 add_x86_feature(featureset,
2758 2792 X86FSET_MDS_NO);
2759 2793 }
2760 2794 }
2761 2795 no_trap();
2762 2796 }
2763 2797 #endif /* !__xpv */
2764 2798
2765 2799 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
2766 2800 add_x86_feature(featureset, X86FSET_SSBD);
2767 2801
2768 2802 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
2769 2803 add_x86_feature(featureset, X86FSET_FLUSH_CMD);
2770 2804 }
2771 2805
2772 2806 if (cpu->cpu_id != 0) {
2773 2807 if (x86_spectrev2_mitigation == X86_SPECTREV2_ENHANCED_IBRS) {
2774 2808 cpuid_enable_enhanced_ibrs();
2775 2809 }
2776 2810 return;
2777 2811 }
2778 2812
2779 2813 /*
2780 2814 * Go through and initialize various security mechanisms that we should
2781 2815 * only do on a single CPU. This includes Spectre V2, L1TF, and MDS.
2782 2816 */
2783 2817
2784 2818 /*
2785 2819 * By default we've come in with retpolines enabled. Check whether we
2786 2820 * should disable them or enable enhanced IBRS. RSB stuffing is enabled
2787 2821 * by default, but disabled if we are using enhanced IBRS.
2788 2822 */
2789 2823 if (x86_disable_spectrev2 != 0) {
2790 2824 v2mit = X86_SPECTREV2_DISABLED;
2791 2825 } else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
2792 2826 cpuid_enable_enhanced_ibrs();
2793 2827 v2mit = X86_SPECTREV2_ENHANCED_IBRS;
2794 2828 #ifndef __xpv
2795 2829 } else if (cpuid_use_amd_retpoline(cpi)) {
2796 2830 v2mit = X86_SPECTREV2_RETPOLINE_AMD;
2797 2831 #endif /* !__xpv */
2798 2832 } else {
2799 2833 v2mit = X86_SPECTREV2_RETPOLINE;
2800 2834 }
2801 2835
2802 2836 cpuid_patch_retpolines(v2mit);
2803 2837 cpuid_patch_rsb(v2mit);
2804 2838 x86_spectrev2_mitigation = v2mit;
2805 2839 membar_producer();
2806 2840
2807 2841 /*
2808 2842 * We need to determine what changes are required for mitigating L1TF
2809 2843 * and MDS. If the CPU suffers from either of them, then SMT exclusion
2810 2844 * is required.
2811 2845 *
2812 2846 * If any of these are present, then we need to flush u-arch state at
2813 2847 * various points. For MDS, we need to do so whenever we change to a
2814 2848 * lesser privilege level or we are halting the CPU. For L1TF we need to
2815 2849 * flush the L1D cache at VM entry. When we have microcode that handles
2816 2850 * MDS, the L1D flush also clears the other u-arch state that the
2817 2851 * md_clear does.
2818 2852 */
2819 2853
2820 2854 /*
2821 2855 * Update whether or not we need to be taking explicit action against
2822 2856 * MDS.
2823 2857 */
2824 2858 cpuid_update_md_clear(cpu, featureset);
2825 2859
2826 2860 /*
2827 2861 * Determine whether SMT exclusion is required and whether or not we
2828 2862 * need to perform an l1d flush.
2829 2863 */
2830 2864 cpuid_update_l1d_flush(cpu, featureset);
2831 2865 }
2832 2866
2833 2867 /*
2834 2868 * Setup XFeature_Enabled_Mask register. Required by xsave feature.
2835 2869 */
2836 2870 void
2837 2871 setup_xfem(void)
2838 2872 {
2839 2873 uint64_t flags = XFEATURE_LEGACY_FP;
2840 2874
2841 2875 ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
2842 2876
2843 2877 if (is_x86_feature(x86_featureset, X86FSET_SSE))
2844 2878 flags |= XFEATURE_SSE;
2845 2879
2846 2880 if (is_x86_feature(x86_featureset, X86FSET_AVX))
2847 2881 flags |= XFEATURE_AVX;
2848 2882
2849 2883 if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
2850 2884 flags |= XFEATURE_AVX512;
2851 2885
2852 2886 set_xcr(XFEATURE_ENABLED_MASK, flags);
2853 2887
2854 2888 xsave_bv_all = flags;
2855 2889 }
2856 2890
2857 2891 static void
2858 2892 cpuid_pass1_topology(cpu_t *cpu, uchar_t *featureset)
2859 2893 {
2860 2894 struct cpuid_info *cpi;
2861 2895
2862 2896 cpi = cpu->cpu_m.mcpu_cpi;
2863 2897
2864 2898 if (cpi->cpi_vendor == X86_VENDOR_AMD) {
2865 2899 cpuid_gather_amd_topology_leaves(cpu);
2866 2900 }
2867 2901
2868 2902 cpi->cpi_apicid = cpuid_gather_apicid(cpi);
2869 2903
2870 2904 /*
2871 2905 * Before we can calculate the IDs that we should assign to this
2872 2906 * processor, we need to understand how many cores and threads it has.
2873 2907 */
2874 2908 switch (cpi->cpi_vendor) {
2875 2909 case X86_VENDOR_Intel:
2876 2910 cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
2877 2911 &cpi->cpi_ncore_per_chip);
2878 2912 break;
2879 2913 case X86_VENDOR_AMD:
2880 2914 cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
2881 2915 &cpi->cpi_ncore_per_chip);
2882 2916 break;
2883 2917 default:
2884 2918 /*
2885 2919 * If we have some other x86 compatible chip, it's not clear how
2886 2920 * they would behave. The most common case is virtualization
2887 2921 * today, though there are also 64-bit VIA chips. Assume that
2888 2922 * all we can get is the basic Leaf 1 HTT information.
2889 2923 */
2890 2924 if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2891 2925 cpi->cpi_ncore_per_chip = 1;
2892 2926 cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
2893 2927 }
2894 2928 break;
2895 2929 }
2896 2930
2897 2931 /*
2898 2932 * Based on the calculated number of threads and cores, potentially
2899 2933 * assign the HTT and CMT features.
2900 2934 */
2901 2935 if (cpi->cpi_ncore_per_chip > 1) {
2902 2936 add_x86_feature(featureset, X86FSET_CMP);
2903 2937 }
2904 2938
2905 2939 if (cpi->cpi_ncpu_per_chip > 1 &&
2906 2940 cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
2907 2941 add_x86_feature(featureset, X86FSET_HTT);
2908 2942 }
2909 2943
2910 2944 /*
2911 2945 * Now that has been set up, we need to go through and calculate all of
2912 2946 * the rest of the parameters that exist. If we think the CPU doesn't
2913 2947 * have either SMT (HTT) or CMP, then we basically go through and fake
2914 2948 * up information in some way. The most likely case for this is
2915 2949 * virtualization where we have a lot of partial topology information.
2916 2950 */
2917 2951 if (!is_x86_feature(featureset, X86FSET_HTT) &&
2918 2952 !is_x86_feature(featureset, X86FSET_CMP)) {
2919 2953 /*
2920 2954 * This is a single core, single-threaded processor.
2921 2955 */
2922 2956 cpi->cpi_procnodes_per_pkg = 1;
2923 2957 cpi->cpi_cores_per_compunit = 1;
2924 2958 cpi->cpi_compunitid = 0;
2925 2959 cpi->cpi_chipid = -1;
2926 2960 cpi->cpi_clogid = 0;
2927 2961 cpi->cpi_coreid = cpu->cpu_id;
2928 2962 cpi->cpi_pkgcoreid = 0;
2929 2963 if (cpi->cpi_vendor == X86_VENDOR_AMD) {
2930 2964 cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
2931 2965 } else {
2932 2966 cpi->cpi_procnodeid = cpi->cpi_chipid;
2933 2967 }
2934 2968 } else {
2935 2969 switch (cpi->cpi_vendor) {
2936 2970 case X86_VENDOR_Intel:
2937 2971 cpuid_intel_getids(cpu, featureset);
2938 2972 break;
2939 2973 case X86_VENDOR_AMD:
2940 2974 cpuid_amd_getids(cpu, featureset);
2941 2975 break;
2942 2976 default:
2943 2977 /*
2944 2978 * In this case, it's hard to say what we should do.
2945 2979 * We're going to model them to the OS as single core
2946 2980 * threads. We don't have a good identifier for them, so
2947 2981 * we're just going to use the cpu id all on a single
2948 2982 * chip.
2949 2983 *
2950 2984 * This case has historically been different from the
2951 2985 * case above where we don't have HTT or CMP. While they
2952 2986 * could be combined, we've opted to keep it separate to
2953 2987 * minimize the risk of topology changes in weird cases.
2954 2988 */
2955 2989 cpi->cpi_procnodes_per_pkg = 1;
2956 2990 cpi->cpi_cores_per_compunit = 1;
2957 2991 cpi->cpi_chipid = 0;
2958 2992 cpi->cpi_coreid = cpu->cpu_id;
2959 2993 cpi->cpi_clogid = cpu->cpu_id;
2960 2994 cpi->cpi_pkgcoreid = cpu->cpu_id;
2961 2995 cpi->cpi_procnodeid = cpi->cpi_chipid;
2962 2996 cpi->cpi_compunitid = cpi->cpi_coreid;
2963 2997 break;
2964 2998 }
2965 2999 }
2966 3000 }
2967 3001
2968 3002 /*
2969 3003 * Gather relevant CPU features from leaf 6 which covers thermal information. We
2970 3004 * always gather leaf 6 if it's supported; however, we only look for features on
2971 3005 * Intel systems as AMD does not currently define any of the features we look
2972 3006 * for below.
2973 3007 */
2974 3008 static void
2975 3009 cpuid_pass1_thermal(cpu_t *cpu, uchar_t *featureset)
2976 3010 {
2977 3011 struct cpuid_regs *cp;
2978 3012 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2979 3013
2980 3014 if (cpi->cpi_maxeax < 6) {
2981 3015 return;
2982 3016 }
2983 3017
2984 3018 cp = &cpi->cpi_std[6];
2985 3019 cp->cp_eax = 6;
2986 3020 cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
2987 3021 (void) __cpuid_insn(cp);
2988 3022 platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
2989 3023
2990 3024 if (cpi->cpi_vendor != X86_VENDOR_Intel) {
2991 3025 return;
2992 3026 }
2993 3027
2994 3028 if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
2995 3029 add_x86_feature(featureset, X86FSET_CORE_THERMAL);
2996 3030 }
2997 3031
2998 3032 if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
2999 3033 add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3000 3034 }
3001 3035 }
3002 3036
3003 3037 void
3004 3038 cpuid_pass1(cpu_t *cpu, uchar_t *featureset)
3005 3039 {
3006 3040 uint32_t mask_ecx, mask_edx;
3007 3041 struct cpuid_info *cpi;
3008 3042 struct cpuid_regs *cp;
3009 3043 int xcpuid;
3010 3044 #if !defined(__xpv)
3011 3045 extern int idle_cpu_prefer_mwait;
3012 3046 #endif
3013 3047
3014 3048 /*
3015 3049 * Space statically allocated for BSP, ensure pointer is set
3016 3050 */
3017 3051 if (cpu->cpu_id == 0) {
3018 3052 if (cpu->cpu_m.mcpu_cpi == NULL)
3019 3053 cpu->cpu_m.mcpu_cpi = &cpuid_info0;
3020 3054 }
3021 3055
3022 3056 add_x86_feature(featureset, X86FSET_CPUID);
3023 3057
3024 3058 cpi = cpu->cpu_m.mcpu_cpi;
3025 3059 ASSERT(cpi != NULL);
3026 3060 cp = &cpi->cpi_std[0];
3027 3061 cp->cp_eax = 0;
3028 3062 cpi->cpi_maxeax = __cpuid_insn(cp);
3029 3063 {
3030 3064 uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
3031 3065 *iptr++ = cp->cp_ebx;
3032 3066 *iptr++ = cp->cp_edx;
3033 3067 *iptr++ = cp->cp_ecx;
3034 3068 *(char *)&cpi->cpi_vendorstr[12] = '\0';
3035 3069 }
3036 3070
3037 3071 cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
3038 3072 x86_vendor = cpi->cpi_vendor; /* for compatibility */
3039 3073
3040 3074 /*
3041 3075 * Limit the range in case of weird hardware
3042 3076 */
3043 3077 if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
3044 3078 cpi->cpi_maxeax = CPI_MAXEAX_MAX;
3045 3079 if (cpi->cpi_maxeax < 1)
3046 3080 goto pass1_done;
3047 3081
3048 3082 cp = &cpi->cpi_std[1];
3049 3083 cp->cp_eax = 1;
3050 3084 (void) __cpuid_insn(cp);
3051 3085
3052 3086 /*
3053 3087 * Extract identifying constants for easy access.
3054 3088 */
3055 3089 cpi->cpi_model = CPI_MODEL(cpi);
3056 3090 cpi->cpi_family = CPI_FAMILY(cpi);
3057 3091
3058 3092 if (cpi->cpi_family == 0xf)
3059 3093 cpi->cpi_family += CPI_FAMILY_XTD(cpi);
3060 3094
3061 3095 /*
3062 3096 * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
3063 3097 * Intel, and presumably everyone else, uses model == 0xf, as
3064 3098 * one would expect (max value means possible overflow). Sigh.
3065 3099 */
3066 3100
3067 3101 switch (cpi->cpi_vendor) {
3068 3102 case X86_VENDOR_Intel:
3069 3103 if (IS_EXTENDED_MODEL_INTEL(cpi))
3070 3104 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3071 3105 break;
3072 3106 case X86_VENDOR_AMD:
3073 3107 if (CPI_FAMILY(cpi) == 0xf)
3074 3108 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3075 3109 break;
3076 3110 default:
3077 3111 if (cpi->cpi_model == 0xf)
3078 3112 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3079 3113 break;
3080 3114 }
3081 3115
3082 3116 cpi->cpi_step = CPI_STEP(cpi);
3083 3117 cpi->cpi_brandid = CPI_BRANDID(cpi);
3084 3118
3085 3119 /*
3086 3120 * *default* assumptions:
3087 3121 * - believe %edx feature word
3088 3122 * - ignore %ecx feature word
3089 3123 * - 32-bit virtual and physical addressing
3090 3124 */
3091 3125 mask_edx = 0xffffffff;
3092 3126 mask_ecx = 0;
3093 3127
3094 3128 cpi->cpi_pabits = cpi->cpi_vabits = 32;
3095 3129
3096 3130 switch (cpi->cpi_vendor) {
3097 3131 case X86_VENDOR_Intel:
3098 3132 if (cpi->cpi_family == 5)
3099 3133 x86_type = X86_TYPE_P5;
3100 3134 else if (IS_LEGACY_P6(cpi)) {
3101 3135 x86_type = X86_TYPE_P6;
3102 3136 pentiumpro_bug4046376 = 1;
3103 3137 /*
3104 3138 * Clear the SEP bit when it was set erroneously
3105 3139 */
3106 3140 if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
3107 3141 cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
3108 3142 } else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
3109 3143 x86_type = X86_TYPE_P4;
3110 3144 /*
3111 3145 * We don't currently depend on any of the %ecx
3112 3146 * features until Prescott, so we'll only check
3113 3147 * this from P4 onwards. We might want to revisit
3114 3148 * that idea later.
3115 3149 */
3116 3150 mask_ecx = 0xffffffff;
3117 3151 } else if (cpi->cpi_family > 0xf)
3118 3152 mask_ecx = 0xffffffff;
3119 3153 /*
3120 3154 * We don't support MONITOR/MWAIT if leaf 5 is not available
3121 3155 * to obtain the monitor linesize.
3122 3156 */
3123 3157 if (cpi->cpi_maxeax < 5)
3124 3158 mask_ecx &= ~CPUID_INTC_ECX_MON;
3125 3159 break;
3126 3160 case X86_VENDOR_IntelClone:
3127 3161 default:
3128 3162 break;
3129 3163 case X86_VENDOR_AMD:
3130 3164 #if defined(OPTERON_ERRATUM_108)
3131 3165 if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
3132 3166 cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
3133 3167 cpi->cpi_model = 0xc;
3134 3168 } else
3135 3169 #endif
3136 3170 if (cpi->cpi_family == 5) {
3137 3171 /*
3138 3172 * AMD K5 and K6
3139 3173 *
3140 3174 * These CPUs have an incomplete implementation
3141 3175 * of MCA/MCE which we mask away.
3142 3176 */
3143 3177 mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
3144 3178
3145 3179 /*
3146 3180 * Model 0 uses the wrong (APIC) bit
3147 3181 * to indicate PGE. Fix it here.
3148 3182 */
3149 3183 if (cpi->cpi_model == 0) {
3150 3184 if (cp->cp_edx & 0x200) {
3151 3185 cp->cp_edx &= ~0x200;
3152 3186 cp->cp_edx |= CPUID_INTC_EDX_PGE;
3153 3187 }
3154 3188 }
3155 3189
3156 3190 /*
3157 3191 * Early models had problems w/ MMX; disable.
3158 3192 */
3159 3193 if (cpi->cpi_model < 6)
3160 3194 mask_edx &= ~CPUID_INTC_EDX_MMX;
3161 3195 }
3162 3196
3163 3197 /*
3164 3198 * For newer families, SSE3 and CX16, at least, are valid;
3165 3199 * enable all
3166 3200 */
3167 3201 if (cpi->cpi_family >= 0xf)
3168 3202 mask_ecx = 0xffffffff;
3169 3203 /*
3170 3204 * We don't support MONITOR/MWAIT if leaf 5 is not available
3171 3205 * to obtain the monitor linesize.
3172 3206 */
3173 3207 if (cpi->cpi_maxeax < 5)
3174 3208 mask_ecx &= ~CPUID_INTC_ECX_MON;
3175 3209
3176 3210 #if !defined(__xpv)
3177 3211 /*
3178 3212 * AMD has not historically used MWAIT in the CPU's idle loop.
3179 3213 * Pre-family-10h Opterons do not have the MWAIT instruction. We
3180 3214 * know for certain that in at least family 17h, per AMD, mwait
3181 3215 * is preferred. Families in-between are less certain.
3182 3216 */
3183 3217 if (cpi->cpi_family < 0x17) {
3184 3218 idle_cpu_prefer_mwait = 0;
3185 3219 }
3186 3220 #endif
3187 3221
3188 3222 break;
3189 3223 case X86_VENDOR_TM:
3190 3224 /*
3191 3225 * workaround the NT workaround in CMS 4.1
3192 3226 */
3193 3227 if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
3194 3228 (cpi->cpi_step == 2 || cpi->cpi_step == 3))
3195 3229 cp->cp_edx |= CPUID_INTC_EDX_CX8;
3196 3230 break;
3197 3231 case X86_VENDOR_Centaur:
3198 3232 /*
3199 3233 * workaround the NT workarounds again
3200 3234 */
3201 3235 if (cpi->cpi_family == 6)
3202 3236 cp->cp_edx |= CPUID_INTC_EDX_CX8;
3203 3237 break;
3204 3238 case X86_VENDOR_Cyrix:
3205 3239 /*
3206 3240 * We rely heavily on the probing in locore
3207 3241 * to actually figure out what parts, if any,
3208 3242 * of the Cyrix cpuid instruction to believe.
3209 3243 */
3210 3244 switch (x86_type) {
3211 3245 case X86_TYPE_CYRIX_486:
3212 3246 mask_edx = 0;
3213 3247 break;
3214 3248 case X86_TYPE_CYRIX_6x86:
3215 3249 mask_edx = 0;
3216 3250 break;
3217 3251 case X86_TYPE_CYRIX_6x86L:
3218 3252 mask_edx =
3219 3253 CPUID_INTC_EDX_DE |
3220 3254 CPUID_INTC_EDX_CX8;
3221 3255 break;
3222 3256 case X86_TYPE_CYRIX_6x86MX:
3223 3257 mask_edx =
3224 3258 CPUID_INTC_EDX_DE |
3225 3259 CPUID_INTC_EDX_MSR |
3226 3260 CPUID_INTC_EDX_CX8 |
3227 3261 CPUID_INTC_EDX_PGE |
3228 3262 CPUID_INTC_EDX_CMOV |
3229 3263 CPUID_INTC_EDX_MMX;
3230 3264 break;
3231 3265 case X86_TYPE_CYRIX_GXm:
3232 3266 mask_edx =
3233 3267 CPUID_INTC_EDX_MSR |
3234 3268 CPUID_INTC_EDX_CX8 |
3235 3269 CPUID_INTC_EDX_CMOV |
3236 3270 CPUID_INTC_EDX_MMX;
3237 3271 break;
3238 3272 case X86_TYPE_CYRIX_MediaGX:
3239 3273 break;
3240 3274 case X86_TYPE_CYRIX_MII:
3241 3275 case X86_TYPE_VIA_CYRIX_III:
3242 3276 mask_edx =
3243 3277 CPUID_INTC_EDX_DE |
3244 3278 CPUID_INTC_EDX_TSC |
3245 3279 CPUID_INTC_EDX_MSR |
3246 3280 CPUID_INTC_EDX_CX8 |
3247 3281 CPUID_INTC_EDX_PGE |
3248 3282 CPUID_INTC_EDX_CMOV |
3249 3283 CPUID_INTC_EDX_MMX;
3250 3284 break;
3251 3285 default:
3252 3286 break;
3253 3287 }
3254 3288 break;
3255 3289 }
3256 3290
3257 3291 #if defined(__xpv)
3258 3292 /*
3259 3293 * Do not support MONITOR/MWAIT under a hypervisor
3260 3294 */
3261 3295 mask_ecx &= ~CPUID_INTC_ECX_MON;
3262 3296 /*
3263 3297 * Do not support XSAVE under a hypervisor for now
3264 3298 */
3265 3299 xsave_force_disable = B_TRUE;
3266 3300
3267 3301 #endif /* __xpv */
3268 3302
3269 3303 if (xsave_force_disable) {
3270 3304 mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
3271 3305 mask_ecx &= ~CPUID_INTC_ECX_AVX;
3272 3306 mask_ecx &= ~CPUID_INTC_ECX_F16C;
3273 3307 mask_ecx &= ~CPUID_INTC_ECX_FMA;
3274 3308 }
3275 3309
3276 3310 /*
3277 3311 * Now we've figured out the masks that determine
3278 3312 * which bits we choose to believe, apply the masks
3279 3313 * to the feature words, then map the kernel's view
3280 3314 * of these feature words into its feature word.
3281 3315 */
3282 3316 cp->cp_edx &= mask_edx;
3283 3317 cp->cp_ecx &= mask_ecx;
3284 3318
3285 3319 /*
3286 3320 * apply any platform restrictions (we don't call this
3287 3321 * immediately after __cpuid_insn here, because we need the
3288 3322 * workarounds applied above first)
3289 3323 */
3290 3324 platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
3291 3325
3292 3326 /*
3293 3327 * In addition to ecx and edx, Intel and AMD are storing a bunch of
3294 3328 * instruction set extensions in leaf 7's ebx, ecx, and edx.
3295 3329 */
3296 3330 if (cpi->cpi_maxeax >= 7) {
3297 3331 struct cpuid_regs *ecp;
3298 3332 ecp = &cpi->cpi_std[7];
3299 3333 ecp->cp_eax = 7;
3300 3334 ecp->cp_ecx = 0;
3301 3335 (void) __cpuid_insn(ecp);
3302 3336
3303 3337 /*
3304 3338 * If XSAVE has been disabled, just ignore all of the
3305 3339 * extended-save-area dependent flags here.
3306 3340 */
3307 3341 if (xsave_force_disable) {
3308 3342 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
3309 3343 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
3310 3344 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
3311 3345 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
3312 3346 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
3313 3347 ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
3314 3348 ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
3315 3349 }
3316 3350
3317 3351 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
3318 3352 add_x86_feature(featureset, X86FSET_SMEP);
3319 3353
3320 3354 /*
3321 3355 * We check disable_smap here in addition to in startup_smap()
3322 3356 * to ensure CPUs that aren't the boot CPU don't accidentally
3323 3357 * include it in the feature set and thus generate a mismatched
3324 3358 * x86 feature set across CPUs.
3325 3359 */
3326 3360 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
3327 3361 disable_smap == 0)
3328 3362 add_x86_feature(featureset, X86FSET_SMAP);
3329 3363
3330 3364 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
3331 3365 add_x86_feature(featureset, X86FSET_RDSEED);
3332 3366
3333 3367 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
3334 3368 add_x86_feature(featureset, X86FSET_ADX);
3335 3369
3336 3370 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
3337 3371 add_x86_feature(featureset, X86FSET_FSGSBASE);
3338 3372
3339 3373 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
3340 3374 add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
3341 3375
3342 3376 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3343 3377 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
3344 3378 add_x86_feature(featureset, X86FSET_INVPCID);
3345 3379
3346 3380 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
3347 3381 add_x86_feature(featureset, X86FSET_MPX);
3348 3382
3349 3383 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
3350 3384 add_x86_feature(featureset, X86FSET_CLWB);
3351 3385 }
3352 3386 }
3353 3387
3354 3388 /*
3355 3389 * fold in overrides from the "eeprom" mechanism
3356 3390 */
3357 3391 cp->cp_edx |= cpuid_feature_edx_include;
3358 3392 cp->cp_edx &= ~cpuid_feature_edx_exclude;
3359 3393
3360 3394 cp->cp_ecx |= cpuid_feature_ecx_include;
3361 3395 cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
3362 3396
3363 3397 if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
3364 3398 add_x86_feature(featureset, X86FSET_LARGEPAGE);
3365 3399 }
3366 3400 if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
3367 3401 add_x86_feature(featureset, X86FSET_TSC);
3368 3402 }
3369 3403 if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
3370 3404 add_x86_feature(featureset, X86FSET_MSR);
3371 3405 }
3372 3406 if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
3373 3407 add_x86_feature(featureset, X86FSET_MTRR);
3374 3408 }
3375 3409 if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
3376 3410 add_x86_feature(featureset, X86FSET_PGE);
3377 3411 }
3378 3412 if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
3379 3413 add_x86_feature(featureset, X86FSET_CMOV);
3380 3414 }
3381 3415 if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
3382 3416 add_x86_feature(featureset, X86FSET_MMX);
3383 3417 }
3384 3418 if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
3385 3419 (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
3386 3420 add_x86_feature(featureset, X86FSET_MCA);
3387 3421 }
3388 3422 if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
3389 3423 add_x86_feature(featureset, X86FSET_PAE);
3390 3424 }
3391 3425 if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
3392 3426 add_x86_feature(featureset, X86FSET_CX8);
3393 3427 }
3394 3428 if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
3395 3429 add_x86_feature(featureset, X86FSET_CX16);
3396 3430 }
3397 3431 if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
3398 3432 add_x86_feature(featureset, X86FSET_PAT);
3399 3433 }
3400 3434 if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
3401 3435 add_x86_feature(featureset, X86FSET_SEP);
3402 3436 }
3403 3437 if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
3404 3438 /*
3405 3439 * In our implementation, fxsave/fxrstor
3406 3440 * are prerequisites before we'll even
3407 3441 * try and do SSE things.
3408 3442 */
3409 3443 if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
3410 3444 add_x86_feature(featureset, X86FSET_SSE);
3411 3445 }
3412 3446 if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
3413 3447 add_x86_feature(featureset, X86FSET_SSE2);
3414 3448 }
3415 3449 if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
3416 3450 add_x86_feature(featureset, X86FSET_SSE3);
3417 3451 }
3418 3452 if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
3419 3453 add_x86_feature(featureset, X86FSET_SSSE3);
3420 3454 }
3421 3455 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
3422 3456 add_x86_feature(featureset, X86FSET_SSE4_1);
3423 3457 }
3424 3458 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
3425 3459 add_x86_feature(featureset, X86FSET_SSE4_2);
3426 3460 }
3427 3461 if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
3428 3462 add_x86_feature(featureset, X86FSET_AES);
3429 3463 }
3430 3464 if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
3431 3465 add_x86_feature(featureset, X86FSET_PCLMULQDQ);
3432 3466 }
3433 3467
3434 3468 if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
3435 3469 add_x86_feature(featureset, X86FSET_SHA);
3436 3470
3437 3471 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
3438 3472 add_x86_feature(featureset, X86FSET_UMIP);
3439 3473 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_PKU)
3440 3474 add_x86_feature(featureset, X86FSET_PKU);
3441 3475 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
3442 3476 add_x86_feature(featureset, X86FSET_OSPKE);
3443 3477
3444 3478 if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
3445 3479 add_x86_feature(featureset, X86FSET_XSAVE);
3446 3480
3447 3481 /* We only test AVX & AVX512 when there is XSAVE */
3448 3482
3449 3483 if (cp->cp_ecx & CPUID_INTC_ECX_AVX) {
3450 3484 add_x86_feature(featureset,
3451 3485 X86FSET_AVX);
3452 3486
3453 3487 /*
3454 3488 * Intel says we can't check these without also
3455 3489 * checking AVX.
3456 3490 */
3457 3491 if (cp->cp_ecx & CPUID_INTC_ECX_F16C)
3458 3492 add_x86_feature(featureset,
3459 3493 X86FSET_F16C);
3460 3494
3461 3495 if (cp->cp_ecx & CPUID_INTC_ECX_FMA)
3462 3496 add_x86_feature(featureset,
3463 3497 X86FSET_FMA);
3464 3498
3465 3499 if (cpi->cpi_std[7].cp_ebx &
3466 3500 CPUID_INTC_EBX_7_0_BMI1)
3467 3501 add_x86_feature(featureset,
3468 3502 X86FSET_BMI1);
3469 3503
3470 3504 if (cpi->cpi_std[7].cp_ebx &
3471 3505 CPUID_INTC_EBX_7_0_BMI2)
3472 3506 add_x86_feature(featureset,
3473 3507 X86FSET_BMI2);
3474 3508
3475 3509 if (cpi->cpi_std[7].cp_ebx &
3476 3510 CPUID_INTC_EBX_7_0_AVX2)
3477 3511 add_x86_feature(featureset,
3478 3512 X86FSET_AVX2);
3479 3513 }
3480 3514
3481 3515 if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3482 3516 (cpi->cpi_std[7].cp_ebx &
3483 3517 CPUID_INTC_EBX_7_0_AVX512F) != 0) {
3484 3518 add_x86_feature(featureset, X86FSET_AVX512F);
3485 3519
3486 3520 if (cpi->cpi_std[7].cp_ebx &
3487 3521 CPUID_INTC_EBX_7_0_AVX512DQ)
3488 3522 add_x86_feature(featureset,
3489 3523 X86FSET_AVX512DQ);
3490 3524 if (cpi->cpi_std[7].cp_ebx &
3491 3525 CPUID_INTC_EBX_7_0_AVX512IFMA)
3492 3526 add_x86_feature(featureset,
3493 3527 X86FSET_AVX512FMA);
3494 3528 if (cpi->cpi_std[7].cp_ebx &
3495 3529 CPUID_INTC_EBX_7_0_AVX512PF)
3496 3530 add_x86_feature(featureset,
3497 3531 X86FSET_AVX512PF);
3498 3532 if (cpi->cpi_std[7].cp_ebx &
3499 3533 CPUID_INTC_EBX_7_0_AVX512ER)
3500 3534 add_x86_feature(featureset,
3501 3535 X86FSET_AVX512ER);
3502 3536 if (cpi->cpi_std[7].cp_ebx &
3503 3537 CPUID_INTC_EBX_7_0_AVX512CD)
3504 3538 add_x86_feature(featureset,
3505 3539 X86FSET_AVX512CD);
3506 3540 if (cpi->cpi_std[7].cp_ebx &
3507 3541 CPUID_INTC_EBX_7_0_AVX512BW)
3508 3542 add_x86_feature(featureset,
3509 3543 X86FSET_AVX512BW);
3510 3544 if (cpi->cpi_std[7].cp_ebx &
3511 3545 CPUID_INTC_EBX_7_0_AVX512VL)
3512 3546 add_x86_feature(featureset,
3513 3547 X86FSET_AVX512VL);
3514 3548
3515 3549 if (cpi->cpi_std[7].cp_ecx &
3516 3550 CPUID_INTC_ECX_7_0_AVX512VBMI)
3517 3551 add_x86_feature(featureset,
3518 3552 X86FSET_AVX512VBMI);
3519 3553 if (cpi->cpi_std[7].cp_ecx &
3520 3554 CPUID_INTC_ECX_7_0_AVX512VNNI)
3521 3555 add_x86_feature(featureset,
3522 3556 X86FSET_AVX512VNNI);
3523 3557 if (cpi->cpi_std[7].cp_ecx &
3524 3558 CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3525 3559 add_x86_feature(featureset,
3526 3560 X86FSET_AVX512VPOPCDQ);
3527 3561
3528 3562 if (cpi->cpi_std[7].cp_edx &
3529 3563 CPUID_INTC_EDX_7_0_AVX5124NNIW)
3530 3564 add_x86_feature(featureset,
3531 3565 X86FSET_AVX512NNIW);
3532 3566 if (cpi->cpi_std[7].cp_edx &
3533 3567 CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3534 3568 add_x86_feature(featureset,
3535 3569 X86FSET_AVX512FMAPS);
3536 3570 }
3537 3571 }
3538 3572 }
3539 3573
3540 3574 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3541 3575 if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
3542 3576 add_x86_feature(featureset, X86FSET_PCID);
3543 3577 }
3544 3578 }
3545 3579
3546 3580 if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
3547 3581 add_x86_feature(featureset, X86FSET_X2APIC);
3548 3582 }
3549 3583 if (cp->cp_edx & CPUID_INTC_EDX_DE) {
3550 3584 add_x86_feature(featureset, X86FSET_DE);
3551 3585 }
3552 3586 #if !defined(__xpv)
3553 3587 if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
3554 3588
3555 3589 /*
3556 3590 * We require the CLFLUSH instruction for erratum workaround
3557 3591 * to use MONITOR/MWAIT.
3558 3592 */
3559 3593 if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3560 3594 cpi->cpi_mwait.support |= MWAIT_SUPPORT;
3561 3595 add_x86_feature(featureset, X86FSET_MWAIT);
3562 3596 } else {
3563 3597 extern int idle_cpu_assert_cflush_monitor;
3564 3598
3565 3599 /*
3566 3600 * All processors we are aware of which have
3567 3601 * MONITOR/MWAIT also have CLFLUSH.
3568 3602 */
3569 3603 if (idle_cpu_assert_cflush_monitor) {
3570 3604 ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
3571 3605 (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
3572 3606 }
3573 3607 }
3574 3608 }
3575 3609 #endif /* __xpv */
3576 3610
3577 3611 if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
3578 3612 add_x86_feature(featureset, X86FSET_VMX);
3579 3613 }
3580 3614
3581 3615 if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
3582 3616 add_x86_feature(featureset, X86FSET_RDRAND);
3583 3617
3584 3618 /*
3585 3619 * Only need it first time, rest of the cpus would follow suit.
3586 3620 * we only capture this for the bootcpu.
3587 3621 */
3588 3622 if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3589 3623 add_x86_feature(featureset, X86FSET_CLFSH);
3590 3624 x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
3591 3625 }
3592 3626 if (is_x86_feature(featureset, X86FSET_PAE))
3593 3627 cpi->cpi_pabits = 36;
3594 3628
3595 3629 if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
3596 3630 struct cpuid_regs r, *ecp;
3597 3631
3598 3632 ecp = &r;
3599 3633 ecp->cp_eax = 0xD;
3600 3634 ecp->cp_ecx = 1;
3601 3635 ecp->cp_edx = ecp->cp_ebx = 0;
3602 3636 (void) __cpuid_insn(ecp);
3603 3637
3604 3638 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
3605 3639 add_x86_feature(featureset, X86FSET_XSAVEOPT);
3606 3640 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
3607 3641 add_x86_feature(featureset, X86FSET_XSAVEC);
3608 3642 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
3609 3643 add_x86_feature(featureset, X86FSET_XSAVES);
3610 3644 }
3611 3645
3612 3646 /*
3613 3647 * Work on the "extended" feature information, doing
3614 3648 * some basic initialization for cpuid_pass2()
3615 3649 */
3616 3650 xcpuid = 0;
3617 3651 switch (cpi->cpi_vendor) {
3618 3652 case X86_VENDOR_Intel:
3619 3653 /*
3620 3654 * On KVM we know we will have proper support for extended
3621 3655 * cpuid.
3622 3656 */
3623 3657 if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
3624 3658 (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
3625 3659 (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
3626 3660 xcpuid++;
3627 3661 break;
3628 3662 case X86_VENDOR_AMD:
3629 3663 if (cpi->cpi_family > 5 ||
3630 3664 (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
3631 3665 xcpuid++;
3632 3666 break;
3633 3667 case X86_VENDOR_Cyrix:
3634 3668 /*
3635 3669 * Only these Cyrix CPUs are -known- to support
3636 3670 * extended cpuid operations.
3637 3671 */
3638 3672 if (x86_type == X86_TYPE_VIA_CYRIX_III ||
3639 3673 x86_type == X86_TYPE_CYRIX_GXm)
3640 3674 xcpuid++;
3641 3675 break;
3642 3676 case X86_VENDOR_Centaur:
3643 3677 case X86_VENDOR_TM:
3644 3678 default:
3645 3679 xcpuid++;
3646 3680 break;
3647 3681 }
3648 3682
3649 3683 if (xcpuid) {
3650 3684 cp = &cpi->cpi_extd[0];
3651 3685 cp->cp_eax = CPUID_LEAF_EXT_0;
3652 3686 cpi->cpi_xmaxeax = __cpuid_insn(cp);
3653 3687 }
3654 3688
3655 3689 if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
3656 3690
3657 3691 if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
3658 3692 cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
3659 3693
3660 3694 switch (cpi->cpi_vendor) {
3661 3695 case X86_VENDOR_Intel:
3662 3696 case X86_VENDOR_AMD:
3663 3697 if (cpi->cpi_xmaxeax < 0x80000001)
3664 3698 break;
3665 3699 cp = &cpi->cpi_extd[1];
3666 3700 cp->cp_eax = 0x80000001;
3667 3701 (void) __cpuid_insn(cp);
3668 3702
3669 3703 if (cpi->cpi_vendor == X86_VENDOR_AMD &&
3670 3704 cpi->cpi_family == 5 &&
3671 3705 cpi->cpi_model == 6 &&
3672 3706 cpi->cpi_step == 6) {
3673 3707 /*
3674 3708 * K6 model 6 uses bit 10 to indicate SYSC
3675 3709 * Later models use bit 11. Fix it here.
3676 3710 */
3677 3711 if (cp->cp_edx & 0x400) {
3678 3712 cp->cp_edx &= ~0x400;
3679 3713 cp->cp_edx |= CPUID_AMD_EDX_SYSC;
3680 3714 }
3681 3715 }
3682 3716
3683 3717 platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
3684 3718
3685 3719 /*
3686 3720 * Compute the additions to the kernel's feature word.
3687 3721 */
3688 3722 if (cp->cp_edx & CPUID_AMD_EDX_NX) {
3689 3723 add_x86_feature(featureset, X86FSET_NX);
3690 3724 }
3691 3725
3692 3726 /*
3693 3727 * Regardless whether or not we boot 64-bit,
3694 3728 * we should have a way to identify whether
3695 3729 * the CPU is capable of running 64-bit.
3696 3730 */
3697 3731 if (cp->cp_edx & CPUID_AMD_EDX_LM) {
3698 3732 add_x86_feature(featureset, X86FSET_64);
3699 3733 }
3700 3734
3701 3735 /* 1 GB large page - enable only for 64 bit kernel */
3702 3736 if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
3703 3737 add_x86_feature(featureset, X86FSET_1GPG);
3704 3738 }
3705 3739
3706 3740 if ((cpi->cpi_vendor == X86_VENDOR_AMD) &&
3707 3741 (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
3708 3742 (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
3709 3743 add_x86_feature(featureset, X86FSET_SSE4A);
3710 3744 }
3711 3745
3712 3746 /*
3713 3747 * It's really tricky to support syscall/sysret in
3714 3748 * the i386 kernel; we rely on sysenter/sysexit
3715 3749 * instead. In the amd64 kernel, things are -way-
3716 3750 * better.
3717 3751 */
3718 3752 if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
3719 3753 add_x86_feature(featureset, X86FSET_ASYSC);
3720 3754 }
3721 3755
3722 3756 /*
3723 3757 * While we're thinking about system calls, note
3724 3758 * that AMD processors don't support sysenter
3725 3759 * in long mode at all, so don't try to program them.
3726 3760 */
3727 3761 if (x86_vendor == X86_VENDOR_AMD) {
3728 3762 remove_x86_feature(featureset, X86FSET_SEP);
3729 3763 }
3730 3764
3731 3765 if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
3732 3766 add_x86_feature(featureset, X86FSET_TSCP);
3733 3767 }
3734 3768
3735 3769 if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
3736 3770 add_x86_feature(featureset, X86FSET_SVM);
3737 3771 }
3738 3772
3739 3773 if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
3740 3774 add_x86_feature(featureset, X86FSET_TOPOEXT);
3741 3775 }
3742 3776
3743 3777 if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
3744 3778 add_x86_feature(featureset, X86FSET_AMD_PCEC);
3745 3779 }
3746 3780
3747 3781 if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
3748 3782 add_x86_feature(featureset, X86FSET_XOP);
3749 3783 }
3750 3784
3751 3785 if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
3752 3786 add_x86_feature(featureset, X86FSET_FMA4);
3753 3787 }
3754 3788
3755 3789 if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
3756 3790 add_x86_feature(featureset, X86FSET_TBM);
3757 3791 }
3758 3792
3759 3793 if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
3760 3794 add_x86_feature(featureset, X86FSET_MONITORX);
3761 3795 }
3762 3796 break;
3763 3797 default:
3764 3798 break;
3765 3799 }
3766 3800
3767 3801 /*
3768 3802 * Get CPUID data about processor cores and hyperthreads.
3769 3803 */
3770 3804 switch (cpi->cpi_vendor) {
3771 3805 case X86_VENDOR_Intel:
3772 3806 if (cpi->cpi_maxeax >= 4) {
3773 3807 cp = &cpi->cpi_std[4];
3774 3808 cp->cp_eax = 4;
3775 3809 cp->cp_ecx = 0;
3776 3810 (void) __cpuid_insn(cp);
3777 3811 platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
3778 3812 }
3779 3813 /*FALLTHROUGH*/
3780 3814 case X86_VENDOR_AMD:
3781 3815 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
3782 3816 break;
3783 3817 cp = &cpi->cpi_extd[8];
3784 3818 cp->cp_eax = CPUID_LEAF_EXT_8;
3785 3819 (void) __cpuid_insn(cp);
3786 3820 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
3787 3821 cp);
3788 3822
3789 3823 /*
3790 3824 * AMD uses ebx for some extended functions.
3791 3825 */
3792 3826 if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3793 3827 /*
3794 3828 * While we're here, check for the AMD "Error
3795 3829 * Pointer Zero/Restore" feature. This can be
3796 3830 * used to setup the FP save handlers
3797 3831 * appropriately.
3798 3832 */
3799 3833 if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
3800 3834 cpi->cpi_fp_amd_save = 0;
3801 3835 } else {
3802 3836 cpi->cpi_fp_amd_save = 1;
3803 3837 }
3804 3838
3805 3839 if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
3806 3840 add_x86_feature(featureset,
3807 3841 X86FSET_CLZERO);
3808 3842 }
3809 3843 }
3810 3844
3811 3845 /*
3812 3846 * Virtual and physical address limits from
3813 3847 * cpuid override previously guessed values.
3814 3848 */
3815 3849 cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
3816 3850 cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
3817 3851 break;
3818 3852 default:
3819 3853 break;
3820 3854 }
3821 3855
3822 3856 /*
3823 3857 * Get CPUID data about TSC Invariance in Deep C-State.
3824 3858 */
3825 3859 switch (cpi->cpi_vendor) {
3826 3860 case X86_VENDOR_Intel:
3827 3861 case X86_VENDOR_AMD:
3828 3862 if (cpi->cpi_maxeax >= 7) {
3829 3863 cp = &cpi->cpi_extd[7];
3830 3864 cp->cp_eax = 0x80000007;
3831 3865 cp->cp_ecx = 0;
3832 3866 (void) __cpuid_insn(cp);
3833 3867 }
3834 3868 break;
3835 3869 default:
3836 3870 break;
3837 3871 }
3838 3872 }
3839 3873
3840 3874 cpuid_pass1_topology(cpu, featureset);
3841 3875 cpuid_pass1_thermal(cpu, featureset);
3842 3876
3843 3877 /*
3844 3878 * Synthesize chip "revision" and socket type
3845 3879 */
3846 3880 cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
3847 3881 cpi->cpi_model, cpi->cpi_step);
3848 3882 cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
3849 3883 cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
3850 3884 cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
3851 3885 cpi->cpi_model, cpi->cpi_step);
3852 3886
3853 3887 if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3854 3888 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
3855 3889 cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
3856 3890 /* Special handling for AMD FP not necessary. */
3857 3891 cpi->cpi_fp_amd_save = 0;
3858 3892 } else {
3859 3893 cpi->cpi_fp_amd_save = 1;
3860 3894 }
3861 3895 }
3862 3896
3863 3897 /*
3864 3898 * Check the processor leaves that are used for security features.
3865 3899 */
3866 3900 cpuid_scan_security(cpu, featureset);
3867 3901
3868 3902 pass1_done:
3869 3903 cpi->cpi_pass = 1;
3870 3904 }
3871 3905
3872 3906 /*
3873 3907 * Make copies of the cpuid table entries we depend on, in
3874 3908 * part for ease of parsing now, in part so that we have only
3875 3909 * one place to correct any of it, in part for ease of
3876 3910 * later export to userland, and in part so we can look at
3877 3911 * this stuff in a crash dump.
3878 3912 */
3879 3913
3880 3914 /*ARGSUSED*/
3881 3915 void
3882 3916 cpuid_pass2(cpu_t *cpu)
3883 3917 {
3884 3918 uint_t n, nmax;
3885 3919 int i;
3886 3920 struct cpuid_regs *cp;
3887 3921 uint8_t *dp;
3888 3922 uint32_t *iptr;
3889 3923 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3890 3924
3891 3925 ASSERT(cpi->cpi_pass == 1);
3892 3926
3893 3927 if (cpi->cpi_maxeax < 1)
3894 3928 goto pass2_done;
3895 3929
3896 3930 if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
3897 3931 nmax = NMAX_CPI_STD;
3898 3932 /*
3899 3933 * (We already handled n == 0 and n == 1 in pass 1)
3900 3934 */
3901 3935 for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
3902 3936 /*
3903 3937 * leaves 6 and 7 were handled in pass 1
3904 3938 */
3905 3939 if (n == 6 || n == 7)
3906 3940 continue;
3907 3941
3908 3942 cp->cp_eax = n;
3909 3943
3910 3944 /*
3911 3945 * CPUID function 4 expects %ecx to be initialized
3912 3946 * with an index which indicates which cache to return
3913 3947 * information about. The OS is expected to call function 4
3914 3948 * with %ecx set to 0, 1, 2, ... until it returns with
3915 3949 * EAX[4:0] set to 0, which indicates there are no more
3916 3950 * caches.
3917 3951 *
3918 3952 * Here, populate cpi_std[4] with the information returned by
3919 3953 * function 4 when %ecx == 0, and do the rest in cpuid_pass3()
3920 3954 * when dynamic memory allocation becomes available.
3921 3955 *
3922 3956 * Note: we need to explicitly initialize %ecx here, since
3923 3957 * function 4 may have been previously invoked.
3924 3958 */
3925 3959 if (n == 4)
3926 3960 cp->cp_ecx = 0;
3927 3961
3928 3962 (void) __cpuid_insn(cp);
3929 3963 platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
3930 3964 switch (n) {
3931 3965 case 2:
3932 3966 /*
3933 3967 * "the lower 8 bits of the %eax register
3934 3968 * contain a value that identifies the number
3935 3969 * of times the cpuid [instruction] has to be
3936 3970 * executed to obtain a complete image of the
3937 3971 * processor's caching systems."
3938 3972 *
3939 3973 * How *do* they make this stuff up?
3940 3974 */
3941 3975 cpi->cpi_ncache = sizeof (*cp) *
3942 3976 BITX(cp->cp_eax, 7, 0);
3943 3977 if (cpi->cpi_ncache == 0)
3944 3978 break;
3945 3979 cpi->cpi_ncache--; /* skip count byte */
3946 3980
3947 3981 /*
3948 3982 * Well, for now, rather than attempt to implement
3949 3983 * this slightly dubious algorithm, we just look
3950 3984 * at the first 15 ..
3951 3985 */
3952 3986 if (cpi->cpi_ncache > (sizeof (*cp) - 1))
3953 3987 cpi->cpi_ncache = sizeof (*cp) - 1;
3954 3988
3955 3989 dp = cpi->cpi_cacheinfo;
3956 3990 if (BITX(cp->cp_eax, 31, 31) == 0) {
3957 3991 uint8_t *p = (void *)&cp->cp_eax;
3958 3992 for (i = 1; i < 4; i++)
3959 3993 if (p[i] != 0)
3960 3994 *dp++ = p[i];
3961 3995 }
3962 3996 if (BITX(cp->cp_ebx, 31, 31) == 0) {
3963 3997 uint8_t *p = (void *)&cp->cp_ebx;
3964 3998 for (i = 0; i < 4; i++)
3965 3999 if (p[i] != 0)
3966 4000 *dp++ = p[i];
3967 4001 }
3968 4002 if (BITX(cp->cp_ecx, 31, 31) == 0) {
3969 4003 uint8_t *p = (void *)&cp->cp_ecx;
3970 4004 for (i = 0; i < 4; i++)
3971 4005 if (p[i] != 0)
3972 4006 *dp++ = p[i];
3973 4007 }
3974 4008 if (BITX(cp->cp_edx, 31, 31) == 0) {
3975 4009 uint8_t *p = (void *)&cp->cp_edx;
3976 4010 for (i = 0; i < 4; i++)
3977 4011 if (p[i] != 0)
3978 4012 *dp++ = p[i];
3979 4013 }
3980 4014 break;
3981 4015
3982 4016 case 3: /* Processor serial number, if PSN supported */
3983 4017 break;
3984 4018
3985 4019 case 4: /* Deterministic cache parameters */
3986 4020 break;
3987 4021
3988 4022 case 5: /* Monitor/Mwait parameters */
3989 4023 {
3990 4024 size_t mwait_size;
3991 4025
3992 4026 /*
3993 4027 * check cpi_mwait.support which was set in cpuid_pass1
3994 4028 */
3995 4029 if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
3996 4030 break;
3997 4031
3998 4032 /*
3999 4033 * Protect ourself from insane mwait line size.
4000 4034 * Workaround for incomplete hardware emulator(s).
4001 4035 */
4002 4036 mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
4003 4037 if (mwait_size < sizeof (uint32_t) ||
4004 4038 !ISP2(mwait_size)) {
4005 4039 #if DEBUG
4006 4040 cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
4007 4041 "size %ld", cpu->cpu_id, (long)mwait_size);
4008 4042 #endif
4009 4043 break;
4010 4044 }
4011 4045
4012 4046 cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
4013 4047 cpi->cpi_mwait.mon_max = mwait_size;
4014 4048 if (MWAIT_EXTENSION(cpi)) {
4015 4049 cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
4016 4050 if (MWAIT_INT_ENABLE(cpi))
4017 4051 cpi->cpi_mwait.support |=
4018 4052 MWAIT_ECX_INT_ENABLE;
4019 4053 }
4020 4054 break;
4021 4055 }
4022 4056 default:
4023 4057 break;
4024 4058 }
4025 4059 }
4026 4060
4027 4061 /*
4028 4062 * XSAVE enumeration
4029 4063 */
4030 4064 if (cpi->cpi_maxeax >= 0xD) {
4031 4065 struct cpuid_regs regs;
4032 4066 boolean_t cpuid_d_valid = B_TRUE;
4033 4067
4034 4068 cp = ®s;
4035 4069 cp->cp_eax = 0xD;
4036 4070 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
4037 4071
4038 4072 (void) __cpuid_insn(cp);
4039 4073
4040 4074 /*
4041 4075 * Sanity checks for debug
4042 4076 */
4043 4077 if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
4044 4078 (cp->cp_eax & XFEATURE_SSE) == 0) {
4045 4079 cpuid_d_valid = B_FALSE;
4046 4080 }
4047 4081
4048 4082 cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
4049 4083 cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
4050 4084 cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
4051 4085
4052 4086 /*
4053 4087 * If the hw supports AVX, get the size and offset in the save
4054 4088 * area for the ymm state.
4055 4089 */
4056 4090 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
4057 4091 cp->cp_eax = 0xD;
4058 4092 cp->cp_ecx = 2;
4059 4093 cp->cp_edx = cp->cp_ebx = 0;
4060 4094
4061 4095 (void) __cpuid_insn(cp);
4062 4096
4063 4097 if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
4064 4098 cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
4065 4099 cpuid_d_valid = B_FALSE;
4066 4100 }
4067 4101
4068 4102 cpi->cpi_xsave.ymm_size = cp->cp_eax;
4069 4103 cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
4070 4104 }
4071 4105
4072 4106 /*
4073 4107 * If the hw supports MPX, get the size and offset in the
4074 4108 * save area for BNDREGS and BNDCSR.
4075 4109 */
4076 4110 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
4077 4111 cp->cp_eax = 0xD;
4078 4112 cp->cp_ecx = 3;
4079 4113 cp->cp_edx = cp->cp_ebx = 0;
4080 4114
4081 4115 (void) __cpuid_insn(cp);
4082 4116
4083 4117 cpi->cpi_xsave.bndregs_size = cp->cp_eax;
4084 4118 cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
4085 4119
4086 4120 cp->cp_eax = 0xD;
4087 4121 cp->cp_ecx = 4;
4088 4122 cp->cp_edx = cp->cp_ebx = 0;
4089 4123
4090 4124 (void) __cpuid_insn(cp);
4091 4125
4092 4126 cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
4093 4127 cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
4094 4128 }
4095 4129
4096 4130 /*
4097 4131 * If the hw supports AVX512, get the size and offset in the
4098 4132 * save area for the opmask registers and zmm state.
4099 4133 */
4100 4134 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
4101 4135 cp->cp_eax = 0xD;
4102 4136 cp->cp_ecx = 5;
4103 4137 cp->cp_edx = cp->cp_ebx = 0;
4104 4138
4105 4139 (void) __cpuid_insn(cp);
4106 4140
4107 4141 cpi->cpi_xsave.opmask_size = cp->cp_eax;
4108 4142 cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
4109 4143
4110 4144 cp->cp_eax = 0xD;
4111 4145 cp->cp_ecx = 6;
4112 4146 cp->cp_edx = cp->cp_ebx = 0;
4113 4147
4114 4148 (void) __cpuid_insn(cp);
4115 4149
4116 4150 cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
4117 4151 cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
4118 4152
4119 4153 cp->cp_eax = 0xD;
4120 4154 cp->cp_ecx = 7;
4121 4155 cp->cp_edx = cp->cp_ebx = 0;
4122 4156
4123 4157 (void) __cpuid_insn(cp);
4124 4158
4125 4159 cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
4126 4160 cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
4127 4161 }
4128 4162
4129 4163 if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
4130 4164 xsave_state_size = 0;
4131 4165 } else if (cpuid_d_valid) {
4132 4166 xsave_state_size = cpi->cpi_xsave.xsav_max_size;
4133 4167 } else {
4134 4168 /* Broken CPUID 0xD, probably in HVM */
4135 4169 cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
4136 4170 "value: hw_low = %d, hw_high = %d, xsave_size = %d"
4137 4171 ", ymm_size = %d, ymm_offset = %d\n",
4138 4172 cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
4139 4173 cpi->cpi_xsave.xsav_hw_features_high,
4140 4174 (int)cpi->cpi_xsave.xsav_max_size,
4141 4175 (int)cpi->cpi_xsave.ymm_size,
4142 4176 (int)cpi->cpi_xsave.ymm_offset);
4143 4177
4144 4178 if (xsave_state_size != 0) {
4145 4179 /*
4146 4180 * This must be a non-boot CPU. We cannot
4147 4181 * continue, because boot cpu has already
4148 4182 * enabled XSAVE.
4149 4183 */
4150 4184 ASSERT(cpu->cpu_id != 0);
4151 4185 cmn_err(CE_PANIC, "cpu%d: we have already "
4152 4186 "enabled XSAVE on boot cpu, cannot "
4153 4187 "continue.", cpu->cpu_id);
4154 4188 } else {
4155 4189 /*
4156 4190 * If we reached here on the boot CPU, it's also
4157 4191 * almost certain that we'll reach here on the
4158 4192 * non-boot CPUs. When we're here on a boot CPU
4159 4193 * we should disable the feature, on a non-boot
4160 4194 * CPU we need to confirm that we have.
4161 4195 */
4162 4196 if (cpu->cpu_id == 0) {
4163 4197 remove_x86_feature(x86_featureset,
4164 4198 X86FSET_XSAVE);
4165 4199 remove_x86_feature(x86_featureset,
4166 4200 X86FSET_AVX);
4167 4201 remove_x86_feature(x86_featureset,
4168 4202 X86FSET_F16C);
4169 4203 remove_x86_feature(x86_featureset,
4170 4204 X86FSET_BMI1);
4171 4205 remove_x86_feature(x86_featureset,
4172 4206 X86FSET_BMI2);
4173 4207 remove_x86_feature(x86_featureset,
4174 4208 X86FSET_FMA);
4175 4209 remove_x86_feature(x86_featureset,
4176 4210 X86FSET_AVX2);
4177 4211 remove_x86_feature(x86_featureset,
4178 4212 X86FSET_MPX);
4179 4213 remove_x86_feature(x86_featureset,
4180 4214 X86FSET_AVX512F);
4181 4215 remove_x86_feature(x86_featureset,
4182 4216 X86FSET_AVX512DQ);
4183 4217 remove_x86_feature(x86_featureset,
4184 4218 X86FSET_AVX512PF);
4185 4219 remove_x86_feature(x86_featureset,
4186 4220 X86FSET_AVX512ER);
4187 4221 remove_x86_feature(x86_featureset,
4188 4222 X86FSET_AVX512CD);
4189 4223 remove_x86_feature(x86_featureset,
4190 4224 X86FSET_AVX512BW);
4191 4225 remove_x86_feature(x86_featureset,
4192 4226 X86FSET_AVX512VL);
4193 4227 remove_x86_feature(x86_featureset,
4194 4228 X86FSET_AVX512FMA);
4195 4229 remove_x86_feature(x86_featureset,
4196 4230 X86FSET_AVX512VBMI);
4197 4231 remove_x86_feature(x86_featureset,
4198 4232 X86FSET_AVX512VNNI);
4199 4233 remove_x86_feature(x86_featureset,
4200 4234 X86FSET_AVX512VPOPCDQ);
4201 4235 remove_x86_feature(x86_featureset,
4202 4236 X86FSET_AVX512NNIW);
4203 4237 remove_x86_feature(x86_featureset,
4204 4238 X86FSET_AVX512FMAPS);
4205 4239
4206 4240 CPI_FEATURES_ECX(cpi) &=
4207 4241 ~CPUID_INTC_ECX_XSAVE;
4208 4242 CPI_FEATURES_ECX(cpi) &=
4209 4243 ~CPUID_INTC_ECX_AVX;
4210 4244 CPI_FEATURES_ECX(cpi) &=
4211 4245 ~CPUID_INTC_ECX_F16C;
4212 4246 CPI_FEATURES_ECX(cpi) &=
4213 4247 ~CPUID_INTC_ECX_FMA;
4214 4248 CPI_FEATURES_7_0_EBX(cpi) &=
4215 4249 ~CPUID_INTC_EBX_7_0_BMI1;
4216 4250 CPI_FEATURES_7_0_EBX(cpi) &=
4217 4251 ~CPUID_INTC_EBX_7_0_BMI2;
4218 4252 CPI_FEATURES_7_0_EBX(cpi) &=
4219 4253 ~CPUID_INTC_EBX_7_0_AVX2;
4220 4254 CPI_FEATURES_7_0_EBX(cpi) &=
4221 4255 ~CPUID_INTC_EBX_7_0_MPX;
4222 4256 CPI_FEATURES_7_0_EBX(cpi) &=
4223 4257 ~CPUID_INTC_EBX_7_0_ALL_AVX512;
4224 4258
4225 4259 CPI_FEATURES_7_0_ECX(cpi) &=
4226 4260 ~CPUID_INTC_ECX_7_0_ALL_AVX512;
4227 4261
4228 4262 CPI_FEATURES_7_0_EDX(cpi) &=
4229 4263 ~CPUID_INTC_EDX_7_0_ALL_AVX512;
4230 4264
4231 4265 xsave_force_disable = B_TRUE;
4232 4266 } else {
4233 4267 VERIFY(is_x86_feature(x86_featureset,
4234 4268 X86FSET_XSAVE) == B_FALSE);
4235 4269 }
4236 4270 }
4237 4271 }
4238 4272 }
4239 4273
4240 4274
4241 4275 if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
4242 4276 goto pass2_done;
4243 4277
4244 4278 if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
4245 4279 nmax = NMAX_CPI_EXTD;
4246 4280 /*
4247 4281 * Copy the extended properties, fixing them as we go.
4248 4282 * (We already handled n == 0 and n == 1 in pass 1)
4249 4283 */
4250 4284 iptr = (void *)cpi->cpi_brandstr;
4251 4285 for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
4252 4286 cp->cp_eax = CPUID_LEAF_EXT_0 + n;
4253 4287 (void) __cpuid_insn(cp);
4254 4288 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
4255 4289 cp);
4256 4290 switch (n) {
4257 4291 case 2:
4258 4292 case 3:
4259 4293 case 4:
4260 4294 /*
4261 4295 * Extract the brand string
4262 4296 */
4263 4297 *iptr++ = cp->cp_eax;
4264 4298 *iptr++ = cp->cp_ebx;
4265 4299 *iptr++ = cp->cp_ecx;
4266 4300 *iptr++ = cp->cp_edx;
4267 4301 break;
4268 4302 case 5:
4269 4303 switch (cpi->cpi_vendor) {
4270 4304 case X86_VENDOR_AMD:
4271 4305 /*
4272 4306 * The Athlon and Duron were the first
4273 4307 * parts to report the sizes of the
4274 4308 * TLB for large pages. Before then,
4275 4309 * we don't trust the data.
4276 4310 */
4277 4311 if (cpi->cpi_family < 6 ||
4278 4312 (cpi->cpi_family == 6 &&
4279 4313 cpi->cpi_model < 1))
4280 4314 cp->cp_eax = 0;
4281 4315 break;
4282 4316 default:
4283 4317 break;
4284 4318 }
4285 4319 break;
4286 4320 case 6:
4287 4321 switch (cpi->cpi_vendor) {
4288 4322 case X86_VENDOR_AMD:
4289 4323 /*
4290 4324 * The Athlon and Duron were the first
4291 4325 * AMD parts with L2 TLB's.
4292 4326 * Before then, don't trust the data.
4293 4327 */
4294 4328 if (cpi->cpi_family < 6 ||
4295 4329 cpi->cpi_family == 6 &&
4296 4330 cpi->cpi_model < 1)
4297 4331 cp->cp_eax = cp->cp_ebx = 0;
4298 4332 /*
4299 4333 * AMD Duron rev A0 reports L2
4300 4334 * cache size incorrectly as 1K
4301 4335 * when it is really 64K
4302 4336 */
4303 4337 if (cpi->cpi_family == 6 &&
4304 4338 cpi->cpi_model == 3 &&
4305 4339 cpi->cpi_step == 0) {
4306 4340 cp->cp_ecx &= 0xffff;
4307 4341 cp->cp_ecx |= 0x400000;
4308 4342 }
4309 4343 break;
4310 4344 case X86_VENDOR_Cyrix: /* VIA C3 */
4311 4345 /*
4312 4346 * VIA C3 processors are a bit messed
4313 4347 * up w.r.t. encoding cache sizes in %ecx
4314 4348 */
4315 4349 if (cpi->cpi_family != 6)
4316 4350 break;
4317 4351 /*
4318 4352 * model 7 and 8 were incorrectly encoded
4319 4353 *
4320 4354 * xxx is model 8 really broken?
4321 4355 */
4322 4356 if (cpi->cpi_model == 7 ||
4323 4357 cpi->cpi_model == 8)
4324 4358 cp->cp_ecx =
4325 4359 BITX(cp->cp_ecx, 31, 24) << 16 |
4326 4360 BITX(cp->cp_ecx, 23, 16) << 12 |
4327 4361 BITX(cp->cp_ecx, 15, 8) << 8 |
4328 4362 BITX(cp->cp_ecx, 7, 0);
4329 4363 /*
4330 4364 * model 9 stepping 1 has wrong associativity
4331 4365 */
4332 4366 if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
4333 4367 cp->cp_ecx |= 8 << 12;
4334 4368 break;
4335 4369 case X86_VENDOR_Intel:
4336 4370 /*
4337 4371 * Extended L2 Cache features function.
4338 4372 * First appeared on Prescott.
4339 4373 */
4340 4374 default:
4341 4375 break;
4342 4376 }
4343 4377 break;
4344 4378 default:
4345 4379 break;
4346 4380 }
4347 4381 }
4348 4382
4349 4383 pass2_done:
4350 4384 cpi->cpi_pass = 2;
4351 4385 }
4352 4386
4353 4387 static const char *
4354 4388 intel_cpubrand(const struct cpuid_info *cpi)
4355 4389 {
4356 4390 int i;
4357 4391
4358 4392 if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4359 4393 cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
4360 4394 return ("i486");
4361 4395
4362 4396 switch (cpi->cpi_family) {
4363 4397 case 5:
4364 4398 return ("Intel Pentium(r)");
4365 4399 case 6:
4366 4400 switch (cpi->cpi_model) {
4367 4401 uint_t celeron, xeon;
4368 4402 const struct cpuid_regs *cp;
4369 4403 case 0:
4370 4404 case 1:
4371 4405 case 2:
4372 4406 return ("Intel Pentium(r) Pro");
4373 4407 case 3:
4374 4408 case 4:
4375 4409 return ("Intel Pentium(r) II");
4376 4410 case 6:
4377 4411 return ("Intel Celeron(r)");
4378 4412 case 5:
4379 4413 case 7:
4380 4414 celeron = xeon = 0;
4381 4415 cp = &cpi->cpi_std[2]; /* cache info */
4382 4416
4383 4417 for (i = 1; i < 4; i++) {
4384 4418 uint_t tmp;
4385 4419
4386 4420 tmp = (cp->cp_eax >> (8 * i)) & 0xff;
4387 4421 if (tmp == 0x40)
4388 4422 celeron++;
4389 4423 if (tmp >= 0x44 && tmp <= 0x45)
4390 4424 xeon++;
4391 4425 }
4392 4426
4393 4427 for (i = 0; i < 2; i++) {
4394 4428 uint_t tmp;
4395 4429
4396 4430 tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
4397 4431 if (tmp == 0x40)
4398 4432 celeron++;
4399 4433 else if (tmp >= 0x44 && tmp <= 0x45)
4400 4434 xeon++;
4401 4435 }
4402 4436
4403 4437 for (i = 0; i < 4; i++) {
4404 4438 uint_t tmp;
4405 4439
4406 4440 tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
4407 4441 if (tmp == 0x40)
4408 4442 celeron++;
4409 4443 else if (tmp >= 0x44 && tmp <= 0x45)
4410 4444 xeon++;
4411 4445 }
4412 4446
4413 4447 for (i = 0; i < 4; i++) {
4414 4448 uint_t tmp;
4415 4449
4416 4450 tmp = (cp->cp_edx >> (8 * i)) & 0xff;
4417 4451 if (tmp == 0x40)
4418 4452 celeron++;
4419 4453 else if (tmp >= 0x44 && tmp <= 0x45)
4420 4454 xeon++;
4421 4455 }
4422 4456
4423 4457 if (celeron)
4424 4458 return ("Intel Celeron(r)");
4425 4459 if (xeon)
4426 4460 return (cpi->cpi_model == 5 ?
4427 4461 "Intel Pentium(r) II Xeon(tm)" :
4428 4462 "Intel Pentium(r) III Xeon(tm)");
4429 4463 return (cpi->cpi_model == 5 ?
4430 4464 "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
4431 4465 "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
4432 4466 default:
4433 4467 break;
4434 4468 }
4435 4469 default:
4436 4470 break;
4437 4471 }
4438 4472
4439 4473 /* BrandID is present if the field is nonzero */
4440 4474 if (cpi->cpi_brandid != 0) {
4441 4475 static const struct {
4442 4476 uint_t bt_bid;
4443 4477 const char *bt_str;
4444 4478 } brand_tbl[] = {
4445 4479 { 0x1, "Intel(r) Celeron(r)" },
4446 4480 { 0x2, "Intel(r) Pentium(r) III" },
4447 4481 { 0x3, "Intel(r) Pentium(r) III Xeon(tm)" },
4448 4482 { 0x4, "Intel(r) Pentium(r) III" },
4449 4483 { 0x6, "Mobile Intel(r) Pentium(r) III" },
4450 4484 { 0x7, "Mobile Intel(r) Celeron(r)" },
4451 4485 { 0x8, "Intel(r) Pentium(r) 4" },
4452 4486 { 0x9, "Intel(r) Pentium(r) 4" },
4453 4487 { 0xa, "Intel(r) Celeron(r)" },
4454 4488 { 0xb, "Intel(r) Xeon(tm)" },
4455 4489 { 0xc, "Intel(r) Xeon(tm) MP" },
4456 4490 { 0xe, "Mobile Intel(r) Pentium(r) 4" },
4457 4491 { 0xf, "Mobile Intel(r) Celeron(r)" },
4458 4492 { 0x11, "Mobile Genuine Intel(r)" },
4459 4493 { 0x12, "Intel(r) Celeron(r) M" },
4460 4494 { 0x13, "Mobile Intel(r) Celeron(r)" },
4461 4495 { 0x14, "Intel(r) Celeron(r)" },
4462 4496 { 0x15, "Mobile Genuine Intel(r)" },
4463 4497 { 0x16, "Intel(r) Pentium(r) M" },
4464 4498 { 0x17, "Mobile Intel(r) Celeron(r)" }
4465 4499 };
4466 4500 uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
4467 4501 uint_t sgn;
4468 4502
4469 4503 sgn = (cpi->cpi_family << 8) |
4470 4504 (cpi->cpi_model << 4) | cpi->cpi_step;
4471 4505
4472 4506 for (i = 0; i < btblmax; i++)
4473 4507 if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
4474 4508 break;
4475 4509 if (i < btblmax) {
4476 4510 if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
4477 4511 return ("Intel(r) Celeron(r)");
4478 4512 if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
4479 4513 return ("Intel(r) Xeon(tm) MP");
4480 4514 if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
4481 4515 return ("Intel(r) Xeon(tm)");
4482 4516 return (brand_tbl[i].bt_str);
4483 4517 }
4484 4518 }
4485 4519
4486 4520 return (NULL);
4487 4521 }
4488 4522
4489 4523 static const char *
4490 4524 amd_cpubrand(const struct cpuid_info *cpi)
4491 4525 {
4492 4526 if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4493 4527 cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
4494 4528 return ("i486 compatible");
4495 4529
4496 4530 switch (cpi->cpi_family) {
4497 4531 case 5:
4498 4532 switch (cpi->cpi_model) {
4499 4533 case 0:
4500 4534 case 1:
4501 4535 case 2:
4502 4536 case 3:
4503 4537 case 4:
4504 4538 case 5:
4505 4539 return ("AMD-K5(r)");
4506 4540 case 6:
4507 4541 case 7:
4508 4542 return ("AMD-K6(r)");
4509 4543 case 8:
4510 4544 return ("AMD-K6(r)-2");
4511 4545 case 9:
4512 4546 return ("AMD-K6(r)-III");
4513 4547 default:
4514 4548 return ("AMD (family 5)");
4515 4549 }
4516 4550 case 6:
4517 4551 switch (cpi->cpi_model) {
4518 4552 case 1:
4519 4553 return ("AMD-K7(tm)");
4520 4554 case 0:
4521 4555 case 2:
4522 4556 case 4:
4523 4557 return ("AMD Athlon(tm)");
4524 4558 case 3:
4525 4559 case 7:
4526 4560 return ("AMD Duron(tm)");
4527 4561 case 6:
4528 4562 case 8:
4529 4563 case 10:
4530 4564 /*
4531 4565 * Use the L2 cache size to distinguish
4532 4566 */
4533 4567 return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
4534 4568 "AMD Athlon(tm)" : "AMD Duron(tm)");
4535 4569 default:
4536 4570 return ("AMD (family 6)");
4537 4571 }
4538 4572 default:
4539 4573 break;
4540 4574 }
4541 4575
4542 4576 if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
4543 4577 cpi->cpi_brandid != 0) {
4544 4578 switch (BITX(cpi->cpi_brandid, 7, 5)) {
4545 4579 case 3:
4546 4580 return ("AMD Opteron(tm) UP 1xx");
4547 4581 case 4:
4548 4582 return ("AMD Opteron(tm) DP 2xx");
4549 4583 case 5:
4550 4584 return ("AMD Opteron(tm) MP 8xx");
4551 4585 default:
4552 4586 return ("AMD Opteron(tm)");
4553 4587 }
4554 4588 }
4555 4589
4556 4590 return (NULL);
4557 4591 }
4558 4592
4559 4593 static const char *
4560 4594 cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
4561 4595 {
4562 4596 if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4563 4597 cpi->cpi_maxeax < 1 || cpi->cpi_family < 5 ||
4564 4598 type == X86_TYPE_CYRIX_486)
4565 4599 return ("i486 compatible");
4566 4600
4567 4601 switch (type) {
4568 4602 case X86_TYPE_CYRIX_6x86:
4569 4603 return ("Cyrix 6x86");
4570 4604 case X86_TYPE_CYRIX_6x86L:
4571 4605 return ("Cyrix 6x86L");
4572 4606 case X86_TYPE_CYRIX_6x86MX:
4573 4607 return ("Cyrix 6x86MX");
4574 4608 case X86_TYPE_CYRIX_GXm:
4575 4609 return ("Cyrix GXm");
4576 4610 case X86_TYPE_CYRIX_MediaGX:
4577 4611 return ("Cyrix MediaGX");
4578 4612 case X86_TYPE_CYRIX_MII:
4579 4613 return ("Cyrix M2");
4580 4614 case X86_TYPE_VIA_CYRIX_III:
4581 4615 return ("VIA Cyrix M3");
4582 4616 default:
4583 4617 /*
4584 4618 * Have another wild guess ..
4585 4619 */
4586 4620 if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
4587 4621 return ("Cyrix 5x86");
4588 4622 else if (cpi->cpi_family == 5) {
4589 4623 switch (cpi->cpi_model) {
4590 4624 case 2:
4591 4625 return ("Cyrix 6x86"); /* Cyrix M1 */
4592 4626 case 4:
4593 4627 return ("Cyrix MediaGX");
4594 4628 default:
4595 4629 break;
4596 4630 }
4597 4631 } else if (cpi->cpi_family == 6) {
4598 4632 switch (cpi->cpi_model) {
4599 4633 case 0:
4600 4634 return ("Cyrix 6x86MX"); /* Cyrix M2? */
4601 4635 case 5:
4602 4636 case 6:
4603 4637 case 7:
4604 4638 case 8:
4605 4639 case 9:
4606 4640 return ("VIA C3");
4607 4641 default:
4608 4642 break;
4609 4643 }
4610 4644 }
4611 4645 break;
4612 4646 }
4613 4647 return (NULL);
4614 4648 }
4615 4649
4616 4650 /*
4617 4651 * This only gets called in the case that the CPU extended
4618 4652 * feature brand string (0x80000002, 0x80000003, 0x80000004)
4619 4653 * aren't available, or contain null bytes for some reason.
4620 4654 */
4621 4655 static void
4622 4656 fabricate_brandstr(struct cpuid_info *cpi)
4623 4657 {
4624 4658 const char *brand = NULL;
4625 4659
4626 4660 switch (cpi->cpi_vendor) {
4627 4661 case X86_VENDOR_Intel:
4628 4662 brand = intel_cpubrand(cpi);
4629 4663 break;
4630 4664 case X86_VENDOR_AMD:
4631 4665 brand = amd_cpubrand(cpi);
4632 4666 break;
4633 4667 case X86_VENDOR_Cyrix:
4634 4668 brand = cyrix_cpubrand(cpi, x86_type);
4635 4669 break;
4636 4670 case X86_VENDOR_NexGen:
4637 4671 if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4638 4672 brand = "NexGen Nx586";
4639 4673 break;
4640 4674 case X86_VENDOR_Centaur:
4641 4675 if (cpi->cpi_family == 5)
4642 4676 switch (cpi->cpi_model) {
4643 4677 case 4:
4644 4678 brand = "Centaur C6";
4645 4679 break;
4646 4680 case 8:
4647 4681 brand = "Centaur C2";
4648 4682 break;
4649 4683 case 9:
4650 4684 brand = "Centaur C3";
4651 4685 break;
4652 4686 default:
4653 4687 break;
4654 4688 }
4655 4689 break;
4656 4690 case X86_VENDOR_Rise:
4657 4691 if (cpi->cpi_family == 5 &&
4658 4692 (cpi->cpi_model == 0 || cpi->cpi_model == 2))
4659 4693 brand = "Rise mP6";
4660 4694 break;
4661 4695 case X86_VENDOR_SiS:
4662 4696 if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4663 4697 brand = "SiS 55x";
4664 4698 break;
4665 4699 case X86_VENDOR_TM:
4666 4700 if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
4667 4701 brand = "Transmeta Crusoe TM3x00 or TM5x00";
4668 4702 break;
4669 4703 case X86_VENDOR_NSC:
4670 4704 case X86_VENDOR_UMC:
4671 4705 default:
4672 4706 break;
4673 4707 }
4674 4708 if (brand) {
4675 4709 (void) strcpy((char *)cpi->cpi_brandstr, brand);
4676 4710 return;
4677 4711 }
4678 4712
4679 4713 /*
4680 4714 * If all else fails ...
4681 4715 */
4682 4716 (void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
4683 4717 "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
4684 4718 cpi->cpi_model, cpi->cpi_step);
4685 4719 }
4686 4720
4687 4721 /*
4688 4722 * This routine is called just after kernel memory allocation
4689 4723 * becomes available on cpu0, and as part of mp_startup() on
4690 4724 * the other cpus.
4691 4725 *
4692 4726 * Fixup the brand string, and collect any information from cpuid
4693 4727 * that requires dynamically allocated storage to represent.
4694 4728 */
4695 4729 /*ARGSUSED*/
4696 4730 void
4697 4731 cpuid_pass3(cpu_t *cpu)
4698 4732 {
4699 4733 int i, max, shft, level, size;
4700 4734 struct cpuid_regs regs;
4701 4735 struct cpuid_regs *cp;
4702 4736 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4703 4737
4704 4738 ASSERT(cpi->cpi_pass == 2);
4705 4739
4706 4740 /*
4707 4741 * Deterministic cache parameters
4708 4742 *
4709 4743 * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
4710 4744 * values that are present are currently defined to be the same. This
4711 4745 * means we can use the same logic to parse it as long as we use the
4712 4746 * appropriate leaf to get the data. If you're updating this, make sure
4713 4747 * you're careful about which vendor supports which aspect.
4714 4748 *
4715 4749 * Take this opportunity to detect the number of threads sharing the
4716 4750 * last level cache, and construct a corresponding cache id. The
4717 4751 * respective cpuid_info members are initialized to the default case of
4718 4752 * "no last level cache sharing".
4719 4753 */
4720 4754 cpi->cpi_ncpu_shr_last_cache = 1;
4721 4755 cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
4722 4756
4723 4757 if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
4724 4758 (cpi->cpi_vendor == X86_VENDOR_AMD &&
4725 4759 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
4726 4760 is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
4727 4761 uint32_t leaf;
4728 4762
4729 4763 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4730 4764 leaf = 4;
4731 4765 } else {
4732 4766 leaf = CPUID_LEAF_EXT_1d;
4733 4767 }
4734 4768
4735 4769 /*
4736 4770 * Find the # of elements (size) returned by the leaf and along
4737 4771 * the way detect last level cache sharing details.
4738 4772 */
4739 4773 bzero(®s, sizeof (regs));
4740 4774 cp = ®s;
4741 4775 for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
4742 4776 cp->cp_eax = leaf;
4743 4777 cp->cp_ecx = i;
4744 4778
4745 4779 (void) __cpuid_insn(cp);
4746 4780
4747 4781 if (CPI_CACHE_TYPE(cp) == 0)
4748 4782 break;
4749 4783 level = CPI_CACHE_LVL(cp);
4750 4784 if (level > max) {
4751 4785 max = level;
4752 4786 cpi->cpi_ncpu_shr_last_cache =
4753 4787 CPI_NTHR_SHR_CACHE(cp) + 1;
4754 4788 }
4755 4789 }
4756 4790 cpi->cpi_cache_leaf_size = size = i;
4757 4791
4758 4792 /*
4759 4793 * Allocate the cpi_cache_leaves array. The first element
4760 4794 * references the regs for the corresponding leaf with %ecx set
4761 4795 * to 0. This was gathered in cpuid_pass2().
4762 4796 */
4763 4797 if (size > 0) {
4764 4798 cpi->cpi_cache_leaves =
4765 4799 kmem_alloc(size * sizeof (cp), KM_SLEEP);
4766 4800 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4767 4801 cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
4768 4802 } else {
4769 4803 cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
4770 4804 }
4771 4805
4772 4806 /*
4773 4807 * Allocate storage to hold the additional regs
4774 4808 * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
4775 4809 *
4776 4810 * The regs for the leaf, %ecx == 0 has already
4777 4811 * been allocated as indicated above.
4778 4812 */
4779 4813 for (i = 1; i < size; i++) {
4780 4814 cp = cpi->cpi_cache_leaves[i] =
4781 4815 kmem_zalloc(sizeof (regs), KM_SLEEP);
4782 4816 cp->cp_eax = leaf;
4783 4817 cp->cp_ecx = i;
4784 4818
4785 4819 (void) __cpuid_insn(cp);
4786 4820 }
4787 4821 }
4788 4822 /*
4789 4823 * Determine the number of bits needed to represent
4790 4824 * the number of CPUs sharing the last level cache.
4791 4825 *
4792 4826 * Shift off that number of bits from the APIC id to
4793 4827 * derive the cache id.
4794 4828 */
4795 4829 shft = 0;
4796 4830 for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
4797 4831 shft++;
4798 4832 cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
4799 4833 }
4800 4834
4801 4835 /*
4802 4836 * Now fixup the brand string
4803 4837 */
4804 4838 if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
4805 4839 fabricate_brandstr(cpi);
4806 4840 } else {
4807 4841
4808 4842 /*
4809 4843 * If we successfully extracted a brand string from the cpuid
4810 4844 * instruction, clean it up by removing leading spaces and
4811 4845 * similar junk.
4812 4846 */
4813 4847 if (cpi->cpi_brandstr[0]) {
4814 4848 size_t maxlen = sizeof (cpi->cpi_brandstr);
4815 4849 char *src, *dst;
4816 4850
4817 4851 dst = src = (char *)cpi->cpi_brandstr;
4818 4852 src[maxlen - 1] = '\0';
4819 4853 /*
4820 4854 * strip leading spaces
4821 4855 */
4822 4856 while (*src == ' ')
4823 4857 src++;
4824 4858 /*
4825 4859 * Remove any 'Genuine' or "Authentic" prefixes
4826 4860 */
4827 4861 if (strncmp(src, "Genuine ", 8) == 0)
4828 4862 src += 8;
4829 4863 if (strncmp(src, "Authentic ", 10) == 0)
4830 4864 src += 10;
4831 4865
4832 4866 /*
4833 4867 * Now do an in-place copy.
4834 4868 * Map (R) to (r) and (TM) to (tm).
4835 4869 * The era of teletypes is long gone, and there's
4836 4870 * -really- no need to shout.
4837 4871 */
4838 4872 while (*src != '\0') {
4839 4873 if (src[0] == '(') {
4840 4874 if (strncmp(src + 1, "R)", 2) == 0) {
4841 4875 (void) strncpy(dst, "(r)", 3);
4842 4876 src += 3;
4843 4877 dst += 3;
4844 4878 continue;
4845 4879 }
4846 4880 if (strncmp(src + 1, "TM)", 3) == 0) {
4847 4881 (void) strncpy(dst, "(tm)", 4);
4848 4882 src += 4;
4849 4883 dst += 4;
4850 4884 continue;
4851 4885 }
4852 4886 }
4853 4887 *dst++ = *src++;
4854 4888 }
4855 4889 *dst = '\0';
4856 4890
4857 4891 /*
4858 4892 * Finally, remove any trailing spaces
4859 4893 */
4860 4894 while (--dst > cpi->cpi_brandstr)
4861 4895 if (*dst == ' ')
4862 4896 *dst = '\0';
4863 4897 else
4864 4898 break;
4865 4899 } else
4866 4900 fabricate_brandstr(cpi);
4867 4901 }
4868 4902 cpi->cpi_pass = 3;
4869 4903 }
4870 4904
4871 4905 /*
4872 4906 * This routine is called out of bind_hwcap() much later in the life
4873 4907 * of the kernel (post_startup()). The job of this routine is to resolve
4874 4908 * the hardware feature support and kernel support for those features into
4875 4909 * what we're actually going to tell applications via the aux vector.
4876 4910 */
4877 4911 void
4878 4912 cpuid_pass4(cpu_t *cpu, uint_t *hwcap_out)
4879 4913 {
4880 4914 struct cpuid_info *cpi;
4881 4915 uint_t hwcap_flags = 0, hwcap_flags_2 = 0;
4882 4916
4883 4917 if (cpu == NULL)
4884 4918 cpu = CPU;
4885 4919 cpi = cpu->cpu_m.mcpu_cpi;
4886 4920
4887 4921 ASSERT(cpi->cpi_pass == 3);
4888 4922
4889 4923 if (cpi->cpi_maxeax >= 1) {
4890 4924 uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
4891 4925 uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
4892 4926 uint32_t *ebx = &cpi->cpi_support[STD_EBX_FEATURES];
4893 4927
4894 4928 *edx = CPI_FEATURES_EDX(cpi);
4895 4929 *ecx = CPI_FEATURES_ECX(cpi);
4896 4930 *ebx = CPI_FEATURES_7_0_EBX(cpi);
4897 4931
4898 4932 /*
4899 4933 * [these require explicit kernel support]
4900 4934 */
4901 4935 if (!is_x86_feature(x86_featureset, X86FSET_SEP))
4902 4936 *edx &= ~CPUID_INTC_EDX_SEP;
4903 4937
4904 4938 if (!is_x86_feature(x86_featureset, X86FSET_SSE))
4905 4939 *edx &= ~(CPUID_INTC_EDX_FXSR|CPUID_INTC_EDX_SSE);
4906 4940 if (!is_x86_feature(x86_featureset, X86FSET_SSE2))
4907 4941 *edx &= ~CPUID_INTC_EDX_SSE2;
4908 4942
4909 4943 if (!is_x86_feature(x86_featureset, X86FSET_HTT))
4910 4944 *edx &= ~CPUID_INTC_EDX_HTT;
4911 4945
4912 4946 if (!is_x86_feature(x86_featureset, X86FSET_SSE3))
4913 4947 *ecx &= ~CPUID_INTC_ECX_SSE3;
4914 4948
4915 4949 if (!is_x86_feature(x86_featureset, X86FSET_SSSE3))
4916 4950 *ecx &= ~CPUID_INTC_ECX_SSSE3;
4917 4951 if (!is_x86_feature(x86_featureset, X86FSET_SSE4_1))
4918 4952 *ecx &= ~CPUID_INTC_ECX_SSE4_1;
4919 4953 if (!is_x86_feature(x86_featureset, X86FSET_SSE4_2))
4920 4954 *ecx &= ~CPUID_INTC_ECX_SSE4_2;
4921 4955 if (!is_x86_feature(x86_featureset, X86FSET_AES))
4922 4956 *ecx &= ~CPUID_INTC_ECX_AES;
4923 4957 if (!is_x86_feature(x86_featureset, X86FSET_PCLMULQDQ))
4924 4958 *ecx &= ~CPUID_INTC_ECX_PCLMULQDQ;
4925 4959 if (!is_x86_feature(x86_featureset, X86FSET_XSAVE))
4926 4960 *ecx &= ~(CPUID_INTC_ECX_XSAVE |
4927 4961 CPUID_INTC_ECX_OSXSAVE);
4928 4962 if (!is_x86_feature(x86_featureset, X86FSET_AVX))
4929 4963 *ecx &= ~CPUID_INTC_ECX_AVX;
4930 4964 if (!is_x86_feature(x86_featureset, X86FSET_F16C))
4931 4965 *ecx &= ~CPUID_INTC_ECX_F16C;
4932 4966 if (!is_x86_feature(x86_featureset, X86FSET_FMA))
4933 4967 *ecx &= ~CPUID_INTC_ECX_FMA;
4934 4968 if (!is_x86_feature(x86_featureset, X86FSET_BMI1))
4935 4969 *ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
4936 4970 if (!is_x86_feature(x86_featureset, X86FSET_BMI2))
4937 4971 *ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
4938 4972 if (!is_x86_feature(x86_featureset, X86FSET_AVX2))
4939 4973 *ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
4940 4974 if (!is_x86_feature(x86_featureset, X86FSET_RDSEED))
4941 4975 *ebx &= ~CPUID_INTC_EBX_7_0_RDSEED;
4942 4976 if (!is_x86_feature(x86_featureset, X86FSET_ADX))
4943 4977 *ebx &= ~CPUID_INTC_EBX_7_0_ADX;
4944 4978
4945 4979 /*
4946 4980 * [no explicit support required beyond x87 fp context]
4947 4981 */
4948 4982 if (!fpu_exists)
4949 4983 *edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
4950 4984
4951 4985 /*
4952 4986 * Now map the supported feature vector to things that we
4953 4987 * think userland will care about.
4954 4988 */
4955 4989 if (*edx & CPUID_INTC_EDX_SEP)
4956 4990 hwcap_flags |= AV_386_SEP;
4957 4991 if (*edx & CPUID_INTC_EDX_SSE)
4958 4992 hwcap_flags |= AV_386_FXSR | AV_386_SSE;
4959 4993 if (*edx & CPUID_INTC_EDX_SSE2)
4960 4994 hwcap_flags |= AV_386_SSE2;
4961 4995 if (*ecx & CPUID_INTC_ECX_SSE3)
4962 4996 hwcap_flags |= AV_386_SSE3;
4963 4997 if (*ecx & CPUID_INTC_ECX_SSSE3)
4964 4998 hwcap_flags |= AV_386_SSSE3;
4965 4999 if (*ecx & CPUID_INTC_ECX_SSE4_1)
4966 5000 hwcap_flags |= AV_386_SSE4_1;
4967 5001 if (*ecx & CPUID_INTC_ECX_SSE4_2)
4968 5002 hwcap_flags |= AV_386_SSE4_2;
4969 5003 if (*ecx & CPUID_INTC_ECX_MOVBE)
4970 5004 hwcap_flags |= AV_386_MOVBE;
4971 5005 if (*ecx & CPUID_INTC_ECX_AES)
4972 5006 hwcap_flags |= AV_386_AES;
4973 5007 if (*ecx & CPUID_INTC_ECX_PCLMULQDQ)
4974 5008 hwcap_flags |= AV_386_PCLMULQDQ;
4975 5009 if ((*ecx & CPUID_INTC_ECX_XSAVE) &&
4976 5010 (*ecx & CPUID_INTC_ECX_OSXSAVE)) {
4977 5011 hwcap_flags |= AV_386_XSAVE;
4978 5012
4979 5013 if (*ecx & CPUID_INTC_ECX_AVX) {
4980 5014 uint32_t *ecx_7 = &CPI_FEATURES_7_0_ECX(cpi);
4981 5015 uint32_t *edx_7 = &CPI_FEATURES_7_0_EDX(cpi);
4982 5016
4983 5017 hwcap_flags |= AV_386_AVX;
4984 5018 if (*ecx & CPUID_INTC_ECX_F16C)
4985 5019 hwcap_flags_2 |= AV_386_2_F16C;
4986 5020 if (*ecx & CPUID_INTC_ECX_FMA)
4987 5021 hwcap_flags_2 |= AV_386_2_FMA;
4988 5022
4989 5023 if (*ebx & CPUID_INTC_EBX_7_0_BMI1)
4990 5024 hwcap_flags_2 |= AV_386_2_BMI1;
4991 5025 if (*ebx & CPUID_INTC_EBX_7_0_BMI2)
4992 5026 hwcap_flags_2 |= AV_386_2_BMI2;
4993 5027 if (*ebx & CPUID_INTC_EBX_7_0_AVX2)
4994 5028 hwcap_flags_2 |= AV_386_2_AVX2;
4995 5029 if (*ebx & CPUID_INTC_EBX_7_0_AVX512F)
4996 5030 hwcap_flags_2 |= AV_386_2_AVX512F;
4997 5031 if (*ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
4998 5032 hwcap_flags_2 |= AV_386_2_AVX512DQ;
4999 5033 if (*ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
5000 5034 hwcap_flags_2 |= AV_386_2_AVX512IFMA;
5001 5035 if (*ebx & CPUID_INTC_EBX_7_0_AVX512PF)
5002 5036 hwcap_flags_2 |= AV_386_2_AVX512PF;
5003 5037 if (*ebx & CPUID_INTC_EBX_7_0_AVX512ER)
5004 5038 hwcap_flags_2 |= AV_386_2_AVX512ER;
5005 5039 if (*ebx & CPUID_INTC_EBX_7_0_AVX512CD)
5006 5040 hwcap_flags_2 |= AV_386_2_AVX512CD;
5007 5041 if (*ebx & CPUID_INTC_EBX_7_0_AVX512BW)
5008 5042 hwcap_flags_2 |= AV_386_2_AVX512BW;
5009 5043 if (*ebx & CPUID_INTC_EBX_7_0_AVX512VL)
5010 5044 hwcap_flags_2 |= AV_386_2_AVX512VL;
5011 5045
5012 5046 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VBMI)
5013 5047 hwcap_flags_2 |= AV_386_2_AVX512VBMI;
5014 5048 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VNNI)
5015 5049 hwcap_flags_2 |= AV_386_2_AVX512_VNNI;
5016 5050 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
5017 5051 hwcap_flags_2 |= AV_386_2_AVX512VPOPCDQ;
5018 5052
5019 5053 if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124NNIW)
5020 5054 hwcap_flags_2 |= AV_386_2_AVX512_4NNIW;
5021 5055 if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
5022 5056 hwcap_flags_2 |= AV_386_2_AVX512_4FMAPS;
5023 5057 }
5024 5058 }
5025 5059 if (*ecx & CPUID_INTC_ECX_VMX)
5026 5060 hwcap_flags |= AV_386_VMX;
5027 5061 if (*ecx & CPUID_INTC_ECX_POPCNT)
5028 5062 hwcap_flags |= AV_386_POPCNT;
5029 5063 if (*edx & CPUID_INTC_EDX_FPU)
5030 5064 hwcap_flags |= AV_386_FPU;
5031 5065 if (*edx & CPUID_INTC_EDX_MMX)
5032 5066 hwcap_flags |= AV_386_MMX;
5033 5067
5034 5068 if (*edx & CPUID_INTC_EDX_TSC)
5035 5069 hwcap_flags |= AV_386_TSC;
5036 5070 if (*edx & CPUID_INTC_EDX_CX8)
5037 5071 hwcap_flags |= AV_386_CX8;
5038 5072 if (*edx & CPUID_INTC_EDX_CMOV)
5039 5073 hwcap_flags |= AV_386_CMOV;
5040 5074 if (*ecx & CPUID_INTC_ECX_CX16)
5041 5075 hwcap_flags |= AV_386_CX16;
5042 5076
5043 5077 if (*ecx & CPUID_INTC_ECX_RDRAND)
5044 5078 hwcap_flags_2 |= AV_386_2_RDRAND;
5045 5079 if (*ebx & CPUID_INTC_EBX_7_0_ADX)
5046 5080 hwcap_flags_2 |= AV_386_2_ADX;
5047 5081 if (*ebx & CPUID_INTC_EBX_7_0_RDSEED)
5048 5082 hwcap_flags_2 |= AV_386_2_RDSEED;
5049 5083 if (*ebx & CPUID_INTC_EBX_7_0_SHA)
5050 5084 hwcap_flags_2 |= AV_386_2_SHA;
5051 5085 if (*ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
5052 5086 hwcap_flags_2 |= AV_386_2_FSGSBASE;
5053 5087 if (*ebx & CPUID_INTC_EBX_7_0_CLWB)
5054 5088 hwcap_flags_2 |= AV_386_2_CLWB;
5055 5089 if (*ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
5056 5090 hwcap_flags_2 |= AV_386_2_CLFLUSHOPT;
5057 5091
5058 5092 }
5059 5093 /*
5060 5094 * Check a few miscilaneous features.
5061 5095 */
5062 5096 if (is_x86_feature(x86_featureset, X86FSET_CLZERO))
5063 5097 hwcap_flags_2 |= AV_386_2_CLZERO;
5064 5098
5065 5099 if (cpi->cpi_xmaxeax < 0x80000001)
5066 5100 goto pass4_done;
5067 5101
5068 5102 switch (cpi->cpi_vendor) {
5069 5103 struct cpuid_regs cp;
5070 5104 uint32_t *edx, *ecx;
5071 5105
5072 5106 case X86_VENDOR_Intel:
5073 5107 /*
5074 5108 * Seems like Intel duplicated what we necessary
5075 5109 * here to make the initial crop of 64-bit OS's work.
5076 5110 * Hopefully, those are the only "extended" bits
5077 5111 * they'll add.
5078 5112 */
5079 5113 /*FALLTHROUGH*/
5080 5114
5081 5115 case X86_VENDOR_AMD:
5082 5116 edx = &cpi->cpi_support[AMD_EDX_FEATURES];
5083 5117 ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
5084 5118
5085 5119 *edx = CPI_FEATURES_XTD_EDX(cpi);
5086 5120 *ecx = CPI_FEATURES_XTD_ECX(cpi);
5087 5121
5088 5122 /*
5089 5123 * [these features require explicit kernel support]
5090 5124 */
5091 5125 switch (cpi->cpi_vendor) {
5092 5126 case X86_VENDOR_Intel:
5093 5127 if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
5094 5128 *edx &= ~CPUID_AMD_EDX_TSCP;
5095 5129 break;
5096 5130
5097 5131 case X86_VENDOR_AMD:
5098 5132 if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
5099 5133 *edx &= ~CPUID_AMD_EDX_TSCP;
5100 5134 if (!is_x86_feature(x86_featureset, X86FSET_SSE4A))
5101 5135 *ecx &= ~CPUID_AMD_ECX_SSE4A;
5102 5136 break;
5103 5137
5104 5138 default:
5105 5139 break;
5106 5140 }
5107 5141
5108 5142 /*
5109 5143 * [no explicit support required beyond
5110 5144 * x87 fp context and exception handlers]
5111 5145 */
5112 5146 if (!fpu_exists)
5113 5147 *edx &= ~(CPUID_AMD_EDX_MMXamd |
5114 5148 CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
5115 5149
5116 5150 if (!is_x86_feature(x86_featureset, X86FSET_NX))
5117 5151 *edx &= ~CPUID_AMD_EDX_NX;
5118 5152 #if !defined(__amd64)
5119 5153 *edx &= ~CPUID_AMD_EDX_LM;
5120 5154 #endif
5121 5155 /*
5122 5156 * Now map the supported feature vector to
5123 5157 * things that we think userland will care about.
5124 5158 */
5125 5159 #if defined(__amd64)
5126 5160 if (*edx & CPUID_AMD_EDX_SYSC)
5127 5161 hwcap_flags |= AV_386_AMD_SYSC;
5128 5162 #endif
5129 5163 if (*edx & CPUID_AMD_EDX_MMXamd)
5130 5164 hwcap_flags |= AV_386_AMD_MMX;
5131 5165 if (*edx & CPUID_AMD_EDX_3DNow)
5132 5166 hwcap_flags |= AV_386_AMD_3DNow;
5133 5167 if (*edx & CPUID_AMD_EDX_3DNowx)
5134 5168 hwcap_flags |= AV_386_AMD_3DNowx;
5135 5169 if (*ecx & CPUID_AMD_ECX_SVM)
5136 5170 hwcap_flags |= AV_386_AMD_SVM;
5137 5171
5138 5172 switch (cpi->cpi_vendor) {
5139 5173 case X86_VENDOR_AMD:
5140 5174 if (*edx & CPUID_AMD_EDX_TSCP)
5141 5175 hwcap_flags |= AV_386_TSCP;
5142 5176 if (*ecx & CPUID_AMD_ECX_AHF64)
5143 5177 hwcap_flags |= AV_386_AHF;
5144 5178 if (*ecx & CPUID_AMD_ECX_SSE4A)
5145 5179 hwcap_flags |= AV_386_AMD_SSE4A;
5146 5180 if (*ecx & CPUID_AMD_ECX_LZCNT)
5147 5181 hwcap_flags |= AV_386_AMD_LZCNT;
5148 5182 if (*ecx & CPUID_AMD_ECX_MONITORX)
5149 5183 hwcap_flags_2 |= AV_386_2_MONITORX;
5150 5184 break;
5151 5185
5152 5186 case X86_VENDOR_Intel:
5153 5187 if (*edx & CPUID_AMD_EDX_TSCP)
5154 5188 hwcap_flags |= AV_386_TSCP;
5155 5189 if (*ecx & CPUID_AMD_ECX_LZCNT)
5156 5190 hwcap_flags |= AV_386_AMD_LZCNT;
5157 5191 /*
5158 5192 * Aarrgh.
5159 5193 * Intel uses a different bit in the same word.
5160 5194 */
5161 5195 if (*ecx & CPUID_INTC_ECX_AHF64)
5162 5196 hwcap_flags |= AV_386_AHF;
5163 5197 break;
5164 5198
5165 5199 default:
5166 5200 break;
5167 5201 }
5168 5202 break;
5169 5203
5170 5204 case X86_VENDOR_TM:
5171 5205 cp.cp_eax = 0x80860001;
5172 5206 (void) __cpuid_insn(&cp);
5173 5207 cpi->cpi_support[TM_EDX_FEATURES] = cp.cp_edx;
5174 5208 break;
5175 5209
5176 5210 default:
5177 5211 break;
5178 5212 }
5179 5213
5180 5214 pass4_done:
5181 5215 cpi->cpi_pass = 4;
5182 5216 if (hwcap_out != NULL) {
5183 5217 hwcap_out[0] = hwcap_flags;
5184 5218 hwcap_out[1] = hwcap_flags_2;
5185 5219 }
5186 5220 }
5187 5221
5188 5222
5189 5223 /*
5190 5224 * Simulate the cpuid instruction using the data we previously
5191 5225 * captured about this CPU. We try our best to return the truth
5192 5226 * about the hardware, independently of kernel support.
5193 5227 */
5194 5228 uint32_t
5195 5229 cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
5196 5230 {
5197 5231 struct cpuid_info *cpi;
5198 5232 struct cpuid_regs *xcp;
5199 5233
5200 5234 if (cpu == NULL)
5201 5235 cpu = CPU;
5202 5236 cpi = cpu->cpu_m.mcpu_cpi;
5203 5237
5204 5238 ASSERT(cpuid_checkpass(cpu, 3));
5205 5239
5206 5240 /*
5207 5241 * CPUID data is cached in two separate places: cpi_std for standard
5208 5242 * CPUID leaves , and cpi_extd for extended CPUID leaves.
5209 5243 */
5210 5244 if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
5211 5245 xcp = &cpi->cpi_std[cp->cp_eax];
5212 5246 } else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
5213 5247 cp->cp_eax <= cpi->cpi_xmaxeax &&
5214 5248 cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
5215 5249 xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
5216 5250 } else {
5217 5251 /*
5218 5252 * The caller is asking for data from an input parameter which
5219 5253 * the kernel has not cached. In this case we go fetch from
5220 5254 * the hardware and return the data directly to the user.
5221 5255 */
5222 5256 return (__cpuid_insn(cp));
5223 5257 }
5224 5258
5225 5259 cp->cp_eax = xcp->cp_eax;
5226 5260 cp->cp_ebx = xcp->cp_ebx;
5227 5261 cp->cp_ecx = xcp->cp_ecx;
5228 5262 cp->cp_edx = xcp->cp_edx;
5229 5263 return (cp->cp_eax);
5230 5264 }
5231 5265
5232 5266 int
5233 5267 cpuid_checkpass(cpu_t *cpu, int pass)
5234 5268 {
5235 5269 return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
5236 5270 cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
5237 5271 }
5238 5272
5239 5273 int
5240 5274 cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
5241 5275 {
5242 5276 ASSERT(cpuid_checkpass(cpu, 3));
5243 5277
5244 5278 return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
5245 5279 }
5246 5280
5247 5281 int
5248 5282 cpuid_is_cmt(cpu_t *cpu)
5249 5283 {
5250 5284 if (cpu == NULL)
5251 5285 cpu = CPU;
5252 5286
5253 5287 ASSERT(cpuid_checkpass(cpu, 1));
5254 5288
5255 5289 return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
5256 5290 }
5257 5291
5258 5292 /*
5259 5293 * AMD and Intel both implement the 64-bit variant of the syscall
5260 5294 * instruction (syscallq), so if there's -any- support for syscall,
5261 5295 * cpuid currently says "yes, we support this".
5262 5296 *
5263 5297 * However, Intel decided to -not- implement the 32-bit variant of the
5264 5298 * syscall instruction, so we provide a predicate to allow our caller
5265 5299 * to test that subtlety here.
5266 5300 *
5267 5301 * XXPV Currently, 32-bit syscall instructions don't work via the hypervisor,
5268 5302 * even in the case where the hardware would in fact support it.
5269 5303 */
5270 5304 /*ARGSUSED*/
5271 5305 int
5272 5306 cpuid_syscall32_insn(cpu_t *cpu)
5273 5307 {
5274 5308 ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), 1));
5275 5309
5276 5310 #if !defined(__xpv)
5277 5311 if (cpu == NULL)
5278 5312 cpu = CPU;
5279 5313
5280 5314 /*CSTYLED*/
5281 5315 {
5282 5316 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5283 5317
5284 5318 if (cpi->cpi_vendor == X86_VENDOR_AMD &&
5285 5319 cpi->cpi_xmaxeax >= 0x80000001 &&
5286 5320 (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
5287 5321 return (1);
5288 5322 }
5289 5323 #endif
5290 5324 return (0);
5291 5325 }
5292 5326
5293 5327 int
5294 5328 cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
5295 5329 {
5296 5330 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5297 5331
5298 5332 static const char fmt[] =
5299 5333 "x86 (%s %X family %d model %d step %d clock %d MHz)";
5300 5334 static const char fmt_ht[] =
5301 5335 "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
5302 5336
5303 5337 ASSERT(cpuid_checkpass(cpu, 1));
5304 5338
5305 5339 if (cpuid_is_cmt(cpu))
5306 5340 return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
5307 5341 cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5308 5342 cpi->cpi_family, cpi->cpi_model,
5309 5343 cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5310 5344 return (snprintf(s, n, fmt,
5311 5345 cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5312 5346 cpi->cpi_family, cpi->cpi_model,
5313 5347 cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5314 5348 }
5315 5349
5316 5350 const char *
5317 5351 cpuid_getvendorstr(cpu_t *cpu)
5318 5352 {
5319 5353 ASSERT(cpuid_checkpass(cpu, 1));
5320 5354 return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
5321 5355 }
5322 5356
5323 5357 uint_t
5324 5358 cpuid_getvendor(cpu_t *cpu)
5325 5359 {
5326 5360 ASSERT(cpuid_checkpass(cpu, 1));
5327 5361 return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
5328 5362 }
5329 5363
5330 5364 uint_t
5331 5365 cpuid_getfamily(cpu_t *cpu)
5332 5366 {
5333 5367 ASSERT(cpuid_checkpass(cpu, 1));
5334 5368 return (cpu->cpu_m.mcpu_cpi->cpi_family);
5335 5369 }
5336 5370
5337 5371 uint_t
5338 5372 cpuid_getmodel(cpu_t *cpu)
5339 5373 {
5340 5374 ASSERT(cpuid_checkpass(cpu, 1));
5341 5375 return (cpu->cpu_m.mcpu_cpi->cpi_model);
5342 5376 }
5343 5377
5344 5378 uint_t
5345 5379 cpuid_get_ncpu_per_chip(cpu_t *cpu)
5346 5380 {
5347 5381 ASSERT(cpuid_checkpass(cpu, 1));
5348 5382 return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
5349 5383 }
5350 5384
5351 5385 uint_t
5352 5386 cpuid_get_ncore_per_chip(cpu_t *cpu)
5353 5387 {
5354 5388 ASSERT(cpuid_checkpass(cpu, 1));
5355 5389 return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
5356 5390 }
5357 5391
5358 5392 uint_t
5359 5393 cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
5360 5394 {
5361 5395 ASSERT(cpuid_checkpass(cpu, 2));
5362 5396 return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
5363 5397 }
5364 5398
5365 5399 id_t
5366 5400 cpuid_get_last_lvl_cacheid(cpu_t *cpu)
5367 5401 {
5368 5402 ASSERT(cpuid_checkpass(cpu, 2));
5369 5403 return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5370 5404 }
5371 5405
5372 5406 uint_t
5373 5407 cpuid_getstep(cpu_t *cpu)
5374 5408 {
5375 5409 ASSERT(cpuid_checkpass(cpu, 1));
5376 5410 return (cpu->cpu_m.mcpu_cpi->cpi_step);
5377 5411 }
5378 5412
5379 5413 uint_t
5380 5414 cpuid_getsig(struct cpu *cpu)
5381 5415 {
5382 5416 ASSERT(cpuid_checkpass(cpu, 1));
5383 5417 return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
5384 5418 }
5385 5419
5386 5420 uint32_t
5387 5421 cpuid_getchiprev(struct cpu *cpu)
5388 5422 {
5389 5423 ASSERT(cpuid_checkpass(cpu, 1));
5390 5424 return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
5391 5425 }
5392 5426
5393 5427 const char *
5394 5428 cpuid_getchiprevstr(struct cpu *cpu)
5395 5429 {
5396 5430 ASSERT(cpuid_checkpass(cpu, 1));
5397 5431 return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
5398 5432 }
5399 5433
5400 5434 uint32_t
5401 5435 cpuid_getsockettype(struct cpu *cpu)
5402 5436 {
5403 5437 ASSERT(cpuid_checkpass(cpu, 1));
5404 5438 return (cpu->cpu_m.mcpu_cpi->cpi_socket);
5405 5439 }
5406 5440
5407 5441 const char *
5408 5442 cpuid_getsocketstr(cpu_t *cpu)
5409 5443 {
5410 5444 static const char *socketstr = NULL;
5411 5445 struct cpuid_info *cpi;
5412 5446
5413 5447 ASSERT(cpuid_checkpass(cpu, 1));
5414 5448 cpi = cpu->cpu_m.mcpu_cpi;
5415 5449
5416 5450 /* Assume that socket types are the same across the system */
5417 5451 if (socketstr == NULL)
5418 5452 socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
5419 5453 cpi->cpi_model, cpi->cpi_step);
5420 5454
5421 5455
5422 5456 return (socketstr);
5423 5457 }
5424 5458
5425 5459 int
5426 5460 cpuid_get_chipid(cpu_t *cpu)
5427 5461 {
5428 5462 ASSERT(cpuid_checkpass(cpu, 1));
5429 5463
5430 5464 if (cpuid_is_cmt(cpu))
5431 5465 return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
5432 5466 return (cpu->cpu_id);
5433 5467 }
5434 5468
5435 5469 id_t
5436 5470 cpuid_get_coreid(cpu_t *cpu)
5437 5471 {
5438 5472 ASSERT(cpuid_checkpass(cpu, 1));
5439 5473 return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
5440 5474 }
5441 5475
5442 5476 int
5443 5477 cpuid_get_pkgcoreid(cpu_t *cpu)
5444 5478 {
5445 5479 ASSERT(cpuid_checkpass(cpu, 1));
5446 5480 return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
5447 5481 }
5448 5482
5449 5483 int
5450 5484 cpuid_get_clogid(cpu_t *cpu)
5451 5485 {
5452 5486 ASSERT(cpuid_checkpass(cpu, 1));
5453 5487 return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
5454 5488 }
5455 5489
5456 5490 int
5457 5491 cpuid_get_cacheid(cpu_t *cpu)
5458 5492 {
5459 5493 ASSERT(cpuid_checkpass(cpu, 1));
5460 5494 return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5461 5495 }
5462 5496
5463 5497 uint_t
5464 5498 cpuid_get_procnodeid(cpu_t *cpu)
5465 5499 {
5466 5500 ASSERT(cpuid_checkpass(cpu, 1));
5467 5501 return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
5468 5502 }
5469 5503
5470 5504 uint_t
5471 5505 cpuid_get_procnodes_per_pkg(cpu_t *cpu)
5472 5506 {
5473 5507 ASSERT(cpuid_checkpass(cpu, 1));
5474 5508 return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
5475 5509 }
5476 5510
5477 5511 uint_t
5478 5512 cpuid_get_compunitid(cpu_t *cpu)
5479 5513 {
5480 5514 ASSERT(cpuid_checkpass(cpu, 1));
5481 5515 return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
5482 5516 }
5483 5517
5484 5518 uint_t
5485 5519 cpuid_get_cores_per_compunit(cpu_t *cpu)
5486 5520 {
5487 5521 ASSERT(cpuid_checkpass(cpu, 1));
5488 5522 return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
5489 5523 }
5490 5524
5491 5525 /*ARGSUSED*/
5492 5526 int
5493 5527 cpuid_have_cr8access(cpu_t *cpu)
5494 5528 {
5495 5529 #if defined(__amd64)
5496 5530 return (1);
5497 5531 #else
5498 5532 struct cpuid_info *cpi;
5499 5533
5500 5534 ASSERT(cpu != NULL);
5501 5535 cpi = cpu->cpu_m.mcpu_cpi;
5502 5536 if (cpi->cpi_vendor == X86_VENDOR_AMD && cpi->cpi_maxeax >= 1 &&
5503 5537 (CPI_FEATURES_XTD_ECX(cpi) & CPUID_AMD_ECX_CR8D) != 0)
5504 5538 return (1);
5505 5539 return (0);
5506 5540 #endif
5507 5541 }
5508 5542
5509 5543 uint32_t
5510 5544 cpuid_get_apicid(cpu_t *cpu)
5511 5545 {
5512 5546 ASSERT(cpuid_checkpass(cpu, 1));
5513 5547 if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
5514 5548 return (UINT32_MAX);
5515 5549 } else {
5516 5550 return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
5517 5551 }
5518 5552 }
5519 5553
5520 5554 void
5521 5555 cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
5522 5556 {
5523 5557 struct cpuid_info *cpi;
5524 5558
5525 5559 if (cpu == NULL)
5526 5560 cpu = CPU;
5527 5561 cpi = cpu->cpu_m.mcpu_cpi;
5528 5562
5529 5563 ASSERT(cpuid_checkpass(cpu, 1));
5530 5564
5531 5565 if (pabits)
5532 5566 *pabits = cpi->cpi_pabits;
5533 5567 if (vabits)
5534 5568 *vabits = cpi->cpi_vabits;
5535 5569 }
5536 5570
5537 5571 size_t
5538 5572 cpuid_get_xsave_size()
5539 5573 {
5540 5574 return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
5541 5575 sizeof (struct xsave_state)));
5542 5576 }
5543 5577
5544 5578 /*
5545 5579 * Return true if the CPUs on this system require 'pointer clearing' for the
5546 5580 * floating point error pointer exception handling. In the past, this has been
5547 5581 * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
5548 5582 * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
5549 5583 * feature bit and is reflected in the cpi_fp_amd_save member.
5550 5584 */
5551 5585 boolean_t
5552 5586 cpuid_need_fp_excp_handling()
5553 5587 {
5554 5588 return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
5555 5589 cpuid_info0.cpi_fp_amd_save != 0);
5556 5590 }
5557 5591
5558 5592 /*
5559 5593 * Returns the number of data TLB entries for a corresponding
5560 5594 * pagesize. If it can't be computed, or isn't known, the
5561 5595 * routine returns zero. If you ask about an architecturally
5562 5596 * impossible pagesize, the routine will panic (so that the
5563 5597 * hat implementor knows that things are inconsistent.)
5564 5598 */
5565 5599 uint_t
5566 5600 cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
5567 5601 {
5568 5602 struct cpuid_info *cpi;
5569 5603 uint_t dtlb_nent = 0;
5570 5604
5571 5605 if (cpu == NULL)
5572 5606 cpu = CPU;
5573 5607 cpi = cpu->cpu_m.mcpu_cpi;
5574 5608
5575 5609 ASSERT(cpuid_checkpass(cpu, 1));
5576 5610
5577 5611 /*
5578 5612 * Check the L2 TLB info
5579 5613 */
5580 5614 if (cpi->cpi_xmaxeax >= 0x80000006) {
5581 5615 struct cpuid_regs *cp = &cpi->cpi_extd[6];
5582 5616
5583 5617 switch (pagesize) {
5584 5618
5585 5619 case 4 * 1024:
5586 5620 /*
5587 5621 * All zero in the top 16 bits of the register
5588 5622 * indicates a unified TLB. Size is in low 16 bits.
5589 5623 */
5590 5624 if ((cp->cp_ebx & 0xffff0000) == 0)
5591 5625 dtlb_nent = cp->cp_ebx & 0x0000ffff;
5592 5626 else
5593 5627 dtlb_nent = BITX(cp->cp_ebx, 27, 16);
5594 5628 break;
5595 5629
5596 5630 case 2 * 1024 * 1024:
5597 5631 if ((cp->cp_eax & 0xffff0000) == 0)
5598 5632 dtlb_nent = cp->cp_eax & 0x0000ffff;
5599 5633 else
5600 5634 dtlb_nent = BITX(cp->cp_eax, 27, 16);
5601 5635 break;
5602 5636
5603 5637 default:
5604 5638 panic("unknown L2 pagesize");
5605 5639 /*NOTREACHED*/
5606 5640 }
5607 5641 }
5608 5642
5609 5643 if (dtlb_nent != 0)
5610 5644 return (dtlb_nent);
5611 5645
5612 5646 /*
5613 5647 * No L2 TLB support for this size, try L1.
5614 5648 */
5615 5649 if (cpi->cpi_xmaxeax >= 0x80000005) {
5616 5650 struct cpuid_regs *cp = &cpi->cpi_extd[5];
5617 5651
5618 5652 switch (pagesize) {
5619 5653 case 4 * 1024:
5620 5654 dtlb_nent = BITX(cp->cp_ebx, 23, 16);
5621 5655 break;
5622 5656 case 2 * 1024 * 1024:
5623 5657 dtlb_nent = BITX(cp->cp_eax, 23, 16);
5624 5658 break;
5625 5659 default:
5626 5660 panic("unknown L1 d-TLB pagesize");
5627 5661 /*NOTREACHED*/
5628 5662 }
5629 5663 }
5630 5664
5631 5665 return (dtlb_nent);
5632 5666 }
5633 5667
5634 5668 /*
5635 5669 * Return 0 if the erratum is not present or not applicable, positive
5636 5670 * if it is, and negative if the status of the erratum is unknown.
5637 5671 *
5638 5672 * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
5639 5673 * Processors" #25759, Rev 3.57, August 2005
5640 5674 */
5641 5675 int
5642 5676 cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
5643 5677 {
5644 5678 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5645 5679 uint_t eax;
5646 5680
5647 5681 /*
5648 5682 * Bail out if this CPU isn't an AMD CPU, or if it's
5649 5683 * a legacy (32-bit) AMD CPU.
5650 5684 */
5651 5685 if (cpi->cpi_vendor != X86_VENDOR_AMD ||
5652 5686 cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
5653 5687 cpi->cpi_family == 6) {
5654 5688 return (0);
5655 5689 }
5656 5690
5657 5691 eax = cpi->cpi_std[1].cp_eax;
5658 5692
5659 5693 #define SH_B0(eax) (eax == 0xf40 || eax == 0xf50)
5660 5694 #define SH_B3(eax) (eax == 0xf51)
5661 5695 #define B(eax) (SH_B0(eax) || SH_B3(eax))
5662 5696
5663 5697 #define SH_C0(eax) (eax == 0xf48 || eax == 0xf58)
5664 5698
5665 5699 #define SH_CG(eax) (eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
5666 5700 #define DH_CG(eax) (eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
5667 5701 #define CH_CG(eax) (eax == 0xf82 || eax == 0xfb2)
5668 5702 #define CG(eax) (SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
5669 5703
5670 5704 #define SH_D0(eax) (eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
5671 5705 #define DH_D0(eax) (eax == 0x10fc0 || eax == 0x10ff0)
5672 5706 #define CH_D0(eax) (eax == 0x10f80 || eax == 0x10fb0)
5673 5707 #define D0(eax) (SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
5674 5708
5675 5709 #define SH_E0(eax) (eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
5676 5710 #define JH_E1(eax) (eax == 0x20f10) /* JH8_E0 had 0x20f30 */
5677 5711 #define DH_E3(eax) (eax == 0x20fc0 || eax == 0x20ff0)
5678 5712 #define SH_E4(eax) (eax == 0x20f51 || eax == 0x20f71)
5679 5713 #define BH_E4(eax) (eax == 0x20fb1)
5680 5714 #define SH_E5(eax) (eax == 0x20f42)
5681 5715 #define DH_E6(eax) (eax == 0x20ff2 || eax == 0x20fc2)
5682 5716 #define JH_E6(eax) (eax == 0x20f12 || eax == 0x20f32)
5683 5717 #define EX(eax) (SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
5684 5718 SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
5685 5719 DH_E6(eax) || JH_E6(eax))
5686 5720
5687 5721 #define DR_AX(eax) (eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
5688 5722 #define DR_B0(eax) (eax == 0x100f20)
5689 5723 #define DR_B1(eax) (eax == 0x100f21)
5690 5724 #define DR_BA(eax) (eax == 0x100f2a)
5691 5725 #define DR_B2(eax) (eax == 0x100f22)
5692 5726 #define DR_B3(eax) (eax == 0x100f23)
5693 5727 #define RB_C0(eax) (eax == 0x100f40)
5694 5728
5695 5729 switch (erratum) {
5696 5730 case 1:
5697 5731 return (cpi->cpi_family < 0x10);
5698 5732 case 51: /* what does the asterisk mean? */
5699 5733 return (B(eax) || SH_C0(eax) || CG(eax));
5700 5734 case 52:
5701 5735 return (B(eax));
5702 5736 case 57:
5703 5737 return (cpi->cpi_family <= 0x11);
5704 5738 case 58:
5705 5739 return (B(eax));
5706 5740 case 60:
5707 5741 return (cpi->cpi_family <= 0x11);
5708 5742 case 61:
5709 5743 case 62:
5710 5744 case 63:
5711 5745 case 64:
5712 5746 case 65:
5713 5747 case 66:
5714 5748 case 68:
5715 5749 case 69:
5716 5750 case 70:
5717 5751 case 71:
5718 5752 return (B(eax));
5719 5753 case 72:
5720 5754 return (SH_B0(eax));
5721 5755 case 74:
5722 5756 return (B(eax));
5723 5757 case 75:
5724 5758 return (cpi->cpi_family < 0x10);
5725 5759 case 76:
5726 5760 return (B(eax));
5727 5761 case 77:
5728 5762 return (cpi->cpi_family <= 0x11);
5729 5763 case 78:
5730 5764 return (B(eax) || SH_C0(eax));
5731 5765 case 79:
5732 5766 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5733 5767 case 80:
5734 5768 case 81:
5735 5769 case 82:
5736 5770 return (B(eax));
5737 5771 case 83:
5738 5772 return (B(eax) || SH_C0(eax) || CG(eax));
5739 5773 case 85:
5740 5774 return (cpi->cpi_family < 0x10);
5741 5775 case 86:
5742 5776 return (SH_C0(eax) || CG(eax));
5743 5777 case 88:
5744 5778 #if !defined(__amd64)
5745 5779 return (0);
5746 5780 #else
5747 5781 return (B(eax) || SH_C0(eax));
5748 5782 #endif
5749 5783 case 89:
5750 5784 return (cpi->cpi_family < 0x10);
5751 5785 case 90:
5752 5786 return (B(eax) || SH_C0(eax) || CG(eax));
5753 5787 case 91:
5754 5788 case 92:
5755 5789 return (B(eax) || SH_C0(eax));
5756 5790 case 93:
5757 5791 return (SH_C0(eax));
5758 5792 case 94:
5759 5793 return (B(eax) || SH_C0(eax) || CG(eax));
5760 5794 case 95:
5761 5795 #if !defined(__amd64)
5762 5796 return (0);
5763 5797 #else
5764 5798 return (B(eax) || SH_C0(eax));
5765 5799 #endif
5766 5800 case 96:
5767 5801 return (B(eax) || SH_C0(eax) || CG(eax));
5768 5802 case 97:
5769 5803 case 98:
5770 5804 return (SH_C0(eax) || CG(eax));
5771 5805 case 99:
5772 5806 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5773 5807 case 100:
5774 5808 return (B(eax) || SH_C0(eax));
5775 5809 case 101:
5776 5810 case 103:
5777 5811 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5778 5812 case 104:
5779 5813 return (SH_C0(eax) || CG(eax) || D0(eax));
5780 5814 case 105:
5781 5815 case 106:
5782 5816 case 107:
5783 5817 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5784 5818 case 108:
5785 5819 return (DH_CG(eax));
5786 5820 case 109:
5787 5821 return (SH_C0(eax) || CG(eax) || D0(eax));
5788 5822 case 110:
5789 5823 return (D0(eax) || EX(eax));
5790 5824 case 111:
5791 5825 return (CG(eax));
5792 5826 case 112:
5793 5827 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5794 5828 case 113:
5795 5829 return (eax == 0x20fc0);
5796 5830 case 114:
5797 5831 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
5798 5832 case 115:
5799 5833 return (SH_E0(eax) || JH_E1(eax));
5800 5834 case 116:
5801 5835 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
5802 5836 case 117:
5803 5837 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5804 5838 case 118:
5805 5839 return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
5806 5840 JH_E6(eax));
5807 5841 case 121:
5808 5842 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5809 5843 case 122:
5810 5844 return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
5811 5845 case 123:
5812 5846 return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
5813 5847 case 131:
5814 5848 return (cpi->cpi_family < 0x10);
5815 5849 case 6336786:
5816 5850
5817 5851 /*
5818 5852 * Test for AdvPowerMgmtInfo.TscPStateInvariant
5819 5853 * if this is a K8 family or newer processor. We're testing for
5820 5854 * this 'erratum' to determine whether or not we have a constant
5821 5855 * TSC.
5822 5856 *
5823 5857 * Our current fix for this is to disable the C1-Clock ramping.
5824 5858 * However, this doesn't work on newer processor families nor
5825 5859 * does it work when virtualized as those devices don't exist.
5826 5860 */
5827 5861 if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
5828 5862 return (0);
5829 5863 }
5830 5864
5831 5865 if (CPI_FAMILY(cpi) == 0xf) {
5832 5866 struct cpuid_regs regs;
5833 5867 regs.cp_eax = 0x80000007;
5834 5868 (void) __cpuid_insn(®s);
5835 5869 return (!(regs.cp_edx & 0x100));
5836 5870 }
5837 5871 return (0);
5838 5872 case 6323525:
5839 5873 /*
5840 5874 * This erratum (K8 #147) is not present on family 10 and newer.
5841 5875 */
5842 5876 if (cpi->cpi_family >= 0x10) {
5843 5877 return (0);
5844 5878 }
5845 5879 return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
5846 5880 (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
5847 5881
5848 5882 case 6671130:
5849 5883 /*
5850 5884 * check for processors (pre-Shanghai) that do not provide
5851 5885 * optimal management of 1gb ptes in its tlb.
5852 5886 */
5853 5887 return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
5854 5888
5855 5889 case 298:
5856 5890 return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
5857 5891 DR_B2(eax) || RB_C0(eax));
5858 5892
5859 5893 case 721:
5860 5894 #if defined(__amd64)
5861 5895 return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
5862 5896 #else
5863 5897 return (0);
5864 5898 #endif
5865 5899
5866 5900 default:
5867 5901 return (-1);
5868 5902
5869 5903 }
5870 5904 }
5871 5905
5872 5906 /*
5873 5907 * Determine if specified erratum is present via OSVW (OS Visible Workaround).
5874 5908 * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
5875 5909 */
5876 5910 int
5877 5911 osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
5878 5912 {
5879 5913 struct cpuid_info *cpi;
5880 5914 uint_t osvwid;
5881 5915 static int osvwfeature = -1;
5882 5916 uint64_t osvwlength;
5883 5917
5884 5918
5885 5919 cpi = cpu->cpu_m.mcpu_cpi;
5886 5920
5887 5921 /* confirm OSVW supported */
5888 5922 if (osvwfeature == -1) {
5889 5923 osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
5890 5924 } else {
5891 5925 /* assert that osvw feature setting is consistent on all cpus */
5892 5926 ASSERT(osvwfeature ==
5893 5927 (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
5894 5928 }
5895 5929 if (!osvwfeature)
5896 5930 return (-1);
5897 5931
5898 5932 osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
5899 5933
5900 5934 switch (erratum) {
5901 5935 case 298: /* osvwid is 0 */
5902 5936 osvwid = 0;
5903 5937 if (osvwlength <= (uint64_t)osvwid) {
5904 5938 /* osvwid 0 is unknown */
5905 5939 return (-1);
5906 5940 }
5907 5941
5908 5942 /*
5909 5943 * Check the OSVW STATUS MSR to determine the state
5910 5944 * of the erratum where:
5911 5945 * 0 - fixed by HW
5912 5946 * 1 - BIOS has applied the workaround when BIOS
5913 5947 * workaround is available. (Or for other errata,
5914 5948 * OS workaround is required.)
5915 5949 * For a value of 1, caller will confirm that the
5916 5950 * erratum 298 workaround has indeed been applied by BIOS.
5917 5951 *
5918 5952 * A 1 may be set in cpus that have a HW fix
5919 5953 * in a mixed cpu system. Regarding erratum 298:
5920 5954 * In a multiprocessor platform, the workaround above
5921 5955 * should be applied to all processors regardless of
5922 5956 * silicon revision when an affected processor is
5923 5957 * present.
5924 5958 */
5925 5959
5926 5960 return (rdmsr(MSR_AMD_OSVW_STATUS +
5927 5961 (osvwid / OSVW_ID_CNT_PER_MSR)) &
5928 5962 (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
5929 5963
5930 5964 default:
5931 5965 return (-1);
5932 5966 }
5933 5967 }
5934 5968
5935 5969 static const char assoc_str[] = "associativity";
5936 5970 static const char line_str[] = "line-size";
5937 5971 static const char size_str[] = "size";
5938 5972
5939 5973 static void
5940 5974 add_cache_prop(dev_info_t *devi, const char *label, const char *type,
5941 5975 uint32_t val)
5942 5976 {
5943 5977 char buf[128];
5944 5978
5945 5979 /*
5946 5980 * ndi_prop_update_int() is used because it is desirable for
5947 5981 * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
5948 5982 */
5949 5983 if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
5950 5984 (void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
5951 5985 }
5952 5986
5953 5987 /*
5954 5988 * Intel-style cache/tlb description
5955 5989 *
5956 5990 * Standard cpuid level 2 gives a randomly ordered
5957 5991 * selection of tags that index into a table that describes
5958 5992 * cache and tlb properties.
5959 5993 */
5960 5994
5961 5995 static const char l1_icache_str[] = "l1-icache";
5962 5996 static const char l1_dcache_str[] = "l1-dcache";
5963 5997 static const char l2_cache_str[] = "l2-cache";
5964 5998 static const char l3_cache_str[] = "l3-cache";
5965 5999 static const char itlb4k_str[] = "itlb-4K";
5966 6000 static const char dtlb4k_str[] = "dtlb-4K";
5967 6001 static const char itlb2M_str[] = "itlb-2M";
5968 6002 static const char itlb4M_str[] = "itlb-4M";
5969 6003 static const char dtlb4M_str[] = "dtlb-4M";
5970 6004 static const char dtlb24_str[] = "dtlb0-2M-4M";
5971 6005 static const char itlb424_str[] = "itlb-4K-2M-4M";
5972 6006 static const char itlb24_str[] = "itlb-2M-4M";
5973 6007 static const char dtlb44_str[] = "dtlb-4K-4M";
5974 6008 static const char sl1_dcache_str[] = "sectored-l1-dcache";
5975 6009 static const char sl2_cache_str[] = "sectored-l2-cache";
5976 6010 static const char itrace_str[] = "itrace-cache";
5977 6011 static const char sl3_cache_str[] = "sectored-l3-cache";
5978 6012 static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
5979 6013
5980 6014 static const struct cachetab {
5981 6015 uint8_t ct_code;
5982 6016 uint8_t ct_assoc;
5983 6017 uint16_t ct_line_size;
5984 6018 size_t ct_size;
5985 6019 const char *ct_label;
5986 6020 } intel_ctab[] = {
5987 6021 /*
5988 6022 * maintain descending order!
5989 6023 *
5990 6024 * Codes ignored - Reason
5991 6025 * ----------------------
5992 6026 * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
5993 6027 * f0H/f1H - Currently we do not interpret prefetch size by design
5994 6028 */
5995 6029 { 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
5996 6030 { 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
5997 6031 { 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
5998 6032 { 0xde, 12, 64, 6*1024*1024, l3_cache_str},
5999 6033 { 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
6000 6034 { 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
6001 6035 { 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
6002 6036 { 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
6003 6037 { 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
6004 6038 { 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
6005 6039 { 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
6006 6040 { 0xd0, 4, 64, 512*1024, l3_cache_str},
6007 6041 { 0xca, 4, 0, 512, sh_l2_tlb4k_str},
6008 6042 { 0xc0, 4, 0, 8, dtlb44_str },
6009 6043 { 0xba, 4, 0, 64, dtlb4k_str },
6010 6044 { 0xb4, 4, 0, 256, dtlb4k_str },
6011 6045 { 0xb3, 4, 0, 128, dtlb4k_str },
6012 6046 { 0xb2, 4, 0, 64, itlb4k_str },
6013 6047 { 0xb0, 4, 0, 128, itlb4k_str },
6014 6048 { 0x87, 8, 64, 1024*1024, l2_cache_str},
6015 6049 { 0x86, 4, 64, 512*1024, l2_cache_str},
6016 6050 { 0x85, 8, 32, 2*1024*1024, l2_cache_str},
6017 6051 { 0x84, 8, 32, 1024*1024, l2_cache_str},
6018 6052 { 0x83, 8, 32, 512*1024, l2_cache_str},
6019 6053 { 0x82, 8, 32, 256*1024, l2_cache_str},
6020 6054 { 0x80, 8, 64, 512*1024, l2_cache_str},
6021 6055 { 0x7f, 2, 64, 512*1024, l2_cache_str},
6022 6056 { 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
6023 6057 { 0x7c, 8, 64, 1024*1024, sl2_cache_str},
6024 6058 { 0x7b, 8, 64, 512*1024, sl2_cache_str},
6025 6059 { 0x7a, 8, 64, 256*1024, sl2_cache_str},
6026 6060 { 0x79, 8, 64, 128*1024, sl2_cache_str},
6027 6061 { 0x78, 8, 64, 1024*1024, l2_cache_str},
6028 6062 { 0x73, 8, 0, 64*1024, itrace_str},
6029 6063 { 0x72, 8, 0, 32*1024, itrace_str},
6030 6064 { 0x71, 8, 0, 16*1024, itrace_str},
6031 6065 { 0x70, 8, 0, 12*1024, itrace_str},
6032 6066 { 0x68, 4, 64, 32*1024, sl1_dcache_str},
6033 6067 { 0x67, 4, 64, 16*1024, sl1_dcache_str},
6034 6068 { 0x66, 4, 64, 8*1024, sl1_dcache_str},
6035 6069 { 0x60, 8, 64, 16*1024, sl1_dcache_str},
6036 6070 { 0x5d, 0, 0, 256, dtlb44_str},
6037 6071 { 0x5c, 0, 0, 128, dtlb44_str},
6038 6072 { 0x5b, 0, 0, 64, dtlb44_str},
6039 6073 { 0x5a, 4, 0, 32, dtlb24_str},
6040 6074 { 0x59, 0, 0, 16, dtlb4k_str},
6041 6075 { 0x57, 4, 0, 16, dtlb4k_str},
6042 6076 { 0x56, 4, 0, 16, dtlb4M_str},
6043 6077 { 0x55, 0, 0, 7, itlb24_str},
6044 6078 { 0x52, 0, 0, 256, itlb424_str},
6045 6079 { 0x51, 0, 0, 128, itlb424_str},
6046 6080 { 0x50, 0, 0, 64, itlb424_str},
6047 6081 { 0x4f, 0, 0, 32, itlb4k_str},
6048 6082 { 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
6049 6083 { 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
6050 6084 { 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
6051 6085 { 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
6052 6086 { 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
6053 6087 { 0x49, 16, 64, 4*1024*1024, l3_cache_str},
6054 6088 { 0x48, 12, 64, 3*1024*1024, l2_cache_str},
6055 6089 { 0x47, 8, 64, 8*1024*1024, l3_cache_str},
6056 6090 { 0x46, 4, 64, 4*1024*1024, l3_cache_str},
6057 6091 { 0x45, 4, 32, 2*1024*1024, l2_cache_str},
6058 6092 { 0x44, 4, 32, 1024*1024, l2_cache_str},
6059 6093 { 0x43, 4, 32, 512*1024, l2_cache_str},
6060 6094 { 0x42, 4, 32, 256*1024, l2_cache_str},
6061 6095 { 0x41, 4, 32, 128*1024, l2_cache_str},
6062 6096 { 0x3e, 4, 64, 512*1024, sl2_cache_str},
6063 6097 { 0x3d, 6, 64, 384*1024, sl2_cache_str},
6064 6098 { 0x3c, 4, 64, 256*1024, sl2_cache_str},
6065 6099 { 0x3b, 2, 64, 128*1024, sl2_cache_str},
6066 6100 { 0x3a, 6, 64, 192*1024, sl2_cache_str},
6067 6101 { 0x39, 4, 64, 128*1024, sl2_cache_str},
6068 6102 { 0x30, 8, 64, 32*1024, l1_icache_str},
6069 6103 { 0x2c, 8, 64, 32*1024, l1_dcache_str},
6070 6104 { 0x29, 8, 64, 4096*1024, sl3_cache_str},
6071 6105 { 0x25, 8, 64, 2048*1024, sl3_cache_str},
6072 6106 { 0x23, 8, 64, 1024*1024, sl3_cache_str},
6073 6107 { 0x22, 4, 64, 512*1024, sl3_cache_str},
6074 6108 { 0x0e, 6, 64, 24*1024, l1_dcache_str},
6075 6109 { 0x0d, 4, 32, 16*1024, l1_dcache_str},
6076 6110 { 0x0c, 4, 32, 16*1024, l1_dcache_str},
6077 6111 { 0x0b, 4, 0, 4, itlb4M_str},
6078 6112 { 0x0a, 2, 32, 8*1024, l1_dcache_str},
6079 6113 { 0x08, 4, 32, 16*1024, l1_icache_str},
6080 6114 { 0x06, 4, 32, 8*1024, l1_icache_str},
6081 6115 { 0x05, 4, 0, 32, dtlb4M_str},
6082 6116 { 0x04, 4, 0, 8, dtlb4M_str},
6083 6117 { 0x03, 4, 0, 64, dtlb4k_str},
6084 6118 { 0x02, 4, 0, 2, itlb4M_str},
6085 6119 { 0x01, 4, 0, 32, itlb4k_str},
6086 6120 { 0 }
6087 6121 };
6088 6122
6089 6123 static const struct cachetab cyrix_ctab[] = {
6090 6124 { 0x70, 4, 0, 32, "tlb-4K" },
6091 6125 { 0x80, 4, 16, 16*1024, "l1-cache" },
6092 6126 { 0 }
6093 6127 };
6094 6128
6095 6129 /*
6096 6130 * Search a cache table for a matching entry
6097 6131 */
6098 6132 static const struct cachetab *
6099 6133 find_cacheent(const struct cachetab *ct, uint_t code)
6100 6134 {
6101 6135 if (code != 0) {
6102 6136 for (; ct->ct_code != 0; ct++)
6103 6137 if (ct->ct_code <= code)
6104 6138 break;
6105 6139 if (ct->ct_code == code)
6106 6140 return (ct);
6107 6141 }
6108 6142 return (NULL);
6109 6143 }
6110 6144
6111 6145 /*
6112 6146 * Populate cachetab entry with L2 or L3 cache-information using
6113 6147 * cpuid function 4. This function is called from intel_walk_cacheinfo()
6114 6148 * when descriptor 0x49 is encountered. It returns 0 if no such cache
6115 6149 * information is found.
6116 6150 */
6117 6151 static int
6118 6152 intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
6119 6153 {
6120 6154 uint32_t level, i;
6121 6155 int ret = 0;
6122 6156
6123 6157 for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
6124 6158 level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
6125 6159
6126 6160 if (level == 2 || level == 3) {
6127 6161 ct->ct_assoc =
6128 6162 CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
6129 6163 ct->ct_line_size =
6130 6164 CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
6131 6165 ct->ct_size = ct->ct_assoc *
6132 6166 (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
6133 6167 ct->ct_line_size *
6134 6168 (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
6135 6169
6136 6170 if (level == 2) {
6137 6171 ct->ct_label = l2_cache_str;
6138 6172 } else if (level == 3) {
6139 6173 ct->ct_label = l3_cache_str;
6140 6174 }
6141 6175 ret = 1;
6142 6176 }
6143 6177 }
6144 6178
6145 6179 return (ret);
6146 6180 }
6147 6181
6148 6182 /*
6149 6183 * Walk the cacheinfo descriptor, applying 'func' to every valid element
6150 6184 * The walk is terminated if the walker returns non-zero.
6151 6185 */
6152 6186 static void
6153 6187 intel_walk_cacheinfo(struct cpuid_info *cpi,
6154 6188 void *arg, int (*func)(void *, const struct cachetab *))
6155 6189 {
6156 6190 const struct cachetab *ct;
6157 6191 struct cachetab des_49_ct, des_b1_ct;
6158 6192 uint8_t *dp;
6159 6193 int i;
6160 6194
6161 6195 if ((dp = cpi->cpi_cacheinfo) == NULL)
6162 6196 return;
6163 6197 for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6164 6198 /*
6165 6199 * For overloaded descriptor 0x49 we use cpuid function 4
6166 6200 * if supported by the current processor, to create
6167 6201 * cache information.
6168 6202 * For overloaded descriptor 0xb1 we use X86_PAE flag
6169 6203 * to disambiguate the cache information.
6170 6204 */
6171 6205 if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
6172 6206 intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
6173 6207 ct = &des_49_ct;
6174 6208 } else if (*dp == 0xb1) {
6175 6209 des_b1_ct.ct_code = 0xb1;
6176 6210 des_b1_ct.ct_assoc = 4;
6177 6211 des_b1_ct.ct_line_size = 0;
6178 6212 if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
6179 6213 des_b1_ct.ct_size = 8;
6180 6214 des_b1_ct.ct_label = itlb2M_str;
6181 6215 } else {
6182 6216 des_b1_ct.ct_size = 4;
6183 6217 des_b1_ct.ct_label = itlb4M_str;
6184 6218 }
6185 6219 ct = &des_b1_ct;
6186 6220 } else {
6187 6221 if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
6188 6222 continue;
6189 6223 }
6190 6224 }
6191 6225
6192 6226 if (func(arg, ct) != 0) {
6193 6227 break;
6194 6228 }
6195 6229 }
6196 6230 }
6197 6231
6198 6232 /*
6199 6233 * (Like the Intel one, except for Cyrix CPUs)
6200 6234 */
6201 6235 static void
6202 6236 cyrix_walk_cacheinfo(struct cpuid_info *cpi,
6203 6237 void *arg, int (*func)(void *, const struct cachetab *))
6204 6238 {
6205 6239 const struct cachetab *ct;
6206 6240 uint8_t *dp;
6207 6241 int i;
6208 6242
6209 6243 if ((dp = cpi->cpi_cacheinfo) == NULL)
6210 6244 return;
6211 6245 for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6212 6246 /*
6213 6247 * Search Cyrix-specific descriptor table first ..
6214 6248 */
6215 6249 if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
6216 6250 if (func(arg, ct) != 0)
6217 6251 break;
6218 6252 continue;
6219 6253 }
6220 6254 /*
6221 6255 * .. else fall back to the Intel one
6222 6256 */
6223 6257 if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
6224 6258 if (func(arg, ct) != 0)
6225 6259 break;
6226 6260 continue;
6227 6261 }
6228 6262 }
6229 6263 }
6230 6264
6231 6265 /*
6232 6266 * A cacheinfo walker that adds associativity, line-size, and size properties
6233 6267 * to the devinfo node it is passed as an argument.
6234 6268 */
6235 6269 static int
6236 6270 add_cacheent_props(void *arg, const struct cachetab *ct)
6237 6271 {
6238 6272 dev_info_t *devi = arg;
6239 6273
6240 6274 add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
6241 6275 if (ct->ct_line_size != 0)
6242 6276 add_cache_prop(devi, ct->ct_label, line_str,
6243 6277 ct->ct_line_size);
6244 6278 add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
6245 6279 return (0);
6246 6280 }
6247 6281
6248 6282
6249 6283 static const char fully_assoc[] = "fully-associative?";
6250 6284
6251 6285 /*
6252 6286 * AMD style cache/tlb description
6253 6287 *
6254 6288 * Extended functions 5 and 6 directly describe properties of
6255 6289 * tlbs and various cache levels.
6256 6290 */
6257 6291 static void
6258 6292 add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6259 6293 {
6260 6294 switch (assoc) {
6261 6295 case 0: /* reserved; ignore */
6262 6296 break;
6263 6297 default:
6264 6298 add_cache_prop(devi, label, assoc_str, assoc);
6265 6299 break;
6266 6300 case 0xff:
6267 6301 add_cache_prop(devi, label, fully_assoc, 1);
6268 6302 break;
6269 6303 }
6270 6304 }
6271 6305
6272 6306 static void
6273 6307 add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6274 6308 {
6275 6309 if (size == 0)
6276 6310 return;
6277 6311 add_cache_prop(devi, label, size_str, size);
6278 6312 add_amd_assoc(devi, label, assoc);
6279 6313 }
6280 6314
6281 6315 static void
6282 6316 add_amd_cache(dev_info_t *devi, const char *label,
6283 6317 uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6284 6318 {
6285 6319 if (size == 0 || line_size == 0)
6286 6320 return;
6287 6321 add_amd_assoc(devi, label, assoc);
6288 6322 /*
6289 6323 * Most AMD parts have a sectored cache. Multiple cache lines are
6290 6324 * associated with each tag. A sector consists of all cache lines
6291 6325 * associated with a tag. For example, the AMD K6-III has a sector
6292 6326 * size of 2 cache lines per tag.
6293 6327 */
6294 6328 if (lines_per_tag != 0)
6295 6329 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6296 6330 add_cache_prop(devi, label, line_str, line_size);
6297 6331 add_cache_prop(devi, label, size_str, size * 1024);
6298 6332 }
6299 6333
6300 6334 static void
6301 6335 add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6302 6336 {
6303 6337 switch (assoc) {
6304 6338 case 0: /* off */
6305 6339 break;
6306 6340 case 1:
6307 6341 case 2:
6308 6342 case 4:
6309 6343 add_cache_prop(devi, label, assoc_str, assoc);
6310 6344 break;
6311 6345 case 6:
6312 6346 add_cache_prop(devi, label, assoc_str, 8);
6313 6347 break;
6314 6348 case 8:
6315 6349 add_cache_prop(devi, label, assoc_str, 16);
6316 6350 break;
6317 6351 case 0xf:
6318 6352 add_cache_prop(devi, label, fully_assoc, 1);
6319 6353 break;
6320 6354 default: /* reserved; ignore */
6321 6355 break;
6322 6356 }
6323 6357 }
6324 6358
6325 6359 static void
6326 6360 add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6327 6361 {
6328 6362 if (size == 0 || assoc == 0)
6329 6363 return;
6330 6364 add_amd_l2_assoc(devi, label, assoc);
6331 6365 add_cache_prop(devi, label, size_str, size);
6332 6366 }
6333 6367
6334 6368 static void
6335 6369 add_amd_l2_cache(dev_info_t *devi, const char *label,
6336 6370 uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6337 6371 {
6338 6372 if (size == 0 || assoc == 0 || line_size == 0)
6339 6373 return;
6340 6374 add_amd_l2_assoc(devi, label, assoc);
6341 6375 if (lines_per_tag != 0)
6342 6376 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6343 6377 add_cache_prop(devi, label, line_str, line_size);
6344 6378 add_cache_prop(devi, label, size_str, size * 1024);
6345 6379 }
6346 6380
6347 6381 static void
6348 6382 amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
6349 6383 {
6350 6384 struct cpuid_regs *cp;
6351 6385
6352 6386 if (cpi->cpi_xmaxeax < 0x80000005)
6353 6387 return;
6354 6388 cp = &cpi->cpi_extd[5];
6355 6389
6356 6390 /*
6357 6391 * 4M/2M L1 TLB configuration
6358 6392 *
6359 6393 * We report the size for 2M pages because AMD uses two
6360 6394 * TLB entries for one 4M page.
6361 6395 */
6362 6396 add_amd_tlb(devi, "dtlb-2M",
6363 6397 BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
6364 6398 add_amd_tlb(devi, "itlb-2M",
6365 6399 BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
6366 6400
6367 6401 /*
6368 6402 * 4K L1 TLB configuration
6369 6403 */
6370 6404
6371 6405 switch (cpi->cpi_vendor) {
6372 6406 uint_t nentries;
6373 6407 case X86_VENDOR_TM:
6374 6408 if (cpi->cpi_family >= 5) {
6375 6409 /*
6376 6410 * Crusoe processors have 256 TLB entries, but
6377 6411 * cpuid data format constrains them to only
6378 6412 * reporting 255 of them.
6379 6413 */
6380 6414 if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
6381 6415 nentries = 256;
6382 6416 /*
6383 6417 * Crusoe processors also have a unified TLB
6384 6418 */
6385 6419 add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
6386 6420 nentries);
6387 6421 break;
6388 6422 }
6389 6423 /*FALLTHROUGH*/
6390 6424 default:
6391 6425 add_amd_tlb(devi, itlb4k_str,
6392 6426 BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
6393 6427 add_amd_tlb(devi, dtlb4k_str,
6394 6428 BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
6395 6429 break;
6396 6430 }
6397 6431
6398 6432 /*
6399 6433 * data L1 cache configuration
6400 6434 */
6401 6435
6402 6436 add_amd_cache(devi, l1_dcache_str,
6403 6437 BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
6404 6438 BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
6405 6439
6406 6440 /*
6407 6441 * code L1 cache configuration
6408 6442 */
6409 6443
6410 6444 add_amd_cache(devi, l1_icache_str,
6411 6445 BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
6412 6446 BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
6413 6447
6414 6448 if (cpi->cpi_xmaxeax < 0x80000006)
6415 6449 return;
6416 6450 cp = &cpi->cpi_extd[6];
6417 6451
6418 6452 /* Check for a unified L2 TLB for large pages */
6419 6453
6420 6454 if (BITX(cp->cp_eax, 31, 16) == 0)
6421 6455 add_amd_l2_tlb(devi, "l2-tlb-2M",
6422 6456 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6423 6457 else {
6424 6458 add_amd_l2_tlb(devi, "l2-dtlb-2M",
6425 6459 BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6426 6460 add_amd_l2_tlb(devi, "l2-itlb-2M",
6427 6461 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6428 6462 }
6429 6463
6430 6464 /* Check for a unified L2 TLB for 4K pages */
6431 6465
6432 6466 if (BITX(cp->cp_ebx, 31, 16) == 0) {
6433 6467 add_amd_l2_tlb(devi, "l2-tlb-4K",
6434 6468 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6435 6469 } else {
6436 6470 add_amd_l2_tlb(devi, "l2-dtlb-4K",
6437 6471 BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6438 6472 add_amd_l2_tlb(devi, "l2-itlb-4K",
6439 6473 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6440 6474 }
6441 6475
6442 6476 add_amd_l2_cache(devi, l2_cache_str,
6443 6477 BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
6444 6478 BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
6445 6479 }
6446 6480
6447 6481 /*
6448 6482 * There are two basic ways that the x86 world describes it cache
6449 6483 * and tlb architecture - Intel's way and AMD's way.
6450 6484 *
6451 6485 * Return which flavor of cache architecture we should use
6452 6486 */
6453 6487 static int
6454 6488 x86_which_cacheinfo(struct cpuid_info *cpi)
6455 6489 {
6456 6490 switch (cpi->cpi_vendor) {
6457 6491 case X86_VENDOR_Intel:
6458 6492 if (cpi->cpi_maxeax >= 2)
6459 6493 return (X86_VENDOR_Intel);
6460 6494 break;
6461 6495 case X86_VENDOR_AMD:
6462 6496 /*
6463 6497 * The K5 model 1 was the first part from AMD that reported
6464 6498 * cache sizes via extended cpuid functions.
6465 6499 */
6466 6500 if (cpi->cpi_family > 5 ||
6467 6501 (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
6468 6502 return (X86_VENDOR_AMD);
6469 6503 break;
6470 6504 case X86_VENDOR_TM:
6471 6505 if (cpi->cpi_family >= 5)
6472 6506 return (X86_VENDOR_AMD);
6473 6507 /*FALLTHROUGH*/
6474 6508 default:
6475 6509 /*
6476 6510 * If they have extended CPU data for 0x80000005
6477 6511 * then we assume they have AMD-format cache
6478 6512 * information.
6479 6513 *
6480 6514 * If not, and the vendor happens to be Cyrix,
6481 6515 * then try our-Cyrix specific handler.
6482 6516 *
6483 6517 * If we're not Cyrix, then assume we're using Intel's
6484 6518 * table-driven format instead.
6485 6519 */
6486 6520 if (cpi->cpi_xmaxeax >= 0x80000005)
6487 6521 return (X86_VENDOR_AMD);
6488 6522 else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
6489 6523 return (X86_VENDOR_Cyrix);
6490 6524 else if (cpi->cpi_maxeax >= 2)
6491 6525 return (X86_VENDOR_Intel);
6492 6526 break;
6493 6527 }
6494 6528 return (-1);
6495 6529 }
6496 6530
6497 6531 void
6498 6532 cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
6499 6533 struct cpuid_info *cpi)
6500 6534 {
6501 6535 dev_info_t *cpu_devi;
6502 6536 int create;
6503 6537
6504 6538 cpu_devi = (dev_info_t *)dip;
6505 6539
6506 6540 /* device_type */
6507 6541 (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6508 6542 "device_type", "cpu");
6509 6543
6510 6544 /* reg */
6511 6545 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6512 6546 "reg", cpu_id);
6513 6547
6514 6548 /* cpu-mhz, and clock-frequency */
6515 6549 if (cpu_freq > 0) {
6516 6550 long long mul;
6517 6551
6518 6552 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6519 6553 "cpu-mhz", cpu_freq);
6520 6554 if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
6521 6555 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6522 6556 "clock-frequency", (int)mul);
6523 6557 }
6524 6558
6525 6559 if (!is_x86_feature(x86_featureset, X86FSET_CPUID)) {
6526 6560 return;
6527 6561 }
6528 6562
6529 6563 /* vendor-id */
6530 6564 (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6531 6565 "vendor-id", cpi->cpi_vendorstr);
6532 6566
6533 6567 if (cpi->cpi_maxeax == 0) {
6534 6568 return;
6535 6569 }
6536 6570
6537 6571 /*
6538 6572 * family, model, and step
6539 6573 */
6540 6574 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6541 6575 "family", CPI_FAMILY(cpi));
6542 6576 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6543 6577 "cpu-model", CPI_MODEL(cpi));
6544 6578 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6545 6579 "stepping-id", CPI_STEP(cpi));
6546 6580
6547 6581 /* type */
6548 6582 switch (cpi->cpi_vendor) {
6549 6583 case X86_VENDOR_Intel:
6550 6584 create = 1;
6551 6585 break;
6552 6586 default:
6553 6587 create = 0;
6554 6588 break;
6555 6589 }
6556 6590 if (create)
6557 6591 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6558 6592 "type", CPI_TYPE(cpi));
6559 6593
6560 6594 /* ext-family */
6561 6595 switch (cpi->cpi_vendor) {
6562 6596 case X86_VENDOR_Intel:
6563 6597 case X86_VENDOR_AMD:
6564 6598 create = cpi->cpi_family >= 0xf;
6565 6599 break;
6566 6600 default:
6567 6601 create = 0;
6568 6602 break;
6569 6603 }
6570 6604 if (create)
6571 6605 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6572 6606 "ext-family", CPI_FAMILY_XTD(cpi));
6573 6607
6574 6608 /* ext-model */
6575 6609 switch (cpi->cpi_vendor) {
6576 6610 case X86_VENDOR_Intel:
6577 6611 create = IS_EXTENDED_MODEL_INTEL(cpi);
6578 6612 break;
6579 6613 case X86_VENDOR_AMD:
6580 6614 create = CPI_FAMILY(cpi) == 0xf;
6581 6615 break;
6582 6616 default:
6583 6617 create = 0;
6584 6618 break;
6585 6619 }
6586 6620 if (create)
6587 6621 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6588 6622 "ext-model", CPI_MODEL_XTD(cpi));
6589 6623
6590 6624 /* generation */
6591 6625 switch (cpi->cpi_vendor) {
6592 6626 case X86_VENDOR_AMD:
6593 6627 /*
6594 6628 * AMD K5 model 1 was the first part to support this
6595 6629 */
6596 6630 create = cpi->cpi_xmaxeax >= 0x80000001;
6597 6631 break;
6598 6632 default:
6599 6633 create = 0;
6600 6634 break;
6601 6635 }
6602 6636 if (create)
6603 6637 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6604 6638 "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
6605 6639
6606 6640 /* brand-id */
6607 6641 switch (cpi->cpi_vendor) {
6608 6642 case X86_VENDOR_Intel:
6609 6643 /*
6610 6644 * brand id first appeared on Pentium III Xeon model 8,
6611 6645 * and Celeron model 8 processors and Opteron
6612 6646 */
6613 6647 create = cpi->cpi_family > 6 ||
6614 6648 (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
6615 6649 break;
6616 6650 case X86_VENDOR_AMD:
6617 6651 create = cpi->cpi_family >= 0xf;
6618 6652 break;
6619 6653 default:
6620 6654 create = 0;
6621 6655 break;
6622 6656 }
6623 6657 if (create && cpi->cpi_brandid != 0) {
6624 6658 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6625 6659 "brand-id", cpi->cpi_brandid);
6626 6660 }
6627 6661
6628 6662 /* chunks, and apic-id */
6629 6663 switch (cpi->cpi_vendor) {
6630 6664 /*
6631 6665 * first available on Pentium IV and Opteron (K8)
6632 6666 */
6633 6667 case X86_VENDOR_Intel:
6634 6668 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6635 6669 break;
6636 6670 case X86_VENDOR_AMD:
6637 6671 create = cpi->cpi_family >= 0xf;
6638 6672 break;
6639 6673 default:
6640 6674 create = 0;
6641 6675 break;
6642 6676 }
6643 6677 if (create) {
6644 6678 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6645 6679 "chunks", CPI_CHUNKS(cpi));
6646 6680 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6647 6681 "apic-id", cpi->cpi_apicid);
6648 6682 if (cpi->cpi_chipid >= 0) {
6649 6683 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6650 6684 "chip#", cpi->cpi_chipid);
6651 6685 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6652 6686 "clog#", cpi->cpi_clogid);
6653 6687 }
6654 6688 }
6655 6689
6656 6690 /* cpuid-features */
6657 6691 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6658 6692 "cpuid-features", CPI_FEATURES_EDX(cpi));
6659 6693
6660 6694
6661 6695 /* cpuid-features-ecx */
6662 6696 switch (cpi->cpi_vendor) {
6663 6697 case X86_VENDOR_Intel:
6664 6698 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6665 6699 break;
6666 6700 case X86_VENDOR_AMD:
6667 6701 create = cpi->cpi_family >= 0xf;
6668 6702 break;
6669 6703 default:
6670 6704 create = 0;
6671 6705 break;
6672 6706 }
6673 6707 if (create)
6674 6708 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6675 6709 "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
6676 6710
6677 6711 /* ext-cpuid-features */
6678 6712 switch (cpi->cpi_vendor) {
6679 6713 case X86_VENDOR_Intel:
6680 6714 case X86_VENDOR_AMD:
6681 6715 case X86_VENDOR_Cyrix:
6682 6716 case X86_VENDOR_TM:
6683 6717 case X86_VENDOR_Centaur:
6684 6718 create = cpi->cpi_xmaxeax >= 0x80000001;
6685 6719 break;
6686 6720 default:
6687 6721 create = 0;
6688 6722 break;
6689 6723 }
6690 6724 if (create) {
6691 6725 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6692 6726 "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
6693 6727 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6694 6728 "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
6695 6729 }
6696 6730
6697 6731 /*
6698 6732 * Brand String first appeared in Intel Pentium IV, AMD K5
6699 6733 * model 1, and Cyrix GXm. On earlier models we try and
6700 6734 * simulate something similar .. so this string should always
6701 6735 * same -something- about the processor, however lame.
6702 6736 */
6703 6737 (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6704 6738 "brand-string", cpi->cpi_brandstr);
6705 6739
6706 6740 /*
6707 6741 * Finally, cache and tlb information
6708 6742 */
6709 6743 switch (x86_which_cacheinfo(cpi)) {
6710 6744 case X86_VENDOR_Intel:
6711 6745 intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
6712 6746 break;
6713 6747 case X86_VENDOR_Cyrix:
6714 6748 cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
6715 6749 break;
6716 6750 case X86_VENDOR_AMD:
6717 6751 amd_cache_info(cpi, cpu_devi);
6718 6752 break;
6719 6753 default:
6720 6754 break;
6721 6755 }
6722 6756 }
6723 6757
6724 6758 struct l2info {
6725 6759 int *l2i_csz;
6726 6760 int *l2i_lsz;
6727 6761 int *l2i_assoc;
6728 6762 int l2i_ret;
6729 6763 };
6730 6764
6731 6765 /*
6732 6766 * A cacheinfo walker that fetches the size, line-size and associativity
6733 6767 * of the L2 cache
6734 6768 */
6735 6769 static int
6736 6770 intel_l2cinfo(void *arg, const struct cachetab *ct)
6737 6771 {
6738 6772 struct l2info *l2i = arg;
6739 6773 int *ip;
6740 6774
6741 6775 if (ct->ct_label != l2_cache_str &&
6742 6776 ct->ct_label != sl2_cache_str)
6743 6777 return (0); /* not an L2 -- keep walking */
6744 6778
6745 6779 if ((ip = l2i->l2i_csz) != NULL)
6746 6780 *ip = ct->ct_size;
6747 6781 if ((ip = l2i->l2i_lsz) != NULL)
6748 6782 *ip = ct->ct_line_size;
6749 6783 if ((ip = l2i->l2i_assoc) != NULL)
6750 6784 *ip = ct->ct_assoc;
6751 6785 l2i->l2i_ret = ct->ct_size;
6752 6786 return (1); /* was an L2 -- terminate walk */
6753 6787 }
6754 6788
6755 6789 /*
6756 6790 * AMD L2/L3 Cache and TLB Associativity Field Definition:
6757 6791 *
6758 6792 * Unlike the associativity for the L1 cache and tlb where the 8 bit
6759 6793 * value is the associativity, the associativity for the L2 cache and
6760 6794 * tlb is encoded in the following table. The 4 bit L2 value serves as
6761 6795 * an index into the amd_afd[] array to determine the associativity.
6762 6796 * -1 is undefined. 0 is fully associative.
6763 6797 */
6764 6798
6765 6799 static int amd_afd[] =
6766 6800 {-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
6767 6801
6768 6802 static void
6769 6803 amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
6770 6804 {
6771 6805 struct cpuid_regs *cp;
6772 6806 uint_t size, assoc;
6773 6807 int i;
6774 6808 int *ip;
6775 6809
6776 6810 if (cpi->cpi_xmaxeax < 0x80000006)
6777 6811 return;
6778 6812 cp = &cpi->cpi_extd[6];
6779 6813
6780 6814 if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
6781 6815 (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
6782 6816 uint_t cachesz = size * 1024;
6783 6817 assoc = amd_afd[i];
6784 6818
6785 6819 ASSERT(assoc != -1);
6786 6820
6787 6821 if ((ip = l2i->l2i_csz) != NULL)
6788 6822 *ip = cachesz;
6789 6823 if ((ip = l2i->l2i_lsz) != NULL)
6790 6824 *ip = BITX(cp->cp_ecx, 7, 0);
6791 6825 if ((ip = l2i->l2i_assoc) != NULL)
6792 6826 *ip = assoc;
6793 6827 l2i->l2i_ret = cachesz;
6794 6828 }
6795 6829 }
6796 6830
6797 6831 int
6798 6832 getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
6799 6833 {
6800 6834 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6801 6835 struct l2info __l2info, *l2i = &__l2info;
6802 6836
6803 6837 l2i->l2i_csz = csz;
6804 6838 l2i->l2i_lsz = lsz;
6805 6839 l2i->l2i_assoc = assoc;
6806 6840 l2i->l2i_ret = -1;
6807 6841
6808 6842 switch (x86_which_cacheinfo(cpi)) {
6809 6843 case X86_VENDOR_Intel:
6810 6844 intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
6811 6845 break;
6812 6846 case X86_VENDOR_Cyrix:
6813 6847 cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
6814 6848 break;
6815 6849 case X86_VENDOR_AMD:
6816 6850 amd_l2cacheinfo(cpi, l2i);
6817 6851 break;
6818 6852 default:
6819 6853 break;
6820 6854 }
6821 6855 return (l2i->l2i_ret);
6822 6856 }
6823 6857
6824 6858 #if !defined(__xpv)
6825 6859
6826 6860 uint32_t *
6827 6861 cpuid_mwait_alloc(cpu_t *cpu)
6828 6862 {
6829 6863 uint32_t *ret;
6830 6864 size_t mwait_size;
6831 6865
6832 6866 ASSERT(cpuid_checkpass(CPU, 2));
6833 6867
6834 6868 mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
6835 6869 if (mwait_size == 0)
6836 6870 return (NULL);
6837 6871
6838 6872 /*
6839 6873 * kmem_alloc() returns cache line size aligned data for mwait_size
6840 6874 * allocations. mwait_size is currently cache line sized. Neither
6841 6875 * of these implementation details are guarantied to be true in the
6842 6876 * future.
6843 6877 *
6844 6878 * First try allocating mwait_size as kmem_alloc() currently returns
6845 6879 * correctly aligned memory. If kmem_alloc() does not return
6846 6880 * mwait_size aligned memory, then use mwait_size ROUNDUP.
6847 6881 *
6848 6882 * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
6849 6883 * decide to free this memory.
6850 6884 */
6851 6885 ret = kmem_zalloc(mwait_size, KM_SLEEP);
6852 6886 if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
6853 6887 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
6854 6888 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
6855 6889 *ret = MWAIT_RUNNING;
6856 6890 return (ret);
6857 6891 } else {
6858 6892 kmem_free(ret, mwait_size);
6859 6893 ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
6860 6894 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
6861 6895 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
6862 6896 ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
6863 6897 *ret = MWAIT_RUNNING;
6864 6898 return (ret);
6865 6899 }
6866 6900 }
6867 6901
6868 6902 void
6869 6903 cpuid_mwait_free(cpu_t *cpu)
6870 6904 {
6871 6905 if (cpu->cpu_m.mcpu_cpi == NULL) {
6872 6906 return;
6873 6907 }
6874 6908
6875 6909 if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
6876 6910 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
6877 6911 kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
6878 6912 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
6879 6913 }
6880 6914
6881 6915 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
6882 6916 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
6883 6917 }
6884 6918
6885 6919 void
6886 6920 patch_tsc_read(int flag)
6887 6921 {
6888 6922 size_t cnt;
6889 6923
6890 6924 switch (flag) {
6891 6925 case TSC_NONE:
6892 6926 cnt = &_no_rdtsc_end - &_no_rdtsc_start;
6893 6927 (void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
6894 6928 break;
6895 6929 case TSC_RDTSC_MFENCE:
6896 6930 cnt = &_tsc_mfence_end - &_tsc_mfence_start;
6897 6931 (void) memcpy((void *)tsc_read,
6898 6932 (void *)&_tsc_mfence_start, cnt);
6899 6933 break;
6900 6934 case TSC_RDTSC_LFENCE:
6901 6935 cnt = &_tsc_lfence_end - &_tsc_lfence_start;
6902 6936 (void) memcpy((void *)tsc_read,
6903 6937 (void *)&_tsc_lfence_start, cnt);
6904 6938 break;
6905 6939 case TSC_TSCP:
6906 6940 cnt = &_tscp_end - &_tscp_start;
6907 6941 (void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
6908 6942 break;
6909 6943 default:
6910 6944 /* Bail for unexpected TSC types. (TSC_NONE covers 0) */
6911 6945 cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
6912 6946 break;
6913 6947 }
6914 6948 tsc_type = flag;
6915 6949 }
6916 6950
6917 6951 int
6918 6952 cpuid_deep_cstates_supported(void)
6919 6953 {
6920 6954 struct cpuid_info *cpi;
6921 6955 struct cpuid_regs regs;
6922 6956
6923 6957 ASSERT(cpuid_checkpass(CPU, 1));
6924 6958
6925 6959 cpi = CPU->cpu_m.mcpu_cpi;
6926 6960
6927 6961 if (!is_x86_feature(x86_featureset, X86FSET_CPUID))
6928 6962 return (0);
6929 6963
6930 6964 switch (cpi->cpi_vendor) {
6931 6965 case X86_VENDOR_Intel:
6932 6966 if (cpi->cpi_xmaxeax < 0x80000007)
6933 6967 return (0);
6934 6968
6935 6969 /*
6936 6970 * TSC run at a constant rate in all ACPI C-states?
6937 6971 */
6938 6972 regs.cp_eax = 0x80000007;
6939 6973 (void) __cpuid_insn(®s);
6940 6974 return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
6941 6975
6942 6976 default:
6943 6977 return (0);
6944 6978 }
6945 6979 }
6946 6980
6947 6981 #endif /* !__xpv */
6948 6982
6949 6983 void
6950 6984 post_startup_cpu_fixups(void)
6951 6985 {
6952 6986 #ifndef __xpv
6953 6987 /*
6954 6988 * Some AMD processors support C1E state. Entering this state will
6955 6989 * cause the local APIC timer to stop, which we can't deal with at
6956 6990 * this time.
6957 6991 */
6958 6992 if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
6959 6993 on_trap_data_t otd;
6960 6994 uint64_t reg;
6961 6995
6962 6996 if (!on_trap(&otd, OT_DATA_ACCESS)) {
6963 6997 reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
6964 6998 /* Disable C1E state if it is enabled by BIOS */
6965 6999 if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
6966 7000 AMD_ACTONCMPHALT_MASK) {
6967 7001 reg &= ~(AMD_ACTONCMPHALT_MASK <<
6968 7002 AMD_ACTONCMPHALT_SHIFT);
6969 7003 wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
6970 7004 }
6971 7005 }
6972 7006 no_trap();
6973 7007 }
6974 7008 #endif /* !__xpv */
6975 7009 }
6976 7010
6977 7011 void
6978 7012 enable_pcid(void)
6979 7013 {
6980 7014 if (x86_use_pcid == -1)
6981 7015 x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
6982 7016
6983 7017 if (x86_use_invpcid == -1) {
6984 7018 x86_use_invpcid = is_x86_feature(x86_featureset,
6985 7019 X86FSET_INVPCID);
6986 7020 }
6987 7021
6988 7022 if (!x86_use_pcid)
6989 7023 return;
6990 7024
6991 7025 /*
6992 7026 * Intel say that on setting PCIDE, it immediately starts using the PCID
6993 7027 * bits; better make sure there's nothing there.
6994 7028 */
6995 7029 ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
6996 7030
6997 7031 setcr4(getcr4() | CR4_PCIDE);
6998 7032 }
6999 7033
7000 7034 /*
7001 7035 * Setup necessary registers to enable XSAVE feature on this processor.
7002 7036 * This function needs to be called early enough, so that no xsave/xrstor
7003 7037 * ops will execute on the processor before the MSRs are properly set up.
7004 7038 *
7005 7039 * Current implementation has the following assumption:
7006 7040 * - cpuid_pass1() is done, so that X86 features are known.
7007 7041 * - fpu_probe() is done, so that fp_save_mech is chosen.
7008 7042 */
7009 7043 void
7010 7044 xsave_setup_msr(cpu_t *cpu)
7011 7045 {
7012 7046 ASSERT(fp_save_mech == FP_XSAVE);
7013 7047 ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
7014 7048
7015 7049 /* Enable OSXSAVE in CR4. */
7016 7050 setcr4(getcr4() | CR4_OSXSAVE);
7017 7051 /*
7018 7052 * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report
7019 7053 * correct value.
7020 7054 */
7021 7055 cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE;
7022 7056 setup_xfem();
7023 7057 }
7024 7058
7025 7059 /*
7026 7060 * Starting with the Westmere processor the local
7027 7061 * APIC timer will continue running in all C-states,
7028 7062 * including the deepest C-states.
7029 7063 */
7030 7064 int
7031 7065 cpuid_arat_supported(void)
7032 7066 {
7033 7067 struct cpuid_info *cpi;
7034 7068 struct cpuid_regs regs;
7035 7069
7036 7070 ASSERT(cpuid_checkpass(CPU, 1));
7037 7071 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7038 7072
7039 7073 cpi = CPU->cpu_m.mcpu_cpi;
7040 7074
7041 7075 switch (cpi->cpi_vendor) {
7042 7076 case X86_VENDOR_Intel:
7043 7077 /*
7044 7078 * Always-running Local APIC Timer is
7045 7079 * indicated by CPUID.6.EAX[2].
7046 7080 */
7047 7081 if (cpi->cpi_maxeax >= 6) {
7048 7082 regs.cp_eax = 6;
7049 7083 (void) cpuid_insn(NULL, ®s);
7050 7084 return (regs.cp_eax & CPUID_INTC_EAX_ARAT);
7051 7085 } else {
7052 7086 return (0);
7053 7087 }
7054 7088 default:
7055 7089 return (0);
7056 7090 }
7057 7091 }
7058 7092
7059 7093 /*
7060 7094 * Check support for Intel ENERGY_PERF_BIAS feature
7061 7095 */
7062 7096 int
7063 7097 cpuid_iepb_supported(struct cpu *cp)
7064 7098 {
7065 7099 struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi;
7066 7100 struct cpuid_regs regs;
7067 7101
7068 7102 ASSERT(cpuid_checkpass(cp, 1));
7069 7103
7070 7104 if (!(is_x86_feature(x86_featureset, X86FSET_CPUID)) ||
7071 7105 !(is_x86_feature(x86_featureset, X86FSET_MSR))) {
7072 7106 return (0);
7073 7107 }
7074 7108
7075 7109 /*
7076 7110 * Intel ENERGY_PERF_BIAS MSR is indicated by
7077 7111 * capability bit CPUID.6.ECX.3
7078 7112 */
7079 7113 if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6))
7080 7114 return (0);
7081 7115
7082 7116 regs.cp_eax = 0x6;
7083 7117 (void) cpuid_insn(NULL, ®s);
7084 7118 return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS);
7085 7119 }
7086 7120
7087 7121 /*
7088 7122 * Check support for TSC deadline timer
7089 7123 *
7090 7124 * TSC deadline timer provides a superior software programming
7091 7125 * model over local APIC timer that eliminates "time drifts".
7092 7126 * Instead of specifying a relative time, software specifies an
7093 7127 * absolute time as the target at which the processor should
7094 7128 * generate a timer event.
7095 7129 */
7096 7130 int
7097 7131 cpuid_deadline_tsc_supported(void)
7098 7132 {
7099 7133 struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi;
7100 7134 struct cpuid_regs regs;
7101 7135
7102 7136 ASSERT(cpuid_checkpass(CPU, 1));
7103 7137 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7104 7138
7105 7139 switch (cpi->cpi_vendor) {
7106 7140 case X86_VENDOR_Intel:
7107 7141 if (cpi->cpi_maxeax >= 1) {
7108 7142 regs.cp_eax = 1;
7109 7143 (void) cpuid_insn(NULL, ®s);
7110 7144 return (regs.cp_ecx & CPUID_DEADLINE_TSC);
7111 7145 } else {
7112 7146 return (0);
7113 7147 }
7114 7148 default:
7115 7149 return (0);
7116 7150 }
7117 7151 }
7118 7152
7119 7153 #if defined(__amd64) && !defined(__xpv)
7120 7154 /*
7121 7155 * Patch in versions of bcopy for high performance Intel Nhm processors
7122 7156 * and later...
7123 7157 */
7124 7158 void
7125 7159 patch_memops(uint_t vendor)
7126 7160 {
7127 7161 size_t cnt, i;
7128 7162 caddr_t to, from;
7129 7163
7130 7164 if ((vendor == X86_VENDOR_Intel) &&
7131 7165 is_x86_feature(x86_featureset, X86FSET_SSE4_2)) {
7132 7166 cnt = &bcopy_patch_end - &bcopy_patch_start;
7133 7167 to = &bcopy_ck_size;
7134 7168 from = &bcopy_patch_start;
7135 7169 for (i = 0; i < cnt; i++) {
7136 7170 *to++ = *from++;
7137 7171 }
7138 7172 }
7139 7173 }
7140 7174 #endif /* __amd64 && !__xpv */
7141 7175
7142 7176 /*
7143 7177 * We're being asked to tell the system how many bits are required to represent
7144 7178 * the various thread and strand IDs. While it's tempting to derive this based
7145 7179 * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite
7146 7180 * correct. Instead, this needs to be based on the number of bits that the APIC
7147 7181 * allows for these different configurations. We only update these to a larger
7148 7182 * value if we find one.
7149 7183 */
7150 7184 void
7151 7185 cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
7152 7186 {
7153 7187 struct cpuid_info *cpi;
7154 7188
7155 7189 VERIFY(cpuid_checkpass(CPU, 1));
7156 7190 cpi = cpu->cpu_m.mcpu_cpi;
7157 7191
7158 7192 if (cpi->cpi_ncore_bits > *core_nbits) {
7159 7193 *core_nbits = cpi->cpi_ncore_bits;
7160 7194 }
7161 7195
7162 7196 if (cpi->cpi_nthread_bits > *strand_nbits) {
7163 7197 *strand_nbits = cpi->cpi_nthread_bits;
7164 7198 }
7165 7199 }
7166 7200
7167 7201 void
7168 7202 cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
7169 7203 {
7170 7204 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7171 7205 struct cpuid_regs cp;
7172 7206
7173 7207 /*
7174 7208 * Reread the CPUID portions that we need for various security
7175 7209 * information.
7176 7210 */
7177 7211 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
7178 7212 /*
7179 7213 * Check if we now have leaf 7 available to us.
7180 7214 */
7181 7215 if (cpi->cpi_maxeax < 7) {
7182 7216 bzero(&cp, sizeof (cp));
7183 7217 cp.cp_eax = 0;
7184 7218 cpi->cpi_maxeax = __cpuid_insn(&cp);
7185 7219 if (cpi->cpi_maxeax < 7)
7186 7220 return;
7187 7221 }
7188 7222
7189 7223 bzero(&cp, sizeof (cp));
7190 7224 cp.cp_eax = 7;
7191 7225 cp.cp_ecx = 0;
7192 7226 (void) __cpuid_insn(&cp);
7193 7227 cpi->cpi_std[7] = cp;
7194 7228 } else if (cpi->cpi_vendor == X86_VENDOR_AMD) {
7195 7229 /* No xcpuid support */
7196 7230 if (cpi->cpi_family < 5 ||
7197 7231 (cpi->cpi_family == 5 && cpi->cpi_model < 1))
7198 7232 return;
7199 7233
7200 7234 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7201 7235 bzero(&cp, sizeof (cp));
7202 7236 cp.cp_eax = CPUID_LEAF_EXT_0;
7203 7237 cpi->cpi_xmaxeax = __cpuid_insn(&cp);
7204 7238 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7205 7239 return;
7206 7240 }
7207 7241 }
7208 7242
7209 7243 bzero(&cp, sizeof (cp));
7210 7244 cp.cp_eax = CPUID_LEAF_EXT_8;
7211 7245 (void) __cpuid_insn(&cp);
7212 7246 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
7213 7247 cpi->cpi_extd[8] = cp;
7214 7248 } else {
7215 7249 /*
7216 7250 * Nothing to do here. Return an empty set which has already
7217 7251 * been zeroed for us.
7218 7252 */
7219 7253 return;
7220 7254 }
7221 7255 cpuid_scan_security(cpu, fset);
7222 7256 }
7223 7257
7224 7258 /* ARGSUSED */
7225 7259 static int
7226 7260 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
7227 7261 {
7228 7262 uchar_t *fset;
7229 7263 boolean_t first_pass = (boolean_t)arg1;
7230 7264
7231 7265 fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
7232 7266 if (first_pass && CPU->cpu_id != 0)
7233 7267 return (0);
7234 7268 if (!first_pass && CPU->cpu_id == 0)
7235 7269 return (0);
7236 7270 cpuid_pass_ucode(CPU, fset);
7237 7271
7238 7272 return (0);
7239 7273 }
7240 7274
7241 7275 /*
7242 7276 * After a microcode update where the version has changed, then we need to
7243 7277 * rescan CPUID. To do this we check every CPU to make sure that they have the
7244 7278 * same microcode. Then we perform a cross call to all such CPUs. It's the
7245 7279 * caller's job to make sure that no one else can end up doing an update while
7246 7280 * this is going on.
7247 7281 *
7248 7282 * We assume that the system is microcode capable if we're called.
7249 7283 */
7250 7284 void
7251 7285 cpuid_post_ucodeadm(void)
7252 7286 {
7253 7287 uint32_t rev;
7254 7288 int i;
7255 7289 struct cpu *cpu;
7256 7290 cpuset_t cpuset;
7257 7291 void *argdata;
7258 7292 uchar_t *f0;
7259 7293
7260 7294 argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
7261 7295
7262 7296 mutex_enter(&cpu_lock);
7263 7297 cpu = cpu_get(0);
7264 7298 rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
7265 7299 CPUSET_ONLY(cpuset, 0);
7266 7300 for (i = 1; i < max_ncpus; i++) {
7267 7301 if ((cpu = cpu_get(i)) == NULL)
7268 7302 continue;
7269 7303
7270 7304 if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
7271 7305 panic("post microcode update CPU %d has differing "
7272 7306 "microcode revision (%u) from CPU 0 (%u)",
7273 7307 i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
7274 7308 }
7275 7309 CPUSET_ADD(cpuset, i);
7276 7310 }
7277 7311
7278 7312 /*
7279 7313 * We do the cross calls in two passes. The first pass is only for the
7280 7314 * boot CPU. The second pass is for all of the other CPUs. This allows
7281 7315 * the boot CPU to go through and change behavior related to patching or
7282 7316 * whether or not Enhanced IBRS needs to be enabled and then allow all
7283 7317 * other CPUs to follow suit.
7284 7318 */
7285 7319 kpreempt_disable();
7286 7320 xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset),
7287 7321 cpuid_post_ucodeadm_xc);
7288 7322 xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset),
7289 7323 cpuid_post_ucodeadm_xc);
7290 7324 kpreempt_enable();
7291 7325
7292 7326 /*
7293 7327 * OK, now look at each CPU and see if their feature sets are equal.
7294 7328 */
7295 7329 f0 = argdata;
7296 7330 for (i = 1; i < max_ncpus; i++) {
7297 7331 uchar_t *fset;
7298 7332 if (!CPU_IN_SET(cpuset, i))
7299 7333 continue;
7300 7334
7301 7335 fset = (uchar_t *)((uintptr_t)argdata +
7302 7336 sizeof (x86_featureset) * i);
7303 7337
7304 7338 if (!compare_x86_featureset(f0, fset)) {
7305 7339 panic("Post microcode update CPU %d has "
7306 7340 "differing security feature (%p) set from CPU 0 "
7307 7341 "(%p), not appending to feature set", i,
7308 7342 (void *)fset, (void *)f0);
7309 7343 }
7310 7344 }
7311 7345
7312 7346 mutex_exit(&cpu_lock);
7313 7347
7314 7348 for (i = 0; i < NUM_X86_FEATURES; i++) {
7315 7349 cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n",
7316 7350 x86_feature_names[i]);
7317 7351 if (is_x86_feature(f0, i)) {
7318 7352 add_x86_feature(x86_featureset, i);
7319 7353 }
7320 7354 }
7321 7355 kmem_free(argdata, sizeof (x86_featureset) * NCPU);
7322 7356 }
↓ open down ↓ |
6145 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX