Print this page
11787 Kernel needs to be built with retpolines
11788 Kernel needs to generally use RSB stuffing
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: John Levon <john.levon@joyent.com>


 880  *
 881  * The second means is through the auxiliary vector. The auxiliary vector is a
 882  * series of tagged data that the kernel passes down to a user program when it
 883  * begins executing. This information is used to indicate to programs what
 884  * instruction set extensions are present. For example, information about the
 885  * CPU supporting the machine check architecture (MCA) wouldn't be passed down
 886  * since user programs cannot make use of it. However, things like the AVX
 887  * instruction sets are. Programs use this information to make run-time
 888  * decisions about what features they should use. As an example, the run-time
 889  * link-editor (rtld) can relocate different functions depending on the hardware
 890  * support available.
 891  *
 892  * The final form is through a series of accessor functions that all have the
 893  * form cpuid_get*. This is used by a number of different subsystems in the
 894  * kernel to determine more detailed information about what we're running on,
 895  * topology information, etc. Some of these subsystems include processor groups
 896  * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
 897  * microcode, and performance monitoring. These functions all ASSERT that the
 898  * CPU they're being called on has reached a certain cpuid pass. If the passes
 899  * are rearranged, then this needs to be adjusted.



















































































































































































































































































 900  */
 901 
 902 #include <sys/types.h>
 903 #include <sys/archsystm.h>
 904 #include <sys/x86_archext.h>
 905 #include <sys/kmem.h>
 906 #include <sys/systm.h>
 907 #include <sys/cmn_err.h>
 908 #include <sys/sunddi.h>
 909 #include <sys/sunndi.h>
 910 #include <sys/cpuvar.h>
 911 #include <sys/processor.h>
 912 #include <sys/sysmacros.h>
 913 #include <sys/pg.h>
 914 #include <sys/fp.h>
 915 #include <sys/controlregs.h>
 916 #include <sys/bitmap.h>
 917 #include <sys/auxv_386.h>
 918 #include <sys/memnode.h>
 919 #include <sys/pci_cfgspace.h>
 920 #include <sys/comm_page.h>
 921 #include <sys/mach_mmu.h>
 922 #include <sys/ucode.h>
 923 #include <sys/tsc.h>


 924 
 925 #ifdef __xpv
 926 #include <sys/hypervisor.h>
 927 #else
 928 #include <sys/ontrap.h>
 929 #endif
 930 
 931 uint_t x86_vendor = X86_VENDOR_IntelClone;
 932 uint_t x86_type = X86_TYPE_OTHER;
 933 uint_t x86_clflush_size = 0;
 934 
 935 #if defined(__xpv)
 936 int x86_use_pcid = 0;
 937 int x86_use_invpcid = 0;
 938 #else
 939 int x86_use_pcid = -1;
 940 int x86_use_invpcid = -1;
 941 #endif
 942 











 943 uint_t pentiumpro_bug4046376;
 944 
 945 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
 946 
 947 static char *x86_feature_names[NUM_X86_FEATURES] = {
 948         "lgpg",
 949         "tsc",
 950         "msr",
 951         "mtrr",
 952         "pge",
 953         "de",
 954         "cmov",
 955         "mmx",
 956         "mca",
 957         "pae",
 958         "cv8",
 959         "pat",
 960         "sep",
 961         "sse",
 962         "sse2",


2153  * This function points to a function that will flush certain
2154  * micro-architectural state on the processor. This flush is used to mitigate
2155  * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This
2156  * function can point to one of three functions:
2157  *
2158  * - A noop which is done because we either are vulnerable, but do not have
2159  *   microcode available to help deal with a fix, or because we aren't
2160  *   vulnerable.
2161  *
2162  * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2163  *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2164  *   however, it only flushes the MDS related micro-architectural state on the
2165  *   current hyperthread, it does not do anything for the twin.
2166  *
2167  * - x86_md_clear which will flush the MDS related state. This is done when we
2168  *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2169  *   (RDCL_NO is set).
2170  */
2171 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2172 
2173 void (*x86_md_clear)(void) = x86_md_clear_noop;
2174 
2175 static void
2176 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2177 {
2178         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2179 
2180         /*
2181          * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2182          * has been fixed in hardware, it doesn't cover everything related to
2183          * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2184          * need to mitigate this.
2185          */
2186         if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2187             is_x86_feature(featureset, X86FSET_MDS_NO)) {
2188                 x86_md_clear = x86_md_clear_noop;
2189                 membar_producer();
2190                 return;
2191         }
2192 
2193         if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2194                 x86_md_clear = x86_md_clear_verw;



2195         }
2196 
2197         membar_producer();
2198 }
2199 
2200 static void
2201 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2202 {
2203         boolean_t need_l1d, need_mds;
2204         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2205 
2206         /*
2207          * If we're not on Intel or we've mitigated both RDCL and MDS in
2208          * hardware, then there's nothing left for us to do for enabling the
2209          * flush. We can also go ahead and say that SMT exclusion is
2210          * unnecessary.
2211          */
2212         if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2213             (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2214             is_x86_feature(featureset, X86FSET_MDS_NO))) {


2238         if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2239             is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2240                 need_mds = B_TRUE;
2241         } else {
2242                 need_mds = B_FALSE;
2243         }
2244 
2245         if (need_l1d) {
2246                 spec_uarch_flush = spec_uarch_flush_msr;
2247         } else if (need_mds) {
2248                 spec_uarch_flush = x86_md_clear;
2249         } else {
2250                 /*
2251                  * We have no hardware mitigations available to us.
2252                  */
2253                 spec_uarch_flush = spec_uarch_flush_noop;
2254         }
2255         membar_producer();
2256 }
2257 



2258 static void



























































































































2259 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
2260 {
2261         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;

2262 
2263         if (cpi->cpi_vendor == X86_VENDOR_AMD &&
2264             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2265                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
2266                         add_x86_feature(featureset, X86FSET_IBPB);
2267                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
2268                         add_x86_feature(featureset, X86FSET_IBRS);
2269                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
2270                         add_x86_feature(featureset, X86FSET_STIBP);
2271                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)
2272                         add_x86_feature(featureset, X86FSET_IBRS_ALL);
2273                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
2274                         add_x86_feature(featureset, X86FSET_STIBP_ALL);
2275                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS)
2276                         add_x86_feature(featureset, X86FSET_RSBA);
2277                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
2278                         add_x86_feature(featureset, X86FSET_SSBD);
2279                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
2280                         add_x86_feature(featureset, X86FSET_SSBD_VIRT);
2281                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
2282                         add_x86_feature(featureset, X86FSET_SSB_NO);










2283         } else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
2284             cpi->cpi_maxeax >= 7) {
2285                 struct cpuid_regs *ecp;
2286                 ecp = &cpi->cpi_std[7];
2287 
2288                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
2289                         add_x86_feature(featureset, X86FSET_MD_CLEAR);
2290                 }
2291 
2292                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
2293                         add_x86_feature(featureset, X86FSET_IBRS);
2294                         add_x86_feature(featureset, X86FSET_IBPB);
2295                 }
2296 
2297                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
2298                         add_x86_feature(featureset, X86FSET_STIBP);
2299                 }
2300 
2301                 /*
2302                  * Don't read the arch caps MSR on xpv where we lack the


2332                                 if (reg & IA32_ARCH_CAP_SSB_NO) {
2333                                         add_x86_feature(featureset,
2334                                             X86FSET_SSB_NO);
2335                                 }
2336                                 if (reg & IA32_ARCH_CAP_MDS_NO) {
2337                                         add_x86_feature(featureset,
2338                                             X86FSET_MDS_NO);
2339                                 }
2340                         }
2341                         no_trap();
2342                 }
2343 #endif  /* !__xpv */
2344 
2345                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
2346                         add_x86_feature(featureset, X86FSET_SSBD);
2347 
2348                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
2349                         add_x86_feature(featureset, X86FSET_FLUSH_CMD);
2350         }
2351 
2352         if (cpu->cpu_id != 0)



2353                 return;

2354 
2355         /*




























2356          * We need to determine what changes are required for mitigating L1TF
2357          * and MDS. If the CPU suffers from either of them, then SMT exclusion
2358          * is required.
2359          *
2360          * If any of these are present, then we need to flush u-arch state at
2361          * various points. For MDS, we need to do so whenever we change to a
2362          * lesser privilege level or we are halting the CPU. For L1TF we need to
2363          * flush the L1D cache at VM entry. When we have microcode that handles
2364          * MDS, the L1D flush also clears the other u-arch state that the
2365          * md_clear does.
2366          */
2367 
2368         /*
2369          * Update whether or not we need to be taking explicit action against
2370          * MDS.
2371          */
2372         cpuid_update_md_clear(cpu, featureset);
2373 
2374         /*
2375          * Determine whether SMT exclusion is required and whether or not we


6757                 bzero(&cp, sizeof (cp));
6758                 cp.cp_eax = CPUID_LEAF_EXT_8;
6759                 (void) __cpuid_insn(&cp);
6760                 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
6761                 cpi->cpi_extd[8] = cp;
6762         } else {
6763                 /*
6764                  * Nothing to do here. Return an empty set which has already
6765                  * been zeroed for us.
6766                  */
6767                 return;
6768         }
6769         cpuid_scan_security(cpu, fset);
6770 }
6771 
6772 /* ARGSUSED */
6773 static int
6774 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
6775 {
6776         uchar_t *fset;

6777 
6778         fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);




6779         cpuid_pass_ucode(CPU, fset);
6780 
6781         return (0);
6782 }
6783 
6784 /*
6785  * After a microcode update where the version has changed, then we need to
6786  * rescan CPUID. To do this we check every CPU to make sure that they have the
6787  * same microcode. Then we perform a cross call to all such CPUs. It's the
6788  * caller's job to make sure that no one else can end up doing an update while
6789  * this is going on.
6790  *
6791  * We assume that the system is microcode capable if we're called.
6792  */
6793 void
6794 cpuid_post_ucodeadm(void)
6795 {
6796         uint32_t rev;
6797         int i;
6798         struct cpu *cpu;


6801         uchar_t *f0;
6802 
6803         argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
6804 
6805         mutex_enter(&cpu_lock);
6806         cpu = cpu_get(0);
6807         rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
6808         CPUSET_ONLY(cpuset, 0);
6809         for (i = 1; i < max_ncpus; i++) {
6810                 if ((cpu = cpu_get(i)) == NULL)
6811                         continue;
6812 
6813                 if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
6814                         panic("post microcode update CPU %d has differing "
6815                             "microcode revision (%u) from CPU 0 (%u)",
6816                             i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
6817                 }
6818                 CPUSET_ADD(cpuset, i);
6819         }
6820 







6821         kpreempt_disable();
6822         xc_sync((xc_arg_t)argdata, 0, 0, CPUSET2BV(cpuset),
6823             cpuid_post_ucodeadm_xc);


6824         kpreempt_enable();
6825 
6826         /*
6827          * OK, now look at each CPU and see if their feature sets are equal.
6828          */
6829         f0 = argdata;
6830         for (i = 1; i < max_ncpus; i++) {
6831                 uchar_t *fset;
6832                 if (!CPU_IN_SET(cpuset, i))
6833                         continue;
6834 
6835                 fset = (uchar_t *)((uintptr_t)argdata +
6836                     sizeof (x86_featureset) * i);
6837 
6838                 if (!compare_x86_featureset(f0, fset)) {
6839                         panic("Post microcode update CPU %d has "
6840                             "differing security feature (%p) set from CPU 0 "
6841                             "(%p), not appending to feature set", i,
6842                             (void *)fset, (void *)f0);
6843                 }


 880  *
 881  * The second means is through the auxiliary vector. The auxiliary vector is a
 882  * series of tagged data that the kernel passes down to a user program when it
 883  * begins executing. This information is used to indicate to programs what
 884  * instruction set extensions are present. For example, information about the
 885  * CPU supporting the machine check architecture (MCA) wouldn't be passed down
 886  * since user programs cannot make use of it. However, things like the AVX
 887  * instruction sets are. Programs use this information to make run-time
 888  * decisions about what features they should use. As an example, the run-time
 889  * link-editor (rtld) can relocate different functions depending on the hardware
 890  * support available.
 891  *
 892  * The final form is through a series of accessor functions that all have the
 893  * form cpuid_get*. This is used by a number of different subsystems in the
 894  * kernel to determine more detailed information about what we're running on,
 895  * topology information, etc. Some of these subsystems include processor groups
 896  * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
 897  * microcode, and performance monitoring. These functions all ASSERT that the
 898  * CPU they're being called on has reached a certain cpuid pass. If the passes
 899  * are rearranged, then this needs to be adjusted.
 900  *
 901  * -----------------------------------------------
 902  * Speculative Execution CPU Side Channel Security
 903  * -----------------------------------------------
 904  *
 905  * With the advent of the Spectre and Meltdown attacks which exploit speculative
 906  * execution in the CPU to create side channels there have been a number of
 907  * different attacks and corresponding issues that the operating system needs to
 908  * mitigate against. The following list is some of the common, but not
 909  * exhaustive, set of issues that we know about and have done some or need to do
 910  * more work in the system to mitigate against:
 911  *
 912  *   - Spectre v1
 913  *   - Spectre v2
 914  *   - Meltdown (Spectre v3)
 915  *   - Rogue Register Read (Spectre v3a)
 916  *   - Speculative Store Bypass (Spectre v4)
 917  *   - ret2spec, SpectreRSB
 918  *   - L1 Terminal Fault (L1TF)
 919  *   - Microarchitectural Data Sampling (MDS)
 920  *
 921  * Each of these requires different sets of mitigations and has different attack
 922  * surfaces. For the most part, this discussion is about protecting the kernel
 923  * from non-kernel executing environments such as user processes and hardware
 924  * virtual machines. Unfortunately, there are a number of user vs. user
 925  * scenarios that exist with these. The rest of this section will describe the
 926  * overall approach that the system has taken to address these as well as their
 927  * shortcomings. Unfortunately, not all of the above have been handled today.
 928  *
 929  * SPECTRE FAMILY (Spectre v2, ret2spec, SpectreRSB)
 930  *
 931  * The second variant of the spectre attack focuses on performing branch target
 932  * injection. This generally impacts indirect call instructions in the system.
 933  * There are three different ways to mitigate this issue that are commonly
 934  * described today:
 935  *
 936  *  1. Using Indirect Branch Restricted Speculation (IBRS).
 937  *  2. Using Retpolines and RSB Stuffing
 938  *  3. Using Enhanced Indirect Branch Restricted Speculation (EIBRS)
 939  *
 940  * IBRS uses a feature added to microcode to restrict speculation, among other
 941  * things. This form of mitigation has not been used as it has been generally
 942  * seen as too expensive and requires reactivation upon various transitions in
 943  * the system.
 944  *
 945  * As a less impactful alternative to IBRS, retpolines were developed by
 946  * Google. These basically require one to replace indirect calls with a specific
 947  * trampoline that will cause speculation to fail and break the attack.
 948  * Retpolines require compiler support. We always build with retpolines in the
 949  * external thunk mode. This means that a traditional indirect call is replaced
 950  * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
 951  * of this is that all indirect function calls are performed through a register.
 952  *
 953  * We have to use a common external location of the thunk and not inline it into
 954  * the callsite so that way we can have a single place to patch these functions.
 955  * As it turns out, we actually have three different forms of retpolines that
 956  * exist in the system:
 957  *
 958  *  1. A full retpoline
 959  *  2. An AMD-specific optimized retpoline
 960  *  3. A no-op version
 961  *
 962  * The first one is used in the general case. The second one is used if we can
 963  * determine that we're on an AMD system and we can successfully toggle the
 964  * lfence serializing MSR that exists on the platform. Basically with this
 965  * present, an lfence is sufficient and we don't need to do anywhere near as
 966  * complicated a dance to successfully use retpolines.
 967  *
 968  * The third form described above is the most curious. It turns out that the way
 969  * that retpolines are implemented is that they rely on how speculation is
 970  * performed on a 'ret' instruction. Intel has continued to optimize this
 971  * process (which is partly why we need to have return stack buffer stuffing,
 972  * but more on that in a bit) and in processors starting with Cascade Lake
 973  * on the server side, it's dangerous to rely on retpolines. Instead, a new
 974  * mechanism has been introduced called Enhanced IBRS (EIBRS).
 975  *
 976  * Unlike IBRS, EIBRS is designed to be enabled once at boot and left on each
 977  * physical core. However, if this is the case, we don't want to use retpolines
 978  * any more. Therefore if EIBRS is present, we end up turning each retpoline
 979  * function (called a thunk) into a jmp instruction. This means that we're still
 980  * paying the cost of an extra jump to the external thunk, but it gives us
 981  * flexibility and the ability to have a single kernel image that works across a
 982  * wide variety of systems and hardware features.
 983  *
 984  * Unfortunately, this alone is insufficient. First, Skylake systems have
 985  * additional speculation for the Return Stack Buffer (RSB) which is used to
 986  * return from call instructions which retpolines take advantage of. However,
 987  * this problem is not just limited to Skylake and is actually more pernicious.
 988  * The SpectreRSB paper introduces several more problems that can arise with
 989  * dealing with this. The RSB can be poisoned just like the indirect branch
 990  * predictor. This means that one needs to clear the RSB when transitioning
 991  * between two different privilege domains. Some examples include:
 992  *
 993  *  - Switching between two different user processes
 994  *  - Going between user land and the kernel
 995  *  - Returning to the kernel from a hardware virtual machine
 996  *
 997  * Mitigating this involves combining a couple of different things. The first is
 998  * SMEP (supervisor mode execution protection) which was introduced in Ivy
 999  * Bridge. When an RSB entry refers to a user address and we're executing in the
1000  * kernel, speculation through it will be stopped when SMEP is enabled. This
1001  * protects against a number of the different cases that we would normally be
1002  * worried about such as when we enter the kernel from user land.
1003  *
1004  * To prevent against additional manipulation of the RSB from other contexts
1005  * such as a non-root VMX context attacking the kernel we first look to enhanced
1006  * IBRS. When EIBRS is present and enabled, then there is nothing else that we
1007  * need to do to protect the kernel at this time.
1008  *
1009  * On CPUs without EIBRS we need to manually overwrite the contents of the
1010  * return stack buffer. We do this through the x86_rsb_stuff() function.
1011  * Currently this is employed on context switch. The x86_rsb_stuff() function is
1012  * disabled when enhanced IBRS is present because Intel claims on such systems
1013  * it will be ineffective. Stuffing the RSB in context switch helps prevent user
1014  * to user attacks via the RSB.
1015  *
1016  * If SMEP is not present, then we would have to stuff the RSB every time we
1017  * transitioned from user mode to the kernel, which isn't very practical right
1018  * now.
1019  *
1020  * To fully protect user to user and vmx to vmx attacks from these classes of
1021  * issues, we would also need to allow them to opt into performing an Indirect
1022  * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1023  *
1024  * By default, the system will enable RSB stuffing and the required variant of
1025  * retpolines and store that information in the x86_spectrev2_mitigation value.
1026  * This will be evaluated after a microcode update as well, though it is
1027  * expected that microcode updates will not take away features. This may mean
1028  * that a late loaded microcode may not end up in the optimal configuration
1029  * (though this should be rare).
1030  *
1031  * Currently we do not build kmdb with retpolines or perform any additional side
1032  * channel security mitigations for it. One complication with kmdb is that it
1033  * requires its own retpoline thunks and it would need to adjust itself based on
1034  * what the kernel does. The threat model of kmdb is more limited and therefore
1035  * it may make more sense to investigate using prediction barriers as the whole
1036  * system is only executing a single instruction at a time while in kmdb.
1037  *
1038  * SPECTRE FAMILY (v1, v4)
1039  *
1040  * The v1 and v4 variants of spectre are not currently mitigated in the
1041  * system and require other classes of changes to occur in the code.
1042  *
1043  * MELTDOWN
1044  *
1045  * Meltdown, or spectre v3, allowed a user process to read any data in their
1046  * address space regardless of whether or not the page tables in question
1047  * allowed the user to have the ability to read them. The solution to meltdown
1048  * is kernel page table isolation. In this world, there are two page tables that
1049  * are used for a process, one in user land and one in the kernel. To implement
1050  * this we use per-CPU page tables and switch between the user and kernel
1051  * variants when entering and exiting the kernel.  For more information about
1052  * this process and how the trampolines work, please see the big theory
1053  * statements and additional comments in:
1054  *
1055  *  - uts/i86pc/ml/kpti_trampolines.s
1056  *  - uts/i86pc/vm/hat_i86.c
1057  *
1058  * While Meltdown only impacted Intel systems and there are also Intel systems
1059  * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1060  * kernel page table isolation enabled. While this may at first seem weird, an
1061  * important thing to remember is that you can't speculatively read an address
1062  * if it's never in your page table at all. Having user processes without kernel
1063  * pages present provides us with an important layer of defense in the kernel
1064  * against any other side channel attacks that exist and have yet to be
1065  * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1066  * default, no matter the x86 system.
1067  *
1068  * L1 TERMINAL FAULT
1069  *
1070  * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1071  * execution uses page table entries. Effectively, it is two different problems.
1072  * The first is that it ignores the not present bit in the page table entries
1073  * when performing speculative execution. This means that something can
1074  * speculatively read the listed physical address if it's present in the L1
1075  * cache under certain conditions (see Intel's documentation for the full set of
1076  * conditions). Secondly, this can be used to bypass hardware virtualization
1077  * extended page tables (EPT) that are part of Intel's hardware virtual machine
1078  * instructions.
1079  *
1080  * For the non-hardware virtualized case, this is relatively easy to deal with.
1081  * We must make sure that all unmapped pages have an address of zero. This means
1082  * that they could read the first 4k of physical memory; however, we never use
1083  * that first page in the operating system and always skip putting it in our
1084  * memory map, even if firmware tells us we can use it in our memory map. While
1085  * other systems try to put extra metadata in the address and reserved bits,
1086  * which led to this being problematic in those cases, we do not.
1087  *
1088  * For hardware virtual machines things are more complicated. Because they can
1089  * construct their own page tables, it isn't hard for them to perform this
1090  * attack against any physical address. The one wrinkle is that this physical
1091  * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1092  * to flush the L1 data cache. We wrap this up in the function
1093  * spec_uarch_flush(). This function is also used in the mitigation of
1094  * microarchitectural data sampling (MDS) discussed later on. Kernel based
1095  * hypervisors such as KVM or bhyve are responsible for performing this before
1096  * entering the guest.
1097  *
1098  * Because this attack takes place in the L1 cache, there's another wrinkle
1099  * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1100  * designs. This means that when a thread enters a hardware virtualized context
1101  * and flushes the L1 data cache, the other thread on the processor may then go
1102  * ahead and put new data in it that can be potentially attacked. While one
1103  * solution is to disable SMT on the system, another option that is available is
1104  * to use a feature for hardware virtualization called 'SMT exclusion'. This
1105  * goes through and makes sure that if a HVM is being scheduled on one thread,
1106  * then the thing on the other thread is from the same hardware virtual machine.
1107  * If an interrupt comes in or the guest exits to the broader system, then the
1108  * other SMT thread will be kicked out.
1109  *
1110  * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1111  * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1112  * perform L1TF related mitigations.
1113  *
1114  * MICROARCHITECTURAL DATA SAMPLING
1115  *
1116  * Microarchitectural data sampling (MDS) is a combination of four discrete
1117  * vulnerabilities that are similar issues affecting various parts of the CPU's
1118  * microarchitectural implementation around load, store, and fill buffers.
1119  * Specifically it is made up of the following subcomponents:
1120  *
1121  *  1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1122  *  2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1123  *  3. Microarchitectural Load Port Data Sampling (MLPDS)
1124  *  4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1125  *
1126  * To begin addressing these, Intel has introduced another feature in microcode
1127  * called MD_CLEAR. This changes the verw instruction to operate in a different
1128  * way. This allows us to execute the verw instruction in a particular way to
1129  * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1130  * updated when this microcode is present to flush this state.
1131  *
1132  * Primarily we need to flush this state whenever we transition from the kernel
1133  * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1134  * little bit different. Here the structures are statically sized when a logical
1135  * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1136  * flush the microarchitectural state before the CPU goes idles by calling hlt,
1137  * mwait, or another ACPI method. To perform these flushes, we call
1138  * x86_md_clear() at all of these transition points.
1139  *
1140  * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1141  * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1142  * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1143  * a no-op.
1144  *
1145  * Unfortunately, with this issue hyperthreading rears its ugly head. In
1146  * particular, everything we've discussed above is only valid for a single
1147  * thread executing on a core. In the case where you have hyper-threading
1148  * present, this attack can be performed between threads. The theoretical fix
1149  * for this is to ensure that both threads are always in the same security
1150  * domain. This means that they are executing in the same ring and mutually
1151  * trust each other. Practically speaking, this would mean that a system call
1152  * would have to issue an inter-processor interrupt (IPI) to the other thread.
1153  * Rather than implement this, we recommend that one disables hyper-threading
1154  * through the use of psradm -aS.
1155  *
1156  * SUMMARY
1157  *
1158  * The following table attempts to summarize the mitigations for various issues
1159  * and what's done in various places:
1160  *
1161  *  - Spectre v1: Not currently mitigated
1162  *  - Spectre v2: Retpolines/RSB Stuffing or EIBRS if HW support
1163  *  - Meltdown: Kernel Page Table Isolation
1164  *  - Spectre v3a: Updated CPU microcode
1165  *  - Spectre v4: Not currently mitigated
1166  *  - SpectreRSB: SMEP and RSB Stuffing
1167  *  - L1TF: spec_uarch_flush, smt exclusion, requires microcode
1168  *  - MDS: x86_md_clear, requires microcode, disabling hyper threading
1169  *
1170  * The following table indicates the x86 feature set bits that indicate that a
1171  * given problem has been solved or a notable feature is present:
1172  *
1173  *  - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1174  *  - MDS_NO: All forms of MDS
1175  */
1176 
1177 #include <sys/types.h>
1178 #include <sys/archsystm.h>
1179 #include <sys/x86_archext.h>
1180 #include <sys/kmem.h>
1181 #include <sys/systm.h>
1182 #include <sys/cmn_err.h>
1183 #include <sys/sunddi.h>
1184 #include <sys/sunndi.h>
1185 #include <sys/cpuvar.h>
1186 #include <sys/processor.h>
1187 #include <sys/sysmacros.h>
1188 #include <sys/pg.h>
1189 #include <sys/fp.h>
1190 #include <sys/controlregs.h>
1191 #include <sys/bitmap.h>
1192 #include <sys/auxv_386.h>
1193 #include <sys/memnode.h>
1194 #include <sys/pci_cfgspace.h>
1195 #include <sys/comm_page.h>
1196 #include <sys/mach_mmu.h>
1197 #include <sys/ucode.h>
1198 #include <sys/tsc.h>
1199 #include <sys/kobj.h>
1200 #include <sys/asm_misc.h>
1201 
1202 #ifdef __xpv
1203 #include <sys/hypervisor.h>
1204 #else
1205 #include <sys/ontrap.h>
1206 #endif
1207 
1208 uint_t x86_vendor = X86_VENDOR_IntelClone;
1209 uint_t x86_type = X86_TYPE_OTHER;
1210 uint_t x86_clflush_size = 0;
1211 
1212 #if defined(__xpv)
1213 int x86_use_pcid = 0;
1214 int x86_use_invpcid = 0;
1215 #else
1216 int x86_use_pcid = -1;
1217 int x86_use_invpcid = -1;
1218 #endif
1219 
1220 typedef enum {
1221         X86_SPECTREV2_RETPOLINE,
1222         X86_SPECTREV2_RETPOLINE_AMD,
1223         X86_SPECTREV2_ENHANCED_IBRS,
1224         X86_SPECTREV2_DISABLED
1225 } x86_spectrev2_mitigation_t;
1226 
1227 uint_t x86_disable_spectrev2 = 0;
1228 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1229     X86_SPECTREV2_RETPOLINE;
1230 
1231 uint_t pentiumpro_bug4046376;
1232 
1233 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1234 
1235 static char *x86_feature_names[NUM_X86_FEATURES] = {
1236         "lgpg",
1237         "tsc",
1238         "msr",
1239         "mtrr",
1240         "pge",
1241         "de",
1242         "cmov",
1243         "mmx",
1244         "mca",
1245         "pae",
1246         "cv8",
1247         "pat",
1248         "sep",
1249         "sse",
1250         "sse2",


2441  * This function points to a function that will flush certain
2442  * micro-architectural state on the processor. This flush is used to mitigate
2443  * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This
2444  * function can point to one of three functions:
2445  *
2446  * - A noop which is done because we either are vulnerable, but do not have
2447  *   microcode available to help deal with a fix, or because we aren't
2448  *   vulnerable.
2449  *
2450  * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2451  *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2452  *   however, it only flushes the MDS related micro-architectural state on the
2453  *   current hyperthread, it does not do anything for the twin.
2454  *
2455  * - x86_md_clear which will flush the MDS related state. This is done when we
2456  *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2457  *   (RDCL_NO is set).
2458  */
2459 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2460 


2461 static void
2462 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2463 {
2464         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2465 
2466         /*
2467          * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2468          * has been fixed in hardware, it doesn't cover everything related to
2469          * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2470          * need to mitigate this.
2471          */
2472         if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2473             is_x86_feature(featureset, X86FSET_MDS_NO)) {


2474                 return;
2475         }
2476 
2477         if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2478                 const uint8_t nop = NOP_INSTR;
2479                 uint8_t *md = (uint8_t *)x86_md_clear;
2480 
2481                 *md = nop;
2482         }
2483 
2484         membar_producer();
2485 }
2486 
2487 static void
2488 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2489 {
2490         boolean_t need_l1d, need_mds;
2491         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2492 
2493         /*
2494          * If we're not on Intel or we've mitigated both RDCL and MDS in
2495          * hardware, then there's nothing left for us to do for enabling the
2496          * flush. We can also go ahead and say that SMT exclusion is
2497          * unnecessary.
2498          */
2499         if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2500             (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2501             is_x86_feature(featureset, X86FSET_MDS_NO))) {


2525         if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2526             is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2527                 need_mds = B_TRUE;
2528         } else {
2529                 need_mds = B_FALSE;
2530         }
2531 
2532         if (need_l1d) {
2533                 spec_uarch_flush = spec_uarch_flush_msr;
2534         } else if (need_mds) {
2535                 spec_uarch_flush = x86_md_clear;
2536         } else {
2537                 /*
2538                  * We have no hardware mitigations available to us.
2539                  */
2540                 spec_uarch_flush = spec_uarch_flush_noop;
2541         }
2542         membar_producer();
2543 }
2544 
2545 /*
2546  * We default to enabling RSB mitigations.
2547  */
2548 static void
2549 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)
2550 {
2551         const uint8_t ret = RET_INSTR;
2552         uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
2553 
2554         switch (mit) {
2555         case X86_SPECTREV2_ENHANCED_IBRS:
2556         case X86_SPECTREV2_DISABLED:
2557                 *stuff = ret;
2558                 break;
2559         default:
2560                 break;
2561         }
2562 }
2563 
2564 static void
2565 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
2566 {
2567         const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
2568             "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
2569             "_r14", "_r15" };
2570         const uint_t nthunks = ARRAY_SIZE(thunks);
2571         const char *type;
2572         uint_t i;
2573 
2574         if (mit == x86_spectrev2_mitigation)
2575                 return;
2576 
2577         switch (mit) {
2578         case X86_SPECTREV2_RETPOLINE:
2579                 type = "gen";
2580                 break;
2581         case X86_SPECTREV2_RETPOLINE_AMD:
2582                 type = "amd";
2583                 break;
2584         case X86_SPECTREV2_ENHANCED_IBRS:
2585         case X86_SPECTREV2_DISABLED:
2586                 type = "jmp";
2587                 break;
2588         default:
2589                 panic("asked to updated retpoline state with unknown state!");
2590         }
2591 
2592         for (i = 0; i < nthunks; i++) {
2593                 uintptr_t source, dest;
2594                 int ssize, dsize;
2595                 char sourcebuf[64], destbuf[64];
2596                 size_t len;
2597 
2598                 (void) snprintf(destbuf, sizeof (destbuf),
2599                     "__x86_indirect_thunk%s", thunks[i]);
2600                 (void) snprintf(sourcebuf, sizeof (sourcebuf),
2601                     "__x86_indirect_thunk_%s%s", type, thunks[i]);
2602 
2603                 source = kobj_getelfsym(sourcebuf, NULL, &ssize);
2604                 dest = kobj_getelfsym(destbuf, NULL, &dsize);
2605                 VERIFY3U(source, !=, 0);
2606                 VERIFY3U(dest, !=, 0);
2607                 VERIFY3S(dsize, >=, ssize);
2608                 bcopy((void *)source, (void *)dest, ssize);
2609         }
2610 }
2611 
2612 static void
2613 cpuid_enable_enhanced_ibrs(void)
2614 {
2615         uint64_t val;
2616 
2617         val = rdmsr(MSR_IA32_SPEC_CTRL);
2618         val |= IA32_SPEC_CTRL_IBRS;
2619         wrmsr(MSR_IA32_SPEC_CTRL, val);
2620 }
2621 
2622 #ifndef __xpv
2623 /*
2624  * Determine whether or not we can use the AMD optimized retpoline
2625  * functionality. We use this when we know we're on an AMD system and we can
2626  * successfully verify that lfence is dispatch serializing.
2627  */
2628 static boolean_t
2629 cpuid_use_amd_retpoline(struct cpuid_info *cpi)
2630 {
2631         uint64_t val;
2632         on_trap_data_t otd;
2633 
2634         if (cpi->cpi_vendor != X86_VENDOR_AMD)
2635                 return (B_FALSE);
2636 
2637         /*
2638          * We need to determine whether or not lfence is serializing. It always
2639          * is on families 0xf and 0x11. On others, it's controlled by
2640          * MSR_AMD_DECODE_CONFIG (MSRC001_1029). If some hypervisor gives us a
2641          * crazy old family, don't try and do anything.
2642          */
2643         if (cpi->cpi_family < 0xf)
2644                 return (B_FALSE);
2645         if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11)
2646                 return (B_TRUE);
2647 
2648         /*
2649          * While it may be tempting to use get_hwenv(), there are no promises
2650          * that a hypervisor will actually declare themselves to be so in a
2651          * friendly way. As such, try to read and set the MSR. If we can then
2652          * read back the value we set (it wasn't just set to zero), then we go
2653          * for it.
2654          */
2655         if (!on_trap(&otd, OT_DATA_ACCESS)) {
2656                 val = rdmsr(MSR_AMD_DECODE_CONFIG);
2657                 val |= AMD_DECODE_CONFIG_LFENCE_DISPATCH;
2658                 wrmsr(MSR_AMD_DECODE_CONFIG, val);
2659                 val = rdmsr(MSR_AMD_DECODE_CONFIG);
2660         } else {
2661                 val = 0;
2662         }
2663         no_trap();
2664 
2665         if ((val & AMD_DECODE_CONFIG_LFENCE_DISPATCH) != 0)
2666                 return (B_TRUE);
2667         return (B_FALSE);
2668 }
2669 #endif  /* !__xpv */
2670 
2671 static void
2672 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
2673 {
2674         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2675         x86_spectrev2_mitigation_t v2mit;
2676 
2677         if (cpi->cpi_vendor == X86_VENDOR_AMD &&
2678             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2679                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
2680                         add_x86_feature(featureset, X86FSET_IBPB);
2681                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
2682                         add_x86_feature(featureset, X86FSET_IBRS);
2683                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
2684                         add_x86_feature(featureset, X86FSET_STIBP);


2685                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
2686                         add_x86_feature(featureset, X86FSET_STIBP_ALL);


2687                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
2688                         add_x86_feature(featureset, X86FSET_SSBD);
2689                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
2690                         add_x86_feature(featureset, X86FSET_SSBD_VIRT);
2691                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
2692                         add_x86_feature(featureset, X86FSET_SSB_NO);
2693                 /*
2694                  * Don't enable enhanced IBRS unless we're told that we should
2695                  * prefer it and it has the same semantics as Intel. This is
2696                  * split into two bits rather than a single one.
2697                  */
2698                 if ((cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
2699                     (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)) {
2700                         add_x86_feature(featureset, X86FSET_IBRS_ALL);
2701                 }
2702 
2703         } else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
2704             cpi->cpi_maxeax >= 7) {
2705                 struct cpuid_regs *ecp;
2706                 ecp = &cpi->cpi_std[7];
2707 
2708                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
2709                         add_x86_feature(featureset, X86FSET_MD_CLEAR);
2710                 }
2711 
2712                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
2713                         add_x86_feature(featureset, X86FSET_IBRS);
2714                         add_x86_feature(featureset, X86FSET_IBPB);
2715                 }
2716 
2717                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
2718                         add_x86_feature(featureset, X86FSET_STIBP);
2719                 }
2720 
2721                 /*
2722                  * Don't read the arch caps MSR on xpv where we lack the


2752                                 if (reg & IA32_ARCH_CAP_SSB_NO) {
2753                                         add_x86_feature(featureset,
2754                                             X86FSET_SSB_NO);
2755                                 }
2756                                 if (reg & IA32_ARCH_CAP_MDS_NO) {
2757                                         add_x86_feature(featureset,
2758                                             X86FSET_MDS_NO);
2759                                 }
2760                         }
2761                         no_trap();
2762                 }
2763 #endif  /* !__xpv */
2764 
2765                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
2766                         add_x86_feature(featureset, X86FSET_SSBD);
2767 
2768                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
2769                         add_x86_feature(featureset, X86FSET_FLUSH_CMD);
2770         }
2771 
2772         if (cpu->cpu_id != 0) {
2773                 if (x86_spectrev2_mitigation == X86_SPECTREV2_ENHANCED_IBRS) {
2774                         cpuid_enable_enhanced_ibrs();
2775                 }
2776                 return;
2777         }
2778 
2779         /*
2780          * Go through and initialize various security mechanisms that we should
2781          * only do on a single CPU. This includes Spectre V2, L1TF, and MDS.
2782          */
2783 
2784         /*
2785          * By default we've come in with retpolines enabled. Check whether we
2786          * should disable them or enable enhanced IBRS. RSB stuffing is enabled
2787          * by default, but disabled if we are using enhanced IBRS.
2788          */
2789         if (x86_disable_spectrev2 != 0) {
2790                 v2mit = X86_SPECTREV2_DISABLED;
2791         } else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
2792                 cpuid_enable_enhanced_ibrs();
2793                 v2mit = X86_SPECTREV2_ENHANCED_IBRS;
2794 #ifndef __xpv
2795         } else if (cpuid_use_amd_retpoline(cpi)) {
2796                 v2mit = X86_SPECTREV2_RETPOLINE_AMD;
2797 #endif  /* !__xpv */
2798         } else {
2799                 v2mit = X86_SPECTREV2_RETPOLINE;
2800         }
2801 
2802         cpuid_patch_retpolines(v2mit);
2803         cpuid_patch_rsb(v2mit);
2804         x86_spectrev2_mitigation = v2mit;
2805         membar_producer();
2806 
2807         /*
2808          * We need to determine what changes are required for mitigating L1TF
2809          * and MDS. If the CPU suffers from either of them, then SMT exclusion
2810          * is required.
2811          *
2812          * If any of these are present, then we need to flush u-arch state at
2813          * various points. For MDS, we need to do so whenever we change to a
2814          * lesser privilege level or we are halting the CPU. For L1TF we need to
2815          * flush the L1D cache at VM entry. When we have microcode that handles
2816          * MDS, the L1D flush also clears the other u-arch state that the
2817          * md_clear does.
2818          */
2819 
2820         /*
2821          * Update whether or not we need to be taking explicit action against
2822          * MDS.
2823          */
2824         cpuid_update_md_clear(cpu, featureset);
2825 
2826         /*
2827          * Determine whether SMT exclusion is required and whether or not we


7209                 bzero(&cp, sizeof (cp));
7210                 cp.cp_eax = CPUID_LEAF_EXT_8;
7211                 (void) __cpuid_insn(&cp);
7212                 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
7213                 cpi->cpi_extd[8] = cp;
7214         } else {
7215                 /*
7216                  * Nothing to do here. Return an empty set which has already
7217                  * been zeroed for us.
7218                  */
7219                 return;
7220         }
7221         cpuid_scan_security(cpu, fset);
7222 }
7223 
7224 /* ARGSUSED */
7225 static int
7226 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
7227 {
7228         uchar_t *fset;
7229         boolean_t first_pass = (boolean_t)arg1;
7230 
7231         fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
7232         if (first_pass && CPU->cpu_id != 0)
7233                 return (0);
7234         if (!first_pass && CPU->cpu_id == 0)
7235                 return (0);
7236         cpuid_pass_ucode(CPU, fset);
7237 
7238         return (0);
7239 }
7240 
7241 /*
7242  * After a microcode update where the version has changed, then we need to
7243  * rescan CPUID. To do this we check every CPU to make sure that they have the
7244  * same microcode. Then we perform a cross call to all such CPUs. It's the
7245  * caller's job to make sure that no one else can end up doing an update while
7246  * this is going on.
7247  *
7248  * We assume that the system is microcode capable if we're called.
7249  */
7250 void
7251 cpuid_post_ucodeadm(void)
7252 {
7253         uint32_t rev;
7254         int i;
7255         struct cpu *cpu;


7258         uchar_t *f0;
7259 
7260         argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
7261 
7262         mutex_enter(&cpu_lock);
7263         cpu = cpu_get(0);
7264         rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
7265         CPUSET_ONLY(cpuset, 0);
7266         for (i = 1; i < max_ncpus; i++) {
7267                 if ((cpu = cpu_get(i)) == NULL)
7268                         continue;
7269 
7270                 if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
7271                         panic("post microcode update CPU %d has differing "
7272                             "microcode revision (%u) from CPU 0 (%u)",
7273                             i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
7274                 }
7275                 CPUSET_ADD(cpuset, i);
7276         }
7277 
7278         /*
7279          * We do the cross calls in two passes. The first pass is only for the
7280          * boot CPU. The second pass is for all of the other CPUs. This allows
7281          * the boot CPU to go through and change behavior related to patching or
7282          * whether or not Enhanced IBRS needs to be enabled and then allow all
7283          * other CPUs to follow suit.
7284          */
7285         kpreempt_disable();
7286         xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset),
7287             cpuid_post_ucodeadm_xc);
7288         xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset),
7289             cpuid_post_ucodeadm_xc);
7290         kpreempt_enable();
7291 
7292         /*
7293          * OK, now look at each CPU and see if their feature sets are equal.
7294          */
7295         f0 = argdata;
7296         for (i = 1; i < max_ncpus; i++) {
7297                 uchar_t *fset;
7298                 if (!CPU_IN_SET(cpuset, i))
7299                         continue;
7300 
7301                 fset = (uchar_t *)((uintptr_t)argdata +
7302                     sizeof (x86_featureset) * i);
7303 
7304                 if (!compare_x86_featureset(f0, fset)) {
7305                         panic("Post microcode update CPU %d has "
7306                             "differing security feature (%p) set from CPU 0 "
7307                             "(%p), not appending to feature set", i,
7308                             (void *)fset, (void *)f0);
7309                 }