880 *
881 * The second means is through the auxiliary vector. The auxiliary vector is a
882 * series of tagged data that the kernel passes down to a user program when it
883 * begins executing. This information is used to indicate to programs what
884 * instruction set extensions are present. For example, information about the
885 * CPU supporting the machine check architecture (MCA) wouldn't be passed down
886 * since user programs cannot make use of it. However, things like the AVX
887 * instruction sets are. Programs use this information to make run-time
888 * decisions about what features they should use. As an example, the run-time
889 * link-editor (rtld) can relocate different functions depending on the hardware
890 * support available.
891 *
892 * The final form is through a series of accessor functions that all have the
893 * form cpuid_get*. This is used by a number of different subsystems in the
894 * kernel to determine more detailed information about what we're running on,
895 * topology information, etc. Some of these subsystems include processor groups
896 * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
897 * microcode, and performance monitoring. These functions all ASSERT that the
898 * CPU they're being called on has reached a certain cpuid pass. If the passes
899 * are rearranged, then this needs to be adjusted.
900 */
901
902 #include <sys/types.h>
903 #include <sys/archsystm.h>
904 #include <sys/x86_archext.h>
905 #include <sys/kmem.h>
906 #include <sys/systm.h>
907 #include <sys/cmn_err.h>
908 #include <sys/sunddi.h>
909 #include <sys/sunndi.h>
910 #include <sys/cpuvar.h>
911 #include <sys/processor.h>
912 #include <sys/sysmacros.h>
913 #include <sys/pg.h>
914 #include <sys/fp.h>
915 #include <sys/controlregs.h>
916 #include <sys/bitmap.h>
917 #include <sys/auxv_386.h>
918 #include <sys/memnode.h>
919 #include <sys/pci_cfgspace.h>
920 #include <sys/comm_page.h>
921 #include <sys/mach_mmu.h>
922 #include <sys/ucode.h>
923 #include <sys/tsc.h>
924
925 #ifdef __xpv
926 #include <sys/hypervisor.h>
927 #else
928 #include <sys/ontrap.h>
929 #endif
930
931 uint_t x86_vendor = X86_VENDOR_IntelClone;
932 uint_t x86_type = X86_TYPE_OTHER;
933 uint_t x86_clflush_size = 0;
934
935 #if defined(__xpv)
936 int x86_use_pcid = 0;
937 int x86_use_invpcid = 0;
938 #else
939 int x86_use_pcid = -1;
940 int x86_use_invpcid = -1;
941 #endif
942
943 uint_t pentiumpro_bug4046376;
944
945 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
946
947 static char *x86_feature_names[NUM_X86_FEATURES] = {
948 "lgpg",
949 "tsc",
950 "msr",
951 "mtrr",
952 "pge",
953 "de",
954 "cmov",
955 "mmx",
956 "mca",
957 "pae",
958 "cv8",
959 "pat",
960 "sep",
961 "sse",
962 "sse2",
2153 * This function points to a function that will flush certain
2154 * micro-architectural state on the processor. This flush is used to mitigate
2155 * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This
2156 * function can point to one of three functions:
2157 *
2158 * - A noop which is done because we either are vulnerable, but do not have
2159 * microcode available to help deal with a fix, or because we aren't
2160 * vulnerable.
2161 *
2162 * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2163 * mitigate MDS is present, also perform the equivalent of the MDS flush;
2164 * however, it only flushes the MDS related micro-architectural state on the
2165 * current hyperthread, it does not do anything for the twin.
2166 *
2167 * - x86_md_clear which will flush the MDS related state. This is done when we
2168 * have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2169 * (RDCL_NO is set).
2170 */
2171 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2172
2173 void (*x86_md_clear)(void) = x86_md_clear_noop;
2174
2175 static void
2176 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2177 {
2178 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2179
2180 /*
2181 * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2182 * has been fixed in hardware, it doesn't cover everything related to
2183 * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2184 * need to mitigate this.
2185 */
2186 if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2187 is_x86_feature(featureset, X86FSET_MDS_NO)) {
2188 x86_md_clear = x86_md_clear_noop;
2189 membar_producer();
2190 return;
2191 }
2192
2193 if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2194 x86_md_clear = x86_md_clear_verw;
2195 }
2196
2197 membar_producer();
2198 }
2199
2200 static void
2201 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2202 {
2203 boolean_t need_l1d, need_mds;
2204 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2205
2206 /*
2207 * If we're not on Intel or we've mitigated both RDCL and MDS in
2208 * hardware, then there's nothing left for us to do for enabling the
2209 * flush. We can also go ahead and say that SMT exclusion is
2210 * unnecessary.
2211 */
2212 if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2213 (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2214 is_x86_feature(featureset, X86FSET_MDS_NO))) {
2238 if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2239 is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2240 need_mds = B_TRUE;
2241 } else {
2242 need_mds = B_FALSE;
2243 }
2244
2245 if (need_l1d) {
2246 spec_uarch_flush = spec_uarch_flush_msr;
2247 } else if (need_mds) {
2248 spec_uarch_flush = x86_md_clear;
2249 } else {
2250 /*
2251 * We have no hardware mitigations available to us.
2252 */
2253 spec_uarch_flush = spec_uarch_flush_noop;
2254 }
2255 membar_producer();
2256 }
2257
2258 static void
2259 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
2260 {
2261 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2262
2263 if (cpi->cpi_vendor == X86_VENDOR_AMD &&
2264 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2265 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
2266 add_x86_feature(featureset, X86FSET_IBPB);
2267 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
2268 add_x86_feature(featureset, X86FSET_IBRS);
2269 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
2270 add_x86_feature(featureset, X86FSET_STIBP);
2271 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)
2272 add_x86_feature(featureset, X86FSET_IBRS_ALL);
2273 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
2274 add_x86_feature(featureset, X86FSET_STIBP_ALL);
2275 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS)
2276 add_x86_feature(featureset, X86FSET_RSBA);
2277 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
2278 add_x86_feature(featureset, X86FSET_SSBD);
2279 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
2280 add_x86_feature(featureset, X86FSET_SSBD_VIRT);
2281 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
2282 add_x86_feature(featureset, X86FSET_SSB_NO);
2283 } else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
2284 cpi->cpi_maxeax >= 7) {
2285 struct cpuid_regs *ecp;
2286 ecp = &cpi->cpi_std[7];
2287
2288 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
2289 add_x86_feature(featureset, X86FSET_MD_CLEAR);
2290 }
2291
2292 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
2293 add_x86_feature(featureset, X86FSET_IBRS);
2294 add_x86_feature(featureset, X86FSET_IBPB);
2295 }
2296
2297 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
2298 add_x86_feature(featureset, X86FSET_STIBP);
2299 }
2300
2301 /*
2302 * Don't read the arch caps MSR on xpv where we lack the
2332 if (reg & IA32_ARCH_CAP_SSB_NO) {
2333 add_x86_feature(featureset,
2334 X86FSET_SSB_NO);
2335 }
2336 if (reg & IA32_ARCH_CAP_MDS_NO) {
2337 add_x86_feature(featureset,
2338 X86FSET_MDS_NO);
2339 }
2340 }
2341 no_trap();
2342 }
2343 #endif /* !__xpv */
2344
2345 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
2346 add_x86_feature(featureset, X86FSET_SSBD);
2347
2348 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
2349 add_x86_feature(featureset, X86FSET_FLUSH_CMD);
2350 }
2351
2352 if (cpu->cpu_id != 0)
2353 return;
2354
2355 /*
2356 * We need to determine what changes are required for mitigating L1TF
2357 * and MDS. If the CPU suffers from either of them, then SMT exclusion
2358 * is required.
2359 *
2360 * If any of these are present, then we need to flush u-arch state at
2361 * various points. For MDS, we need to do so whenever we change to a
2362 * lesser privilege level or we are halting the CPU. For L1TF we need to
2363 * flush the L1D cache at VM entry. When we have microcode that handles
2364 * MDS, the L1D flush also clears the other u-arch state that the
2365 * md_clear does.
2366 */
2367
2368 /*
2369 * Update whether or not we need to be taking explicit action against
2370 * MDS.
2371 */
2372 cpuid_update_md_clear(cpu, featureset);
2373
2374 /*
2375 * Determine whether SMT exclusion is required and whether or not we
6757 bzero(&cp, sizeof (cp));
6758 cp.cp_eax = CPUID_LEAF_EXT_8;
6759 (void) __cpuid_insn(&cp);
6760 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
6761 cpi->cpi_extd[8] = cp;
6762 } else {
6763 /*
6764 * Nothing to do here. Return an empty set which has already
6765 * been zeroed for us.
6766 */
6767 return;
6768 }
6769 cpuid_scan_security(cpu, fset);
6770 }
6771
6772 /* ARGSUSED */
6773 static int
6774 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
6775 {
6776 uchar_t *fset;
6777
6778 fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
6779 cpuid_pass_ucode(CPU, fset);
6780
6781 return (0);
6782 }
6783
6784 /*
6785 * After a microcode update where the version has changed, then we need to
6786 * rescan CPUID. To do this we check every CPU to make sure that they have the
6787 * same microcode. Then we perform a cross call to all such CPUs. It's the
6788 * caller's job to make sure that no one else can end up doing an update while
6789 * this is going on.
6790 *
6791 * We assume that the system is microcode capable if we're called.
6792 */
6793 void
6794 cpuid_post_ucodeadm(void)
6795 {
6796 uint32_t rev;
6797 int i;
6798 struct cpu *cpu;
6801 uchar_t *f0;
6802
6803 argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
6804
6805 mutex_enter(&cpu_lock);
6806 cpu = cpu_get(0);
6807 rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
6808 CPUSET_ONLY(cpuset, 0);
6809 for (i = 1; i < max_ncpus; i++) {
6810 if ((cpu = cpu_get(i)) == NULL)
6811 continue;
6812
6813 if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
6814 panic("post microcode update CPU %d has differing "
6815 "microcode revision (%u) from CPU 0 (%u)",
6816 i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
6817 }
6818 CPUSET_ADD(cpuset, i);
6819 }
6820
6821 kpreempt_disable();
6822 xc_sync((xc_arg_t)argdata, 0, 0, CPUSET2BV(cpuset),
6823 cpuid_post_ucodeadm_xc);
6824 kpreempt_enable();
6825
6826 /*
6827 * OK, now look at each CPU and see if their feature sets are equal.
6828 */
6829 f0 = argdata;
6830 for (i = 1; i < max_ncpus; i++) {
6831 uchar_t *fset;
6832 if (!CPU_IN_SET(cpuset, i))
6833 continue;
6834
6835 fset = (uchar_t *)((uintptr_t)argdata +
6836 sizeof (x86_featureset) * i);
6837
6838 if (!compare_x86_featureset(f0, fset)) {
6839 panic("Post microcode update CPU %d has "
6840 "differing security feature (%p) set from CPU 0 "
6841 "(%p), not appending to feature set", i,
6842 (void *)fset, (void *)f0);
6843 }
|
880 *
881 * The second means is through the auxiliary vector. The auxiliary vector is a
882 * series of tagged data that the kernel passes down to a user program when it
883 * begins executing. This information is used to indicate to programs what
884 * instruction set extensions are present. For example, information about the
885 * CPU supporting the machine check architecture (MCA) wouldn't be passed down
886 * since user programs cannot make use of it. However, things like the AVX
887 * instruction sets are. Programs use this information to make run-time
888 * decisions about what features they should use. As an example, the run-time
889 * link-editor (rtld) can relocate different functions depending on the hardware
890 * support available.
891 *
892 * The final form is through a series of accessor functions that all have the
893 * form cpuid_get*. This is used by a number of different subsystems in the
894 * kernel to determine more detailed information about what we're running on,
895 * topology information, etc. Some of these subsystems include processor groups
896 * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
897 * microcode, and performance monitoring. These functions all ASSERT that the
898 * CPU they're being called on has reached a certain cpuid pass. If the passes
899 * are rearranged, then this needs to be adjusted.
900 *
901 * -----------------------------------------------
902 * Speculative Execution CPU Side Channel Security
903 * -----------------------------------------------
904 *
905 * With the advent of the Spectre and Meltdown attacks which exploit speculative
906 * execution in the CPU to create side channels there have been a number of
907 * different attacks and corresponding issues that the operating system needs to
908 * mitigate against. The following list is some of the common, but not
909 * exhaustive, set of issues that we know about and have done some or need to do
910 * more work in the system to mitigate against:
911 *
912 * - Spectre v1
913 * - Spectre v2
914 * - Meltdown (Spectre v3)
915 * - Rogue Register Read (Spectre v3a)
916 * - Speculative Store Bypass (Spectre v4)
917 * - ret2spec, SpectreRSB
918 * - L1 Terminal Fault (L1TF)
919 * - Microarchitectural Data Sampling (MDS)
920 *
921 * Each of these requires different sets of mitigations and has different attack
922 * surfaces. For the most part, this discussion is about protecting the kernel
923 * from non-kernel executing environments such as user processes and hardware
924 * virtual machines. Unfortunately, there are a number of user vs. user
925 * scenarios that exist with these. The rest of this section will describe the
926 * overall approach that the system has taken to address these as well as their
927 * shortcomings. Unfortunately, not all of the above have been handled today.
928 *
929 * SPECTRE FAMILY (Spectre v2, ret2spec, SpectreRSB)
930 *
931 * The second variant of the spectre attack focuses on performing branch target
932 * injection. This generally impacts indirect call instructions in the system.
933 * There are three different ways to mitigate this issue that are commonly
934 * described today:
935 *
936 * 1. Using Indirect Branch Restricted Speculation (IBRS).
937 * 2. Using Retpolines and RSB Stuffing
938 * 3. Using Enhanced Indirect Branch Restricted Speculation (EIBRS)
939 *
940 * IBRS uses a feature added to microcode to restrict speculation, among other
941 * things. This form of mitigation has not been used as it has been generally
942 * seen as too expensive and requires reactivation upon various transitions in
943 * the system.
944 *
945 * As a less impactful alternative to IBRS, retpolines were developed by
946 * Google. These basically require one to replace indirect calls with a specific
947 * trampoline that will cause speculation to fail and break the attack.
948 * Retpolines require compiler support. We always build with retpolines in the
949 * external thunk mode. This means that a traditional indirect call is replaced
950 * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
951 * of this is that all indirect function calls are performed through a register.
952 *
953 * We have to use a common external location of the thunk and not inline it into
954 * the callsite so that way we can have a single place to patch these functions.
955 * As it turns out, we actually have three different forms of retpolines that
956 * exist in the system:
957 *
958 * 1. A full retpoline
959 * 2. An AMD-specific optimized retpoline
960 * 3. A no-op version
961 *
962 * The first one is used in the general case. The second one is used if we can
963 * determine that we're on an AMD system and we can successfully toggle the
964 * lfence serializing MSR that exists on the platform. Basically with this
965 * present, an lfence is sufficient and we don't need to do anywhere near as
966 * complicated a dance to successfully use retpolines.
967 *
968 * The third form described above is the most curious. It turns out that the way
969 * that retpolines are implemented is that they rely on how speculation is
970 * performed on a 'ret' instruction. Intel has continued to optimize this
971 * process (which is partly why we need to have return stack buffer stuffing,
972 * but more on that in a bit) and in processors starting with Cascade Lake
973 * on the server side, it's dangerous to rely on retpolines. Instead, a new
974 * mechanism has been introduced called Enhanced IBRS (EIBRS).
975 *
976 * Unlike IBRS, EIBRS is designed to be enabled once at boot and left on each
977 * physical core. However, if this is the case, we don't want to use retpolines
978 * any more. Therefore if EIBRS is present, we end up turning each retpoline
979 * function (called a thunk) into a jmp instruction. This means that we're still
980 * paying the cost of an extra jump to the external thunk, but it gives us
981 * flexibility and the ability to have a single kernel image that works across a
982 * wide variety of systems and hardware features.
983 *
984 * Unfortunately, this alone is insufficient. First, Skylake systems have
985 * additional speculation for the Return Stack Buffer (RSB) which is used to
986 * return from call instructions which retpolines take advantage of. However,
987 * this problem is not just limited to Skylake and is actually more pernicious.
988 * The SpectreRSB paper introduces several more problems that can arise with
989 * dealing with this. The RSB can be poisoned just like the indirect branch
990 * predictor. This means that one needs to clear the RSB when transitioning
991 * between two different privilege domains. Some examples include:
992 *
993 * - Switching between two different user processes
994 * - Going between user land and the kernel
995 * - Returning to the kernel from a hardware virtual machine
996 *
997 * Mitigating this involves combining a couple of different things. The first is
998 * SMEP (supervisor mode execution protection) which was introduced in Ivy
999 * Bridge. When an RSB entry refers to a user address and we're executing in the
1000 * kernel, speculation through it will be stopped when SMEP is enabled. This
1001 * protects against a number of the different cases that we would normally be
1002 * worried about such as when we enter the kernel from user land.
1003 *
1004 * To prevent against additional manipulation of the RSB from other contexts
1005 * such as a non-root VMX context attacking the kernel we first look to enhanced
1006 * IBRS. When EIBRS is present and enabled, then there is nothing else that we
1007 * need to do to protect the kernel at this time.
1008 *
1009 * On CPUs without EIBRS we need to manually overwrite the contents of the
1010 * return stack buffer. We do this through the x86_rsb_stuff() function.
1011 * Currently this is employed on context switch. The x86_rsb_stuff() function is
1012 * disabled when enhanced IBRS is present because Intel claims on such systems
1013 * it will be ineffective. Stuffing the RSB in context switch helps prevent user
1014 * to user attacks via the RSB.
1015 *
1016 * If SMEP is not present, then we would have to stuff the RSB every time we
1017 * transitioned from user mode to the kernel, which isn't very practical right
1018 * now.
1019 *
1020 * To fully protect user to user and vmx to vmx attacks from these classes of
1021 * issues, we would also need to allow them to opt into performing an Indirect
1022 * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1023 *
1024 * By default, the system will enable RSB stuffing and the required variant of
1025 * retpolines and store that information in the x86_spectrev2_mitigation value.
1026 * This will be evaluated after a microcode update as well, though it is
1027 * expected that microcode updates will not take away features. This may mean
1028 * that a late loaded microcode may not end up in the optimal configuration
1029 * (though this should be rare).
1030 *
1031 * Currently we do not build kmdb with retpolines or perform any additional side
1032 * channel security mitigations for it. One complication with kmdb is that it
1033 * requires its own retpoline thunks and it would need to adjust itself based on
1034 * what the kernel does. The threat model of kmdb is more limited and therefore
1035 * it may make more sense to investigate using prediction barriers as the whole
1036 * system is only executing a single instruction at a time while in kmdb.
1037 *
1038 * SPECTRE FAMILY (v1, v4)
1039 *
1040 * The v1 and v4 variants of spectre are not currently mitigated in the
1041 * system and require other classes of changes to occur in the code.
1042 *
1043 * MELTDOWN
1044 *
1045 * Meltdown, or spectre v3, allowed a user process to read any data in their
1046 * address space regardless of whether or not the page tables in question
1047 * allowed the user to have the ability to read them. The solution to meltdown
1048 * is kernel page table isolation. In this world, there are two page tables that
1049 * are used for a process, one in user land and one in the kernel. To implement
1050 * this we use per-CPU page tables and switch between the user and kernel
1051 * variants when entering and exiting the kernel. For more information about
1052 * this process and how the trampolines work, please see the big theory
1053 * statements and additional comments in:
1054 *
1055 * - uts/i86pc/ml/kpti_trampolines.s
1056 * - uts/i86pc/vm/hat_i86.c
1057 *
1058 * While Meltdown only impacted Intel systems and there are also Intel systems
1059 * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1060 * kernel page table isolation enabled. While this may at first seem weird, an
1061 * important thing to remember is that you can't speculatively read an address
1062 * if it's never in your page table at all. Having user processes without kernel
1063 * pages present provides us with an important layer of defense in the kernel
1064 * against any other side channel attacks that exist and have yet to be
1065 * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1066 * default, no matter the x86 system.
1067 *
1068 * L1 TERMINAL FAULT
1069 *
1070 * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1071 * execution uses page table entries. Effectively, it is two different problems.
1072 * The first is that it ignores the not present bit in the page table entries
1073 * when performing speculative execution. This means that something can
1074 * speculatively read the listed physical address if it's present in the L1
1075 * cache under certain conditions (see Intel's documentation for the full set of
1076 * conditions). Secondly, this can be used to bypass hardware virtualization
1077 * extended page tables (EPT) that are part of Intel's hardware virtual machine
1078 * instructions.
1079 *
1080 * For the non-hardware virtualized case, this is relatively easy to deal with.
1081 * We must make sure that all unmapped pages have an address of zero. This means
1082 * that they could read the first 4k of physical memory; however, we never use
1083 * that first page in the operating system and always skip putting it in our
1084 * memory map, even if firmware tells us we can use it in our memory map. While
1085 * other systems try to put extra metadata in the address and reserved bits,
1086 * which led to this being problematic in those cases, we do not.
1087 *
1088 * For hardware virtual machines things are more complicated. Because they can
1089 * construct their own page tables, it isn't hard for them to perform this
1090 * attack against any physical address. The one wrinkle is that this physical
1091 * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1092 * to flush the L1 data cache. We wrap this up in the function
1093 * spec_uarch_flush(). This function is also used in the mitigation of
1094 * microarchitectural data sampling (MDS) discussed later on. Kernel based
1095 * hypervisors such as KVM or bhyve are responsible for performing this before
1096 * entering the guest.
1097 *
1098 * Because this attack takes place in the L1 cache, there's another wrinkle
1099 * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1100 * designs. This means that when a thread enters a hardware virtualized context
1101 * and flushes the L1 data cache, the other thread on the processor may then go
1102 * ahead and put new data in it that can be potentially attacked. While one
1103 * solution is to disable SMT on the system, another option that is available is
1104 * to use a feature for hardware virtualization called 'SMT exclusion'. This
1105 * goes through and makes sure that if a HVM is being scheduled on one thread,
1106 * then the thing on the other thread is from the same hardware virtual machine.
1107 * If an interrupt comes in or the guest exits to the broader system, then the
1108 * other SMT thread will be kicked out.
1109 *
1110 * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1111 * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1112 * perform L1TF related mitigations.
1113 *
1114 * MICROARCHITECTURAL DATA SAMPLING
1115 *
1116 * Microarchitectural data sampling (MDS) is a combination of four discrete
1117 * vulnerabilities that are similar issues affecting various parts of the CPU's
1118 * microarchitectural implementation around load, store, and fill buffers.
1119 * Specifically it is made up of the following subcomponents:
1120 *
1121 * 1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1122 * 2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1123 * 3. Microarchitectural Load Port Data Sampling (MLPDS)
1124 * 4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1125 *
1126 * To begin addressing these, Intel has introduced another feature in microcode
1127 * called MD_CLEAR. This changes the verw instruction to operate in a different
1128 * way. This allows us to execute the verw instruction in a particular way to
1129 * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1130 * updated when this microcode is present to flush this state.
1131 *
1132 * Primarily we need to flush this state whenever we transition from the kernel
1133 * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1134 * little bit different. Here the structures are statically sized when a logical
1135 * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1136 * flush the microarchitectural state before the CPU goes idles by calling hlt,
1137 * mwait, or another ACPI method. To perform these flushes, we call
1138 * x86_md_clear() at all of these transition points.
1139 *
1140 * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1141 * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1142 * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1143 * a no-op.
1144 *
1145 * Unfortunately, with this issue hyperthreading rears its ugly head. In
1146 * particular, everything we've discussed above is only valid for a single
1147 * thread executing on a core. In the case where you have hyper-threading
1148 * present, this attack can be performed between threads. The theoretical fix
1149 * for this is to ensure that both threads are always in the same security
1150 * domain. This means that they are executing in the same ring and mutually
1151 * trust each other. Practically speaking, this would mean that a system call
1152 * would have to issue an inter-processor interrupt (IPI) to the other thread.
1153 * Rather than implement this, we recommend that one disables hyper-threading
1154 * through the use of psradm -aS.
1155 *
1156 * SUMMARY
1157 *
1158 * The following table attempts to summarize the mitigations for various issues
1159 * and what's done in various places:
1160 *
1161 * - Spectre v1: Not currently mitigated
1162 * - Spectre v2: Retpolines/RSB Stuffing or EIBRS if HW support
1163 * - Meltdown: Kernel Page Table Isolation
1164 * - Spectre v3a: Updated CPU microcode
1165 * - Spectre v4: Not currently mitigated
1166 * - SpectreRSB: SMEP and RSB Stuffing
1167 * - L1TF: spec_uarch_flush, smt exclusion, requires microcode
1168 * - MDS: x86_md_clear, requires microcode, disabling hyper threading
1169 *
1170 * The following table indicates the x86 feature set bits that indicate that a
1171 * given problem has been solved or a notable feature is present:
1172 *
1173 * - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1174 * - MDS_NO: All forms of MDS
1175 */
1176
1177 #include <sys/types.h>
1178 #include <sys/archsystm.h>
1179 #include <sys/x86_archext.h>
1180 #include <sys/kmem.h>
1181 #include <sys/systm.h>
1182 #include <sys/cmn_err.h>
1183 #include <sys/sunddi.h>
1184 #include <sys/sunndi.h>
1185 #include <sys/cpuvar.h>
1186 #include <sys/processor.h>
1187 #include <sys/sysmacros.h>
1188 #include <sys/pg.h>
1189 #include <sys/fp.h>
1190 #include <sys/controlregs.h>
1191 #include <sys/bitmap.h>
1192 #include <sys/auxv_386.h>
1193 #include <sys/memnode.h>
1194 #include <sys/pci_cfgspace.h>
1195 #include <sys/comm_page.h>
1196 #include <sys/mach_mmu.h>
1197 #include <sys/ucode.h>
1198 #include <sys/tsc.h>
1199 #include <sys/kobj.h>
1200 #include <sys/asm_misc.h>
1201
1202 #ifdef __xpv
1203 #include <sys/hypervisor.h>
1204 #else
1205 #include <sys/ontrap.h>
1206 #endif
1207
1208 uint_t x86_vendor = X86_VENDOR_IntelClone;
1209 uint_t x86_type = X86_TYPE_OTHER;
1210 uint_t x86_clflush_size = 0;
1211
1212 #if defined(__xpv)
1213 int x86_use_pcid = 0;
1214 int x86_use_invpcid = 0;
1215 #else
1216 int x86_use_pcid = -1;
1217 int x86_use_invpcid = -1;
1218 #endif
1219
1220 typedef enum {
1221 X86_SPECTREV2_RETPOLINE,
1222 X86_SPECTREV2_RETPOLINE_AMD,
1223 X86_SPECTREV2_ENHANCED_IBRS,
1224 X86_SPECTREV2_DISABLED
1225 } x86_spectrev2_mitigation_t;
1226
1227 uint_t x86_disable_spectrev2 = 0;
1228 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1229 X86_SPECTREV2_RETPOLINE;
1230
1231 uint_t pentiumpro_bug4046376;
1232
1233 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1234
1235 static char *x86_feature_names[NUM_X86_FEATURES] = {
1236 "lgpg",
1237 "tsc",
1238 "msr",
1239 "mtrr",
1240 "pge",
1241 "de",
1242 "cmov",
1243 "mmx",
1244 "mca",
1245 "pae",
1246 "cv8",
1247 "pat",
1248 "sep",
1249 "sse",
1250 "sse2",
2441 * This function points to a function that will flush certain
2442 * micro-architectural state on the processor. This flush is used to mitigate
2443 * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This
2444 * function can point to one of three functions:
2445 *
2446 * - A noop which is done because we either are vulnerable, but do not have
2447 * microcode available to help deal with a fix, or because we aren't
2448 * vulnerable.
2449 *
2450 * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2451 * mitigate MDS is present, also perform the equivalent of the MDS flush;
2452 * however, it only flushes the MDS related micro-architectural state on the
2453 * current hyperthread, it does not do anything for the twin.
2454 *
2455 * - x86_md_clear which will flush the MDS related state. This is done when we
2456 * have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2457 * (RDCL_NO is set).
2458 */
2459 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2460
2461 static void
2462 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2463 {
2464 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2465
2466 /*
2467 * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2468 * has been fixed in hardware, it doesn't cover everything related to
2469 * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2470 * need to mitigate this.
2471 */
2472 if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2473 is_x86_feature(featureset, X86FSET_MDS_NO)) {
2474 return;
2475 }
2476
2477 if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2478 const uint8_t nop = NOP_INSTR;
2479 uint8_t *md = (uint8_t *)x86_md_clear;
2480
2481 *md = nop;
2482 }
2483
2484 membar_producer();
2485 }
2486
2487 static void
2488 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2489 {
2490 boolean_t need_l1d, need_mds;
2491 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2492
2493 /*
2494 * If we're not on Intel or we've mitigated both RDCL and MDS in
2495 * hardware, then there's nothing left for us to do for enabling the
2496 * flush. We can also go ahead and say that SMT exclusion is
2497 * unnecessary.
2498 */
2499 if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2500 (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2501 is_x86_feature(featureset, X86FSET_MDS_NO))) {
2525 if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2526 is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2527 need_mds = B_TRUE;
2528 } else {
2529 need_mds = B_FALSE;
2530 }
2531
2532 if (need_l1d) {
2533 spec_uarch_flush = spec_uarch_flush_msr;
2534 } else if (need_mds) {
2535 spec_uarch_flush = x86_md_clear;
2536 } else {
2537 /*
2538 * We have no hardware mitigations available to us.
2539 */
2540 spec_uarch_flush = spec_uarch_flush_noop;
2541 }
2542 membar_producer();
2543 }
2544
2545 /*
2546 * We default to enabling RSB mitigations.
2547 */
2548 static void
2549 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)
2550 {
2551 const uint8_t ret = RET_INSTR;
2552 uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
2553
2554 switch (mit) {
2555 case X86_SPECTREV2_ENHANCED_IBRS:
2556 case X86_SPECTREV2_DISABLED:
2557 *stuff = ret;
2558 break;
2559 default:
2560 break;
2561 }
2562 }
2563
2564 static void
2565 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
2566 {
2567 const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
2568 "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
2569 "_r14", "_r15" };
2570 const uint_t nthunks = ARRAY_SIZE(thunks);
2571 const char *type;
2572 uint_t i;
2573
2574 if (mit == x86_spectrev2_mitigation)
2575 return;
2576
2577 switch (mit) {
2578 case X86_SPECTREV2_RETPOLINE:
2579 type = "gen";
2580 break;
2581 case X86_SPECTREV2_RETPOLINE_AMD:
2582 type = "amd";
2583 break;
2584 case X86_SPECTREV2_ENHANCED_IBRS:
2585 case X86_SPECTREV2_DISABLED:
2586 type = "jmp";
2587 break;
2588 default:
2589 panic("asked to updated retpoline state with unknown state!");
2590 }
2591
2592 for (i = 0; i < nthunks; i++) {
2593 uintptr_t source, dest;
2594 int ssize, dsize;
2595 char sourcebuf[64], destbuf[64];
2596 size_t len;
2597
2598 (void) snprintf(destbuf, sizeof (destbuf),
2599 "__x86_indirect_thunk%s", thunks[i]);
2600 (void) snprintf(sourcebuf, sizeof (sourcebuf),
2601 "__x86_indirect_thunk_%s%s", type, thunks[i]);
2602
2603 source = kobj_getelfsym(sourcebuf, NULL, &ssize);
2604 dest = kobj_getelfsym(destbuf, NULL, &dsize);
2605 VERIFY3U(source, !=, 0);
2606 VERIFY3U(dest, !=, 0);
2607 VERIFY3S(dsize, >=, ssize);
2608 bcopy((void *)source, (void *)dest, ssize);
2609 }
2610 }
2611
2612 static void
2613 cpuid_enable_enhanced_ibrs(void)
2614 {
2615 uint64_t val;
2616
2617 val = rdmsr(MSR_IA32_SPEC_CTRL);
2618 val |= IA32_SPEC_CTRL_IBRS;
2619 wrmsr(MSR_IA32_SPEC_CTRL, val);
2620 }
2621
2622 #ifndef __xpv
2623 /*
2624 * Determine whether or not we can use the AMD optimized retpoline
2625 * functionality. We use this when we know we're on an AMD system and we can
2626 * successfully verify that lfence is dispatch serializing.
2627 */
2628 static boolean_t
2629 cpuid_use_amd_retpoline(struct cpuid_info *cpi)
2630 {
2631 uint64_t val;
2632 on_trap_data_t otd;
2633
2634 if (cpi->cpi_vendor != X86_VENDOR_AMD)
2635 return (B_FALSE);
2636
2637 /*
2638 * We need to determine whether or not lfence is serializing. It always
2639 * is on families 0xf and 0x11. On others, it's controlled by
2640 * MSR_AMD_DECODE_CONFIG (MSRC001_1029). If some hypervisor gives us a
2641 * crazy old family, don't try and do anything.
2642 */
2643 if (cpi->cpi_family < 0xf)
2644 return (B_FALSE);
2645 if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11)
2646 return (B_TRUE);
2647
2648 /*
2649 * While it may be tempting to use get_hwenv(), there are no promises
2650 * that a hypervisor will actually declare themselves to be so in a
2651 * friendly way. As such, try to read and set the MSR. If we can then
2652 * read back the value we set (it wasn't just set to zero), then we go
2653 * for it.
2654 */
2655 if (!on_trap(&otd, OT_DATA_ACCESS)) {
2656 val = rdmsr(MSR_AMD_DECODE_CONFIG);
2657 val |= AMD_DECODE_CONFIG_LFENCE_DISPATCH;
2658 wrmsr(MSR_AMD_DECODE_CONFIG, val);
2659 val = rdmsr(MSR_AMD_DECODE_CONFIG);
2660 } else {
2661 val = 0;
2662 }
2663 no_trap();
2664
2665 if ((val & AMD_DECODE_CONFIG_LFENCE_DISPATCH) != 0)
2666 return (B_TRUE);
2667 return (B_FALSE);
2668 }
2669 #endif /* !__xpv */
2670
2671 static void
2672 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
2673 {
2674 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2675 x86_spectrev2_mitigation_t v2mit;
2676
2677 if (cpi->cpi_vendor == X86_VENDOR_AMD &&
2678 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2679 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
2680 add_x86_feature(featureset, X86FSET_IBPB);
2681 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
2682 add_x86_feature(featureset, X86FSET_IBRS);
2683 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
2684 add_x86_feature(featureset, X86FSET_STIBP);
2685 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
2686 add_x86_feature(featureset, X86FSET_STIBP_ALL);
2687 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
2688 add_x86_feature(featureset, X86FSET_SSBD);
2689 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
2690 add_x86_feature(featureset, X86FSET_SSBD_VIRT);
2691 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
2692 add_x86_feature(featureset, X86FSET_SSB_NO);
2693 /*
2694 * Don't enable enhanced IBRS unless we're told that we should
2695 * prefer it and it has the same semantics as Intel. This is
2696 * split into two bits rather than a single one.
2697 */
2698 if ((cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
2699 (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)) {
2700 add_x86_feature(featureset, X86FSET_IBRS_ALL);
2701 }
2702
2703 } else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
2704 cpi->cpi_maxeax >= 7) {
2705 struct cpuid_regs *ecp;
2706 ecp = &cpi->cpi_std[7];
2707
2708 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
2709 add_x86_feature(featureset, X86FSET_MD_CLEAR);
2710 }
2711
2712 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
2713 add_x86_feature(featureset, X86FSET_IBRS);
2714 add_x86_feature(featureset, X86FSET_IBPB);
2715 }
2716
2717 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
2718 add_x86_feature(featureset, X86FSET_STIBP);
2719 }
2720
2721 /*
2722 * Don't read the arch caps MSR on xpv where we lack the
2752 if (reg & IA32_ARCH_CAP_SSB_NO) {
2753 add_x86_feature(featureset,
2754 X86FSET_SSB_NO);
2755 }
2756 if (reg & IA32_ARCH_CAP_MDS_NO) {
2757 add_x86_feature(featureset,
2758 X86FSET_MDS_NO);
2759 }
2760 }
2761 no_trap();
2762 }
2763 #endif /* !__xpv */
2764
2765 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
2766 add_x86_feature(featureset, X86FSET_SSBD);
2767
2768 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
2769 add_x86_feature(featureset, X86FSET_FLUSH_CMD);
2770 }
2771
2772 if (cpu->cpu_id != 0) {
2773 if (x86_spectrev2_mitigation == X86_SPECTREV2_ENHANCED_IBRS) {
2774 cpuid_enable_enhanced_ibrs();
2775 }
2776 return;
2777 }
2778
2779 /*
2780 * Go through and initialize various security mechanisms that we should
2781 * only do on a single CPU. This includes Spectre V2, L1TF, and MDS.
2782 */
2783
2784 /*
2785 * By default we've come in with retpolines enabled. Check whether we
2786 * should disable them or enable enhanced IBRS. RSB stuffing is enabled
2787 * by default, but disabled if we are using enhanced IBRS.
2788 */
2789 if (x86_disable_spectrev2 != 0) {
2790 v2mit = X86_SPECTREV2_DISABLED;
2791 } else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
2792 cpuid_enable_enhanced_ibrs();
2793 v2mit = X86_SPECTREV2_ENHANCED_IBRS;
2794 #ifndef __xpv
2795 } else if (cpuid_use_amd_retpoline(cpi)) {
2796 v2mit = X86_SPECTREV2_RETPOLINE_AMD;
2797 #endif /* !__xpv */
2798 } else {
2799 v2mit = X86_SPECTREV2_RETPOLINE;
2800 }
2801
2802 cpuid_patch_retpolines(v2mit);
2803 cpuid_patch_rsb(v2mit);
2804 x86_spectrev2_mitigation = v2mit;
2805 membar_producer();
2806
2807 /*
2808 * We need to determine what changes are required for mitigating L1TF
2809 * and MDS. If the CPU suffers from either of them, then SMT exclusion
2810 * is required.
2811 *
2812 * If any of these are present, then we need to flush u-arch state at
2813 * various points. For MDS, we need to do so whenever we change to a
2814 * lesser privilege level or we are halting the CPU. For L1TF we need to
2815 * flush the L1D cache at VM entry. When we have microcode that handles
2816 * MDS, the L1D flush also clears the other u-arch state that the
2817 * md_clear does.
2818 */
2819
2820 /*
2821 * Update whether or not we need to be taking explicit action against
2822 * MDS.
2823 */
2824 cpuid_update_md_clear(cpu, featureset);
2825
2826 /*
2827 * Determine whether SMT exclusion is required and whether or not we
7209 bzero(&cp, sizeof (cp));
7210 cp.cp_eax = CPUID_LEAF_EXT_8;
7211 (void) __cpuid_insn(&cp);
7212 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
7213 cpi->cpi_extd[8] = cp;
7214 } else {
7215 /*
7216 * Nothing to do here. Return an empty set which has already
7217 * been zeroed for us.
7218 */
7219 return;
7220 }
7221 cpuid_scan_security(cpu, fset);
7222 }
7223
7224 /* ARGSUSED */
7225 static int
7226 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
7227 {
7228 uchar_t *fset;
7229 boolean_t first_pass = (boolean_t)arg1;
7230
7231 fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
7232 if (first_pass && CPU->cpu_id != 0)
7233 return (0);
7234 if (!first_pass && CPU->cpu_id == 0)
7235 return (0);
7236 cpuid_pass_ucode(CPU, fset);
7237
7238 return (0);
7239 }
7240
7241 /*
7242 * After a microcode update where the version has changed, then we need to
7243 * rescan CPUID. To do this we check every CPU to make sure that they have the
7244 * same microcode. Then we perform a cross call to all such CPUs. It's the
7245 * caller's job to make sure that no one else can end up doing an update while
7246 * this is going on.
7247 *
7248 * We assume that the system is microcode capable if we're called.
7249 */
7250 void
7251 cpuid_post_ucodeadm(void)
7252 {
7253 uint32_t rev;
7254 int i;
7255 struct cpu *cpu;
7258 uchar_t *f0;
7259
7260 argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
7261
7262 mutex_enter(&cpu_lock);
7263 cpu = cpu_get(0);
7264 rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
7265 CPUSET_ONLY(cpuset, 0);
7266 for (i = 1; i < max_ncpus; i++) {
7267 if ((cpu = cpu_get(i)) == NULL)
7268 continue;
7269
7270 if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
7271 panic("post microcode update CPU %d has differing "
7272 "microcode revision (%u) from CPU 0 (%u)",
7273 i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
7274 }
7275 CPUSET_ADD(cpuset, i);
7276 }
7277
7278 /*
7279 * We do the cross calls in two passes. The first pass is only for the
7280 * boot CPU. The second pass is for all of the other CPUs. This allows
7281 * the boot CPU to go through and change behavior related to patching or
7282 * whether or not Enhanced IBRS needs to be enabled and then allow all
7283 * other CPUs to follow suit.
7284 */
7285 kpreempt_disable();
7286 xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset),
7287 cpuid_post_ucodeadm_xc);
7288 xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset),
7289 cpuid_post_ucodeadm_xc);
7290 kpreempt_enable();
7291
7292 /*
7293 * OK, now look at each CPU and see if their feature sets are equal.
7294 */
7295 f0 = argdata;
7296 for (i = 1; i < max_ncpus; i++) {
7297 uchar_t *fset;
7298 if (!CPU_IN_SET(cpuset, i))
7299 continue;
7300
7301 fset = (uchar_t *)((uintptr_t)argdata +
7302 sizeof (x86_featureset) * i);
7303
7304 if (!compare_x86_featureset(f0, fset)) {
7305 panic("Post microcode update CPU %d has "
7306 "differing security feature (%p) set from CPU 0 "
7307 "(%p), not appending to feature set", i,
7308 (void *)fset, (void *)f0);
7309 }
|