diff options
author | Mike Pagano <mpagano@gentoo.org> | 2019-05-14 16:55:01 -0400 |
---|---|---|
committer | Mike Pagano <mpagano@gentoo.org> | 2019-05-14 16:55:01 -0400 |
commit | a79e0f8fd41f9d88de8ca425ff82e0b20c86e775 (patch) | |
tree | 28a2712b695c3574f70a3d10b68749544512c381 /1118_linux-4.14.119.patch | |
parent | Linux patch 4.14.118 (diff) | |
download | linux-patches-a79e0f8fd41f9d88de8ca425ff82e0b20c86e775.tar.gz linux-patches-a79e0f8fd41f9d88de8ca425ff82e0b20c86e775.tar.bz2 linux-patches-a79e0f8fd41f9d88de8ca425ff82e0b20c86e775.zip |
Linux patch 4.14.1194.14-127
Signed-off-by: Mike Pagano <mpagano@gentoo.org>
Diffstat (limited to '1118_linux-4.14.119.patch')
-rw-r--r-- | 1118_linux-4.14.119.patch | 3641 |
1 files changed, 3641 insertions, 0 deletions
diff --git a/1118_linux-4.14.119.patch b/1118_linux-4.14.119.patch new file mode 100644 index 00000000..10316711 --- /dev/null +++ b/1118_linux-4.14.119.patch @@ -0,0 +1,3641 @@ +diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu +index 6cae60929cb6..645687b1870d 100644 +--- a/Documentation/ABI/testing/sysfs-devices-system-cpu ++++ b/Documentation/ABI/testing/sysfs-devices-system-cpu +@@ -380,6 +380,7 @@ What: /sys/devices/system/cpu/vulnerabilities + /sys/devices/system/cpu/vulnerabilities/spectre_v2 + /sys/devices/system/cpu/vulnerabilities/spec_store_bypass + /sys/devices/system/cpu/vulnerabilities/l1tf ++ /sys/devices/system/cpu/vulnerabilities/mds + Date: January 2018 + Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org> + Description: Information about CPU vulnerabilities +@@ -392,8 +393,7 @@ Description: Information about CPU vulnerabilities + "Vulnerable" CPU is affected and no mitigation in effect + "Mitigation: $M" CPU is affected and mitigation $M is in effect + +- Details about the l1tf file can be found in +- Documentation/admin-guide/l1tf.rst ++ See also: Documentation/admin-guide/hw-vuln/index.rst + + What: /sys/devices/system/cpu/smt + /sys/devices/system/cpu/smt/active +diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst +new file mode 100644 +index 000000000000..ffc064c1ec68 +--- /dev/null ++++ b/Documentation/admin-guide/hw-vuln/index.rst +@@ -0,0 +1,13 @@ ++======================== ++Hardware vulnerabilities ++======================== ++ ++This section describes CPU vulnerabilities and provides an overview of the ++possible mitigations along with guidance for selecting mitigations if they ++are configurable at compile, boot or run time. ++ ++.. toctree:: ++ :maxdepth: 1 ++ ++ l1tf ++ mds +diff --git a/Documentation/admin-guide/hw-vuln/l1tf.rst b/Documentation/admin-guide/hw-vuln/l1tf.rst +new file mode 100644 +index 000000000000..31653a9f0e1b +--- /dev/null ++++ b/Documentation/admin-guide/hw-vuln/l1tf.rst +@@ -0,0 +1,615 @@ ++L1TF - L1 Terminal Fault ++======================== ++ ++L1 Terminal Fault is a hardware vulnerability which allows unprivileged ++speculative access to data which is available in the Level 1 Data Cache ++when the page table entry controlling the virtual address, which is used ++for the access, has the Present bit cleared or other reserved bits set. ++ ++Affected processors ++------------------- ++ ++This vulnerability affects a wide range of Intel processors. The ++vulnerability is not present on: ++ ++ - Processors from AMD, Centaur and other non Intel vendors ++ ++ - Older processor models, where the CPU family is < 6 ++ ++ - A range of Intel ATOM processors (Cedarview, Cloverview, Lincroft, ++ Penwell, Pineview, Silvermont, Airmont, Merrifield) ++ ++ - The Intel XEON PHI family ++ ++ - Intel processors which have the ARCH_CAP_RDCL_NO bit set in the ++ IA32_ARCH_CAPABILITIES MSR. If the bit is set the CPU is not affected ++ by the Meltdown vulnerability either. These CPUs should become ++ available by end of 2018. ++ ++Whether a processor is affected or not can be read out from the L1TF ++vulnerability file in sysfs. See :ref:`l1tf_sys_info`. ++ ++Related CVEs ++------------ ++ ++The following CVE entries are related to the L1TF vulnerability: ++ ++ ============= ================= ============================== ++ CVE-2018-3615 L1 Terminal Fault SGX related aspects ++ CVE-2018-3620 L1 Terminal Fault OS, SMM related aspects ++ CVE-2018-3646 L1 Terminal Fault Virtualization related aspects ++ ============= ================= ============================== ++ ++Problem ++------- ++ ++If an instruction accesses a virtual address for which the relevant page ++table entry (PTE) has the Present bit cleared or other reserved bits set, ++then speculative execution ignores the invalid PTE and loads the referenced ++data if it is present in the Level 1 Data Cache, as if the page referenced ++by the address bits in the PTE was still present and accessible. ++ ++While this is a purely speculative mechanism and the instruction will raise ++a page fault when it is retired eventually, the pure act of loading the ++data and making it available to other speculative instructions opens up the ++opportunity for side channel attacks to unprivileged malicious code, ++similar to the Meltdown attack. ++ ++While Meltdown breaks the user space to kernel space protection, L1TF ++allows to attack any physical memory address in the system and the attack ++works across all protection domains. It allows an attack of SGX and also ++works from inside virtual machines because the speculation bypasses the ++extended page table (EPT) protection mechanism. ++ ++ ++Attack scenarios ++---------------- ++ ++1. Malicious user space ++^^^^^^^^^^^^^^^^^^^^^^^ ++ ++ Operating Systems store arbitrary information in the address bits of a ++ PTE which is marked non present. This allows a malicious user space ++ application to attack the physical memory to which these PTEs resolve. ++ In some cases user-space can maliciously influence the information ++ encoded in the address bits of the PTE, thus making attacks more ++ deterministic and more practical. ++ ++ The Linux kernel contains a mitigation for this attack vector, PTE ++ inversion, which is permanently enabled and has no performance ++ impact. The kernel ensures that the address bits of PTEs, which are not ++ marked present, never point to cacheable physical memory space. ++ ++ A system with an up to date kernel is protected against attacks from ++ malicious user space applications. ++ ++2. Malicious guest in a virtual machine ++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ ++ The fact that L1TF breaks all domain protections allows malicious guest ++ OSes, which can control the PTEs directly, and malicious guest user ++ space applications, which run on an unprotected guest kernel lacking the ++ PTE inversion mitigation for L1TF, to attack physical host memory. ++ ++ A special aspect of L1TF in the context of virtualization is symmetric ++ multi threading (SMT). The Intel implementation of SMT is called ++ HyperThreading. The fact that Hyperthreads on the affected processors ++ share the L1 Data Cache (L1D) is important for this. As the flaw allows ++ only to attack data which is present in L1D, a malicious guest running ++ on one Hyperthread can attack the data which is brought into the L1D by ++ the context which runs on the sibling Hyperthread of the same physical ++ core. This context can be host OS, host user space or a different guest. ++ ++ If the processor does not support Extended Page Tables, the attack is ++ only possible, when the hypervisor does not sanitize the content of the ++ effective (shadow) page tables. ++ ++ While solutions exist to mitigate these attack vectors fully, these ++ mitigations are not enabled by default in the Linux kernel because they ++ can affect performance significantly. The kernel provides several ++ mechanisms which can be utilized to address the problem depending on the ++ deployment scenario. The mitigations, their protection scope and impact ++ are described in the next sections. ++ ++ The default mitigations and the rationale for choosing them are explained ++ at the end of this document. See :ref:`default_mitigations`. ++ ++.. _l1tf_sys_info: ++ ++L1TF system information ++----------------------- ++ ++The Linux kernel provides a sysfs interface to enumerate the current L1TF ++status of the system: whether the system is vulnerable, and which ++mitigations are active. The relevant sysfs file is: ++ ++/sys/devices/system/cpu/vulnerabilities/l1tf ++ ++The possible values in this file are: ++ ++ =========================== =============================== ++ 'Not affected' The processor is not vulnerable ++ 'Mitigation: PTE Inversion' The host protection is active ++ =========================== =============================== ++ ++If KVM/VMX is enabled and the processor is vulnerable then the following ++information is appended to the 'Mitigation: PTE Inversion' part: ++ ++ - SMT status: ++ ++ ===================== ================ ++ 'VMX: SMT vulnerable' SMT is enabled ++ 'VMX: SMT disabled' SMT is disabled ++ ===================== ================ ++ ++ - L1D Flush mode: ++ ++ ================================ ==================================== ++ 'L1D vulnerable' L1D flushing is disabled ++ ++ 'L1D conditional cache flushes' L1D flush is conditionally enabled ++ ++ 'L1D cache flushes' L1D flush is unconditionally enabled ++ ================================ ==================================== ++ ++The resulting grade of protection is discussed in the following sections. ++ ++ ++Host mitigation mechanism ++------------------------- ++ ++The kernel is unconditionally protected against L1TF attacks from malicious ++user space running on the host. ++ ++ ++Guest mitigation mechanisms ++--------------------------- ++ ++.. _l1d_flush: ++ ++1. L1D flush on VMENTER ++^^^^^^^^^^^^^^^^^^^^^^^ ++ ++ To make sure that a guest cannot attack data which is present in the L1D ++ the hypervisor flushes the L1D before entering the guest. ++ ++ Flushing the L1D evicts not only the data which should not be accessed ++ by a potentially malicious guest, it also flushes the guest ++ data. Flushing the L1D has a performance impact as the processor has to ++ bring the flushed guest data back into the L1D. Depending on the ++ frequency of VMEXIT/VMENTER and the type of computations in the guest ++ performance degradation in the range of 1% to 50% has been observed. For ++ scenarios where guest VMEXIT/VMENTER are rare the performance impact is ++ minimal. Virtio and mechanisms like posted interrupts are designed to ++ confine the VMEXITs to a bare minimum, but specific configurations and ++ application scenarios might still suffer from a high VMEXIT rate. ++ ++ The kernel provides two L1D flush modes: ++ - conditional ('cond') ++ - unconditional ('always') ++ ++ The conditional mode avoids L1D flushing after VMEXITs which execute ++ only audited code paths before the corresponding VMENTER. These code ++ paths have been verified that they cannot expose secrets or other ++ interesting data to an attacker, but they can leak information about the ++ address space layout of the hypervisor. ++ ++ Unconditional mode flushes L1D on all VMENTER invocations and provides ++ maximum protection. It has a higher overhead than the conditional ++ mode. The overhead cannot be quantified correctly as it depends on the ++ workload scenario and the resulting number of VMEXITs. ++ ++ The general recommendation is to enable L1D flush on VMENTER. The kernel ++ defaults to conditional mode on affected processors. ++ ++ **Note**, that L1D flush does not prevent the SMT problem because the ++ sibling thread will also bring back its data into the L1D which makes it ++ attackable again. ++ ++ L1D flush can be controlled by the administrator via the kernel command ++ line and sysfs control files. See :ref:`mitigation_control_command_line` ++ and :ref:`mitigation_control_kvm`. ++ ++.. _guest_confinement: ++ ++2. Guest VCPU confinement to dedicated physical cores ++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ ++ To address the SMT problem, it is possible to make a guest or a group of ++ guests affine to one or more physical cores. The proper mechanism for ++ that is to utilize exclusive cpusets to ensure that no other guest or ++ host tasks can run on these cores. ++ ++ If only a single guest or related guests run on sibling SMT threads on ++ the same physical core then they can only attack their own memory and ++ restricted parts of the host memory. ++ ++ Host memory is attackable, when one of the sibling SMT threads runs in ++ host OS (hypervisor) context and the other in guest context. The amount ++ of valuable information from the host OS context depends on the context ++ which the host OS executes, i.e. interrupts, soft interrupts and kernel ++ threads. The amount of valuable data from these contexts cannot be ++ declared as non-interesting for an attacker without deep inspection of ++ the code. ++ ++ **Note**, that assigning guests to a fixed set of physical cores affects ++ the ability of the scheduler to do load balancing and might have ++ negative effects on CPU utilization depending on the hosting ++ scenario. Disabling SMT might be a viable alternative for particular ++ scenarios. ++ ++ For further information about confining guests to a single or to a group ++ of cores consult the cpusets documentation: ++ ++ https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt ++ ++.. _interrupt_isolation: ++ ++3. Interrupt affinity ++^^^^^^^^^^^^^^^^^^^^^ ++ ++ Interrupts can be made affine to logical CPUs. This is not universally ++ true because there are types of interrupts which are truly per CPU ++ interrupts, e.g. the local timer interrupt. Aside of that multi queue ++ devices affine their interrupts to single CPUs or groups of CPUs per ++ queue without allowing the administrator to control the affinities. ++ ++ Moving the interrupts, which can be affinity controlled, away from CPUs ++ which run untrusted guests, reduces the attack vector space. ++ ++ Whether the interrupts with are affine to CPUs, which run untrusted ++ guests, provide interesting data for an attacker depends on the system ++ configuration and the scenarios which run on the system. While for some ++ of the interrupts it can be assumed that they won't expose interesting ++ information beyond exposing hints about the host OS memory layout, there ++ is no way to make general assumptions. ++ ++ Interrupt affinity can be controlled by the administrator via the ++ /proc/irq/$NR/smp_affinity[_list] files. Limited documentation is ++ available at: ++ ++ https://www.kernel.org/doc/Documentation/IRQ-affinity.txt ++ ++.. _smt_control: ++ ++4. SMT control ++^^^^^^^^^^^^^^ ++ ++ To prevent the SMT issues of L1TF it might be necessary to disable SMT ++ completely. Disabling SMT can have a significant performance impact, but ++ the impact depends on the hosting scenario and the type of workloads. ++ The impact of disabling SMT needs also to be weighted against the impact ++ of other mitigation solutions like confining guests to dedicated cores. ++ ++ The kernel provides a sysfs interface to retrieve the status of SMT and ++ to control it. It also provides a kernel command line interface to ++ control SMT. ++ ++ The kernel command line interface consists of the following options: ++ ++ =========== ========================================================== ++ nosmt Affects the bring up of the secondary CPUs during boot. The ++ kernel tries to bring all present CPUs online during the ++ boot process. "nosmt" makes sure that from each physical ++ core only one - the so called primary (hyper) thread is ++ activated. Due to a design flaw of Intel processors related ++ to Machine Check Exceptions the non primary siblings have ++ to be brought up at least partially and are then shut down ++ again. "nosmt" can be undone via the sysfs interface. ++ ++ nosmt=force Has the same effect as "nosmt" but it does not allow to ++ undo the SMT disable via the sysfs interface. ++ =========== ========================================================== ++ ++ The sysfs interface provides two files: ++ ++ - /sys/devices/system/cpu/smt/control ++ - /sys/devices/system/cpu/smt/active ++ ++ /sys/devices/system/cpu/smt/control: ++ ++ This file allows to read out the SMT control state and provides the ++ ability to disable or (re)enable SMT. The possible states are: ++ ++ ============== =================================================== ++ on SMT is supported by the CPU and enabled. All ++ logical CPUs can be onlined and offlined without ++ restrictions. ++ ++ off SMT is supported by the CPU and disabled. Only ++ the so called primary SMT threads can be onlined ++ and offlined without restrictions. An attempt to ++ online a non-primary sibling is rejected ++ ++ forceoff Same as 'off' but the state cannot be controlled. ++ Attempts to write to the control file are rejected. ++ ++ notsupported The processor does not support SMT. It's therefore ++ not affected by the SMT implications of L1TF. ++ Attempts to write to the control file are rejected. ++ ============== =================================================== ++ ++ The possible states which can be written into this file to control SMT ++ state are: ++ ++ - on ++ - off ++ - forceoff ++ ++ /sys/devices/system/cpu/smt/active: ++ ++ This file reports whether SMT is enabled and active, i.e. if on any ++ physical core two or more sibling threads are online. ++ ++ SMT control is also possible at boot time via the l1tf kernel command ++ line parameter in combination with L1D flush control. See ++ :ref:`mitigation_control_command_line`. ++ ++5. Disabling EPT ++^^^^^^^^^^^^^^^^ ++ ++ Disabling EPT for virtual machines provides full mitigation for L1TF even ++ with SMT enabled, because the effective page tables for guests are ++ managed and sanitized by the hypervisor. Though disabling EPT has a ++ significant performance impact especially when the Meltdown mitigation ++ KPTI is enabled. ++ ++ EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter. ++ ++There is ongoing research and development for new mitigation mechanisms to ++address the performance impact of disabling SMT or EPT. ++ ++.. _mitigation_control_command_line: ++ ++Mitigation control on the kernel command line ++--------------------------------------------- ++ ++The kernel command line allows to control the L1TF mitigations at boot ++time with the option "l1tf=". The valid arguments for this option are: ++ ++ ============ ============================================================= ++ full Provides all available mitigations for the L1TF ++ vulnerability. Disables SMT and enables all mitigations in ++ the hypervisors, i.e. unconditional L1D flushing ++ ++ SMT control and L1D flush control via the sysfs interface ++ is still possible after boot. Hypervisors will issue a ++ warning when the first VM is started in a potentially ++ insecure configuration, i.e. SMT enabled or L1D flush ++ disabled. ++ ++ full,force Same as 'full', but disables SMT and L1D flush runtime ++ control. Implies the 'nosmt=force' command line option. ++ (i.e. sysfs control of SMT is disabled.) ++ ++ flush Leaves SMT enabled and enables the default hypervisor ++ mitigation, i.e. conditional L1D flushing ++ ++ SMT control and L1D flush control via the sysfs interface ++ is still possible after boot. Hypervisors will issue a ++ warning when the first VM is started in a potentially ++ insecure configuration, i.e. SMT enabled or L1D flush ++ disabled. ++ ++ flush,nosmt Disables SMT and enables the default hypervisor mitigation, ++ i.e. conditional L1D flushing. ++ ++ SMT control and L1D flush control via the sysfs interface ++ is still possible after boot. Hypervisors will issue a ++ warning when the first VM is started in a potentially ++ insecure configuration, i.e. SMT enabled or L1D flush ++ disabled. ++ ++ flush,nowarn Same as 'flush', but hypervisors will not warn when a VM is ++ started in a potentially insecure configuration. ++ ++ off Disables hypervisor mitigations and doesn't emit any ++ warnings. ++ It also drops the swap size and available RAM limit restrictions ++ on both hypervisor and bare metal. ++ ++ ============ ============================================================= ++ ++The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`. ++ ++ ++.. _mitigation_control_kvm: ++ ++Mitigation control for KVM - module parameter ++------------------------------------------------------------- ++ ++The KVM hypervisor mitigation mechanism, flushing the L1D cache when ++entering a guest, can be controlled with a module parameter. ++ ++The option/parameter is "kvm-intel.vmentry_l1d_flush=". It takes the ++following arguments: ++ ++ ============ ============================================================== ++ always L1D cache flush on every VMENTER. ++ ++ cond Flush L1D on VMENTER only when the code between VMEXIT and ++ VMENTER can leak host memory which is considered ++ interesting for an attacker. This still can leak host memory ++ which allows e.g. to determine the hosts address space layout. ++ ++ never Disables the mitigation ++ ============ ============================================================== ++ ++The parameter can be provided on the kernel command line, as a module ++parameter when loading the modules and at runtime modified via the sysfs ++file: ++ ++/sys/module/kvm_intel/parameters/vmentry_l1d_flush ++ ++The default is 'cond'. If 'l1tf=full,force' is given on the kernel command ++line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush ++module parameter is ignored and writes to the sysfs file are rejected. ++ ++.. _mitigation_selection: ++ ++Mitigation selection guide ++-------------------------- ++ ++1. No virtualization in use ++^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ ++ The system is protected by the kernel unconditionally and no further ++ action is required. ++ ++2. Virtualization with trusted guests ++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ ++ If the guest comes from a trusted source and the guest OS kernel is ++ guaranteed to have the L1TF mitigations in place the system is fully ++ protected against L1TF and no further action is required. ++ ++ To avoid the overhead of the default L1D flushing on VMENTER the ++ administrator can disable the flushing via the kernel command line and ++ sysfs control files. See :ref:`mitigation_control_command_line` and ++ :ref:`mitigation_control_kvm`. ++ ++ ++3. Virtualization with untrusted guests ++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ ++3.1. SMT not supported or disabled ++"""""""""""""""""""""""""""""""""" ++ ++ If SMT is not supported by the processor or disabled in the BIOS or by ++ the kernel, it's only required to enforce L1D flushing on VMENTER. ++ ++ Conditional L1D flushing is the default behaviour and can be tuned. See ++ :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`. ++ ++3.2. EPT not supported or disabled ++"""""""""""""""""""""""""""""""""" ++ ++ If EPT is not supported by the processor or disabled in the hypervisor, ++ the system is fully protected. SMT can stay enabled and L1D flushing on ++ VMENTER is not required. ++ ++ EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter. ++ ++3.3. SMT and EPT supported and active ++""""""""""""""""""""""""""""""""""""" ++ ++ If SMT and EPT are supported and active then various degrees of ++ mitigations can be employed: ++ ++ - L1D flushing on VMENTER: ++ ++ L1D flushing on VMENTER is the minimal protection requirement, but it ++ is only potent in combination with other mitigation methods. ++ ++ Conditional L1D flushing is the default behaviour and can be tuned. See ++ :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`. ++ ++ - Guest confinement: ++ ++ Confinement of guests to a single or a group of physical cores which ++ are not running any other processes, can reduce the attack surface ++ significantly, but interrupts, soft interrupts and kernel threads can ++ still expose valuable data to a potential attacker. See ++ :ref:`guest_confinement`. ++ ++ - Interrupt isolation: ++ ++ Isolating the guest CPUs from interrupts can reduce the attack surface ++ further, but still allows a malicious guest to explore a limited amount ++ of host physical memory. This can at least be used to gain knowledge ++ about the host address space layout. The interrupts which have a fixed ++ affinity to the CPUs which run the untrusted guests can depending on ++ the scenario still trigger soft interrupts and schedule kernel threads ++ which might expose valuable information. See ++ :ref:`interrupt_isolation`. ++ ++The above three mitigation methods combined can provide protection to a ++certain degree, but the risk of the remaining attack surface has to be ++carefully analyzed. For full protection the following methods are ++available: ++ ++ - Disabling SMT: ++ ++ Disabling SMT and enforcing the L1D flushing provides the maximum ++ amount of protection. This mitigation is not depending on any of the ++ above mitigation methods. ++ ++ SMT control and L1D flushing can be tuned by the command line ++ parameters 'nosmt', 'l1tf', 'kvm-intel.vmentry_l1d_flush' and at run ++ time with the matching sysfs control files. See :ref:`smt_control`, ++ :ref:`mitigation_control_command_line` and ++ :ref:`mitigation_control_kvm`. ++ ++ - Disabling EPT: ++ ++ Disabling EPT provides the maximum amount of protection as well. It is ++ not depending on any of the above mitigation methods. SMT can stay ++ enabled and L1D flushing is not required, but the performance impact is ++ significant. ++ ++ EPT can be disabled in the hypervisor via the 'kvm-intel.ept' ++ parameter. ++ ++3.4. Nested virtual machines ++"""""""""""""""""""""""""""" ++ ++When nested virtualization is in use, three operating systems are involved: ++the bare metal hypervisor, the nested hypervisor and the nested virtual ++machine. VMENTER operations from the nested hypervisor into the nested ++guest will always be processed by the bare metal hypervisor. If KVM is the ++bare metal hypervisor it will: ++ ++ - Flush the L1D cache on every switch from the nested hypervisor to the ++ nested virtual machine, so that the nested hypervisor's secrets are not ++ exposed to the nested virtual machine; ++ ++ - Flush the L1D cache on every switch from the nested virtual machine to ++ the nested hypervisor; this is a complex operation, and flushing the L1D ++ cache avoids that the bare metal hypervisor's secrets are exposed to the ++ nested virtual machine; ++ ++ - Instruct the nested hypervisor to not perform any L1D cache flush. This ++ is an optimization to avoid double L1D flushing. ++ ++ ++.. _default_mitigations: ++ ++Default mitigations ++------------------- ++ ++ The kernel default mitigations for vulnerable processors are: ++ ++ - PTE inversion to protect against malicious user space. This is done ++ unconditionally and cannot be controlled. The swap storage is limited ++ to ~16TB. ++ ++ - L1D conditional flushing on VMENTER when EPT is enabled for ++ a guest. ++ ++ The kernel does not by default enforce the disabling of SMT, which leaves ++ SMT systems vulnerable when running untrusted guests with EPT enabled. ++ ++ The rationale for this choice is: ++ ++ - Force disabling SMT can break existing setups, especially with ++ unattended updates. ++ ++ - If regular users run untrusted guests on their machine, then L1TF is ++ just an add on to other malware which might be embedded in an untrusted ++ guest, e.g. spam-bots or attacks on the local network. ++ ++ There is no technical way to prevent a user from running untrusted code ++ on their machines blindly. ++ ++ - It's technically extremely unlikely and from today's knowledge even ++ impossible that L1TF can be exploited via the most popular attack ++ mechanisms like JavaScript because these mechanisms have no way to ++ control PTEs. If this would be possible and not other mitigation would ++ be possible, then the default might be different. ++ ++ - The administrators of cloud and hosting setups have to carefully ++ analyze the risk for their scenarios and make the appropriate ++ mitigation choices, which might even vary across their deployed ++ machines and also result in other changes of their overall setup. ++ There is no way for the kernel to provide a sensible default for this ++ kind of scenarios. +diff --git a/Documentation/admin-guide/hw-vuln/mds.rst b/Documentation/admin-guide/hw-vuln/mds.rst +new file mode 100644 +index 000000000000..e3a796c0d3a2 +--- /dev/null ++++ b/Documentation/admin-guide/hw-vuln/mds.rst +@@ -0,0 +1,308 @@ ++MDS - Microarchitectural Data Sampling ++====================================== ++ ++Microarchitectural Data Sampling is a hardware vulnerability which allows ++unprivileged speculative access to data which is available in various CPU ++internal buffers. ++ ++Affected processors ++------------------- ++ ++This vulnerability affects a wide range of Intel processors. The ++vulnerability is not present on: ++ ++ - Processors from AMD, Centaur and other non Intel vendors ++ ++ - Older processor models, where the CPU family is < 6 ++ ++ - Some Atoms (Bonnell, Saltwell, Goldmont, GoldmontPlus) ++ ++ - Intel processors which have the ARCH_CAP_MDS_NO bit set in the ++ IA32_ARCH_CAPABILITIES MSR. ++ ++Whether a processor is affected or not can be read out from the MDS ++vulnerability file in sysfs. See :ref:`mds_sys_info`. ++ ++Not all processors are affected by all variants of MDS, but the mitigation ++is identical for all of them so the kernel treats them as a single ++vulnerability. ++ ++Related CVEs ++------------ ++ ++The following CVE entries are related to the MDS vulnerability: ++ ++ ============== ===== =================================================== ++ CVE-2018-12126 MSBDS Microarchitectural Store Buffer Data Sampling ++ CVE-2018-12130 MFBDS Microarchitectural Fill Buffer Data Sampling ++ CVE-2018-12127 MLPDS Microarchitectural Load Port Data Sampling ++ CVE-2019-11091 MDSUM Microarchitectural Data Sampling Uncacheable Memory ++ ============== ===== =================================================== ++ ++Problem ++------- ++ ++When performing store, load, L1 refill operations, processors write data ++into temporary microarchitectural structures (buffers). The data in the ++buffer can be forwarded to load operations as an optimization. ++ ++Under certain conditions, usually a fault/assist caused by a load ++operation, data unrelated to the load memory address can be speculatively ++forwarded from the buffers. Because the load operation causes a fault or ++assist and its result will be discarded, the forwarded data will not cause ++incorrect program execution or state changes. But a malicious operation ++may be able to forward this speculative data to a disclosure gadget which ++allows in turn to infer the value via a cache side channel attack. ++ ++Because the buffers are potentially shared between Hyper-Threads cross ++Hyper-Thread attacks are possible. ++ ++Deeper technical information is available in the MDS specific x86 ++architecture section: :ref:`Documentation/x86/mds.rst <mds>`. ++ ++ ++Attack scenarios ++---------------- ++ ++Attacks against the MDS vulnerabilities can be mounted from malicious non ++priviledged user space applications running on hosts or guest. Malicious ++guest OSes can obviously mount attacks as well. ++ ++Contrary to other speculation based vulnerabilities the MDS vulnerability ++does not allow the attacker to control the memory target address. As a ++consequence the attacks are purely sampling based, but as demonstrated with ++the TLBleed attack samples can be postprocessed successfully. ++ ++Web-Browsers ++^^^^^^^^^^^^ ++ ++ It's unclear whether attacks through Web-Browsers are possible at ++ all. The exploitation through Java-Script is considered very unlikely, ++ but other widely used web technologies like Webassembly could possibly be ++ abused. ++ ++ ++.. _mds_sys_info: ++ ++MDS system information ++----------------------- ++ ++The Linux kernel provides a sysfs interface to enumerate the current MDS ++status of the system: whether the system is vulnerable, and which ++mitigations are active. The relevant sysfs file is: ++ ++/sys/devices/system/cpu/vulnerabilities/mds ++ ++The possible values in this file are: ++ ++ .. list-table:: ++ ++ * - 'Not affected' ++ - The processor is not vulnerable ++ * - 'Vulnerable' ++ - The processor is vulnerable, but no mitigation enabled ++ * - 'Vulnerable: Clear CPU buffers attempted, no microcode' ++ - The processor is vulnerable but microcode is not updated. ++ ++ The mitigation is enabled on a best effort basis. See :ref:`vmwerv` ++ * - 'Mitigation: Clear CPU buffers' ++ - The processor is vulnerable and the CPU buffer clearing mitigation is ++ enabled. ++ ++If the processor is vulnerable then the following information is appended ++to the above information: ++ ++ ======================== ============================================ ++ 'SMT vulnerable' SMT is enabled ++ 'SMT mitigated' SMT is enabled and mitigated ++ 'SMT disabled' SMT is disabled ++ 'SMT Host state unknown' Kernel runs in a VM, Host SMT state unknown ++ ======================== ============================================ ++ ++.. _vmwerv: ++ ++Best effort mitigation mode ++^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ ++ If the processor is vulnerable, but the availability of the microcode based ++ mitigation mechanism is not advertised via CPUID the kernel selects a best ++ effort mitigation mode. This mode invokes the mitigation instructions ++ without a guarantee that they clear the CPU buffers. ++ ++ This is done to address virtualization scenarios where the host has the ++ microcode update applied, but the hypervisor is not yet updated to expose ++ the CPUID to the guest. If the host has updated microcode the protection ++ takes effect otherwise a few cpu cycles are wasted pointlessly. ++ ++ The state in the mds sysfs file reflects this situation accordingly. ++ ++ ++Mitigation mechanism ++------------------------- ++ ++The kernel detects the affected CPUs and the presence of the microcode ++which is required. ++ ++If a CPU is affected and the microcode is available, then the kernel ++enables the mitigation by default. The mitigation can be controlled at boot ++time via a kernel command line option. See ++:ref:`mds_mitigation_control_command_line`. ++ ++.. _cpu_buffer_clear: ++ ++CPU buffer clearing ++^^^^^^^^^^^^^^^^^^^ ++ ++ The mitigation for MDS clears the affected CPU buffers on return to user ++ space and when entering a guest. ++ ++ If SMT is enabled it also clears the buffers on idle entry when the CPU ++ is only affected by MSBDS and not any other MDS variant, because the ++ other variants cannot be protected against cross Hyper-Thread attacks. ++ ++ For CPUs which are only affected by MSBDS the user space, guest and idle ++ transition mitigations are sufficient and SMT is not affected. ++ ++.. _virt_mechanism: ++ ++Virtualization mitigation ++^^^^^^^^^^^^^^^^^^^^^^^^^ ++ ++ The protection for host to guest transition depends on the L1TF ++ vulnerability of the CPU: ++ ++ - CPU is affected by L1TF: ++ ++ If the L1D flush mitigation is enabled and up to date microcode is ++ available, the L1D flush mitigation is automatically protecting the ++ guest transition. ++ ++ If the L1D flush mitigation is disabled then the MDS mitigation is ++ invoked explicit when the host MDS mitigation is enabled. ++ ++ For details on L1TF and virtualization see: ++ :ref:`Documentation/admin-guide/hw-vuln//l1tf.rst <mitigation_control_kvm>`. ++ ++ - CPU is not affected by L1TF: ++ ++ CPU buffers are flushed before entering the guest when the host MDS ++ mitigation is enabled. ++ ++ The resulting MDS protection matrix for the host to guest transition: ++ ++ ============ ===== ============= ============ ================= ++ L1TF MDS VMX-L1FLUSH Host MDS MDS-State ++ ++ Don't care No Don't care N/A Not affected ++ ++ Yes Yes Disabled Off Vulnerable ++ ++ Yes Yes Disabled Full Mitigated ++ ++ Yes Yes Enabled Don't care Mitigated ++ ++ No Yes N/A Off Vulnerable ++ ++ No Yes N/A Full Mitigated ++ ============ ===== ============= ============ ================= ++ ++ This only covers the host to guest transition, i.e. prevents leakage from ++ host to guest, but does not protect the guest internally. Guests need to ++ have their own protections. ++ ++.. _xeon_phi: ++ ++XEON PHI specific considerations ++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ ++ The XEON PHI processor family is affected by MSBDS which can be exploited ++ cross Hyper-Threads when entering idle states. Some XEON PHI variants allow ++ to use MWAIT in user space (Ring 3) which opens an potential attack vector ++ for malicious user space. The exposure can be disabled on the kernel ++ command line with the 'ring3mwait=disable' command line option. ++ ++ XEON PHI is not affected by the other MDS variants and MSBDS is mitigated ++ before the CPU enters a idle state. As XEON PHI is not affected by L1TF ++ either disabling SMT is not required for full protection. ++ ++.. _mds_smt_control: ++ ++SMT control ++^^^^^^^^^^^ ++ ++ All MDS variants except MSBDS can be attacked cross Hyper-Threads. That ++ means on CPUs which are affected by MFBDS or MLPDS it is necessary to ++ disable SMT for full protection. These are most of the affected CPUs; the ++ exception is XEON PHI, see :ref:`xeon_phi`. ++ ++ Disabling SMT can have a significant performance impact, but the impact ++ depends on the type of workloads. ++ ++ See the relevant chapter in the L1TF mitigation documentation for details: ++ :ref:`Documentation/admin-guide/hw-vuln/l1tf.rst <smt_control>`. ++ ++ ++.. _mds_mitigation_control_command_line: ++ ++Mitigation control on the kernel command line ++--------------------------------------------- ++ ++The kernel command line allows to control the MDS mitigations at boot ++time with the option "mds=". The valid arguments for this option are: ++ ++ ============ ============================================================= ++ full If the CPU is vulnerable, enable all available mitigations ++ for the MDS vulnerability, CPU buffer clearing on exit to ++ userspace and when entering a VM. Idle transitions are ++ protected as well if SMT is enabled. ++ ++ It does not automatically disable SMT. ++ ++ full,nosmt The same as mds=full, with SMT disabled on vulnerable ++ CPUs. This is the complete mitigation. ++ ++ off Disables MDS mitigations completely. ++ ++ ============ ============================================================= ++ ++Not specifying this option is equivalent to "mds=full". ++ ++ ++Mitigation selection guide ++-------------------------- ++ ++1. Trusted userspace ++^^^^^^^^^^^^^^^^^^^^ ++ ++ If all userspace applications are from a trusted source and do not ++ execute untrusted code which is supplied externally, then the mitigation ++ can be disabled. ++ ++ ++2. Virtualization with trusted guests ++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ ++ The same considerations as above versus trusted user space apply. ++ ++3. Virtualization with untrusted guests ++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ ++ The protection depends on the state of the L1TF mitigations. ++ See :ref:`virt_mechanism`. ++ ++ If the MDS mitigation is enabled and SMT is disabled, guest to host and ++ guest to guest attacks are prevented. ++ ++.. _mds_default_mitigations: ++ ++Default mitigations ++------------------- ++ ++ The kernel default mitigations for vulnerable processors are: ++ ++ - Enable CPU buffer clearing ++ ++ The kernel does not by default enforce the disabling of SMT, which leaves ++ SMT systems vulnerable when running untrusted code. The same rationale as ++ for L1TF applies. ++ See :ref:`Documentation/admin-guide/hw-vuln//l1tf.rst <default_mitigations>`. +diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst +index 78f8f00c369f..f8d4e9af01dc 100644 +--- a/Documentation/admin-guide/index.rst ++++ b/Documentation/admin-guide/index.rst +@@ -17,14 +17,12 @@ etc. + kernel-parameters + devices + +-This section describes CPU vulnerabilities and provides an overview of the +-possible mitigations along with guidance for selecting mitigations if they +-are configurable at compile, boot or run time. ++This section describes CPU vulnerabilities and their mitigations. + + .. toctree:: + :maxdepth: 1 + +- l1tf ++ hw-vuln/index + + Here is a set of documents aimed at users who are trying to track down + problems and bugs in particular. +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 94fa46d2d805..9240b2caa0b1 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -1971,7 +1971,7 @@ + + Default is 'flush'. + +- For details see: Documentation/admin-guide/l1tf.rst ++ For details see: Documentation/admin-guide/hw-vuln/l1tf.rst + + l2cr= [PPC] + +@@ -2214,6 +2214,32 @@ + Format: <first>,<last> + Specifies range of consoles to be captured by the MDA. + ++ mds= [X86,INTEL] ++ Control mitigation for the Micro-architectural Data ++ Sampling (MDS) vulnerability. ++ ++ Certain CPUs are vulnerable to an exploit against CPU ++ internal buffers which can forward information to a ++ disclosure gadget under certain conditions. ++ ++ In vulnerable processors, the speculatively ++ forwarded data can be used in a cache side channel ++ attack, to access data to which the attacker does ++ not have direct access. ++ ++ This parameter controls the MDS mitigation. The ++ options are: ++ ++ full - Enable MDS mitigation on vulnerable CPUs ++ full,nosmt - Enable MDS mitigation and disable ++ SMT on vulnerable CPUs ++ off - Unconditionally disable MDS mitigation ++ ++ Not specifying this option is equivalent to ++ mds=full. ++ ++ For details see: Documentation/admin-guide/hw-vuln/mds.rst ++ + mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory + Amount of memory to be used when the kernel is not able + to see the whole system memory or for test. +@@ -2362,6 +2388,40 @@ + in the "bleeding edge" mini2440 support kernel at + http://repo.or.cz/w/linux-2.6/mini2440.git + ++ mitigations= ++ [X86,PPC,S390] Control optional mitigations for CPU ++ vulnerabilities. This is a set of curated, ++ arch-independent options, each of which is an ++ aggregation of existing arch-specific options. ++ ++ off ++ Disable all optional CPU mitigations. This ++ improves system performance, but it may also ++ expose users to several CPU vulnerabilities. ++ Equivalent to: nopti [X86,PPC] ++ nospectre_v1 [PPC] ++ nobp=0 [S390] ++ nospectre_v2 [X86,PPC,S390] ++ spectre_v2_user=off [X86] ++ spec_store_bypass_disable=off [X86,PPC] ++ l1tf=off [X86] ++ mds=off [X86] ++ ++ auto (default) ++ Mitigate all CPU vulnerabilities, but leave SMT ++ enabled, even if it's vulnerable. This is for ++ users who don't want to be surprised by SMT ++ getting disabled across kernel upgrades, or who ++ have other ways of avoiding SMT-based attacks. ++ Equivalent to: (default behavior) ++ ++ auto,nosmt ++ Mitigate all CPU vulnerabilities, disabling SMT ++ if needed. This is for users who always want to ++ be fully mitigated, even if it means losing SMT. ++ Equivalent to: l1tf=flush,nosmt [X86] ++ mds=full,nosmt [X86] ++ + mminit_loglevel= + [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this + parameter allows control of the logging verbosity for +diff --git a/Documentation/admin-guide/l1tf.rst b/Documentation/admin-guide/l1tf.rst +deleted file mode 100644 +index 9f5924f81f89..000000000000 +--- a/Documentation/admin-guide/l1tf.rst ++++ /dev/null +@@ -1,614 +0,0 @@ +-L1TF - L1 Terminal Fault +-======================== +- +-L1 Terminal Fault is a hardware vulnerability which allows unprivileged +-speculative access to data which is available in the Level 1 Data Cache +-when the page table entry controlling the virtual address, which is used +-for the access, has the Present bit cleared or other reserved bits set. +- +-Affected processors +-------------------- +- +-This vulnerability affects a wide range of Intel processors. The +-vulnerability is not present on: +- +- - Processors from AMD, Centaur and other non Intel vendors +- +- - Older processor models, where the CPU family is < 6 +- +- - A range of Intel ATOM processors (Cedarview, Cloverview, Lincroft, +- Penwell, Pineview, Silvermont, Airmont, Merrifield) +- +- - The Intel XEON PHI family +- +- - Intel processors which have the ARCH_CAP_RDCL_NO bit set in the +- IA32_ARCH_CAPABILITIES MSR. If the bit is set the CPU is not affected +- by the Meltdown vulnerability either. These CPUs should become +- available by end of 2018. +- +-Whether a processor is affected or not can be read out from the L1TF +-vulnerability file in sysfs. See :ref:`l1tf_sys_info`. +- +-Related CVEs +------------- +- +-The following CVE entries are related to the L1TF vulnerability: +- +- ============= ================= ============================== +- CVE-2018-3615 L1 Terminal Fault SGX related aspects +- CVE-2018-3620 L1 Terminal Fault OS, SMM related aspects +- CVE-2018-3646 L1 Terminal Fault Virtualization related aspects +- ============= ================= ============================== +- +-Problem +-------- +- +-If an instruction accesses a virtual address for which the relevant page +-table entry (PTE) has the Present bit cleared or other reserved bits set, +-then speculative execution ignores the invalid PTE and loads the referenced +-data if it is present in the Level 1 Data Cache, as if the page referenced +-by the address bits in the PTE was still present and accessible. +- +-While this is a purely speculative mechanism and the instruction will raise +-a page fault when it is retired eventually, the pure act of loading the +-data and making it available to other speculative instructions opens up the +-opportunity for side channel attacks to unprivileged malicious code, +-similar to the Meltdown attack. +- +-While Meltdown breaks the user space to kernel space protection, L1TF +-allows to attack any physical memory address in the system and the attack +-works across all protection domains. It allows an attack of SGX and also +-works from inside virtual machines because the speculation bypasses the +-extended page table (EPT) protection mechanism. +- +- +-Attack scenarios +----------------- +- +-1. Malicious user space +-^^^^^^^^^^^^^^^^^^^^^^^ +- +- Operating Systems store arbitrary information in the address bits of a +- PTE which is marked non present. This allows a malicious user space +- application to attack the physical memory to which these PTEs resolve. +- In some cases user-space can maliciously influence the information +- encoded in the address bits of the PTE, thus making attacks more +- deterministic and more practical. +- +- The Linux kernel contains a mitigation for this attack vector, PTE +- inversion, which is permanently enabled and has no performance +- impact. The kernel ensures that the address bits of PTEs, which are not +- marked present, never point to cacheable physical memory space. +- +- A system with an up to date kernel is protected against attacks from +- malicious user space applications. +- +-2. Malicious guest in a virtual machine +-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +- +- The fact that L1TF breaks all domain protections allows malicious guest +- OSes, which can control the PTEs directly, and malicious guest user +- space applications, which run on an unprotected guest kernel lacking the +- PTE inversion mitigation for L1TF, to attack physical host memory. +- +- A special aspect of L1TF in the context of virtualization is symmetric +- multi threading (SMT). The Intel implementation of SMT is called +- HyperThreading. The fact that Hyperthreads on the affected processors +- share the L1 Data Cache (L1D) is important for this. As the flaw allows +- only to attack data which is present in L1D, a malicious guest running +- on one Hyperthread can attack the data which is brought into the L1D by +- the context which runs on the sibling Hyperthread of the same physical +- core. This context can be host OS, host user space or a different guest. +- +- If the processor does not support Extended Page Tables, the attack is +- only possible, when the hypervisor does not sanitize the content of the +- effective (shadow) page tables. +- +- While solutions exist to mitigate these attack vectors fully, these +- mitigations are not enabled by default in the Linux kernel because they +- can affect performance significantly. The kernel provides several +- mechanisms which can be utilized to address the problem depending on the +- deployment scenario. The mitigations, their protection scope and impact +- are described in the next sections. +- +- The default mitigations and the rationale for choosing them are explained +- at the end of this document. See :ref:`default_mitigations`. +- +-.. _l1tf_sys_info: +- +-L1TF system information +------------------------ +- +-The Linux kernel provides a sysfs interface to enumerate the current L1TF +-status of the system: whether the system is vulnerable, and which +-mitigations are active. The relevant sysfs file is: +- +-/sys/devices/system/cpu/vulnerabilities/l1tf +- +-The possible values in this file are: +- +- =========================== =============================== +- 'Not affected' The processor is not vulnerable +- 'Mitigation: PTE Inversion' The host protection is active +- =========================== =============================== +- +-If KVM/VMX is enabled and the processor is vulnerable then the following +-information is appended to the 'Mitigation: PTE Inversion' part: +- +- - SMT status: +- +- ===================== ================ +- 'VMX: SMT vulnerable' SMT is enabled +- 'VMX: SMT disabled' SMT is disabled +- ===================== ================ +- +- - L1D Flush mode: +- +- ================================ ==================================== +- 'L1D vulnerable' L1D flushing is disabled +- +- 'L1D conditional cache flushes' L1D flush is conditionally enabled +- +- 'L1D cache flushes' L1D flush is unconditionally enabled +- ================================ ==================================== +- +-The resulting grade of protection is discussed in the following sections. +- +- +-Host mitigation mechanism +-------------------------- +- +-The kernel is unconditionally protected against L1TF attacks from malicious +-user space running on the host. +- +- +-Guest mitigation mechanisms +---------------------------- +- +-.. _l1d_flush: +- +-1. L1D flush on VMENTER +-^^^^^^^^^^^^^^^^^^^^^^^ +- +- To make sure that a guest cannot attack data which is present in the L1D +- the hypervisor flushes the L1D before entering the guest. +- +- Flushing the L1D evicts not only the data which should not be accessed +- by a potentially malicious guest, it also flushes the guest +- data. Flushing the L1D has a performance impact as the processor has to +- bring the flushed guest data back into the L1D. Depending on the +- frequency of VMEXIT/VMENTER and the type of computations in the guest +- performance degradation in the range of 1% to 50% has been observed. For +- scenarios where guest VMEXIT/VMENTER are rare the performance impact is +- minimal. Virtio and mechanisms like posted interrupts are designed to +- confine the VMEXITs to a bare minimum, but specific configurations and +- application scenarios might still suffer from a high VMEXIT rate. +- +- The kernel provides two L1D flush modes: +- - conditional ('cond') +- - unconditional ('always') +- +- The conditional mode avoids L1D flushing after VMEXITs which execute +- only audited code paths before the corresponding VMENTER. These code +- paths have been verified that they cannot expose secrets or other +- interesting data to an attacker, but they can leak information about the +- address space layout of the hypervisor. +- +- Unconditional mode flushes L1D on all VMENTER invocations and provides +- maximum protection. It has a higher overhead than the conditional +- mode. The overhead cannot be quantified correctly as it depends on the +- workload scenario and the resulting number of VMEXITs. +- +- The general recommendation is to enable L1D flush on VMENTER. The kernel +- defaults to conditional mode on affected processors. +- +- **Note**, that L1D flush does not prevent the SMT problem because the +- sibling thread will also bring back its data into the L1D which makes it +- attackable again. +- +- L1D flush can be controlled by the administrator via the kernel command +- line and sysfs control files. See :ref:`mitigation_control_command_line` +- and :ref:`mitigation_control_kvm`. +- +-.. _guest_confinement: +- +-2. Guest VCPU confinement to dedicated physical cores +-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +- +- To address the SMT problem, it is possible to make a guest or a group of +- guests affine to one or more physical cores. The proper mechanism for +- that is to utilize exclusive cpusets to ensure that no other guest or +- host tasks can run on these cores. +- +- If only a single guest or related guests run on sibling SMT threads on +- the same physical core then they can only attack their own memory and +- restricted parts of the host memory. +- +- Host memory is attackable, when one of the sibling SMT threads runs in +- host OS (hypervisor) context and the other in guest context. The amount +- of valuable information from the host OS context depends on the context +- which the host OS executes, i.e. interrupts, soft interrupts and kernel +- threads. The amount of valuable data from these contexts cannot be +- declared as non-interesting for an attacker without deep inspection of +- the code. +- +- **Note**, that assigning guests to a fixed set of physical cores affects +- the ability of the scheduler to do load balancing and might have +- negative effects on CPU utilization depending on the hosting +- scenario. Disabling SMT might be a viable alternative for particular +- scenarios. +- +- For further information about confining guests to a single or to a group +- of cores consult the cpusets documentation: +- +- https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt +- +-.. _interrupt_isolation: +- +-3. Interrupt affinity +-^^^^^^^^^^^^^^^^^^^^^ +- +- Interrupts can be made affine to logical CPUs. This is not universally +- true because there are types of interrupts which are truly per CPU +- interrupts, e.g. the local timer interrupt. Aside of that multi queue +- devices affine their interrupts to single CPUs or groups of CPUs per +- queue without allowing the administrator to control the affinities. +- +- Moving the interrupts, which can be affinity controlled, away from CPUs +- which run untrusted guests, reduces the attack vector space. +- +- Whether the interrupts with are affine to CPUs, which run untrusted +- guests, provide interesting data for an attacker depends on the system +- configuration and the scenarios which run on the system. While for some +- of the interrupts it can be assumed that they won't expose interesting +- information beyond exposing hints about the host OS memory layout, there +- is no way to make general assumptions. +- +- Interrupt affinity can be controlled by the administrator via the +- /proc/irq/$NR/smp_affinity[_list] files. Limited documentation is +- available at: +- +- https://www.kernel.org/doc/Documentation/IRQ-affinity.txt +- +-.. _smt_control: +- +-4. SMT control +-^^^^^^^^^^^^^^ +- +- To prevent the SMT issues of L1TF it might be necessary to disable SMT +- completely. Disabling SMT can have a significant performance impact, but +- the impact depends on the hosting scenario and the type of workloads. +- The impact of disabling SMT needs also to be weighted against the impact +- of other mitigation solutions like confining guests to dedicated cores. +- +- The kernel provides a sysfs interface to retrieve the status of SMT and +- to control it. It also provides a kernel command line interface to +- control SMT. +- +- The kernel command line interface consists of the following options: +- +- =========== ========================================================== +- nosmt Affects the bring up of the secondary CPUs during boot. The +- kernel tries to bring all present CPUs online during the +- boot process. "nosmt" makes sure that from each physical +- core only one - the so called primary (hyper) thread is +- activated. Due to a design flaw of Intel processors related +- to Machine Check Exceptions the non primary siblings have +- to be brought up at least partially and are then shut down +- again. "nosmt" can be undone via the sysfs interface. +- +- nosmt=force Has the same effect as "nosmt" but it does not allow to +- undo the SMT disable via the sysfs interface. +- =========== ========================================================== +- +- The sysfs interface provides two files: +- +- - /sys/devices/system/cpu/smt/control +- - /sys/devices/system/cpu/smt/active +- +- /sys/devices/system/cpu/smt/control: +- +- This file allows to read out the SMT control state and provides the +- ability to disable or (re)enable SMT. The possible states are: +- +- ============== =================================================== +- on SMT is supported by the CPU and enabled. All +- logical CPUs can be onlined and offlined without +- restrictions. +- +- off SMT is supported by the CPU and disabled. Only +- the so called primary SMT threads can be onlined +- and offlined without restrictions. An attempt to +- online a non-primary sibling is rejected +- +- forceoff Same as 'off' but the state cannot be controlled. +- Attempts to write to the control file are rejected. +- +- notsupported The processor does not support SMT. It's therefore +- not affected by the SMT implications of L1TF. +- Attempts to write to the control file are rejected. +- ============== =================================================== +- +- The possible states which can be written into this file to control SMT +- state are: +- +- - on +- - off +- - forceoff +- +- /sys/devices/system/cpu/smt/active: +- +- This file reports whether SMT is enabled and active, i.e. if on any +- physical core two or more sibling threads are online. +- +- SMT control is also possible at boot time via the l1tf kernel command +- line parameter in combination with L1D flush control. See +- :ref:`mitigation_control_command_line`. +- +-5. Disabling EPT +-^^^^^^^^^^^^^^^^ +- +- Disabling EPT for virtual machines provides full mitigation for L1TF even +- with SMT enabled, because the effective page tables for guests are +- managed and sanitized by the hypervisor. Though disabling EPT has a +- significant performance impact especially when the Meltdown mitigation +- KPTI is enabled. +- +- EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter. +- +-There is ongoing research and development for new mitigation mechanisms to +-address the performance impact of disabling SMT or EPT. +- +-.. _mitigation_control_command_line: +- +-Mitigation control on the kernel command line +---------------------------------------------- +- +-The kernel command line allows to control the L1TF mitigations at boot +-time with the option "l1tf=". The valid arguments for this option are: +- +- ============ ============================================================= +- full Provides all available mitigations for the L1TF +- vulnerability. Disables SMT and enables all mitigations in +- the hypervisors, i.e. unconditional L1D flushing +- +- SMT control and L1D flush control via the sysfs interface +- is still possible after boot. Hypervisors will issue a +- warning when the first VM is started in a potentially +- insecure configuration, i.e. SMT enabled or L1D flush +- disabled. +- +- full,force Same as 'full', but disables SMT and L1D flush runtime +- control. Implies the 'nosmt=force' command line option. +- (i.e. sysfs control of SMT is disabled.) +- +- flush Leaves SMT enabled and enables the default hypervisor +- mitigation, i.e. conditional L1D flushing +- +- SMT control and L1D flush control via the sysfs interface +- is still possible after boot. Hypervisors will issue a +- warning when the first VM is started in a potentially +- insecure configuration, i.e. SMT enabled or L1D flush +- disabled. +- +- flush,nosmt Disables SMT and enables the default hypervisor mitigation, +- i.e. conditional L1D flushing. +- +- SMT control and L1D flush control via the sysfs interface +- is still possible after boot. Hypervisors will issue a +- warning when the first VM is started in a potentially +- insecure configuration, i.e. SMT enabled or L1D flush +- disabled. +- +- flush,nowarn Same as 'flush', but hypervisors will not warn when a VM is +- started in a potentially insecure configuration. +- +- off Disables hypervisor mitigations and doesn't emit any +- warnings. +- It also drops the swap size and available RAM limit restrictions +- on both hypervisor and bare metal. +- +- ============ ============================================================= +- +-The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`. +- +- +-.. _mitigation_control_kvm: +- +-Mitigation control for KVM - module parameter +-------------------------------------------------------------- +- +-The KVM hypervisor mitigation mechanism, flushing the L1D cache when +-entering a guest, can be controlled with a module parameter. +- +-The option/parameter is "kvm-intel.vmentry_l1d_flush=". It takes the +-following arguments: +- +- ============ ============================================================== +- always L1D cache flush on every VMENTER. +- +- cond Flush L1D on VMENTER only when the code between VMEXIT and +- VMENTER can leak host memory which is considered +- interesting for an attacker. This still can leak host memory +- which allows e.g. to determine the hosts address space layout. +- +- never Disables the mitigation +- ============ ============================================================== +- +-The parameter can be provided on the kernel command line, as a module +-parameter when loading the modules and at runtime modified via the sysfs +-file: +- +-/sys/module/kvm_intel/parameters/vmentry_l1d_flush +- +-The default is 'cond'. If 'l1tf=full,force' is given on the kernel command +-line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush +-module parameter is ignored and writes to the sysfs file are rejected. +- +- +-Mitigation selection guide +--------------------------- +- +-1. No virtualization in use +-^^^^^^^^^^^^^^^^^^^^^^^^^^^ +- +- The system is protected by the kernel unconditionally and no further +- action is required. +- +-2. Virtualization with trusted guests +-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +- +- If the guest comes from a trusted source and the guest OS kernel is +- guaranteed to have the L1TF mitigations in place the system is fully +- protected against L1TF and no further action is required. +- +- To avoid the overhead of the default L1D flushing on VMENTER the +- administrator can disable the flushing via the kernel command line and +- sysfs control files. See :ref:`mitigation_control_command_line` and +- :ref:`mitigation_control_kvm`. +- +- +-3. Virtualization with untrusted guests +-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +- +-3.1. SMT not supported or disabled +-"""""""""""""""""""""""""""""""""" +- +- If SMT is not supported by the processor or disabled in the BIOS or by +- the kernel, it's only required to enforce L1D flushing on VMENTER. +- +- Conditional L1D flushing is the default behaviour and can be tuned. See +- :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`. +- +-3.2. EPT not supported or disabled +-"""""""""""""""""""""""""""""""""" +- +- If EPT is not supported by the processor or disabled in the hypervisor, +- the system is fully protected. SMT can stay enabled and L1D flushing on +- VMENTER is not required. +- +- EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter. +- +-3.3. SMT and EPT supported and active +-""""""""""""""""""""""""""""""""""""" +- +- If SMT and EPT are supported and active then various degrees of +- mitigations can be employed: +- +- - L1D flushing on VMENTER: +- +- L1D flushing on VMENTER is the minimal protection requirement, but it +- is only potent in combination with other mitigation methods. +- +- Conditional L1D flushing is the default behaviour and can be tuned. See +- :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`. +- +- - Guest confinement: +- +- Confinement of guests to a single or a group of physical cores which +- are not running any other processes, can reduce the attack surface +- significantly, but interrupts, soft interrupts and kernel threads can +- still expose valuable data to a potential attacker. See +- :ref:`guest_confinement`. +- +- - Interrupt isolation: +- +- Isolating the guest CPUs from interrupts can reduce the attack surface +- further, but still allows a malicious guest to explore a limited amount +- of host physical memory. This can at least be used to gain knowledge +- about the host address space layout. The interrupts which have a fixed +- affinity to the CPUs which run the untrusted guests can depending on +- the scenario still trigger soft interrupts and schedule kernel threads +- which might expose valuable information. See +- :ref:`interrupt_isolation`. +- +-The above three mitigation methods combined can provide protection to a +-certain degree, but the risk of the remaining attack surface has to be +-carefully analyzed. For full protection the following methods are +-available: +- +- - Disabling SMT: +- +- Disabling SMT and enforcing the L1D flushing provides the maximum +- amount of protection. This mitigation is not depending on any of the +- above mitigation methods. +- +- SMT control and L1D flushing can be tuned by the command line +- parameters 'nosmt', 'l1tf', 'kvm-intel.vmentry_l1d_flush' and at run +- time with the matching sysfs control files. See :ref:`smt_control`, +- :ref:`mitigation_control_command_line` and +- :ref:`mitigation_control_kvm`. +- +- - Disabling EPT: +- +- Disabling EPT provides the maximum amount of protection as well. It is +- not depending on any of the above mitigation methods. SMT can stay +- enabled and L1D flushing is not required, but the performance impact is +- significant. +- +- EPT can be disabled in the hypervisor via the 'kvm-intel.ept' +- parameter. +- +-3.4. Nested virtual machines +-"""""""""""""""""""""""""""" +- +-When nested virtualization is in use, three operating systems are involved: +-the bare metal hypervisor, the nested hypervisor and the nested virtual +-machine. VMENTER operations from the nested hypervisor into the nested +-guest will always be processed by the bare metal hypervisor. If KVM is the +-bare metal hypervisor it wiil: +- +- - Flush the L1D cache on every switch from the nested hypervisor to the +- nested virtual machine, so that the nested hypervisor's secrets are not +- exposed to the nested virtual machine; +- +- - Flush the L1D cache on every switch from the nested virtual machine to +- the nested hypervisor; this is a complex operation, and flushing the L1D +- cache avoids that the bare metal hypervisor's secrets are exposed to the +- nested virtual machine; +- +- - Instruct the nested hypervisor to not perform any L1D cache flush. This +- is an optimization to avoid double L1D flushing. +- +- +-.. _default_mitigations: +- +-Default mitigations +-------------------- +- +- The kernel default mitigations for vulnerable processors are: +- +- - PTE inversion to protect against malicious user space. This is done +- unconditionally and cannot be controlled. The swap storage is limited +- to ~16TB. +- +- - L1D conditional flushing on VMENTER when EPT is enabled for +- a guest. +- +- The kernel does not by default enforce the disabling of SMT, which leaves +- SMT systems vulnerable when running untrusted guests with EPT enabled. +- +- The rationale for this choice is: +- +- - Force disabling SMT can break existing setups, especially with +- unattended updates. +- +- - If regular users run untrusted guests on their machine, then L1TF is +- just an add on to other malware which might be embedded in an untrusted +- guest, e.g. spam-bots or attacks on the local network. +- +- There is no technical way to prevent a user from running untrusted code +- on their machines blindly. +- +- - It's technically extremely unlikely and from today's knowledge even +- impossible that L1TF can be exploited via the most popular attack +- mechanisms like JavaScript because these mechanisms have no way to +- control PTEs. If this would be possible and not other mitigation would +- be possible, then the default might be different. +- +- - The administrators of cloud and hosting setups have to carefully +- analyze the risk for their scenarios and make the appropriate +- mitigation choices, which might even vary across their deployed +- machines and also result in other changes of their overall setup. +- There is no way for the kernel to provide a sensible default for this +- kind of scenarios. +diff --git a/Documentation/index.rst b/Documentation/index.rst +index cb7f1ba5b3b1..ccfebc260e04 100644 +--- a/Documentation/index.rst ++++ b/Documentation/index.rst +@@ -86,6 +86,7 @@ implementation. + :maxdepth: 2 + + sh/index ++ x86/index + + Korean translations + ------------------- +diff --git a/Documentation/x86/conf.py b/Documentation/x86/conf.py +new file mode 100644 +index 000000000000..33c5c3142e20 +--- /dev/null ++++ b/Documentation/x86/conf.py +@@ -0,0 +1,10 @@ ++# -*- coding: utf-8; mode: python -*- ++ ++project = "X86 architecture specific documentation" ++ ++tags.add("subproject") ++ ++latex_documents = [ ++ ('index', 'x86.tex', project, ++ 'The kernel development community', 'manual'), ++] +diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst +new file mode 100644 +index 000000000000..ef389dcf1b1d +--- /dev/null ++++ b/Documentation/x86/index.rst +@@ -0,0 +1,8 @@ ++========================== ++x86 architecture specifics ++========================== ++ ++.. toctree:: ++ :maxdepth: 1 ++ ++ mds +diff --git a/Documentation/x86/mds.rst b/Documentation/x86/mds.rst +new file mode 100644 +index 000000000000..534e9baa4e1d +--- /dev/null ++++ b/Documentation/x86/mds.rst +@@ -0,0 +1,225 @@ ++Microarchitectural Data Sampling (MDS) mitigation ++================================================= ++ ++.. _mds: ++ ++Overview ++-------- ++ ++Microarchitectural Data Sampling (MDS) is a family of side channel attacks ++on internal buffers in Intel CPUs. The variants are: ++ ++ - Microarchitectural Store Buffer Data Sampling (MSBDS) (CVE-2018-12126) ++ - Microarchitectural Fill Buffer Data Sampling (MFBDS) (CVE-2018-12130) ++ - Microarchitectural Load Port Data Sampling (MLPDS) (CVE-2018-12127) ++ - Microarchitectural Data Sampling Uncacheable Memory (MDSUM) (CVE-2019-11091) ++ ++MSBDS leaks Store Buffer Entries which can be speculatively forwarded to a ++dependent load (store-to-load forwarding) as an optimization. The forward ++can also happen to a faulting or assisting load operation for a different ++memory address, which can be exploited under certain conditions. Store ++buffers are partitioned between Hyper-Threads so cross thread forwarding is ++not possible. But if a thread enters or exits a sleep state the store ++buffer is repartitioned which can expose data from one thread to the other. ++ ++MFBDS leaks Fill Buffer Entries. Fill buffers are used internally to manage ++L1 miss situations and to hold data which is returned or sent in response ++to a memory or I/O operation. Fill buffers can forward data to a load ++operation and also write data to the cache. When the fill buffer is ++deallocated it can retain the stale data of the preceding operations which ++can then be forwarded to a faulting or assisting load operation, which can ++be exploited under certain conditions. Fill buffers are shared between ++Hyper-Threads so cross thread leakage is possible. ++ ++MLPDS leaks Load Port Data. Load ports are used to perform load operations ++from memory or I/O. The received data is then forwarded to the register ++file or a subsequent operation. In some implementations the Load Port can ++contain stale data from a previous operation which can be forwarded to ++faulting or assisting loads under certain conditions, which again can be ++exploited eventually. Load ports are shared between Hyper-Threads so cross ++thread leakage is possible. ++ ++MDSUM is a special case of MSBDS, MFBDS and MLPDS. An uncacheable load from ++memory that takes a fault or assist can leave data in a microarchitectural ++structure that may later be observed using one of the same methods used by ++MSBDS, MFBDS or MLPDS. ++ ++Exposure assumptions ++-------------------- ++ ++It is assumed that attack code resides in user space or in a guest with one ++exception. The rationale behind this assumption is that the code construct ++needed for exploiting MDS requires: ++ ++ - to control the load to trigger a fault or assist ++ ++ - to have a disclosure gadget which exposes the speculatively accessed ++ data for consumption through a side channel. ++ ++ - to control the pointer through which the disclosure gadget exposes the ++ data ++ ++The existence of such a construct in the kernel cannot be excluded with ++100% certainty, but the complexity involved makes it extremly unlikely. ++ ++There is one exception, which is untrusted BPF. The functionality of ++untrusted BPF is limited, but it needs to be thoroughly investigated ++whether it can be used to create such a construct. ++ ++ ++Mitigation strategy ++------------------- ++ ++All variants have the same mitigation strategy at least for the single CPU ++thread case (SMT off): Force the CPU to clear the affected buffers. ++ ++This is achieved by using the otherwise unused and obsolete VERW ++instruction in combination with a microcode update. The microcode clears ++the affected CPU buffers when the VERW instruction is executed. ++ ++For virtualization there are two ways to achieve CPU buffer ++clearing. Either the modified VERW instruction or via the L1D Flush ++command. The latter is issued when L1TF mitigation is enabled so the extra ++VERW can be avoided. If the CPU is not affected by L1TF then VERW needs to ++be issued. ++ ++If the VERW instruction with the supplied segment selector argument is ++executed on a CPU without the microcode update there is no side effect ++other than a small number of pointlessly wasted CPU cycles. ++ ++This does not protect against cross Hyper-Thread attacks except for MSBDS ++which is only exploitable cross Hyper-thread when one of the Hyper-Threads ++enters a C-state. ++ ++The kernel provides a function to invoke the buffer clearing: ++ ++ mds_clear_cpu_buffers() ++ ++The mitigation is invoked on kernel/userspace, hypervisor/guest and C-state ++(idle) transitions. ++ ++As a special quirk to address virtualization scenarios where the host has ++the microcode updated, but the hypervisor does not (yet) expose the ++MD_CLEAR CPUID bit to guests, the kernel issues the VERW instruction in the ++hope that it might actually clear the buffers. The state is reflected ++accordingly. ++ ++According to current knowledge additional mitigations inside the kernel ++itself are not required because the necessary gadgets to expose the leaked ++data cannot be controlled in a way which allows exploitation from malicious ++user space or VM guests. ++ ++Kernel internal mitigation modes ++-------------------------------- ++ ++ ======= ============================================================ ++ off Mitigation is disabled. Either the CPU is not affected or ++ mds=off is supplied on the kernel command line ++ ++ full Mitigation is enabled. CPU is affected and MD_CLEAR is ++ advertised in CPUID. ++ ++ vmwerv Mitigation is enabled. CPU is affected and MD_CLEAR is not ++ advertised in CPUID. That is mainly for virtualization ++ scenarios where the host has the updated microcode but the ++ hypervisor does not expose MD_CLEAR in CPUID. It's a best ++ effort approach without guarantee. ++ ======= ============================================================ ++ ++If the CPU is affected and mds=off is not supplied on the kernel command ++line then the kernel selects the appropriate mitigation mode depending on ++the availability of the MD_CLEAR CPUID bit. ++ ++Mitigation points ++----------------- ++ ++1. Return to user space ++^^^^^^^^^^^^^^^^^^^^^^^ ++ ++ When transitioning from kernel to user space the CPU buffers are flushed ++ on affected CPUs when the mitigation is not disabled on the kernel ++ command line. The migitation is enabled through the static key ++ mds_user_clear. ++ ++ The mitigation is invoked in prepare_exit_to_usermode() which covers ++ most of the kernel to user space transitions. There are a few exceptions ++ which are not invoking prepare_exit_to_usermode() on return to user ++ space. These exceptions use the paranoid exit code. ++ ++ - Non Maskable Interrupt (NMI): ++ ++ Access to sensible data like keys, credentials in the NMI context is ++ mostly theoretical: The CPU can do prefetching or execute a ++ misspeculated code path and thereby fetching data which might end up ++ leaking through a buffer. ++ ++ But for mounting other attacks the kernel stack address of the task is ++ already valuable information. So in full mitigation mode, the NMI is ++ mitigated on the return from do_nmi() to provide almost complete ++ coverage. ++ ++ - Double fault (#DF): ++ ++ A double fault is usually fatal, but the ESPFIX workaround, which can ++ be triggered from user space through modify_ldt(2) is a recoverable ++ double fault. #DF uses the paranoid exit path, so explicit mitigation ++ in the double fault handler is required. ++ ++ - Machine Check Exception (#MC): ++ ++ Another corner case is a #MC which hits between the CPU buffer clear ++ invocation and the actual return to user. As this still is in kernel ++ space it takes the paranoid exit path which does not clear the CPU ++ buffers. So the #MC handler repopulates the buffers to some ++ extent. Machine checks are not reliably controllable and the window is ++ extremly small so mitigation would just tick a checkbox that this ++ theoretical corner case is covered. To keep the amount of special ++ cases small, ignore #MC. ++ ++ - Debug Exception (#DB): ++ ++ This takes the paranoid exit path only when the INT1 breakpoint is in ++ kernel space. #DB on a user space address takes the regular exit path, ++ so no extra mitigation required. ++ ++ ++2. C-State transition ++^^^^^^^^^^^^^^^^^^^^^ ++ ++ When a CPU goes idle and enters a C-State the CPU buffers need to be ++ cleared on affected CPUs when SMT is active. This addresses the ++ repartitioning of the store buffer when one of the Hyper-Threads enters ++ a C-State. ++ ++ When SMT is inactive, i.e. either the CPU does not support it or all ++ sibling threads are offline CPU buffer clearing is not required. ++ ++ The idle clearing is enabled on CPUs which are only affected by MSBDS ++ and not by any other MDS variant. The other MDS variants cannot be ++ protected against cross Hyper-Thread attacks because the Fill Buffer and ++ the Load Ports are shared. So on CPUs affected by other variants, the ++ idle clearing would be a window dressing exercise and is therefore not ++ activated. ++ ++ The invocation is controlled by the static key mds_idle_clear which is ++ switched depending on the chosen mitigation mode and the SMT state of ++ the system. ++ ++ The buffer clear is only invoked before entering the C-State to prevent ++ that stale data from the idling CPU from spilling to the Hyper-Thread ++ sibling after the store buffer got repartitioned and all entries are ++ available to the non idle sibling. ++ ++ When coming out of idle the store buffer is partitioned again so each ++ sibling has half of it available. The back from idle CPU could be then ++ speculatively exposed to contents of the sibling. The buffers are ++ flushed either on exit to user space or on VMENTER so malicious code ++ in user space or the guest cannot speculatively access them. ++ ++ The mitigation is hooked into all variants of halt()/mwait(), but does ++ not cover the legacy ACPI IO-Port mechanism because the ACPI idle driver ++ has been superseded by the intel_idle driver around 2010 and is ++ preferred on all affected CPUs which are expected to gain the MD_CLEAR ++ functionality in microcode. Aside of that the IO-Port mechanism is a ++ legacy interface which is only used on older systems which are either ++ not affected or do not receive microcode updates anymore. +diff --git a/Makefile b/Makefile +index a856457cab8f..a9de3c45a7ef 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,7 +1,7 @@ + # SPDX-License-Identifier: GPL-2.0 + VERSION = 4 + PATCHLEVEL = 14 +-SUBLEVEL = 118 ++SUBLEVEL = 119 + EXTRAVERSION = + NAME = Petit Gorille + +diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c +index 48b50fb8dc4b..e9af5d9badf2 100644 +--- a/arch/powerpc/kernel/security.c ++++ b/arch/powerpc/kernel/security.c +@@ -56,7 +56,7 @@ void setup_barrier_nospec(void) + enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) && + security_ftr_enabled(SEC_FTR_BNDS_CHK_SPEC_BAR); + +- if (!no_nospec) ++ if (!no_nospec && !cpu_mitigations_off()) + enable_barrier_nospec(enable); + } + +@@ -115,7 +115,7 @@ static int __init handle_nospectre_v2(char *p) + early_param("nospectre_v2", handle_nospectre_v2); + void setup_spectre_v2(void) + { +- if (no_spectrev2) ++ if (no_spectrev2 || cpu_mitigations_off()) + do_btb_flush_fixups(); + else + btb_flush_enabled = true; +@@ -299,7 +299,7 @@ void setup_stf_barrier(void) + + stf_enabled_flush_types = type; + +- if (!no_stf_barrier) ++ if (!no_stf_barrier && !cpu_mitigations_off()) + stf_barrier_enable(enable); + } + +diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c +index 0618aa61b26a..41b3b2787f23 100644 +--- a/arch/powerpc/kernel/setup_64.c ++++ b/arch/powerpc/kernel/setup_64.c +@@ -872,7 +872,7 @@ void setup_rfi_flush(enum l1d_flush_type types, bool enable) + + enabled_flush_types = types; + +- if (!no_rfi_flush) ++ if (!no_rfi_flush && !cpu_mitigations_off()) + rfi_flush_enable(enable); + } + +diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c +index d5eed651b5ab..83e597688562 100644 +--- a/arch/s390/kernel/nospec-branch.c ++++ b/arch/s390/kernel/nospec-branch.c +@@ -1,6 +1,7 @@ + // SPDX-License-Identifier: GPL-2.0 + #include <linux/module.h> + #include <linux/device.h> ++#include <linux/cpu.h> + #include <asm/facility.h> + #include <asm/nospec-branch.h> + +@@ -55,8 +56,16 @@ static int __init nospectre_v2_setup_early(char *str) + } + early_param("nospectre_v2", nospectre_v2_setup_early); + ++ + void __init nospec_auto_detect(void) + { ++ if (cpu_mitigations_off()) { ++ /* ++ * Disable expolines and disable nobp. ++ */ ++ if (IS_ENABLED(CC_USING_EXPOLINE)) ++ nospec_disable = 1; ++ __clear_facility(82, S390_lowcore.alt_stfle_fac_list); + if (IS_ENABLED(CC_USING_EXPOLINE)) { + /* + * The kernel has been compiled with expolines. +diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c +index 60e21ccfb6d6..389800344f69 100644 +--- a/arch/x86/entry/common.c ++++ b/arch/x86/entry/common.c +@@ -31,6 +31,7 @@ + #include <asm/vdso.h> + #include <linux/uaccess.h> + #include <asm/cpufeature.h> ++#include <asm/nospec-branch.h> + + #define CREATE_TRACE_POINTS + #include <trace/events/syscalls.h> +@@ -213,6 +214,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) + #endif + + user_enter_irqoff(); ++ ++ mds_user_clear_cpu_buffers(); + } + + #define SYSCALL_EXIT_WORK_FLAGS \ +diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c +index 82ddee4ab25f..0b93f5519dda 100644 +--- a/arch/x86/events/intel/core.c ++++ b/arch/x86/events/intel/core.c +@@ -4111,11 +4111,11 @@ __init int intel_pmu_init(void) + name = "nehalem"; + break; + +- case INTEL_FAM6_ATOM_PINEVIEW: +- case INTEL_FAM6_ATOM_LINCROFT: +- case INTEL_FAM6_ATOM_PENWELL: +- case INTEL_FAM6_ATOM_CLOVERVIEW: +- case INTEL_FAM6_ATOM_CEDARVIEW: ++ case INTEL_FAM6_ATOM_BONNELL: ++ case INTEL_FAM6_ATOM_BONNELL_MID: ++ case INTEL_FAM6_ATOM_SALTWELL: ++ case INTEL_FAM6_ATOM_SALTWELL_MID: ++ case INTEL_FAM6_ATOM_SALTWELL_TABLET: + memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + +@@ -4128,9 +4128,11 @@ __init int intel_pmu_init(void) + name = "bonnell"; + break; + +- case INTEL_FAM6_ATOM_SILVERMONT1: +- case INTEL_FAM6_ATOM_SILVERMONT2: ++ case INTEL_FAM6_ATOM_SILVERMONT: ++ case INTEL_FAM6_ATOM_SILVERMONT_X: ++ case INTEL_FAM6_ATOM_SILVERMONT_MID: + case INTEL_FAM6_ATOM_AIRMONT: ++ case INTEL_FAM6_ATOM_AIRMONT_MID: + memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs, +@@ -4149,7 +4151,7 @@ __init int intel_pmu_init(void) + break; + + case INTEL_FAM6_ATOM_GOLDMONT: +- case INTEL_FAM6_ATOM_DENVERTON: ++ case INTEL_FAM6_ATOM_GOLDMONT_X: + memcpy(hw_cache_event_ids, glm_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs, +@@ -4175,7 +4177,7 @@ __init int intel_pmu_init(void) + name = "goldmont"; + break; + +- case INTEL_FAM6_ATOM_GEMINI_LAKE: ++ case INTEL_FAM6_ATOM_GOLDMONT_PLUS: + memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, glp_hw_cache_extra_regs, +diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c +index 357e82dc0e2a..59521c71c98a 100644 +--- a/arch/x86/events/intel/cstate.c ++++ b/arch/x86/events/intel/cstate.c +@@ -543,8 +543,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { + + X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_ULT, hswult_cstates), + +- X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT1, slm_cstates), +- X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT2, slm_cstates), ++ X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT, slm_cstates), ++ X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT_X, slm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_AIRMONT, slm_cstates), + + X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_CORE, snb_cstates), +@@ -563,9 +563,9 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { + X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNM, knl_cstates), + + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT, glm_cstates), +- X86_CSTATES_MODEL(INTEL_FAM6_ATOM_DENVERTON, glm_cstates), ++ X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_X, glm_cstates), + +- X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GEMINI_LAKE, glm_cstates), ++ X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_PLUS, glm_cstates), + { }, + }; + MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); +diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c +index 005908ee9333..d36a5fac6a18 100644 +--- a/arch/x86/events/intel/rapl.c ++++ b/arch/x86/events/intel/rapl.c +@@ -775,9 +775,9 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = { + X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_rapl_init), + + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, hsw_rapl_init), +- X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_DENVERTON, hsw_rapl_init), ++ X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_X, hsw_rapl_init), + +- X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GEMINI_LAKE, hsw_rapl_init), ++ X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS, hsw_rapl_init), + {}, + }; + +diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c +index 81dd57280441..5eb0669d0795 100644 +--- a/arch/x86/events/msr.c ++++ b/arch/x86/events/msr.c +@@ -62,14 +62,14 @@ static bool test_intel(int idx) + case INTEL_FAM6_BROADWELL_GT3E: + case INTEL_FAM6_BROADWELL_X: + +- case INTEL_FAM6_ATOM_SILVERMONT1: +- case INTEL_FAM6_ATOM_SILVERMONT2: ++ case INTEL_FAM6_ATOM_SILVERMONT: ++ case INTEL_FAM6_ATOM_SILVERMONT_X: + case INTEL_FAM6_ATOM_AIRMONT: + + case INTEL_FAM6_ATOM_GOLDMONT: +- case INTEL_FAM6_ATOM_DENVERTON: ++ case INTEL_FAM6_ATOM_GOLDMONT_X: + +- case INTEL_FAM6_ATOM_GEMINI_LAKE: ++ case INTEL_FAM6_ATOM_GOLDMONT_PLUS: + + case INTEL_FAM6_XEON_PHI_KNL: + case INTEL_FAM6_XEON_PHI_KNM: +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index e90940ecb436..48ef9ed8226d 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -340,6 +340,7 @@ + #define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ + #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ + #define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ ++#define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ + #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ + #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ + #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ +@@ -377,5 +378,7 @@ + #define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */ + #define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */ + #define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */ ++#define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */ ++#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */ + + #endif /* _ASM_X86_CPUFEATURES_H */ +diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h +index 35a6bc4da8ad..038e4b63b56b 100644 +--- a/arch/x86/include/asm/intel-family.h ++++ b/arch/x86/include/asm/intel-family.h +@@ -51,19 +51,23 @@ + + /* "Small Core" Processors (Atom) */ + +-#define INTEL_FAM6_ATOM_PINEVIEW 0x1C +-#define INTEL_FAM6_ATOM_LINCROFT 0x26 +-#define INTEL_FAM6_ATOM_PENWELL 0x27 +-#define INTEL_FAM6_ATOM_CLOVERVIEW 0x35 +-#define INTEL_FAM6_ATOM_CEDARVIEW 0x36 +-#define INTEL_FAM6_ATOM_SILVERMONT1 0x37 /* BayTrail/BYT / Valleyview */ +-#define INTEL_FAM6_ATOM_SILVERMONT2 0x4D /* Avaton/Rangely */ +-#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* CherryTrail / Braswell */ +-#define INTEL_FAM6_ATOM_MERRIFIELD 0x4A /* Tangier */ +-#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Anniedale */ +-#define INTEL_FAM6_ATOM_GOLDMONT 0x5C +-#define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */ +-#define INTEL_FAM6_ATOM_GEMINI_LAKE 0x7A ++#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ ++#define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */ ++ ++#define INTEL_FAM6_ATOM_SALTWELL 0x36 /* Cedarview */ ++#define INTEL_FAM6_ATOM_SALTWELL_MID 0x27 /* Penwell */ ++#define INTEL_FAM6_ATOM_SALTWELL_TABLET 0x35 /* Cloverview */ ++ ++#define INTEL_FAM6_ATOM_SILVERMONT 0x37 /* Bay Trail, Valleyview */ ++#define INTEL_FAM6_ATOM_SILVERMONT_X 0x4D /* Avaton, Rangely */ ++#define INTEL_FAM6_ATOM_SILVERMONT_MID 0x4A /* Merriefield */ ++ ++#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* Cherry Trail, Braswell */ ++#define INTEL_FAM6_ATOM_AIRMONT_MID 0x5A /* Moorefield */ ++ ++#define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */ ++#define INTEL_FAM6_ATOM_GOLDMONT_X 0x5F /* Denverton */ ++#define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */ + + /* Xeon Phi */ + +diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h +index 15450a675031..c99c66b41e53 100644 +--- a/arch/x86/include/asm/irqflags.h ++++ b/arch/x86/include/asm/irqflags.h +@@ -6,6 +6,8 @@ + + #ifndef __ASSEMBLY__ + ++#include <asm/nospec-branch.h> ++ + /* Provide __cpuidle; we can't safely include <linux/cpu.h> */ + #define __cpuidle __attribute__((__section__(".cpuidle.text"))) + +@@ -54,11 +56,13 @@ static inline void native_irq_enable(void) + + static inline __cpuidle void native_safe_halt(void) + { ++ mds_idle_clear_cpu_buffers(); + asm volatile("sti; hlt": : :"memory"); + } + + static inline __cpuidle void native_halt(void) + { ++ mds_idle_clear_cpu_buffers(); + asm volatile("hlt": : :"memory"); + } + +diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h +index b0df002c60df..7f1c8448d595 100644 +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -2,6 +2,8 @@ + #ifndef _ASM_X86_MSR_INDEX_H + #define _ASM_X86_MSR_INDEX_H + ++#include <linux/bits.h> ++ + /* + * CPU model specific register (MSR) numbers. + * +@@ -40,14 +42,14 @@ + /* Intel MSRs. Some also available on other CPUs */ + + #define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */ +-#define SPEC_CTRL_IBRS (1 << 0) /* Indirect Branch Restricted Speculation */ ++#define SPEC_CTRL_IBRS BIT(0) /* Indirect Branch Restricted Speculation */ + #define SPEC_CTRL_STIBP_SHIFT 1 /* Single Thread Indirect Branch Predictor (STIBP) bit */ +-#define SPEC_CTRL_STIBP (1 << SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */ ++#define SPEC_CTRL_STIBP BIT(SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */ + #define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */ +-#define SPEC_CTRL_SSBD (1 << SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ ++#define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ + + #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ +-#define PRED_CMD_IBPB (1 << 0) /* Indirect Branch Prediction Barrier */ ++#define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */ + + #define MSR_PPIN_CTL 0x0000004e + #define MSR_PPIN 0x0000004f +@@ -69,20 +71,25 @@ + #define MSR_MTRRcap 0x000000fe + + #define MSR_IA32_ARCH_CAPABILITIES 0x0000010a +-#define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */ +-#define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */ +-#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH (1 << 3) /* Skip L1D flush on vmentry */ +-#define ARCH_CAP_SSB_NO (1 << 4) /* +- * Not susceptible to Speculative Store Bypass +- * attack, so no Speculative Store Bypass +- * control required. +- */ ++#define ARCH_CAP_RDCL_NO BIT(0) /* Not susceptible to Meltdown */ ++#define ARCH_CAP_IBRS_ALL BIT(1) /* Enhanced IBRS support */ ++#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH BIT(3) /* Skip L1D flush on vmentry */ ++#define ARCH_CAP_SSB_NO BIT(4) /* ++ * Not susceptible to Speculative Store Bypass ++ * attack, so no Speculative Store Bypass ++ * control required. ++ */ ++#define ARCH_CAP_MDS_NO BIT(5) /* ++ * Not susceptible to ++ * Microarchitectural Data ++ * Sampling (MDS) vulnerabilities. ++ */ + + #define MSR_IA32_FLUSH_CMD 0x0000010b +-#define L1D_FLUSH (1 << 0) /* +- * Writeback and invalidate the +- * L1 data cache. +- */ ++#define L1D_FLUSH BIT(0) /* ++ * Writeback and invalidate the ++ * L1 data cache. ++ */ + + #define MSR_IA32_BBL_CR_CTL 0x00000119 + #define MSR_IA32_BBL_CR_CTL3 0x0000011e +diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h +index 39a2fb29378a..eb0f80ce8524 100644 +--- a/arch/x86/include/asm/mwait.h ++++ b/arch/x86/include/asm/mwait.h +@@ -6,6 +6,7 @@ + #include <linux/sched/idle.h> + + #include <asm/cpufeature.h> ++#include <asm/nospec-branch.h> + + #define MWAIT_SUBSTATE_MASK 0xf + #define MWAIT_CSTATE_MASK 0xf +@@ -40,6 +41,8 @@ static inline void __monitorx(const void *eax, unsigned long ecx, + + static inline void __mwait(unsigned long eax, unsigned long ecx) + { ++ mds_idle_clear_cpu_buffers(); ++ + /* "mwait %eax, %ecx;" */ + asm volatile(".byte 0x0f, 0x01, 0xc9;" + :: "a" (eax), "c" (ecx)); +@@ -74,6 +77,8 @@ static inline void __mwait(unsigned long eax, unsigned long ecx) + static inline void __mwaitx(unsigned long eax, unsigned long ebx, + unsigned long ecx) + { ++ /* No MDS buffer clear as this is AMD/HYGON only */ ++ + /* "mwaitx %eax, %ebx, %ecx;" */ + asm volatile(".byte 0x0f, 0x01, 0xfb;" + :: "a" (eax), "b" (ebx), "c" (ecx)); +@@ -81,6 +86,8 @@ static inline void __mwaitx(unsigned long eax, unsigned long ebx, + + static inline void __sti_mwait(unsigned long eax, unsigned long ecx) + { ++ mds_idle_clear_cpu_buffers(); ++ + trace_hardirqs_on(); + /* "mwait %eax, %ecx;" */ + asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" +diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h +index a633767419f2..f1ddf3a1f307 100644 +--- a/arch/x86/include/asm/nospec-branch.h ++++ b/arch/x86/include/asm/nospec-branch.h +@@ -317,6 +317,56 @@ DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp); + DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); + DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb); + ++DECLARE_STATIC_KEY_FALSE(mds_user_clear); ++DECLARE_STATIC_KEY_FALSE(mds_idle_clear); ++ ++#include <asm/segment.h> ++ ++/** ++ * mds_clear_cpu_buffers - Mitigation for MDS vulnerability ++ * ++ * This uses the otherwise unused and obsolete VERW instruction in ++ * combination with microcode which triggers a CPU buffer flush when the ++ * instruction is executed. ++ */ ++static inline void mds_clear_cpu_buffers(void) ++{ ++ static const u16 ds = __KERNEL_DS; ++ ++ /* ++ * Has to be the memory-operand variant because only that ++ * guarantees the CPU buffer flush functionality according to ++ * documentation. The register-operand variant does not. ++ * Works with any segment selector, but a valid writable ++ * data segment is the fastest variant. ++ * ++ * "cc" clobber is required because VERW modifies ZF. ++ */ ++ asm volatile("verw %[ds]" : : [ds] "m" (ds) : "cc"); ++} ++ ++/** ++ * mds_user_clear_cpu_buffers - Mitigation for MDS vulnerability ++ * ++ * Clear CPU buffers if the corresponding static key is enabled ++ */ ++static inline void mds_user_clear_cpu_buffers(void) ++{ ++ if (static_branch_likely(&mds_user_clear)) ++ mds_clear_cpu_buffers(); ++} ++ ++/** ++ * mds_idle_clear_cpu_buffers - Mitigation for MDS vulnerability ++ * ++ * Clear CPU buffers if the corresponding static key is enabled ++ */ ++static inline void mds_idle_clear_cpu_buffers(void) ++{ ++ if (static_branch_likely(&mds_idle_clear)) ++ mds_clear_cpu_buffers(); ++} ++ + #endif /* __ASSEMBLY__ */ + + /* +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index b12c8d70dd33..d55a0adbcf27 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -988,4 +988,10 @@ enum l1tf_mitigations { + + extern enum l1tf_mitigations l1tf_mitigation; + ++enum mds_mitigations { ++ MDS_MITIGATION_OFF, ++ MDS_MITIGATION_FULL, ++ MDS_MITIGATION_VMWERV, ++}; ++ + #endif /* _ASM_X86_PROCESSOR_H */ +diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c +index 5567705e0601..2769e0f5c686 100644 +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -26,6 +26,7 @@ + #include <asm/vmx.h> + #include <asm/paravirt.h> + #include <asm/alternative.h> ++#include <asm/hypervisor.h> + #include <asm/pgtable.h> + #include <asm/set_memory.h> + #include <asm/intel-family.h> +@@ -34,6 +35,7 @@ + static void __init spectre_v2_select_mitigation(void); + static void __init ssb_select_mitigation(void); + static void __init l1tf_select_mitigation(void); ++static void __init mds_select_mitigation(void); + + /* The base value of the SPEC_CTRL MSR that always has to be preserved. */ + u64 x86_spec_ctrl_base; +@@ -60,6 +62,13 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); + /* Control unconditional IBPB in switch_mm() */ + DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb); + ++/* Control MDS CPU buffer clear before returning to user space */ ++DEFINE_STATIC_KEY_FALSE(mds_user_clear); ++EXPORT_SYMBOL_GPL(mds_user_clear); ++/* Control MDS CPU buffer clear before idling (halt, mwait) */ ++DEFINE_STATIC_KEY_FALSE(mds_idle_clear); ++EXPORT_SYMBOL_GPL(mds_idle_clear); ++ + void __init check_bugs(void) + { + identify_boot_cpu(); +@@ -98,6 +107,10 @@ void __init check_bugs(void) + + l1tf_select_mitigation(); + ++ mds_select_mitigation(); ++ ++ arch_smt_update(); ++ + #ifdef CONFIG_X86_32 + /* + * Check whether we are able to run this kernel safely on SMP. +@@ -203,6 +216,61 @@ static void x86_amd_ssb_disable(void) + wrmsrl(MSR_AMD64_LS_CFG, msrval); + } + ++#undef pr_fmt ++#define pr_fmt(fmt) "MDS: " fmt ++ ++/* Default mitigation for MDS-affected CPUs */ ++static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL; ++static bool mds_nosmt __ro_after_init = false; ++ ++static const char * const mds_strings[] = { ++ [MDS_MITIGATION_OFF] = "Vulnerable", ++ [MDS_MITIGATION_FULL] = "Mitigation: Clear CPU buffers", ++ [MDS_MITIGATION_VMWERV] = "Vulnerable: Clear CPU buffers attempted, no microcode", ++}; ++ ++static void __init mds_select_mitigation(void) ++{ ++ if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) { ++ mds_mitigation = MDS_MITIGATION_OFF; ++ return; ++ } ++ ++ if (mds_mitigation == MDS_MITIGATION_FULL) { ++ if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) ++ mds_mitigation = MDS_MITIGATION_VMWERV; ++ ++ static_branch_enable(&mds_user_clear); ++ ++ if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) && ++ (mds_nosmt || cpu_mitigations_auto_nosmt())) ++ cpu_smt_disable(false); ++ } ++ ++ pr_info("%s\n", mds_strings[mds_mitigation]); ++} ++ ++static int __init mds_cmdline(char *str) ++{ ++ if (!boot_cpu_has_bug(X86_BUG_MDS)) ++ return 0; ++ ++ if (!str) ++ return -EINVAL; ++ ++ if (!strcmp(str, "off")) ++ mds_mitigation = MDS_MITIGATION_OFF; ++ else if (!strcmp(str, "full")) ++ mds_mitigation = MDS_MITIGATION_FULL; ++ else if (!strcmp(str, "full,nosmt")) { ++ mds_mitigation = MDS_MITIGATION_FULL; ++ mds_nosmt = true; ++ } ++ ++ return 0; ++} ++early_param("mds", mds_cmdline); ++ + #undef pr_fmt + #define pr_fmt(fmt) "Spectre V2 : " fmt + +@@ -427,7 +495,8 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) + char arg[20]; + int ret, i; + +- if (cmdline_find_option_bool(boot_command_line, "nospectre_v2")) ++ if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") || ++ cpu_mitigations_off()) + return SPECTRE_V2_CMD_NONE; + + ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); +@@ -559,9 +628,6 @@ specv2_set_mode: + + /* Set up IBPB and STIBP depending on the general spectre V2 command */ + spectre_v2_user_select_mitigation(cmd); +- +- /* Enable STIBP if appropriate */ +- arch_smt_update(); + } + + static void update_stibp_msr(void * __unused) +@@ -595,6 +661,31 @@ static void update_indir_branch_cond(void) + static_branch_disable(&switch_to_cond_stibp); + } + ++#undef pr_fmt ++#define pr_fmt(fmt) fmt ++ ++/* Update the static key controlling the MDS CPU buffer clear in idle */ ++static void update_mds_branch_idle(void) ++{ ++ /* ++ * Enable the idle clearing if SMT is active on CPUs which are ++ * affected only by MSBDS and not any other MDS variant. ++ * ++ * The other variants cannot be mitigated when SMT is enabled, so ++ * clearing the buffers on idle just to prevent the Store Buffer ++ * repartitioning leak would be a window dressing exercise. ++ */ ++ if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY)) ++ return; ++ ++ if (sched_smt_active()) ++ static_branch_enable(&mds_idle_clear); ++ else ++ static_branch_disable(&mds_idle_clear); ++} ++ ++#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" ++ + void arch_smt_update(void) + { + /* Enhanced IBRS implies STIBP. No update required. */ +@@ -615,6 +706,17 @@ void arch_smt_update(void) + break; + } + ++ switch (mds_mitigation) { ++ case MDS_MITIGATION_FULL: ++ case MDS_MITIGATION_VMWERV: ++ if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY)) ++ pr_warn_once(MDS_MSG_SMT); ++ update_mds_branch_idle(); ++ break; ++ case MDS_MITIGATION_OFF: ++ break; ++ } ++ + mutex_unlock(&spec_ctrl_mutex); + } + +@@ -656,7 +758,8 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) + char arg[20]; + int ret, i; + +- if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable")) { ++ if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") || ++ cpu_mitigations_off()) { + return SPEC_STORE_BYPASS_CMD_NONE; + } else { + ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", +@@ -977,6 +1080,11 @@ static void __init l1tf_select_mitigation(void) + if (!boot_cpu_has_bug(X86_BUG_L1TF)) + return; + ++ if (cpu_mitigations_off()) ++ l1tf_mitigation = L1TF_MITIGATION_OFF; ++ else if (cpu_mitigations_auto_nosmt()) ++ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; ++ + override_cache_bits(&boot_cpu_data); + + switch (l1tf_mitigation) { +@@ -1005,7 +1113,7 @@ static void __init l1tf_select_mitigation(void) + pr_info("You may make it effective by booting the kernel with mem=%llu parameter.\n", + half_pa); + pr_info("However, doing so will make a part of your RAM unusable.\n"); +- pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html might help you decide.\n"); ++ pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html might help you decide.\n"); + return; + } + +@@ -1038,6 +1146,7 @@ static int __init l1tf_cmdline(char *str) + early_param("l1tf", l1tf_cmdline); + + #undef pr_fmt ++#define pr_fmt(fmt) fmt + + #ifdef CONFIG_SYSFS + +@@ -1076,6 +1185,23 @@ static ssize_t l1tf_show_state(char *buf) + } + #endif + ++static ssize_t mds_show_state(char *buf) ++{ ++ if (!hypervisor_is_type(X86_HYPER_NATIVE)) { ++ return sprintf(buf, "%s; SMT Host state unknown\n", ++ mds_strings[mds_mitigation]); ++ } ++ ++ if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) { ++ return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], ++ (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" : ++ sched_smt_active() ? "mitigated" : "disabled")); ++ } ++ ++ return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], ++ sched_smt_active() ? "vulnerable" : "disabled"); ++} ++ + static char *stibp_state(void) + { + if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) +@@ -1137,6 +1263,10 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr + if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV)) + return l1tf_show_state(buf); + break; ++ ++ case X86_BUG_MDS: ++ return mds_show_state(buf); ++ + default: + break; + } +@@ -1168,4 +1298,9 @@ ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *b + { + return cpu_show_common(dev, attr, buf, X86_BUG_L1TF); + } ++ ++ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ return cpu_show_common(dev, attr, buf, X86_BUG_MDS); ++} + #endif +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 51e49f6fe8e1..ebe547b1ffce 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -899,85 +899,95 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) + c->x86_cache_bits = c->x86_phys_bits; + } + +-static const __initconst struct x86_cpu_id cpu_no_speculation[] = { +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY }, +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY }, +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY }, +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL, X86_FEATURE_ANY }, +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW, X86_FEATURE_ANY }, +- { X86_VENDOR_CENTAUR, 5 }, +- { X86_VENDOR_INTEL, 5 }, +- { X86_VENDOR_NSC, 5 }, +- { X86_VENDOR_ANY, 4 }, ++#define NO_SPECULATION BIT(0) ++#define NO_MELTDOWN BIT(1) ++#define NO_SSB BIT(2) ++#define NO_L1TF BIT(3) ++#define NO_MDS BIT(4) ++#define MSBDS_ONLY BIT(5) ++ ++#define VULNWL(_vendor, _family, _model, _whitelist) \ ++ { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist } ++ ++#define VULNWL_INTEL(model, whitelist) \ ++ VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist) ++ ++#define VULNWL_AMD(family, whitelist) \ ++ VULNWL(AMD, family, X86_MODEL_ANY, whitelist) ++ ++static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { ++ VULNWL(ANY, 4, X86_MODEL_ANY, NO_SPECULATION), ++ VULNWL(CENTAUR, 5, X86_MODEL_ANY, NO_SPECULATION), ++ VULNWL(INTEL, 5, X86_MODEL_ANY, NO_SPECULATION), ++ VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION), ++ ++ /* Intel Family 6 */ ++ VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION), ++ VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION), ++ VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION), ++ VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION), ++ VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION), ++ ++ VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY), ++ VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY), ++ VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY), ++ VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY), ++ VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY), ++ VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY), ++ ++ VULNWL_INTEL(CORE_YONAH, NO_SSB), ++ ++ VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY), ++ ++ VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF), ++ VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF), ++ VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF), ++ ++ /* AMD Family 0xf - 0x12 */ ++ VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS), ++ VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS), ++ VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS), ++ VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS), ++ ++ /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ ++ VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS), + {} + }; + +-static const __initconst struct x86_cpu_id cpu_no_meltdown[] = { +- { X86_VENDOR_AMD }, +- {} +-}; +- +-static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = { +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW }, +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT }, +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL }, +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW }, +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW }, +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT }, +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 }, +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD }, +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_CORE_YONAH }, +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL }, +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM }, +- { X86_VENDOR_CENTAUR, 5, }, +- { X86_VENDOR_INTEL, 5, }, +- { X86_VENDOR_NSC, 5, }, +- { X86_VENDOR_AMD, 0x12, }, +- { X86_VENDOR_AMD, 0x11, }, +- { X86_VENDOR_AMD, 0x10, }, +- { X86_VENDOR_AMD, 0xf, }, +- { X86_VENDOR_ANY, 4, }, +- {} +-}; ++static bool __init cpu_matches(unsigned long which) ++{ ++ const struct x86_cpu_id *m = x86_match_cpu(cpu_vuln_whitelist); + +-static const __initconst struct x86_cpu_id cpu_no_l1tf[] = { +- /* in addition to cpu_no_speculation */ +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 }, +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT }, +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD }, +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MOOREFIELD }, +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT }, +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_DENVERTON }, +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GEMINI_LAKE }, +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL }, +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM }, +- {} +-}; ++ return m && !!(m->driver_data & which); ++} + + static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) + { + u64 ia32_cap = 0; + ++ if (cpu_matches(NO_SPECULATION)) ++ return; ++ ++ setup_force_cpu_bug(X86_BUG_SPECTRE_V1); ++ setup_force_cpu_bug(X86_BUG_SPECTRE_V2); ++ + if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); + +- if (!x86_match_cpu(cpu_no_spec_store_bypass) && +- !(ia32_cap & ARCH_CAP_SSB_NO) && ++ if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) && + !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) + setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); + +- if (x86_match_cpu(cpu_no_speculation)) +- return; +- +- setup_force_cpu_bug(X86_BUG_SPECTRE_V1); +- setup_force_cpu_bug(X86_BUG_SPECTRE_V2); +- + if (ia32_cap & ARCH_CAP_IBRS_ALL) + setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); + +- if (x86_match_cpu(cpu_no_meltdown)) ++ if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) { ++ setup_force_cpu_bug(X86_BUG_MDS); ++ if (cpu_matches(MSBDS_ONLY)) ++ setup_force_cpu_bug(X86_BUG_MSBDS_ONLY); ++ } ++ ++ if (cpu_matches(NO_MELTDOWN)) + return; + + /* Rogue Data Cache Load? No! */ +@@ -986,7 +996,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) + + setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); + +- if (x86_match_cpu(cpu_no_l1tf)) ++ if (cpu_matches(NO_L1TF)) + return; + + setup_force_cpu_bug(X86_BUG_L1TF); +diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c +index 35aafc95e4b8..d796a7f6a74a 100644 +--- a/arch/x86/kernel/nmi.c ++++ b/arch/x86/kernel/nmi.c +@@ -34,6 +34,7 @@ + #include <asm/x86_init.h> + #include <asm/reboot.h> + #include <asm/cache.h> ++#include <asm/nospec-branch.h> + + #define CREATE_TRACE_POINTS + #include <trace/events/nmi.h> +@@ -533,6 +534,9 @@ nmi_restart: + write_cr2(this_cpu_read(nmi_cr2)); + if (this_cpu_dec_return(nmi_state)) + goto nmi_restart; ++ ++ if (user_mode(regs)) ++ mds_user_clear_cpu_buffers(); + } + NOKPROBE_SYMBOL(do_nmi); + +diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c +index ed8d78fd4f8c..aa0022a3faf5 100644 +--- a/arch/x86/kernel/traps.c ++++ b/arch/x86/kernel/traps.c +@@ -58,6 +58,7 @@ + #include <asm/alternative.h> + #include <asm/fpu/xstate.h> + #include <asm/trace/mpx.h> ++#include <asm/nospec-branch.h> + #include <asm/mpx.h> + #include <asm/vm86.h> + +@@ -385,6 +386,13 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) + regs->ip = (unsigned long)general_protection; + regs->sp = (unsigned long)&gpregs->orig_ax; + ++ /* ++ * This situation can be triggered by userspace via ++ * modify_ldt(2) and the return does not take the regular ++ * user space exit, so a CPU buffer clear is required when ++ * MDS mitigation is enabled. ++ */ ++ mds_user_clear_cpu_buffers(); + return; + } + #endif +diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c +index 36d02484e384..5d681fe6d352 100644 +--- a/arch/x86/kernel/tsc.c ++++ b/arch/x86/kernel/tsc.c +@@ -620,7 +620,7 @@ unsigned long native_calibrate_tsc(void) + case INTEL_FAM6_KABYLAKE_DESKTOP: + crystal_khz = 24000; /* 24.0 MHz */ + break; +- case INTEL_FAM6_ATOM_DENVERTON: ++ case INTEL_FAM6_ATOM_GOLDMONT_X: + crystal_khz = 25000; /* 25.0 MHz */ + break; + case INTEL_FAM6_ATOM_GOLDMONT: +diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c +index bbcd69c76d96..5c82b4bc4a68 100644 +--- a/arch/x86/kvm/cpuid.c ++++ b/arch/x86/kvm/cpuid.c +@@ -368,7 +368,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + /* cpuid 0x80000008.ebx */ + const u32 kvm_cpuid_8000_0008_ebx_x86_features = + F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | +- F(AMD_SSB_NO); ++ F(AMD_SSB_NO) | F(AMD_STIBP); + + /* cpuid 0xC0000001.edx */ + const u32 kvm_cpuid_C000_0001_edx_x86_features = +@@ -396,7 +396,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + /* cpuid 7.0.edx*/ + const u32 kvm_cpuid_7_0_edx_x86_features = + F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | +- F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES); ++ F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | ++ F(MD_CLEAR); + + /* all calls to cpuid_count() should be made on the same cpu */ + get_cpu(); +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index 90b7eee6d0f9..9b2486e8ec00 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -9766,8 +9766,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) + + vmx->__launched = vmx->loaded_vmcs->launched; + ++ /* L1D Flush includes CPU buffer clear to mitigate MDS */ + if (static_branch_unlikely(&vmx_l1d_should_flush)) + vmx_l1d_flush(vcpu); ++ else if (static_branch_unlikely(&mds_user_clear)) ++ mds_clear_cpu_buffers(); + + asm( + /* Store host registers */ +@@ -10121,8 +10124,8 @@ free_vcpu: + return ERR_PTR(err); + } + +-#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" +-#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" ++#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" ++#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" + + static int vmx_vm_init(struct kvm *kvm) + { +diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c +index 60c48f5d6b0e..33c6ee9aebbd 100644 +--- a/arch/x86/mm/pti.c ++++ b/arch/x86/mm/pti.c +@@ -35,6 +35,7 @@ + #include <linux/spinlock.h> + #include <linux/mm.h> + #include <linux/uaccess.h> ++#include <linux/cpu.h> + + #include <asm/cpufeature.h> + #include <asm/hypervisor.h> +@@ -91,7 +92,8 @@ void __init pti_check_boottime_disable(void) + goto autosel; + } + +- if (cmdline_find_option_bool(boot_command_line, "nopti")) { ++ if (cmdline_find_option_bool(boot_command_line, "nopti") || ++ cpu_mitigations_off()) { + pti_print_if_insecure("disabled on command line."); + return; + } +diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c +index d49d3be81953..ecb5866aaf84 100644 +--- a/arch/x86/platform/atom/punit_atom_debug.c ++++ b/arch/x86/platform/atom/punit_atom_debug.c +@@ -154,8 +154,8 @@ static void punit_dbgfs_unregister(void) + (kernel_ulong_t)&drv_data } + + static const struct x86_cpu_id intel_punit_cpu_ids[] = { +- ICPU(INTEL_FAM6_ATOM_SILVERMONT1, punit_device_byt), +- ICPU(INTEL_FAM6_ATOM_MERRIFIELD, punit_device_tng), ++ ICPU(INTEL_FAM6_ATOM_SILVERMONT, punit_device_byt), ++ ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID, punit_device_tng), + ICPU(INTEL_FAM6_ATOM_AIRMONT, punit_device_cht), + {} + }; +diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bt.c b/arch/x86/platform/intel-mid/device_libs/platform_bt.c +index 5a0483e7bf66..31dce781364c 100644 +--- a/arch/x86/platform/intel-mid/device_libs/platform_bt.c ++++ b/arch/x86/platform/intel-mid/device_libs/platform_bt.c +@@ -68,7 +68,7 @@ static struct bt_sfi_data tng_bt_sfi_data __initdata = { + { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (kernel_ulong_t)&ddata } + + static const struct x86_cpu_id bt_sfi_cpu_ids[] = { +- ICPU(INTEL_FAM6_ATOM_MERRIFIELD, tng_bt_sfi_data), ++ ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID, tng_bt_sfi_data), + {} + }; + +diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c +index a56d3f352765..51592dd45b06 100644 +--- a/drivers/acpi/acpi_lpss.c ++++ b/drivers/acpi/acpi_lpss.c +@@ -291,7 +291,7 @@ static const struct lpss_device_desc bsw_spi_dev_desc = { + #define ICPU(model) { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, } + + static const struct x86_cpu_id lpss_cpu_ids[] = { +- ICPU(INTEL_FAM6_ATOM_SILVERMONT1), /* Valleyview, Bay Trail */ ++ ICPU(INTEL_FAM6_ATOM_SILVERMONT), /* Valleyview, Bay Trail */ + ICPU(INTEL_FAM6_ATOM_AIRMONT), /* Braswell, Cherry Trail */ + {} + }; +diff --git a/drivers/acpi/x86/utils.c b/drivers/acpi/x86/utils.c +index b4fbb9929482..96b93f4ff06c 100644 +--- a/drivers/acpi/x86/utils.c ++++ b/drivers/acpi/x86/utils.c +@@ -54,7 +54,7 @@ static const struct always_present_id always_present_ids[] = { + * Bay / Cherry Trail PWM directly poked by GPU driver in win10, + * but Linux uses a separate PWM driver, harmless if not used. + */ +- ENTRY("80860F09", "1", ICPU(INTEL_FAM6_ATOM_SILVERMONT1), {}), ++ ENTRY("80860F09", "1", ICPU(INTEL_FAM6_ATOM_SILVERMONT), {}), + ENTRY("80862288", "1", ICPU(INTEL_FAM6_ATOM_AIRMONT), {}), + /* + * The INT0002 device is necessary to clear wakeup interrupt sources +diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c +index 93758b528d8f..32b52e6bd13b 100644 +--- a/drivers/base/cpu.c ++++ b/drivers/base/cpu.c +@@ -533,11 +533,18 @@ ssize_t __weak cpu_show_l1tf(struct device *dev, + return sprintf(buf, "Not affected\n"); + } + ++ssize_t __weak cpu_show_mds(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "Not affected\n"); ++} ++ + static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); + static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); + static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); + static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL); + static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL); ++static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL); + + static struct attribute *cpu_root_vulnerabilities_attrs[] = { + &dev_attr_meltdown.attr, +@@ -545,6 +552,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { + &dev_attr_spectre_v2.attr, + &dev_attr_spec_store_bypass.attr, + &dev_attr_l1tf.attr, ++ &dev_attr_mds.attr, + NULL + }; + +diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c +index 5ebefa17d195..7a5662425b29 100644 +--- a/drivers/cpufreq/intel_pstate.c ++++ b/drivers/cpufreq/intel_pstate.c +@@ -1625,7 +1625,7 @@ static const struct pstate_funcs bxt_funcs = { + static const struct x86_cpu_id intel_pstate_cpu_ids[] = { + ICPU(INTEL_FAM6_SANDYBRIDGE, core_funcs), + ICPU(INTEL_FAM6_SANDYBRIDGE_X, core_funcs), +- ICPU(INTEL_FAM6_ATOM_SILVERMONT1, silvermont_funcs), ++ ICPU(INTEL_FAM6_ATOM_SILVERMONT, silvermont_funcs), + ICPU(INTEL_FAM6_IVYBRIDGE, core_funcs), + ICPU(INTEL_FAM6_HASWELL_CORE, core_funcs), + ICPU(INTEL_FAM6_BROADWELL_CORE, core_funcs), +@@ -1642,7 +1642,7 @@ static const struct x86_cpu_id intel_pstate_cpu_ids[] = { + ICPU(INTEL_FAM6_XEON_PHI_KNL, knl_funcs), + ICPU(INTEL_FAM6_XEON_PHI_KNM, knl_funcs), + ICPU(INTEL_FAM6_ATOM_GOLDMONT, bxt_funcs), +- ICPU(INTEL_FAM6_ATOM_GEMINI_LAKE, bxt_funcs), ++ ICPU(INTEL_FAM6_ATOM_GOLDMONT_PLUS, bxt_funcs), + {} + }; + MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids); +diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c +index 4395c84cdcbf..7f9bb9d9fcdc 100644 +--- a/drivers/edac/pnd2_edac.c ++++ b/drivers/edac/pnd2_edac.c +@@ -1539,7 +1539,7 @@ static struct dunit_ops dnv_ops = { + + static const struct x86_cpu_id pnd2_cpuids[] = { + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT, 0, (kernel_ulong_t)&apl_ops }, +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_DENVERTON, 0, (kernel_ulong_t)&dnv_ops }, ++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT_X, 0, (kernel_ulong_t)&dnv_ops }, + { } + }; + MODULE_DEVICE_TABLE(x86cpu, pnd2_cpuids); +diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c +index 16249b0953ff..31f54a334b58 100644 +--- a/drivers/idle/intel_idle.c ++++ b/drivers/idle/intel_idle.c +@@ -1070,14 +1070,14 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { + ICPU(INTEL_FAM6_WESTMERE, idle_cpu_nehalem), + ICPU(INTEL_FAM6_WESTMERE_EP, idle_cpu_nehalem), + ICPU(INTEL_FAM6_NEHALEM_EX, idle_cpu_nehalem), +- ICPU(INTEL_FAM6_ATOM_PINEVIEW, idle_cpu_atom), +- ICPU(INTEL_FAM6_ATOM_LINCROFT, idle_cpu_lincroft), ++ ICPU(INTEL_FAM6_ATOM_BONNELL, idle_cpu_atom), ++ ICPU(INTEL_FAM6_ATOM_BONNELL_MID, idle_cpu_lincroft), + ICPU(INTEL_FAM6_WESTMERE_EX, idle_cpu_nehalem), + ICPU(INTEL_FAM6_SANDYBRIDGE, idle_cpu_snb), + ICPU(INTEL_FAM6_SANDYBRIDGE_X, idle_cpu_snb), +- ICPU(INTEL_FAM6_ATOM_CEDARVIEW, idle_cpu_atom), +- ICPU(INTEL_FAM6_ATOM_SILVERMONT1, idle_cpu_byt), +- ICPU(INTEL_FAM6_ATOM_MERRIFIELD, idle_cpu_tangier), ++ ICPU(INTEL_FAM6_ATOM_SALTWELL, idle_cpu_atom), ++ ICPU(INTEL_FAM6_ATOM_SILVERMONT, idle_cpu_byt), ++ ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID, idle_cpu_tangier), + ICPU(INTEL_FAM6_ATOM_AIRMONT, idle_cpu_cht), + ICPU(INTEL_FAM6_IVYBRIDGE, idle_cpu_ivb), + ICPU(INTEL_FAM6_IVYBRIDGE_X, idle_cpu_ivt), +@@ -1085,7 +1085,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { + ICPU(INTEL_FAM6_HASWELL_X, idle_cpu_hsw), + ICPU(INTEL_FAM6_HASWELL_ULT, idle_cpu_hsw), + ICPU(INTEL_FAM6_HASWELL_GT3E, idle_cpu_hsw), +- ICPU(INTEL_FAM6_ATOM_SILVERMONT2, idle_cpu_avn), ++ ICPU(INTEL_FAM6_ATOM_SILVERMONT_X, idle_cpu_avn), + ICPU(INTEL_FAM6_BROADWELL_CORE, idle_cpu_bdw), + ICPU(INTEL_FAM6_BROADWELL_GT3E, idle_cpu_bdw), + ICPU(INTEL_FAM6_BROADWELL_X, idle_cpu_bdw), +@@ -1098,8 +1098,8 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { + ICPU(INTEL_FAM6_XEON_PHI_KNL, idle_cpu_knl), + ICPU(INTEL_FAM6_XEON_PHI_KNM, idle_cpu_knl), + ICPU(INTEL_FAM6_ATOM_GOLDMONT, idle_cpu_bxt), +- ICPU(INTEL_FAM6_ATOM_GEMINI_LAKE, idle_cpu_bxt), +- ICPU(INTEL_FAM6_ATOM_DENVERTON, idle_cpu_dnv), ++ ICPU(INTEL_FAM6_ATOM_GOLDMONT_PLUS, idle_cpu_bxt), ++ ICPU(INTEL_FAM6_ATOM_GOLDMONT_X, idle_cpu_dnv), + {} + }; + +@@ -1316,7 +1316,7 @@ static void intel_idle_state_table_update(void) + ivt_idle_state_table_update(); + break; + case INTEL_FAM6_ATOM_GOLDMONT: +- case INTEL_FAM6_ATOM_GEMINI_LAKE: ++ case INTEL_FAM6_ATOM_GOLDMONT_PLUS: + bxt_idle_state_table_update(); + break; + case INTEL_FAM6_SKYLAKE_DESKTOP: +diff --git a/drivers/mmc/host/sdhci-acpi.c b/drivers/mmc/host/sdhci-acpi.c +index 08ae0ff13513..48cf430b84bf 100644 +--- a/drivers/mmc/host/sdhci-acpi.c ++++ b/drivers/mmc/host/sdhci-acpi.c +@@ -128,7 +128,7 @@ static const struct sdhci_acpi_chip sdhci_acpi_chip_int = { + static bool sdhci_acpi_byt(void) + { + static const struct x86_cpu_id byt[] = { +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, ++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT }, + {} + }; + +diff --git a/drivers/pci/pci-mid.c b/drivers/pci/pci-mid.c +index a4ac940c7696..65c85f219bc3 100644 +--- a/drivers/pci/pci-mid.c ++++ b/drivers/pci/pci-mid.c +@@ -65,8 +65,8 @@ static const struct pci_platform_pm_ops mid_pci_platform_pm = { + * arch/x86/platform/intel-mid/pwr.c. + */ + static const struct x86_cpu_id lpss_cpu_ids[] = { +- ICPU(INTEL_FAM6_ATOM_PENWELL), +- ICPU(INTEL_FAM6_ATOM_MERRIFIELD), ++ ICPU(INTEL_FAM6_ATOM_SALTWELL_MID), ++ ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID), + {} + }; + +diff --git a/drivers/platform/x86/intel_int0002_vgpio.c b/drivers/platform/x86/intel_int0002_vgpio.c +index 92dc230ef5b2..08107731afe0 100644 +--- a/drivers/platform/x86/intel_int0002_vgpio.c ++++ b/drivers/platform/x86/intel_int0002_vgpio.c +@@ -60,7 +60,7 @@ static const struct x86_cpu_id int0002_cpu_ids[] = { + /* + * Limit ourselves to Cherry Trail for now, until testing shows we + * need to handle the INT0002 device on Baytrail too. +- * ICPU(INTEL_FAM6_ATOM_SILVERMONT1), * Valleyview, Bay Trail * ++ * ICPU(INTEL_FAM6_ATOM_SILVERMONT), * Valleyview, Bay Trail * + */ + ICPU(INTEL_FAM6_ATOM_AIRMONT), /* Braswell, Cherry Trail */ + {} +diff --git a/drivers/platform/x86/intel_mid_powerbtn.c b/drivers/platform/x86/intel_mid_powerbtn.c +index d79fbf924b13..5ad44204a9c3 100644 +--- a/drivers/platform/x86/intel_mid_powerbtn.c ++++ b/drivers/platform/x86/intel_mid_powerbtn.c +@@ -125,8 +125,8 @@ static const struct mid_pb_ddata mrfld_ddata = { + { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (kernel_ulong_t)&ddata } + + static const struct x86_cpu_id mid_pb_cpu_ids[] = { +- ICPU(INTEL_FAM6_ATOM_PENWELL, mfld_ddata), +- ICPU(INTEL_FAM6_ATOM_MERRIFIELD, mrfld_ddata), ++ ICPU(INTEL_FAM6_ATOM_SALTWELL_MID, mfld_ddata), ++ ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID, mrfld_ddata), + {} + }; + +diff --git a/drivers/platform/x86/intel_telemetry_debugfs.c b/drivers/platform/x86/intel_telemetry_debugfs.c +index 401bdc7a9d94..995d158fb8d2 100644 +--- a/drivers/platform/x86/intel_telemetry_debugfs.c ++++ b/drivers/platform/x86/intel_telemetry_debugfs.c +@@ -331,7 +331,7 @@ static struct telemetry_debugfs_conf telem_apl_debugfs_conf = { + + static const struct x86_cpu_id telemetry_debugfs_cpu_ids[] = { + TELEM_DEBUGFS_CPU(INTEL_FAM6_ATOM_GOLDMONT, telem_apl_debugfs_conf), +- TELEM_DEBUGFS_CPU(INTEL_FAM6_ATOM_GEMINI_LAKE, telem_apl_debugfs_conf), ++ TELEM_DEBUGFS_CPU(INTEL_FAM6_ATOM_GOLDMONT_PLUS, telem_apl_debugfs_conf), + {} + }; + +diff --git a/drivers/platform/x86/intel_telemetry_pltdrv.c b/drivers/platform/x86/intel_telemetry_pltdrv.c +index e0424d5a795a..a77313edbe48 100644 +--- a/drivers/platform/x86/intel_telemetry_pltdrv.c ++++ b/drivers/platform/x86/intel_telemetry_pltdrv.c +@@ -198,7 +198,7 @@ static struct telemetry_plt_config telem_glk_config = { + + static const struct x86_cpu_id telemetry_cpu_ids[] = { + TELEM_CPU(INTEL_FAM6_ATOM_GOLDMONT, telem_apl_config), +- TELEM_CPU(INTEL_FAM6_ATOM_GEMINI_LAKE, telem_glk_config), ++ TELEM_CPU(INTEL_FAM6_ATOM_GOLDMONT_PLUS, telem_glk_config), + {} + }; + +diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c +index d1694f1def72..54ddd78924dd 100644 +--- a/drivers/powercap/intel_rapl.c ++++ b/drivers/powercap/intel_rapl.c +@@ -1161,13 +1161,13 @@ static const struct x86_cpu_id rapl_ids[] __initconst = { + RAPL_CPU(INTEL_FAM6_KABYLAKE_MOBILE, rapl_defaults_core), + RAPL_CPU(INTEL_FAM6_KABYLAKE_DESKTOP, rapl_defaults_core), + +- RAPL_CPU(INTEL_FAM6_ATOM_SILVERMONT1, rapl_defaults_byt), ++ RAPL_CPU(INTEL_FAM6_ATOM_SILVERMONT, rapl_defaults_byt), + RAPL_CPU(INTEL_FAM6_ATOM_AIRMONT, rapl_defaults_cht), +- RAPL_CPU(INTEL_FAM6_ATOM_MERRIFIELD, rapl_defaults_tng), +- RAPL_CPU(INTEL_FAM6_ATOM_MOOREFIELD, rapl_defaults_ann), ++ RAPL_CPU(INTEL_FAM6_ATOM_SILVERMONT_MID,rapl_defaults_tng), ++ RAPL_CPU(INTEL_FAM6_ATOM_AIRMONT_MID, rapl_defaults_ann), + RAPL_CPU(INTEL_FAM6_ATOM_GOLDMONT, rapl_defaults_core), +- RAPL_CPU(INTEL_FAM6_ATOM_GEMINI_LAKE, rapl_defaults_core), +- RAPL_CPU(INTEL_FAM6_ATOM_DENVERTON, rapl_defaults_core), ++ RAPL_CPU(INTEL_FAM6_ATOM_GOLDMONT_PLUS, rapl_defaults_core), ++ RAPL_CPU(INTEL_FAM6_ATOM_GOLDMONT_X, rapl_defaults_core), + + RAPL_CPU(INTEL_FAM6_XEON_PHI_KNL, rapl_defaults_hsw_server), + RAPL_CPU(INTEL_FAM6_XEON_PHI_KNM, rapl_defaults_hsw_server), +diff --git a/drivers/thermal/intel_soc_dts_thermal.c b/drivers/thermal/intel_soc_dts_thermal.c +index c27868b2c6af..ce2722edd307 100644 +--- a/drivers/thermal/intel_soc_dts_thermal.c ++++ b/drivers/thermal/intel_soc_dts_thermal.c +@@ -43,7 +43,7 @@ static irqreturn_t soc_irq_thread_fn(int irq, void *dev_data) + } + + static const struct x86_cpu_id soc_thermal_ids[] = { +- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1, 0, ++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT, 0, + BYT_SOC_DTS_APIC_IRQ}, + {} + }; +diff --git a/include/linux/bitops.h b/include/linux/bitops.h +index d03c5dd6185d..43373e41f2f4 100644 +--- a/include/linux/bitops.h ++++ b/include/linux/bitops.h +@@ -2,29 +2,9 @@ + #ifndef _LINUX_BITOPS_H + #define _LINUX_BITOPS_H + #include <asm/types.h> ++#include <linux/bits.h> + +-#ifdef __KERNEL__ +-#define BIT(nr) (1UL << (nr)) +-#define BIT_ULL(nr) (1ULL << (nr)) +-#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) +-#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) +-#define BIT_ULL_MASK(nr) (1ULL << ((nr) % BITS_PER_LONG_LONG)) +-#define BIT_ULL_WORD(nr) ((nr) / BITS_PER_LONG_LONG) +-#define BITS_PER_BYTE 8 + #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long)) +-#endif +- +-/* +- * Create a contiguous bitmask starting at bit position @l and ending at +- * position @h. For example +- * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000. +- */ +-#define GENMASK(h, l) \ +- (((~0UL) - (1UL << (l)) + 1) & (~0UL >> (BITS_PER_LONG - 1 - (h)))) +- +-#define GENMASK_ULL(h, l) \ +- (((~0ULL) - (1ULL << (l)) + 1) & \ +- (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h)))) + + extern unsigned int __sw_hweight8(unsigned int w); + extern unsigned int __sw_hweight16(unsigned int w); +diff --git a/include/linux/bits.h b/include/linux/bits.h +new file mode 100644 +index 000000000000..2b7b532c1d51 +--- /dev/null ++++ b/include/linux/bits.h +@@ -0,0 +1,26 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef __LINUX_BITS_H ++#define __LINUX_BITS_H ++#include <asm/bitsperlong.h> ++ ++#define BIT(nr) (1UL << (nr)) ++#define BIT_ULL(nr) (1ULL << (nr)) ++#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) ++#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) ++#define BIT_ULL_MASK(nr) (1ULL << ((nr) % BITS_PER_LONG_LONG)) ++#define BIT_ULL_WORD(nr) ((nr) / BITS_PER_LONG_LONG) ++#define BITS_PER_BYTE 8 ++ ++/* ++ * Create a contiguous bitmask starting at bit position @l and ending at ++ * position @h. For example ++ * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000. ++ */ ++#define GENMASK(h, l) \ ++ (((~0UL) - (1UL << (l)) + 1) & (~0UL >> (BITS_PER_LONG - 1 - (h)))) ++ ++#define GENMASK_ULL(h, l) \ ++ (((~0ULL) - (1ULL << (l)) + 1) & \ ++ (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h)))) ++ ++#endif /* __LINUX_BITS_H */ +diff --git a/include/linux/cpu.h b/include/linux/cpu.h +index c7712e042aba..9573b5b0fc6f 100644 +--- a/include/linux/cpu.h ++++ b/include/linux/cpu.h +@@ -57,6 +57,8 @@ extern ssize_t cpu_show_spec_store_bypass(struct device *dev, + struct device_attribute *attr, char *buf); + extern ssize_t cpu_show_l1tf(struct device *dev, + struct device_attribute *attr, char *buf); ++extern ssize_t cpu_show_mds(struct device *dev, ++ struct device_attribute *attr, char *buf); + + extern __printf(4, 5) + struct device *cpu_device_create(struct device *parent, void *drvdata, +@@ -195,4 +197,28 @@ static inline void cpu_smt_disable(bool force) { } + static inline void cpu_smt_check_topology(void) { } + #endif + ++/* ++ * These are used for a global "mitigations=" cmdline option for toggling ++ * optional CPU mitigations. ++ */ ++enum cpu_mitigations { ++ CPU_MITIGATIONS_OFF, ++ CPU_MITIGATIONS_AUTO, ++ CPU_MITIGATIONS_AUTO_NOSMT, ++}; ++ ++extern enum cpu_mitigations cpu_mitigations; ++ ++/* mitigations=off */ ++static inline bool cpu_mitigations_off(void) ++{ ++ return cpu_mitigations == CPU_MITIGATIONS_OFF; ++} ++ ++/* mitigations=auto,nosmt */ ++static inline bool cpu_mitigations_auto_nosmt(void) ++{ ++ return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT; ++} ++ + #endif /* _LINUX_CPU_H_ */ +diff --git a/kernel/cpu.c b/kernel/cpu.c +index 8c350dd81581..6503ca8d59a7 100644 +--- a/kernel/cpu.c ++++ b/kernel/cpu.c +@@ -2297,3 +2297,18 @@ void __init boot_cpu_hotplug_init(void) + #endif + this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); + } ++ ++enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO; ++ ++static int __init mitigations_parse_cmdline(char *arg) ++{ ++ if (!strcmp(arg, "off")) ++ cpu_mitigations = CPU_MITIGATIONS_OFF; ++ else if (!strcmp(arg, "auto")) ++ cpu_mitigations = CPU_MITIGATIONS_AUTO; ++ else if (!strcmp(arg, "auto,nosmt")) ++ cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT; ++ ++ return 0; ++} ++early_param("mitigations", mitigations_parse_cmdline); +diff --git a/tools/power/x86/turbostat/Makefile b/tools/power/x86/turbostat/Makefile +index a9bc914a8fe8..78e26f782f45 100644 +--- a/tools/power/x86/turbostat/Makefile ++++ b/tools/power/x86/turbostat/Makefile +@@ -9,7 +9,7 @@ ifeq ("$(origin O)", "command line") + endif + + turbostat : turbostat.c +-CFLAGS += -Wall ++CFLAGS += -Wall -I../../../include + CFLAGS += -DMSRHEADER='"../../../../arch/x86/include/asm/msr-index.h"' + CFLAGS += -DINTEL_FAMILY_HEADER='"../../../../arch/x86/include/asm/intel-family.h"' + +diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c +index d1b2348db0f9..3e5f8b3db272 100644 +--- a/tools/power/x86/turbostat/turbostat.c ++++ b/tools/power/x86/turbostat/turbostat.c +@@ -1883,7 +1883,7 @@ int has_turbo_ratio_group_limits(int family, int model) + switch (model) { + case INTEL_FAM6_ATOM_GOLDMONT: + case INTEL_FAM6_SKYLAKE_X: +- case INTEL_FAM6_ATOM_DENVERTON: ++ case INTEL_FAM6_ATOM_GOLDMONT_X: + return 1; + } + return 0; +@@ -2745,9 +2745,9 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) + pkg_cstate_limits = skx_pkg_cstate_limits; + has_misc_feature_control = 1; + break; +- case INTEL_FAM6_ATOM_SILVERMONT1: /* BYT */ ++ case INTEL_FAM6_ATOM_SILVERMONT: /* BYT */ + no_MSR_MISC_PWR_MGMT = 1; +- case INTEL_FAM6_ATOM_SILVERMONT2: /* AVN */ ++ case INTEL_FAM6_ATOM_SILVERMONT_X: /* AVN */ + pkg_cstate_limits = slv_pkg_cstate_limits; + break; + case INTEL_FAM6_ATOM_AIRMONT: /* AMT */ +@@ -2759,8 +2759,8 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) + pkg_cstate_limits = phi_pkg_cstate_limits; + break; + case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ +- case INTEL_FAM6_ATOM_GEMINI_LAKE: +- case INTEL_FAM6_ATOM_DENVERTON: /* DNV */ ++ case INTEL_FAM6_ATOM_GOLDMONT_PLUS: ++ case INTEL_FAM6_ATOM_GOLDMONT_X: /* DNV */ + pkg_cstate_limits = bxt_pkg_cstate_limits; + break; + default: +@@ -2789,9 +2789,9 @@ int has_slv_msrs(unsigned int family, unsigned int model) + return 0; + + switch (model) { +- case INTEL_FAM6_ATOM_SILVERMONT1: +- case INTEL_FAM6_ATOM_MERRIFIELD: +- case INTEL_FAM6_ATOM_MOOREFIELD: ++ case INTEL_FAM6_ATOM_SILVERMONT: ++ case INTEL_FAM6_ATOM_SILVERMONT_MID: ++ case INTEL_FAM6_ATOM_AIRMONT_MID: + return 1; + } + return 0; +@@ -2803,7 +2803,7 @@ int is_dnv(unsigned int family, unsigned int model) + return 0; + + switch (model) { +- case INTEL_FAM6_ATOM_DENVERTON: ++ case INTEL_FAM6_ATOM_GOLDMONT_X: + return 1; + } + return 0; +@@ -3319,8 +3319,8 @@ double get_tdp(unsigned int model) + return ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units; + + switch (model) { +- case INTEL_FAM6_ATOM_SILVERMONT1: +- case INTEL_FAM6_ATOM_SILVERMONT2: ++ case INTEL_FAM6_ATOM_SILVERMONT: ++ case INTEL_FAM6_ATOM_SILVERMONT_X: + return 30.0; + default: + return 135.0; +@@ -3386,7 +3386,7 @@ void rapl_probe(unsigned int family, unsigned int model) + } + break; + case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ +- case INTEL_FAM6_ATOM_GEMINI_LAKE: ++ case INTEL_FAM6_ATOM_GOLDMONT_PLUS: + do_rapl = RAPL_PKG | RAPL_PKG_POWER_INFO; + if (rapl_joules) + BIC_PRESENT(BIC_Pkg_J); +@@ -3444,8 +3444,8 @@ void rapl_probe(unsigned int family, unsigned int model) + BIC_PRESENT(BIC_RAMWatt); + } + break; +- case INTEL_FAM6_ATOM_SILVERMONT1: /* BYT */ +- case INTEL_FAM6_ATOM_SILVERMONT2: /* AVN */ ++ case INTEL_FAM6_ATOM_SILVERMONT: /* BYT */ ++ case INTEL_FAM6_ATOM_SILVERMONT_X: /* AVN */ + do_rapl = RAPL_PKG | RAPL_CORES; + if (rapl_joules) { + BIC_PRESENT(BIC_Pkg_J); +@@ -3455,7 +3455,7 @@ void rapl_probe(unsigned int family, unsigned int model) + BIC_PRESENT(BIC_CorWatt); + } + break; +- case INTEL_FAM6_ATOM_DENVERTON: /* DNV */ ++ case INTEL_FAM6_ATOM_GOLDMONT_X: /* DNV */ + do_rapl = RAPL_PKG | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO | RAPL_CORES_ENERGY_STATUS; + BIC_PRESENT(BIC_PKG__); + BIC_PRESENT(BIC_RAM__); +@@ -3478,7 +3478,7 @@ void rapl_probe(unsigned int family, unsigned int model) + return; + + rapl_power_units = 1.0 / (1 << (msr & 0xF)); +- if (model == INTEL_FAM6_ATOM_SILVERMONT1) ++ if (model == INTEL_FAM6_ATOM_SILVERMONT) + rapl_energy_units = 1.0 * (1 << (msr >> 8 & 0x1F)) / 1000000; + else + rapl_energy_units = 1.0 / (1 << (msr >> 8 & 0x1F)); +@@ -3728,8 +3728,8 @@ int has_snb_msrs(unsigned int family, unsigned int model) + case INTEL_FAM6_KABYLAKE_DESKTOP: /* KBL */ + case INTEL_FAM6_SKYLAKE_X: /* SKX */ + case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ +- case INTEL_FAM6_ATOM_GEMINI_LAKE: +- case INTEL_FAM6_ATOM_DENVERTON: /* DNV */ ++ case INTEL_FAM6_ATOM_GOLDMONT_PLUS: ++ case INTEL_FAM6_ATOM_GOLDMONT_X: /* DNV */ + return 1; + } + return 0; +@@ -3760,7 +3760,7 @@ int has_hsw_msrs(unsigned int family, unsigned int model) + case INTEL_FAM6_KABYLAKE_MOBILE: /* KBL */ + case INTEL_FAM6_KABYLAKE_DESKTOP: /* KBL */ + case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ +- case INTEL_FAM6_ATOM_GEMINI_LAKE: ++ case INTEL_FAM6_ATOM_GOLDMONT_PLUS: + return 1; + } + return 0; +@@ -3794,8 +3794,8 @@ int is_slm(unsigned int family, unsigned int model) + if (!genuine_intel) + return 0; + switch (model) { +- case INTEL_FAM6_ATOM_SILVERMONT1: /* BYT */ +- case INTEL_FAM6_ATOM_SILVERMONT2: /* AVN */ ++ case INTEL_FAM6_ATOM_SILVERMONT: /* BYT */ ++ case INTEL_FAM6_ATOM_SILVERMONT_X: /* AVN */ + return 1; + } + return 0; +@@ -4153,11 +4153,11 @@ void process_cpuid() + crystal_hz = 24000000; /* 24.0 MHz */ + break; + case INTEL_FAM6_SKYLAKE_X: /* SKX */ +- case INTEL_FAM6_ATOM_DENVERTON: /* DNV */ ++ case INTEL_FAM6_ATOM_GOLDMONT_X: /* DNV */ + crystal_hz = 25000000; /* 25.0 MHz */ + break; + case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ +- case INTEL_FAM6_ATOM_GEMINI_LAKE: ++ case INTEL_FAM6_ATOM_GOLDMONT_PLUS: + crystal_hz = 19200000; /* 19.2 MHz */ + break; + default: +diff --git a/tools/power/x86/x86_energy_perf_policy/Makefile b/tools/power/x86/x86_energy_perf_policy/Makefile +index 2447b1bbaacf..f60883c574cc 100644 +--- a/tools/power/x86/x86_energy_perf_policy/Makefile ++++ b/tools/power/x86/x86_energy_perf_policy/Makefile +@@ -9,7 +9,7 @@ ifeq ("$(origin O)", "command line") + endif + + x86_energy_perf_policy : x86_energy_perf_policy.c +-CFLAGS += -Wall ++CFLAGS += -Wall -I../../../include + CFLAGS += -DMSRHEADER='"../../../../arch/x86/include/asm/msr-index.h"' + + %: %.c |