diff options
author | Mike Pagano <mpagano@gentoo.org> | 2018-02-07 19:40:27 -0500 |
---|---|---|
committer | Mike Pagano <mpagano@gentoo.org> | 2018-11-14 09:00:39 -0500 |
commit | bf700be01b4e88d2b151c924c8a3e1be7a47be9d (patch) | |
tree | af43005304675dc5a3a81159f9c7fa5eff0cc84d /1017_linux-4.14.18.patch | |
parent | Linux patch 4.14.17 (diff) | |
download | linux-patches-bf700be01b4e88d2b151c924c8a3e1be7a47be9d.tar.gz linux-patches-bf700be01b4e88d2b151c924c8a3e1be7a47be9d.tar.bz2 linux-patches-bf700be01b4e88d2b151c924c8a3e1be7a47be9d.zip |
Linux patch 4.14.18
Signed-off-by: Mike Pagano <mpagano@gentoo.org>
Diffstat (limited to '1017_linux-4.14.18.patch')
-rw-r--r-- | 1017_linux-4.14.18.patch | 3790 |
1 files changed, 3790 insertions, 0 deletions
diff --git a/1017_linux-4.14.18.patch b/1017_linux-4.14.18.patch new file mode 100644 index 00000000..07fbf451 --- /dev/null +++ b/1017_linux-4.14.18.patch @@ -0,0 +1,3790 @@ +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 8122b5f98ea1..c76afdcafbef 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -2718,8 +2718,6 @@ + norandmaps Don't use address space randomization. Equivalent to + echo 0 > /proc/sys/kernel/randomize_va_space + +- noreplace-paravirt [X86,IA-64,PV_OPS] Don't patch paravirt_ops +- + noreplace-smp [X86-32,SMP] Don't replace SMP instructions + with UP alternatives + +diff --git a/Documentation/speculation.txt b/Documentation/speculation.txt +new file mode 100644 +index 000000000000..e9e6cbae2841 +--- /dev/null ++++ b/Documentation/speculation.txt +@@ -0,0 +1,90 @@ ++This document explains potential effects of speculation, and how undesirable ++effects can be mitigated portably using common APIs. ++ ++=========== ++Speculation ++=========== ++ ++To improve performance and minimize average latencies, many contemporary CPUs ++employ speculative execution techniques such as branch prediction, performing ++work which may be discarded at a later stage. ++ ++Typically speculative execution cannot be observed from architectural state, ++such as the contents of registers. However, in some cases it is possible to ++observe its impact on microarchitectural state, such as the presence or ++absence of data in caches. Such state may form side-channels which can be ++observed to extract secret information. ++ ++For example, in the presence of branch prediction, it is possible for bounds ++checks to be ignored by code which is speculatively executed. Consider the ++following code: ++ ++ int load_array(int *array, unsigned int index) ++ { ++ if (index >= MAX_ARRAY_ELEMS) ++ return 0; ++ else ++ return array[index]; ++ } ++ ++Which, on arm64, may be compiled to an assembly sequence such as: ++ ++ CMP <index>, #MAX_ARRAY_ELEMS ++ B.LT less ++ MOV <returnval>, #0 ++ RET ++ less: ++ LDR <returnval>, [<array>, <index>] ++ RET ++ ++It is possible that a CPU mis-predicts the conditional branch, and ++speculatively loads array[index], even if index >= MAX_ARRAY_ELEMS. This ++value will subsequently be discarded, but the speculated load may affect ++microarchitectural state which can be subsequently measured. ++ ++More complex sequences involving multiple dependent memory accesses may ++result in sensitive information being leaked. Consider the following ++code, building on the prior example: ++ ++ int load_dependent_arrays(int *arr1, int *arr2, int index) ++ { ++ int val1, val2, ++ ++ val1 = load_array(arr1, index); ++ val2 = load_array(arr2, val1); ++ ++ return val2; ++ } ++ ++Under speculation, the first call to load_array() may return the value ++of an out-of-bounds address, while the second call will influence ++microarchitectural state dependent on this value. This may provide an ++arbitrary read primitive. ++ ++==================================== ++Mitigating speculation side-channels ++==================================== ++ ++The kernel provides a generic API to ensure that bounds checks are ++respected even under speculation. Architectures which are affected by ++speculation-based side-channels are expected to implement these ++primitives. ++ ++The array_index_nospec() helper in <linux/nospec.h> can be used to ++prevent information from being leaked via side-channels. ++ ++A call to array_index_nospec(index, size) returns a sanitized index ++value that is bounded to [0, size) even under cpu speculation ++conditions. ++ ++This can be used to protect the earlier load_array() example: ++ ++ int load_array(int *array, unsigned int index) ++ { ++ if (index >= MAX_ARRAY_ELEMS) ++ return 0; ++ else { ++ index = array_index_nospec(index, MAX_ARRAY_ELEMS); ++ return array[index]; ++ } ++ } +diff --git a/Makefile b/Makefile +index 7ed993896dd5..a69e5da9ed86 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,7 +1,7 @@ + # SPDX-License-Identifier: GPL-2.0 + VERSION = 4 + PATCHLEVEL = 14 +-SUBLEVEL = 17 ++SUBLEVEL = 18 + EXTRAVERSION = + NAME = Petit Gorille + +diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig +index cb782ac1c35d..fe418226df7f 100644 +--- a/arch/powerpc/Kconfig ++++ b/arch/powerpc/Kconfig +@@ -164,6 +164,7 @@ config PPC + select GENERIC_CLOCKEVENTS_BROADCAST if SMP + select GENERIC_CMOS_UPDATE + select GENERIC_CPU_AUTOPROBE ++ select GENERIC_CPU_VULNERABILITIES if PPC_BOOK3S_64 + select GENERIC_IRQ_SHOW + select GENERIC_IRQ_SHOW_LEVEL + select GENERIC_SMP_IDLE_THREAD +diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c +index 935059cb9e40..9527a4c6cbc2 100644 +--- a/arch/powerpc/kernel/setup_64.c ++++ b/arch/powerpc/kernel/setup_64.c +@@ -38,6 +38,7 @@ + #include <linux/memory.h> + #include <linux/nmi.h> + ++#include <asm/debugfs.h> + #include <asm/io.h> + #include <asm/kdump.h> + #include <asm/prom.h> +@@ -884,4 +885,41 @@ void __init setup_rfi_flush(enum l1d_flush_type types, bool enable) + if (!no_rfi_flush) + rfi_flush_enable(enable); + } ++ ++#ifdef CONFIG_DEBUG_FS ++static int rfi_flush_set(void *data, u64 val) ++{ ++ if (val == 1) ++ rfi_flush_enable(true); ++ else if (val == 0) ++ rfi_flush_enable(false); ++ else ++ return -EINVAL; ++ ++ return 0; ++} ++ ++static int rfi_flush_get(void *data, u64 *val) ++{ ++ *val = rfi_flush ? 1 : 0; ++ return 0; ++} ++ ++DEFINE_SIMPLE_ATTRIBUTE(fops_rfi_flush, rfi_flush_get, rfi_flush_set, "%llu\n"); ++ ++static __init int rfi_flush_debugfs_init(void) ++{ ++ debugfs_create_file("rfi_flush", 0600, powerpc_debugfs_root, NULL, &fops_rfi_flush); ++ return 0; ++} ++device_initcall(rfi_flush_debugfs_init); ++#endif ++ ++ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ if (rfi_flush) ++ return sprintf(buf, "Mitigation: RFI Flush\n"); ++ ++ return sprintf(buf, "Vulnerable\n"); ++} + #endif /* CONFIG_PPC_BOOK3S_64 */ +diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c +index 03505ffbe1b6..60e21ccfb6d6 100644 +--- a/arch/x86/entry/common.c ++++ b/arch/x86/entry/common.c +@@ -21,6 +21,7 @@ + #include <linux/export.h> + #include <linux/context_tracking.h> + #include <linux/user-return-notifier.h> ++#include <linux/nospec.h> + #include <linux/uprobes.h> + #include <linux/livepatch.h> + #include <linux/syscalls.h> +@@ -208,7 +209,7 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) + * special case only applies after poking regs and before the + * very next return to user mode. + */ +- current->thread.status &= ~(TS_COMPAT|TS_I386_REGS_POKED); ++ ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED); + #endif + + user_enter_irqoff(); +@@ -284,7 +285,8 @@ __visible void do_syscall_64(struct pt_regs *regs) + * regs->orig_ax, which changes the behavior of some syscalls. + */ + if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) { +- regs->ax = sys_call_table[nr & __SYSCALL_MASK]( ++ nr = array_index_nospec(nr & __SYSCALL_MASK, NR_syscalls); ++ regs->ax = sys_call_table[nr]( + regs->di, regs->si, regs->dx, + regs->r10, regs->r8, regs->r9); + } +@@ -306,7 +308,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) + unsigned int nr = (unsigned int)regs->orig_ax; + + #ifdef CONFIG_IA32_EMULATION +- current->thread.status |= TS_COMPAT; ++ ti->status |= TS_COMPAT; + #endif + + if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) { +@@ -320,6 +322,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) + } + + if (likely(nr < IA32_NR_syscalls)) { ++ nr = array_index_nospec(nr, IA32_NR_syscalls); + /* + * It's possible that a 32-bit syscall implementation + * takes a 64-bit parameter but nonetheless assumes that +diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S +index 60c4c342316c..2a35b1e0fb90 100644 +--- a/arch/x86/entry/entry_32.S ++++ b/arch/x86/entry/entry_32.S +@@ -252,7 +252,8 @@ ENTRY(__switch_to_asm) + * exist, overwrite the RSB with entries which capture + * speculative execution to prevent attack. + */ +- FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW ++ /* Clobbers %ebx */ ++ FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW + #endif + + /* restore callee-saved registers */ +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index be6b66464f6a..16e2d72e79a0 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -232,91 +232,20 @@ GLOBAL(entry_SYSCALL_64_after_hwframe) + pushq %r9 /* pt_regs->r9 */ + pushq %r10 /* pt_regs->r10 */ + pushq %r11 /* pt_regs->r11 */ +- sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ +- UNWIND_HINT_REGS extra=0 +- +- TRACE_IRQS_OFF +- +- /* +- * If we need to do entry work or if we guess we'll need to do +- * exit work, go straight to the slow path. +- */ +- movq PER_CPU_VAR(current_task), %r11 +- testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TASK_TI_flags(%r11) +- jnz entry_SYSCALL64_slow_path +- +-entry_SYSCALL_64_fastpath: +- /* +- * Easy case: enable interrupts and issue the syscall. If the syscall +- * needs pt_regs, we'll call a stub that disables interrupts again +- * and jumps to the slow path. +- */ +- TRACE_IRQS_ON +- ENABLE_INTERRUPTS(CLBR_NONE) +-#if __SYSCALL_MASK == ~0 +- cmpq $__NR_syscall_max, %rax +-#else +- andl $__SYSCALL_MASK, %eax +- cmpl $__NR_syscall_max, %eax +-#endif +- ja 1f /* return -ENOSYS (already in pt_regs->ax) */ +- movq %r10, %rcx +- +- /* +- * This call instruction is handled specially in stub_ptregs_64. +- * It might end up jumping to the slow path. If it jumps, RAX +- * and all argument registers are clobbered. +- */ +-#ifdef CONFIG_RETPOLINE +- movq sys_call_table(, %rax, 8), %rax +- call __x86_indirect_thunk_rax +-#else +- call *sys_call_table(, %rax, 8) +-#endif +-.Lentry_SYSCALL_64_after_fastpath_call: +- +- movq %rax, RAX(%rsp) +-1: ++ pushq %rbx /* pt_regs->rbx */ ++ pushq %rbp /* pt_regs->rbp */ ++ pushq %r12 /* pt_regs->r12 */ ++ pushq %r13 /* pt_regs->r13 */ ++ pushq %r14 /* pt_regs->r14 */ ++ pushq %r15 /* pt_regs->r15 */ ++ UNWIND_HINT_REGS + +- /* +- * If we get here, then we know that pt_regs is clean for SYSRET64. +- * If we see that no exit work is required (which we are required +- * to check with IRQs off), then we can go straight to SYSRET64. +- */ +- DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF +- movq PER_CPU_VAR(current_task), %r11 +- testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11) +- jnz 1f +- +- LOCKDEP_SYS_EXIT +- TRACE_IRQS_ON /* user mode is traced as IRQs on */ +- movq RIP(%rsp), %rcx +- movq EFLAGS(%rsp), %r11 +- addq $6*8, %rsp /* skip extra regs -- they were preserved */ +- UNWIND_HINT_EMPTY +- jmp .Lpop_c_regs_except_rcx_r11_and_sysret + +-1: +- /* +- * The fast path looked good when we started, but something changed +- * along the way and we need to switch to the slow path. Calling +- * raise(3) will trigger this, for example. IRQs are off. +- */ +- TRACE_IRQS_ON +- ENABLE_INTERRUPTS(CLBR_ANY) +- SAVE_EXTRA_REGS +- movq %rsp, %rdi +- call syscall_return_slowpath /* returns with IRQs disabled */ +- jmp return_from_SYSCALL_64 +- +-entry_SYSCALL64_slow_path: + /* IRQs are off. */ +- SAVE_EXTRA_REGS + movq %rsp, %rdi + call do_syscall_64 /* returns with IRQs disabled */ + +-return_from_SYSCALL_64: + TRACE_IRQS_IRETQ /* we're about to change IF */ + + /* +@@ -389,7 +318,6 @@ syscall_return_via_sysret: + /* rcx and r11 are already restored (see code above) */ + UNWIND_HINT_EMPTY + POP_EXTRA_REGS +-.Lpop_c_regs_except_rcx_r11_and_sysret: + popq %rsi /* skip r11 */ + popq %r10 + popq %r9 +@@ -420,47 +348,6 @@ syscall_return_via_sysret: + USERGS_SYSRET64 + END(entry_SYSCALL_64) + +-ENTRY(stub_ptregs_64) +- /* +- * Syscalls marked as needing ptregs land here. +- * If we are on the fast path, we need to save the extra regs, +- * which we achieve by trying again on the slow path. If we are on +- * the slow path, the extra regs are already saved. +- * +- * RAX stores a pointer to the C function implementing the syscall. +- * IRQs are on. +- */ +- cmpq $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp) +- jne 1f +- +- /* +- * Called from fast path -- disable IRQs again, pop return address +- * and jump to slow path +- */ +- DISABLE_INTERRUPTS(CLBR_ANY) +- TRACE_IRQS_OFF +- popq %rax +- UNWIND_HINT_REGS extra=0 +- jmp entry_SYSCALL64_slow_path +- +-1: +- JMP_NOSPEC %rax /* Called from C */ +-END(stub_ptregs_64) +- +-.macro ptregs_stub func +-ENTRY(ptregs_\func) +- UNWIND_HINT_FUNC +- leaq \func(%rip), %rax +- jmp stub_ptregs_64 +-END(ptregs_\func) +-.endm +- +-/* Instantiate ptregs_stub for each ptregs-using syscall */ +-#define __SYSCALL_64_QUAL_(sym) +-#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym +-#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym) +-#include <asm/syscalls_64.h> +- + /* + * %rdi: prev task + * %rsi: next task +@@ -495,7 +382,8 @@ ENTRY(__switch_to_asm) + * exist, overwrite the RSB with entries which capture + * speculative execution to prevent attack. + */ +- FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW ++ /* Clobbers %rbx */ ++ FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW + #endif + + /* restore callee-saved registers */ +diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c +index 9c09775e589d..c176d2fab1da 100644 +--- a/arch/x86/entry/syscall_64.c ++++ b/arch/x86/entry/syscall_64.c +@@ -7,14 +7,11 @@ + #include <asm/asm-offsets.h> + #include <asm/syscall.h> + +-#define __SYSCALL_64_QUAL_(sym) sym +-#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym +- +-#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); ++#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); + #include <asm/syscalls_64.h> + #undef __SYSCALL_64 + +-#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym), ++#define __SYSCALL_64(nr, sym, qual) [nr] = sym, + + extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); + +diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h +index 0927cdc4f946..4d111616524b 100644 +--- a/arch/x86/include/asm/asm-prototypes.h ++++ b/arch/x86/include/asm/asm-prototypes.h +@@ -38,5 +38,7 @@ INDIRECT_THUNK(dx) + INDIRECT_THUNK(si) + INDIRECT_THUNK(di) + INDIRECT_THUNK(bp) +-INDIRECT_THUNK(sp) ++asmlinkage void __fill_rsb(void); ++asmlinkage void __clear_rsb(void); ++ + #endif /* CONFIG_RETPOLINE */ +diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h +index 01727dbc294a..1e7c955b6303 100644 +--- a/arch/x86/include/asm/barrier.h ++++ b/arch/x86/include/asm/barrier.h +@@ -24,6 +24,34 @@ + #define wmb() asm volatile("sfence" ::: "memory") + #endif + ++/** ++ * array_index_mask_nospec() - generate a mask that is ~0UL when the ++ * bounds check succeeds and 0 otherwise ++ * @index: array element index ++ * @size: number of elements in array ++ * ++ * Returns: ++ * 0 - (index < size) ++ */ ++static inline unsigned long array_index_mask_nospec(unsigned long index, ++ unsigned long size) ++{ ++ unsigned long mask; ++ ++ asm ("cmp %1,%2; sbb %0,%0;" ++ :"=r" (mask) ++ :"r"(size),"r" (index) ++ :"cc"); ++ return mask; ++} ++ ++/* Override the default implementation from linux/nospec.h. */ ++#define array_index_mask_nospec array_index_mask_nospec ++ ++/* Prevent speculative execution past this barrier. */ ++#define barrier_nospec() alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, \ ++ "lfence", X86_FEATURE_LFENCE_RDTSC) ++ + #ifdef CONFIG_X86_PPRO_FENCE + #define dma_rmb() rmb() + #else +diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h +index ea9a7dde62e5..70eddb3922ff 100644 +--- a/arch/x86/include/asm/cpufeature.h ++++ b/arch/x86/include/asm/cpufeature.h +@@ -29,6 +29,7 @@ enum cpuid_leafs + CPUID_8000_000A_EDX, + CPUID_7_ECX, + CPUID_8000_0007_EBX, ++ CPUID_7_EDX, + }; + + #ifdef CONFIG_X86_FEATURE_NAMES +@@ -79,8 +80,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 15, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \ ++ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \ + REQUIRED_MASK_CHECK || \ +- BUILD_BUG_ON_ZERO(NCAPINTS != 18)) ++ BUILD_BUG_ON_ZERO(NCAPINTS != 19)) + + #define DISABLED_MASK_BIT_SET(feature_bit) \ + ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \ +@@ -101,8 +103,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 15, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \ ++ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \ + DISABLED_MASK_CHECK || \ +- BUILD_BUG_ON_ZERO(NCAPINTS != 18)) ++ BUILD_BUG_ON_ZERO(NCAPINTS != 19)) + + #define cpu_has(c, bit) \ + (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index 25b9375c1484..73b5fff159a4 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -13,7 +13,7 @@ + /* + * Defines x86 CPU feature bits + */ +-#define NCAPINTS 18 /* N 32-bit words worth of info */ ++#define NCAPINTS 19 /* N 32-bit words worth of info */ + #define NBUGINTS 1 /* N 32-bit bug flags */ + + /* +@@ -203,14 +203,14 @@ + #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ + #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ + #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ +-#define X86_FEATURE_RETPOLINE ( 7*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */ +-#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* AMD Retpoline mitigation for Spectre variant 2 */ ++#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ ++#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */ + #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ +-#define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */ +-#define X86_FEATURE_AVX512_4FMAPS ( 7*32+17) /* AVX-512 Multiply Accumulation Single precision */ + + #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ +-#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ ++#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ ++ ++#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */ + + /* Virtualization flags: Linux defined, word 8 */ + #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ +@@ -271,6 +271,9 @@ + #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ + #define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ + #define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ ++#define X86_FEATURE_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ ++#define X86_FEATURE_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */ ++#define X86_FEATURE_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */ + + /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ + #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ +@@ -319,6 +322,13 @@ + #define X86_FEATURE_SUCCOR (17*32+ 1) /* Uncorrectable error containment and recovery */ + #define X86_FEATURE_SMCA (17*32+ 3) /* Scalable MCA */ + ++/* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ ++#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ ++#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ ++#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ ++#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ ++#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ ++ + /* + * BUG word(s) + */ +diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h +index e428e16dd822..c6a3af198294 100644 +--- a/arch/x86/include/asm/disabled-features.h ++++ b/arch/x86/include/asm/disabled-features.h +@@ -71,6 +71,7 @@ + #define DISABLED_MASK15 0 + #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57) + #define DISABLED_MASK17 0 +-#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) ++#define DISABLED_MASK18 0 ++#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) + + #endif /* _ASM_X86_DISABLED_FEATURES_H */ +diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h +index 64c4a30e0d39..e203169931c7 100644 +--- a/arch/x86/include/asm/fixmap.h ++++ b/arch/x86/include/asm/fixmap.h +@@ -137,8 +137,10 @@ enum fixed_addresses { + + extern void reserve_top_address(unsigned long reserve); + +-#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) +-#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) ++#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) ++#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) ++#define FIXADDR_TOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) ++#define FIXADDR_TOT_START (FIXADDR_TOP - FIXADDR_TOT_SIZE) + + extern int fixmaps_set; + +diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h +index fa11fb1fa570..eb83ff1bae8f 100644 +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -39,6 +39,13 @@ + + /* Intel MSRs. Some also available on other CPUs */ + ++#define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */ ++#define SPEC_CTRL_IBRS (1 << 0) /* Indirect Branch Restricted Speculation */ ++#define SPEC_CTRL_STIBP (1 << 1) /* Single Thread Indirect Branch Predictors */ ++ ++#define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ ++#define PRED_CMD_IBPB (1 << 0) /* Indirect Branch Prediction Barrier */ ++ + #define MSR_PPIN_CTL 0x0000004e + #define MSR_PPIN 0x0000004f + +@@ -57,6 +64,11 @@ + #define SNB_C3_AUTO_UNDEMOTE (1UL << 28) + + #define MSR_MTRRcap 0x000000fe ++ ++#define MSR_IA32_ARCH_CAPABILITIES 0x0000010a ++#define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */ ++#define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */ ++ + #define MSR_IA32_BBL_CR_CTL 0x00000119 + #define MSR_IA32_BBL_CR_CTL3 0x0000011e + +diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h +index 07962f5f6fba..30df295f6d94 100644 +--- a/arch/x86/include/asm/msr.h ++++ b/arch/x86/include/asm/msr.h +@@ -214,8 +214,7 @@ static __always_inline unsigned long long rdtsc_ordered(void) + * that some other imaginary CPU is updating continuously with a + * time stamp. + */ +- alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, +- "lfence", X86_FEATURE_LFENCE_RDTSC); ++ barrier_nospec(); + return rdtsc(); + } + +diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h +index 4ad41087ce0e..4d57894635f2 100644 +--- a/arch/x86/include/asm/nospec-branch.h ++++ b/arch/x86/include/asm/nospec-branch.h +@@ -1,56 +1,12 @@ + /* SPDX-License-Identifier: GPL-2.0 */ + +-#ifndef __NOSPEC_BRANCH_H__ +-#define __NOSPEC_BRANCH_H__ ++#ifndef _ASM_X86_NOSPEC_BRANCH_H_ ++#define _ASM_X86_NOSPEC_BRANCH_H_ + + #include <asm/alternative.h> + #include <asm/alternative-asm.h> + #include <asm/cpufeatures.h> + +-/* +- * Fill the CPU return stack buffer. +- * +- * Each entry in the RSB, if used for a speculative 'ret', contains an +- * infinite 'pause; lfence; jmp' loop to capture speculative execution. +- * +- * This is required in various cases for retpoline and IBRS-based +- * mitigations for the Spectre variant 2 vulnerability. Sometimes to +- * eliminate potentially bogus entries from the RSB, and sometimes +- * purely to ensure that it doesn't get empty, which on some CPUs would +- * allow predictions from other (unwanted!) sources to be used. +- * +- * We define a CPP macro such that it can be used from both .S files and +- * inline assembly. It's possible to do a .macro and then include that +- * from C via asm(".include <asm/nospec-branch.h>") but let's not go there. +- */ +- +-#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */ +-#define RSB_FILL_LOOPS 16 /* To avoid underflow */ +- +-/* +- * Google experimented with loop-unrolling and this turned out to be +- * the optimal version — two calls, each with their own speculation +- * trap should their return address end up getting used, in a loop. +- */ +-#define __FILL_RETURN_BUFFER(reg, nr, sp) \ +- mov $(nr/2), reg; \ +-771: \ +- call 772f; \ +-773: /* speculation trap */ \ +- pause; \ +- lfence; \ +- jmp 773b; \ +-772: \ +- call 774f; \ +-775: /* speculation trap */ \ +- pause; \ +- lfence; \ +- jmp 775b; \ +-774: \ +- dec reg; \ +- jnz 771b; \ +- add $(BITS_PER_LONG/8) * nr, sp; +- + #ifdef __ASSEMBLY__ + + /* +@@ -121,17 +77,10 @@ + #endif + .endm + +- /* +- * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP +- * monstrosity above, manually. +- */ +-.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ++/* This clobbers the BX register */ ++.macro FILL_RETURN_BUFFER nr:req ftr:req + #ifdef CONFIG_RETPOLINE +- ANNOTATE_NOSPEC_ALTERNATIVE +- ALTERNATIVE "jmp .Lskip_rsb_\@", \ +- __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)) \ +- \ftr +-.Lskip_rsb_\@: ++ ALTERNATIVE "", "call __clear_rsb", \ftr + #endif + .endm + +@@ -201,22 +150,25 @@ extern char __indirect_thunk_end[]; + * On VMEXIT we must ensure that no RSB predictions learned in the guest + * can be followed in the host, by overwriting the RSB completely. Both + * retpoline and IBRS mitigations for Spectre v2 need this; only on future +- * CPUs with IBRS_ATT *might* it be avoided. ++ * CPUs with IBRS_ALL *might* it be avoided. + */ + static inline void vmexit_fill_RSB(void) + { + #ifdef CONFIG_RETPOLINE +- unsigned long loops; +- +- asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE +- ALTERNATIVE("jmp 910f", +- __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)), +- X86_FEATURE_RETPOLINE) +- "910:" +- : "=r" (loops), ASM_CALL_CONSTRAINT +- : : "memory" ); ++ alternative_input("", ++ "call __fill_rsb", ++ X86_FEATURE_RETPOLINE, ++ ASM_NO_INPUT_CLOBBER(_ASM_BX, "memory")); + #endif + } + ++static inline void indirect_branch_prediction_barrier(void) ++{ ++ alternative_input("", ++ "call __ibp_barrier", ++ X86_FEATURE_USE_IBPB, ++ ASM_NO_INPUT_CLOBBER("eax", "ecx", "edx", "memory")); ++} ++ + #endif /* __ASSEMBLY__ */ +-#endif /* __NOSPEC_BRANCH_H__ */ ++#endif /* _ASM_X86_NOSPEC_BRANCH_H_ */ +diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h +index ce245b0cdfca..0777e18a1d23 100644 +--- a/arch/x86/include/asm/pgtable_32_types.h ++++ b/arch/x86/include/asm/pgtable_32_types.h +@@ -44,8 +44,9 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ + */ + #define CPU_ENTRY_AREA_PAGES (NR_CPUS * 40) + +-#define CPU_ENTRY_AREA_BASE \ +- ((FIXADDR_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) & PMD_MASK) ++#define CPU_ENTRY_AREA_BASE \ ++ ((FIXADDR_TOT_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) \ ++ & PMD_MASK) + + #define PKMAP_BASE \ + ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK) +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index 9c18da64daa9..c57c6e77c29f 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -459,8 +459,6 @@ struct thread_struct { + unsigned short gsindex; + #endif + +- u32 status; /* thread synchronous flags */ +- + #ifdef CONFIG_X86_64 + unsigned long fsbase; + unsigned long gsbase; +@@ -970,4 +968,7 @@ bool xen_set_default_idle(void); + + void stop_this_cpu(void *dummy); + void df_debug(struct pt_regs *regs, long error_code); ++ ++void __ibp_barrier(void); ++ + #endif /* _ASM_X86_PROCESSOR_H */ +diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h +index d91ba04dd007..fb3a6de7440b 100644 +--- a/arch/x86/include/asm/required-features.h ++++ b/arch/x86/include/asm/required-features.h +@@ -106,6 +106,7 @@ + #define REQUIRED_MASK15 0 + #define REQUIRED_MASK16 (NEED_LA57) + #define REQUIRED_MASK17 0 +-#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) ++#define REQUIRED_MASK18 0 ++#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) + + #endif /* _ASM_X86_REQUIRED_FEATURES_H */ +diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h +index e3c95e8e61c5..03eedc21246d 100644 +--- a/arch/x86/include/asm/syscall.h ++++ b/arch/x86/include/asm/syscall.h +@@ -60,7 +60,7 @@ static inline long syscall_get_error(struct task_struct *task, + * TS_COMPAT is set for 32-bit syscall entries and then + * remains set until we return to user mode. + */ +- if (task->thread.status & (TS_COMPAT|TS_I386_REGS_POKED)) ++ if (task->thread_info.status & (TS_COMPAT|TS_I386_REGS_POKED)) + /* + * Sign-extend the value so (int)-EFOO becomes (long)-EFOO + * and will match correctly in comparisons. +@@ -116,7 +116,7 @@ static inline void syscall_get_arguments(struct task_struct *task, + unsigned long *args) + { + # ifdef CONFIG_IA32_EMULATION +- if (task->thread.status & TS_COMPAT) ++ if (task->thread_info.status & TS_COMPAT) + switch (i) { + case 0: + if (!n--) break; +@@ -177,7 +177,7 @@ static inline void syscall_set_arguments(struct task_struct *task, + const unsigned long *args) + { + # ifdef CONFIG_IA32_EMULATION +- if (task->thread.status & TS_COMPAT) ++ if (task->thread_info.status & TS_COMPAT) + switch (i) { + case 0: + if (!n--) break; +diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h +index 00223333821a..eda3b6823ca4 100644 +--- a/arch/x86/include/asm/thread_info.h ++++ b/arch/x86/include/asm/thread_info.h +@@ -55,6 +55,7 @@ struct task_struct; + + struct thread_info { + unsigned long flags; /* low level flags */ ++ u32 status; /* thread synchronous flags */ + }; + + #define INIT_THREAD_INFO(tsk) \ +@@ -221,7 +222,7 @@ static inline int arch_within_stack_frames(const void * const stack, + #define in_ia32_syscall() true + #else + #define in_ia32_syscall() (IS_ENABLED(CONFIG_IA32_EMULATION) && \ +- current->thread.status & TS_COMPAT) ++ current_thread_info()->status & TS_COMPAT) + #endif + + /* +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 3effd3c994af..4405c4b308e8 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -174,6 +174,8 @@ struct tlb_state { + struct mm_struct *loaded_mm; + u16 loaded_mm_asid; + u16 next_asid; ++ /* last user mm's ctx id */ ++ u64 last_ctx_id; + + /* + * We can be in one of several states: +diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h +index 574dff4d2913..aae77eb8491c 100644 +--- a/arch/x86/include/asm/uaccess.h ++++ b/arch/x86/include/asm/uaccess.h +@@ -124,6 +124,11 @@ extern int __get_user_bad(void); + + #define __uaccess_begin() stac() + #define __uaccess_end() clac() ++#define __uaccess_begin_nospec() \ ++({ \ ++ stac(); \ ++ barrier_nospec(); \ ++}) + + /* + * This is a type: either unsigned long, if the argument fits into +@@ -445,7 +450,7 @@ do { \ + ({ \ + int __gu_err; \ + __inttype(*(ptr)) __gu_val; \ +- __uaccess_begin(); \ ++ __uaccess_begin_nospec(); \ + __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ + __uaccess_end(); \ + (x) = (__force __typeof__(*(ptr)))__gu_val; \ +@@ -487,6 +492,10 @@ struct __large_struct { unsigned long buf[100]; }; + __uaccess_begin(); \ + barrier(); + ++#define uaccess_try_nospec do { \ ++ current->thread.uaccess_err = 0; \ ++ __uaccess_begin_nospec(); \ ++ + #define uaccess_catch(err) \ + __uaccess_end(); \ + (err) |= (current->thread.uaccess_err ? -EFAULT : 0); \ +@@ -548,7 +557,7 @@ struct __large_struct { unsigned long buf[100]; }; + * get_user_ex(...); + * } get_user_catch(err) + */ +-#define get_user_try uaccess_try ++#define get_user_try uaccess_try_nospec + #define get_user_catch(err) uaccess_catch(err) + + #define get_user_ex(x, ptr) do { \ +@@ -582,7 +591,7 @@ extern void __cmpxchg_wrong_size(void) + __typeof__(ptr) __uval = (uval); \ + __typeof__(*(ptr)) __old = (old); \ + __typeof__(*(ptr)) __new = (new); \ +- __uaccess_begin(); \ ++ __uaccess_begin_nospec(); \ + switch (size) { \ + case 1: \ + { \ +diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h +index 72950401b223..ba2dc1930630 100644 +--- a/arch/x86/include/asm/uaccess_32.h ++++ b/arch/x86/include/asm/uaccess_32.h +@@ -29,21 +29,21 @@ raw_copy_from_user(void *to, const void __user *from, unsigned long n) + switch (n) { + case 1: + ret = 0; +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_asm_nozero(*(u8 *)to, from, ret, + "b", "b", "=q", 1); + __uaccess_end(); + return ret; + case 2: + ret = 0; +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_asm_nozero(*(u16 *)to, from, ret, + "w", "w", "=r", 2); + __uaccess_end(); + return ret; + case 4: + ret = 0; +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_asm_nozero(*(u32 *)to, from, ret, + "l", "k", "=r", 4); + __uaccess_end(); +diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h +index f07ef3c575db..62546b3a398e 100644 +--- a/arch/x86/include/asm/uaccess_64.h ++++ b/arch/x86/include/asm/uaccess_64.h +@@ -55,31 +55,31 @@ raw_copy_from_user(void *dst, const void __user *src, unsigned long size) + return copy_user_generic(dst, (__force void *)src, size); + switch (size) { + case 1: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_asm_nozero(*(u8 *)dst, (u8 __user *)src, + ret, "b", "b", "=q", 1); + __uaccess_end(); + return ret; + case 2: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_asm_nozero(*(u16 *)dst, (u16 __user *)src, + ret, "w", "w", "=r", 2); + __uaccess_end(); + return ret; + case 4: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_asm_nozero(*(u32 *)dst, (u32 __user *)src, + ret, "l", "k", "=r", 4); + __uaccess_end(); + return ret; + case 8: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src, + ret, "q", "", "=r", 8); + __uaccess_end(); + return ret; + case 10: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src, + ret, "q", "", "=r", 10); + if (likely(!ret)) +@@ -89,7 +89,7 @@ raw_copy_from_user(void *dst, const void __user *src, unsigned long size) + __uaccess_end(); + return ret; + case 16: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src, + ret, "q", "", "=r", 16); + if (likely(!ret)) +diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c +index e0b97e4d1db5..21be0193d9dc 100644 +--- a/arch/x86/kernel/alternative.c ++++ b/arch/x86/kernel/alternative.c +@@ -46,17 +46,6 @@ static int __init setup_noreplace_smp(char *str) + } + __setup("noreplace-smp", setup_noreplace_smp); + +-#ifdef CONFIG_PARAVIRT +-static int __initdata_or_module noreplace_paravirt = 0; +- +-static int __init setup_noreplace_paravirt(char *str) +-{ +- noreplace_paravirt = 1; +- return 1; +-} +-__setup("noreplace-paravirt", setup_noreplace_paravirt); +-#endif +- + #define DPRINTK(fmt, args...) \ + do { \ + if (debug_alternative) \ +@@ -298,7 +287,7 @@ recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf) + tgt_rip = next_rip + o_dspl; + n_dspl = tgt_rip - orig_insn; + +- DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl); ++ DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl); + + if (tgt_rip - orig_insn >= 0) { + if (n_dspl - 2 <= 127) +@@ -355,7 +344,7 @@ static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *ins + add_nops(instr + (a->instrlen - a->padlen), a->padlen); + local_irq_restore(flags); + +- DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ", ++ DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ", + instr, a->instrlen - a->padlen, a->padlen); + } + +@@ -376,7 +365,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, + u8 *instr, *replacement; + u8 insnbuf[MAX_PATCH_LEN]; + +- DPRINTK("alt table %p -> %p", start, end); ++ DPRINTK("alt table %px, -> %px", start, end); + /* + * The scan order should be from start to end. A later scanned + * alternative code can overwrite previously scanned alternative code. +@@ -400,14 +389,14 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, + continue; + } + +- DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d), pad: %d", ++ DPRINTK("feat: %d*32+%d, old: (%px len: %d), repl: (%px, len: %d), pad: %d", + a->cpuid >> 5, + a->cpuid & 0x1f, + instr, a->instrlen, + replacement, a->replacementlen, a->padlen); + +- DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr); +- DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement); ++ DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr); ++ DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement); + + memcpy(insnbuf, replacement, a->replacementlen); + insnbuf_sz = a->replacementlen; +@@ -433,7 +422,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, + a->instrlen - a->replacementlen); + insnbuf_sz += a->instrlen - a->replacementlen; + } +- DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr); ++ DUMP_BYTES(insnbuf, insnbuf_sz, "%px: final_insn: ", instr); + + text_poke_early(instr, insnbuf, insnbuf_sz); + } +@@ -599,9 +588,6 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start, + struct paravirt_patch_site *p; + char insnbuf[MAX_PATCH_LEN]; + +- if (noreplace_paravirt) +- return; +- + for (p = start; p < end; p++) { + unsigned int used; + +diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c +index 390b3dc3d438..71949bf2de5a 100644 +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -11,6 +11,7 @@ + #include <linux/init.h> + #include <linux/utsname.h> + #include <linux/cpu.h> ++#include <linux/module.h> + + #include <asm/nospec-branch.h> + #include <asm/cmdline.h> +@@ -90,20 +91,41 @@ static const char *spectre_v2_strings[] = { + }; + + #undef pr_fmt +-#define pr_fmt(fmt) "Spectre V2 mitigation: " fmt ++#define pr_fmt(fmt) "Spectre V2 : " fmt + + static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; + ++#ifdef RETPOLINE ++static bool spectre_v2_bad_module; ++ ++bool retpoline_module_ok(bool has_retpoline) ++{ ++ if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline) ++ return true; ++ ++ pr_err("System may be vulnerable to spectre v2\n"); ++ spectre_v2_bad_module = true; ++ return false; ++} ++ ++static inline const char *spectre_v2_module_string(void) ++{ ++ return spectre_v2_bad_module ? " - vulnerable module loaded" : ""; ++} ++#else ++static inline const char *spectre_v2_module_string(void) { return ""; } ++#endif ++ + static void __init spec2_print_if_insecure(const char *reason) + { + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) +- pr_info("%s\n", reason); ++ pr_info("%s selected on command line.\n", reason); + } + + static void __init spec2_print_if_secure(const char *reason) + { + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) +- pr_info("%s\n", reason); ++ pr_info("%s selected on command line.\n", reason); + } + + static inline bool retp_compiler(void) +@@ -118,42 +140,68 @@ static inline bool match_option(const char *arg, int arglen, const char *opt) + return len == arglen && !strncmp(arg, opt, len); + } + ++static const struct { ++ const char *option; ++ enum spectre_v2_mitigation_cmd cmd; ++ bool secure; ++} mitigation_options[] = { ++ { "off", SPECTRE_V2_CMD_NONE, false }, ++ { "on", SPECTRE_V2_CMD_FORCE, true }, ++ { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, ++ { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false }, ++ { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, ++ { "auto", SPECTRE_V2_CMD_AUTO, false }, ++}; ++ + static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) + { + char arg[20]; +- int ret; +- +- ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, +- sizeof(arg)); +- if (ret > 0) { +- if (match_option(arg, ret, "off")) { +- goto disable; +- } else if (match_option(arg, ret, "on")) { +- spec2_print_if_secure("force enabled on command line."); +- return SPECTRE_V2_CMD_FORCE; +- } else if (match_option(arg, ret, "retpoline")) { +- spec2_print_if_insecure("retpoline selected on command line."); +- return SPECTRE_V2_CMD_RETPOLINE; +- } else if (match_option(arg, ret, "retpoline,amd")) { +- if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { +- pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n"); +- return SPECTRE_V2_CMD_AUTO; +- } +- spec2_print_if_insecure("AMD retpoline selected on command line."); +- return SPECTRE_V2_CMD_RETPOLINE_AMD; +- } else if (match_option(arg, ret, "retpoline,generic")) { +- spec2_print_if_insecure("generic retpoline selected on command line."); +- return SPECTRE_V2_CMD_RETPOLINE_GENERIC; +- } else if (match_option(arg, ret, "auto")) { ++ int ret, i; ++ enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO; ++ ++ if (cmdline_find_option_bool(boot_command_line, "nospectre_v2")) ++ return SPECTRE_V2_CMD_NONE; ++ else { ++ ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, ++ sizeof(arg)); ++ if (ret < 0) ++ return SPECTRE_V2_CMD_AUTO; ++ ++ for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) { ++ if (!match_option(arg, ret, mitigation_options[i].option)) ++ continue; ++ cmd = mitigation_options[i].cmd; ++ break; ++ } ++ ++ if (i >= ARRAY_SIZE(mitigation_options)) { ++ pr_err("unknown option (%s). Switching to AUTO select\n", ++ mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + } + +- if (!cmdline_find_option_bool(boot_command_line, "nospectre_v2")) ++ if ((cmd == SPECTRE_V2_CMD_RETPOLINE || ++ cmd == SPECTRE_V2_CMD_RETPOLINE_AMD || ++ cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) && ++ !IS_ENABLED(CONFIG_RETPOLINE)) { ++ pr_err("%s selected but not compiled in. Switching to AUTO select\n", ++ mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; +-disable: +- spec2_print_if_insecure("disabled on command line."); +- return SPECTRE_V2_CMD_NONE; ++ } ++ ++ if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD && ++ boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { ++ pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n"); ++ return SPECTRE_V2_CMD_AUTO; ++ } ++ ++ if (mitigation_options[i].secure) ++ spec2_print_if_secure(mitigation_options[i].option); ++ else ++ spec2_print_if_insecure(mitigation_options[i].option); ++ ++ return cmd; + } + + /* Check for Skylake-like CPUs (for RSB handling) */ +@@ -191,10 +239,10 @@ static void __init spectre_v2_select_mitigation(void) + return; + + case SPECTRE_V2_CMD_FORCE: +- /* FALLTRHU */ + case SPECTRE_V2_CMD_AUTO: +- goto retpoline_auto; +- ++ if (IS_ENABLED(CONFIG_RETPOLINE)) ++ goto retpoline_auto; ++ break; + case SPECTRE_V2_CMD_RETPOLINE_AMD: + if (IS_ENABLED(CONFIG_RETPOLINE)) + goto retpoline_amd; +@@ -249,6 +297,12 @@ static void __init spectre_v2_select_mitigation(void) + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); + pr_info("Filling RSB on context switch\n"); + } ++ ++ /* Initialize Indirect Branch Prediction Barrier if supported */ ++ if (boot_cpu_has(X86_FEATURE_IBPB)) { ++ setup_force_cpu_cap(X86_FEATURE_USE_IBPB); ++ pr_info("Enabling Indirect Branch Prediction Barrier\n"); ++ } + } + + #undef pr_fmt +@@ -269,7 +323,7 @@ ssize_t cpu_show_spectre_v1(struct device *dev, + { + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1)) + return sprintf(buf, "Not affected\n"); +- return sprintf(buf, "Vulnerable\n"); ++ return sprintf(buf, "Mitigation: __user pointer sanitization\n"); + } + + ssize_t cpu_show_spectre_v2(struct device *dev, +@@ -278,6 +332,14 @@ ssize_t cpu_show_spectre_v2(struct device *dev, + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + return sprintf(buf, "Not affected\n"); + +- return sprintf(buf, "%s\n", spectre_v2_strings[spectre_v2_enabled]); ++ return sprintf(buf, "%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], ++ boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "", ++ spectre_v2_module_string()); + } + #endif ++ ++void __ibp_barrier(void) ++{ ++ __wrmsr(MSR_IA32_PRED_CMD, PRED_CMD_IBPB, 0); ++} ++EXPORT_SYMBOL_GPL(__ibp_barrier); +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 372ba3fb400f..92b66e21bae5 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -47,6 +47,8 @@ + #include <asm/pat.h> + #include <asm/microcode.h> + #include <asm/microcode_intel.h> ++#include <asm/intel-family.h> ++#include <asm/cpu_device_id.h> + + #ifdef CONFIG_X86_LOCAL_APIC + #include <asm/uv/uv.h> +@@ -724,6 +726,26 @@ static void apply_forced_caps(struct cpuinfo_x86 *c) + } + } + ++static void init_speculation_control(struct cpuinfo_x86 *c) ++{ ++ /* ++ * The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support, ++ * and they also have a different bit for STIBP support. Also, ++ * a hypervisor might have set the individual AMD bits even on ++ * Intel CPUs, for finer-grained selection of what's available. ++ * ++ * We use the AMD bits in 0x8000_0008 EBX as the generic hardware ++ * features, which are visible in /proc/cpuinfo and used by the ++ * kernel. So set those accordingly from the Intel bits. ++ */ ++ if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) { ++ set_cpu_cap(c, X86_FEATURE_IBRS); ++ set_cpu_cap(c, X86_FEATURE_IBPB); ++ } ++ if (cpu_has(c, X86_FEATURE_INTEL_STIBP)) ++ set_cpu_cap(c, X86_FEATURE_STIBP); ++} ++ + void get_cpu_cap(struct cpuinfo_x86 *c) + { + u32 eax, ebx, ecx, edx; +@@ -745,6 +767,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c) + cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); + c->x86_capability[CPUID_7_0_EBX] = ebx; + c->x86_capability[CPUID_7_ECX] = ecx; ++ c->x86_capability[CPUID_7_EDX] = edx; + } + + /* Extended state features: level 0x0000000d */ +@@ -817,6 +840,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c) + c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); + + init_scattered_cpuid_features(c); ++ init_speculation_control(c); + + /* + * Clear/Set all flags overridden by options, after probe. +@@ -852,6 +876,41 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) + #endif + } + ++static const __initconst struct x86_cpu_id cpu_no_speculation[] = { ++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY }, ++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY }, ++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY }, ++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL, X86_FEATURE_ANY }, ++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW, X86_FEATURE_ANY }, ++ { X86_VENDOR_CENTAUR, 5 }, ++ { X86_VENDOR_INTEL, 5 }, ++ { X86_VENDOR_NSC, 5 }, ++ { X86_VENDOR_ANY, 4 }, ++ {} ++}; ++ ++static const __initconst struct x86_cpu_id cpu_no_meltdown[] = { ++ { X86_VENDOR_AMD }, ++ {} ++}; ++ ++static bool __init cpu_vulnerable_to_meltdown(struct cpuinfo_x86 *c) ++{ ++ u64 ia32_cap = 0; ++ ++ if (x86_match_cpu(cpu_no_meltdown)) ++ return false; ++ ++ if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) ++ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); ++ ++ /* Rogue Data Cache Load? No! */ ++ if (ia32_cap & ARCH_CAP_RDCL_NO) ++ return false; ++ ++ return true; ++} ++ + /* + * Do minimum CPU detection early. + * Fields really needed: vendor, cpuid_level, family, model, mask, +@@ -899,11 +958,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) + + setup_force_cpu_cap(X86_FEATURE_ALWAYS); + +- if (c->x86_vendor != X86_VENDOR_AMD) +- setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); +- +- setup_force_cpu_bug(X86_BUG_SPECTRE_V1); +- setup_force_cpu_bug(X86_BUG_SPECTRE_V2); ++ if (!x86_match_cpu(cpu_no_speculation)) { ++ if (cpu_vulnerable_to_meltdown(c)) ++ setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); ++ setup_force_cpu_bug(X86_BUG_SPECTRE_V1); ++ setup_force_cpu_bug(X86_BUG_SPECTRE_V2); ++ } + + fpu__init_system(c); + +diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c +index b720dacac051..4cf4f8cbc69d 100644 +--- a/arch/x86/kernel/cpu/intel.c ++++ b/arch/x86/kernel/cpu/intel.c +@@ -102,6 +102,59 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c) + ELF_HWCAP2 |= HWCAP2_RING3MWAIT; + } + ++/* ++ * Early microcode releases for the Spectre v2 mitigation were broken. ++ * Information taken from; ++ * - https://newsroom.intel.com/wp-content/uploads/sites/11/2018/01/microcode-update-guidance.pdf ++ * - https://kb.vmware.com/s/article/52345 ++ * - Microcode revisions observed in the wild ++ * - Release note from 20180108 microcode release ++ */ ++struct sku_microcode { ++ u8 model; ++ u8 stepping; ++ u32 microcode; ++}; ++static const struct sku_microcode spectre_bad_microcodes[] = { ++ { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0B, 0x84 }, ++ { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0A, 0x84 }, ++ { INTEL_FAM6_KABYLAKE_DESKTOP, 0x09, 0x84 }, ++ { INTEL_FAM6_KABYLAKE_MOBILE, 0x0A, 0x84 }, ++ { INTEL_FAM6_KABYLAKE_MOBILE, 0x09, 0x84 }, ++ { INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e }, ++ { INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c }, ++ { INTEL_FAM6_SKYLAKE_MOBILE, 0x03, 0xc2 }, ++ { INTEL_FAM6_SKYLAKE_DESKTOP, 0x03, 0xc2 }, ++ { INTEL_FAM6_BROADWELL_CORE, 0x04, 0x28 }, ++ { INTEL_FAM6_BROADWELL_GT3E, 0x01, 0x1b }, ++ { INTEL_FAM6_BROADWELL_XEON_D, 0x02, 0x14 }, ++ { INTEL_FAM6_BROADWELL_XEON_D, 0x03, 0x07000011 }, ++ { INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 }, ++ { INTEL_FAM6_HASWELL_ULT, 0x01, 0x21 }, ++ { INTEL_FAM6_HASWELL_GT3E, 0x01, 0x18 }, ++ { INTEL_FAM6_HASWELL_CORE, 0x03, 0x23 }, ++ { INTEL_FAM6_HASWELL_X, 0x02, 0x3b }, ++ { INTEL_FAM6_HASWELL_X, 0x04, 0x10 }, ++ { INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a }, ++ /* Updated in the 20180108 release; blacklist until we know otherwise */ ++ { INTEL_FAM6_ATOM_GEMINI_LAKE, 0x01, 0x22 }, ++ /* Observed in the wild */ ++ { INTEL_FAM6_SANDYBRIDGE_X, 0x06, 0x61b }, ++ { INTEL_FAM6_SANDYBRIDGE_X, 0x07, 0x712 }, ++}; ++ ++static bool bad_spectre_microcode(struct cpuinfo_x86 *c) ++{ ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) { ++ if (c->x86_model == spectre_bad_microcodes[i].model && ++ c->x86_mask == spectre_bad_microcodes[i].stepping) ++ return (c->microcode <= spectre_bad_microcodes[i].microcode); ++ } ++ return false; ++} ++ + static void early_init_intel(struct cpuinfo_x86 *c) + { + u64 misc_enable; +@@ -122,6 +175,19 @@ static void early_init_intel(struct cpuinfo_x86 *c) + if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) + c->microcode = intel_get_microcode_revision(); + ++ /* Now if any of them are set, check the blacklist and clear the lot */ ++ if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) || ++ cpu_has(c, X86_FEATURE_INTEL_STIBP) || ++ cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) || ++ cpu_has(c, X86_FEATURE_STIBP)) && bad_spectre_microcode(c)) { ++ pr_warn("Intel Spectre v2 broken microcode detected; disabling Speculation Control\n"); ++ setup_clear_cpu_cap(X86_FEATURE_IBRS); ++ setup_clear_cpu_cap(X86_FEATURE_IBPB); ++ setup_clear_cpu_cap(X86_FEATURE_STIBP); ++ setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL); ++ setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP); ++ } ++ + /* + * Atom erratum AAE44/AAF40/AAG38/AAH41: + * +diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c +index d0e69769abfd..df11f5d604be 100644 +--- a/arch/x86/kernel/cpu/scattered.c ++++ b/arch/x86/kernel/cpu/scattered.c +@@ -21,8 +21,6 @@ struct cpuid_bit { + static const struct cpuid_bit cpuid_bits[] = { + { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, + { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, +- { X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 }, +- { X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 }, + { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, + { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, + { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, +diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c +index c75466232016..9eb448c7859d 100644 +--- a/arch/x86/kernel/process_64.c ++++ b/arch/x86/kernel/process_64.c +@@ -557,7 +557,7 @@ static void __set_personality_x32(void) + * Pretend to come from a x32 execve. + */ + task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT; +- current->thread.status &= ~TS_COMPAT; ++ current_thread_info()->status &= ~TS_COMPAT; + #endif + } + +@@ -571,7 +571,7 @@ static void __set_personality_ia32(void) + current->personality |= force_personality32; + /* Prepare the first "return" to user space */ + task_pt_regs(current)->orig_ax = __NR_ia32_execve; +- current->thread.status |= TS_COMPAT; ++ current_thread_info()->status |= TS_COMPAT; + #endif + } + +diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c +index f37d18124648..ed5c4cdf0a34 100644 +--- a/arch/x86/kernel/ptrace.c ++++ b/arch/x86/kernel/ptrace.c +@@ -935,7 +935,7 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value) + */ + regs->orig_ax = value; + if (syscall_get_nr(child, regs) >= 0) +- child->thread.status |= TS_I386_REGS_POKED; ++ child->thread_info.status |= TS_I386_REGS_POKED; + break; + + case offsetof(struct user32, regs.eflags): +diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c +index b9e00e8f1c9b..4cdc0b27ec82 100644 +--- a/arch/x86/kernel/signal.c ++++ b/arch/x86/kernel/signal.c +@@ -787,7 +787,7 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) + * than the tracee. + */ + #ifdef CONFIG_IA32_EMULATION +- if (current->thread.status & (TS_COMPAT|TS_I386_REGS_POKED)) ++ if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED)) + return __NR_ia32_restart_syscall; + #endif + #ifdef CONFIG_X86_X32_ABI +diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c +index 0099e10eb045..13f5d4217e4f 100644 +--- a/arch/x86/kvm/cpuid.c ++++ b/arch/x86/kvm/cpuid.c +@@ -67,9 +67,7 @@ u64 kvm_supported_xcr0(void) + + #define F(x) bit(X86_FEATURE_##x) + +-/* These are scattered features in cpufeatures.h. */ +-#define KVM_CPUID_BIT_AVX512_4VNNIW 2 +-#define KVM_CPUID_BIT_AVX512_4FMAPS 3 ++/* For scattered features from cpufeatures.h; we currently expose none */ + #define KF(x) bit(KVM_CPUID_BIT_##x) + + int kvm_update_cpuid(struct kvm_vcpu *vcpu) +@@ -367,6 +365,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | + 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); + ++ /* cpuid 0x80000008.ebx */ ++ const u32 kvm_cpuid_8000_0008_ebx_x86_features = ++ F(IBPB) | F(IBRS); ++ + /* cpuid 0xC0000001.edx */ + const u32 kvm_cpuid_C000_0001_edx_x86_features = + F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | +@@ -392,7 +394,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + + /* cpuid 7.0.edx*/ + const u32 kvm_cpuid_7_0_edx_x86_features = +- KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS); ++ F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | ++ F(ARCH_CAPABILITIES); + + /* all calls to cpuid_count() should be made on the same cpu */ + get_cpu(); +@@ -477,7 +480,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) + entry->ecx &= ~F(PKU); + entry->edx &= kvm_cpuid_7_0_edx_x86_features; +- entry->edx &= get_scattered_cpuid_leaf(7, 0, CPUID_EDX); ++ cpuid_mask(&entry->edx, CPUID_7_EDX); + } else { + entry->ebx = 0; + entry->ecx = 0; +@@ -627,7 +630,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + if (!g_phys_as) + g_phys_as = phys_as; + entry->eax = g_phys_as | (virt_as << 8); +- entry->ebx = entry->edx = 0; ++ entry->edx = 0; ++ /* IBRS and IBPB aren't necessarily present in hardware cpuid */ ++ if (boot_cpu_has(X86_FEATURE_IBPB)) ++ entry->ebx |= F(IBPB); ++ if (boot_cpu_has(X86_FEATURE_IBRS)) ++ entry->ebx |= F(IBRS); ++ entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; ++ cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); + break; + } + case 0x80000019: +diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h +index c2cea6651279..9a327d5b6d1f 100644 +--- a/arch/x86/kvm/cpuid.h ++++ b/arch/x86/kvm/cpuid.h +@@ -54,6 +54,7 @@ static const struct cpuid_reg reverse_cpuid[] = { + [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX}, + [CPUID_7_ECX] = { 7, 0, CPUID_ECX}, + [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX}, ++ [CPUID_7_EDX] = { 7, 0, CPUID_EDX}, + }; + + static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature) +diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c +index eca6a89f2326..fab073b19528 100644 +--- a/arch/x86/kvm/emulate.c ++++ b/arch/x86/kvm/emulate.c +@@ -25,6 +25,7 @@ + #include <asm/kvm_emulate.h> + #include <linux/stringify.h> + #include <asm/debugreg.h> ++#include <asm/nospec-branch.h> + + #include "x86.h" + #include "tss.h" +@@ -1021,8 +1022,8 @@ static __always_inline u8 test_cc(unsigned int condition, unsigned long flags) + void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf); + + flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF; +- asm("push %[flags]; popf; call *%[fastop]" +- : "=a"(rc) : [fastop]"r"(fop), [flags]"r"(flags)); ++ asm("push %[flags]; popf; " CALL_NOSPEC ++ : "=a"(rc) : [thunk_target]"r"(fop), [flags]"r"(flags)); + return rc; + } + +@@ -5350,9 +5351,9 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) + if (!(ctxt->d & ByteOp)) + fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; + +- asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" ++ asm("push %[flags]; popf; " CALL_NOSPEC " ; pushf; pop %[flags]\n" + : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags), +- [fastop]"+S"(fop), ASM_CALL_CONSTRAINT ++ [thunk_target]"+S"(fop), ASM_CALL_CONSTRAINT + : "c"(ctxt->src2.val)); + + ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); +diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c +index 6a8284f72328..e0bc3ad0f6cd 100644 +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -184,6 +184,8 @@ struct vcpu_svm { + u64 gs_base; + } host; + ++ u64 spec_ctrl; ++ + u32 *msrpm; + + ulong nmi_iret_rip; +@@ -249,6 +251,8 @@ static const struct svm_direct_access_msrs { + { .index = MSR_CSTAR, .always = true }, + { .index = MSR_SYSCALL_MASK, .always = true }, + #endif ++ { .index = MSR_IA32_SPEC_CTRL, .always = false }, ++ { .index = MSR_IA32_PRED_CMD, .always = false }, + { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, + { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, + { .index = MSR_IA32_LASTINTFROMIP, .always = false }, +@@ -529,6 +533,7 @@ struct svm_cpu_data { + struct kvm_ldttss_desc *tss_desc; + + struct page *save_area; ++ struct vmcb *current_vmcb; + }; + + static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); +@@ -880,6 +885,25 @@ static bool valid_msr_intercept(u32 index) + return false; + } + ++static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr) ++{ ++ u8 bit_write; ++ unsigned long tmp; ++ u32 offset; ++ u32 *msrpm; ++ ++ msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm: ++ to_svm(vcpu)->msrpm; ++ ++ offset = svm_msrpm_offset(msr); ++ bit_write = 2 * (msr & 0x0f) + 1; ++ tmp = msrpm[offset]; ++ ++ BUG_ON(offset == MSR_INVALID); ++ ++ return !!test_bit(bit_write, &tmp); ++} ++ + static void set_msr_interception(u32 *msrpm, unsigned msr, + int read, int write) + { +@@ -1585,6 +1609,8 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) + u32 dummy; + u32 eax = 1; + ++ svm->spec_ctrl = 0; ++ + if (!init_event) { + svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE; +@@ -1706,11 +1732,17 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) + __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); + kvm_vcpu_uninit(vcpu); + kmem_cache_free(kvm_vcpu_cache, svm); ++ /* ++ * The vmcb page can be recycled, causing a false negative in ++ * svm_vcpu_load(). So do a full IBPB now. ++ */ ++ indirect_branch_prediction_barrier(); + } + + static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) + { + struct vcpu_svm *svm = to_svm(vcpu); ++ struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + int i; + + if (unlikely(cpu != vcpu->cpu)) { +@@ -1739,6 +1771,10 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) + if (static_cpu_has(X86_FEATURE_RDTSCP)) + wrmsrl(MSR_TSC_AUX, svm->tsc_aux); + ++ if (sd->current_vmcb != svm->vmcb) { ++ sd->current_vmcb = svm->vmcb; ++ indirect_branch_prediction_barrier(); ++ } + avic_vcpu_load(vcpu, cpu); + } + +@@ -3579,6 +3615,13 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + case MSR_VM_CR: + msr_info->data = svm->nested.vm_cr_msr; + break; ++ case MSR_IA32_SPEC_CTRL: ++ if (!msr_info->host_initiated && ++ !guest_cpuid_has(vcpu, X86_FEATURE_IBRS)) ++ return 1; ++ ++ msr_info->data = svm->spec_ctrl; ++ break; + case MSR_IA32_UCODE_REV: + msr_info->data = 0x01000065; + break; +@@ -3670,6 +3713,49 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) + case MSR_IA32_TSC: + kvm_write_tsc(vcpu, msr); + break; ++ case MSR_IA32_SPEC_CTRL: ++ if (!msr->host_initiated && ++ !guest_cpuid_has(vcpu, X86_FEATURE_IBRS)) ++ return 1; ++ ++ /* The STIBP bit doesn't fault even if it's not advertised */ ++ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) ++ return 1; ++ ++ svm->spec_ctrl = data; ++ ++ if (!data) ++ break; ++ ++ /* ++ * For non-nested: ++ * When it's written (to non-zero) for the first time, pass ++ * it through. ++ * ++ * For nested: ++ * The handling of the MSR bitmap for L2 guests is done in ++ * nested_svm_vmrun_msrpm. ++ * We update the L1 MSR bit as well since it will end up ++ * touching the MSR anyway now. ++ */ ++ set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); ++ break; ++ case MSR_IA32_PRED_CMD: ++ if (!msr->host_initiated && ++ !guest_cpuid_has(vcpu, X86_FEATURE_IBPB)) ++ return 1; ++ ++ if (data & ~PRED_CMD_IBPB) ++ return 1; ++ ++ if (!data) ++ break; ++ ++ wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); ++ if (is_guest_mode(vcpu)) ++ break; ++ set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); ++ break; + case MSR_STAR: + svm->vmcb->save.star = data; + break; +@@ -4922,6 +5008,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) + + local_irq_enable(); + ++ /* ++ * If this vCPU has touched SPEC_CTRL, restore the guest's value if ++ * it's non-zero. Since vmentry is serialising on affected CPUs, there ++ * is no need to worry about the conditional branch over the wrmsr ++ * being speculatively taken. ++ */ ++ if (svm->spec_ctrl) ++ wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl); ++ + asm volatile ( + "push %%" _ASM_BP "; \n\t" + "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t" +@@ -5014,6 +5109,27 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) + #endif + ); + ++ /* ++ * We do not use IBRS in the kernel. If this vCPU has used the ++ * SPEC_CTRL MSR it may have left it on; save the value and ++ * turn it off. This is much more efficient than blindly adding ++ * it to the atomic save/restore list. Especially as the former ++ * (Saving guest MSRs on vmexit) doesn't even exist in KVM. ++ * ++ * For non-nested case: ++ * If the L01 MSR bitmap does not intercept the MSR, then we need to ++ * save it. ++ * ++ * For nested case: ++ * If the L02 MSR bitmap does not intercept the MSR, then we need to ++ * save it. ++ */ ++ if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)) ++ rdmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl); ++ ++ if (svm->spec_ctrl) ++ wrmsrl(MSR_IA32_SPEC_CTRL, 0); ++ + /* Eliminate branch target predictions from guest mode */ + vmexit_fill_RSB(); + +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index a45063a9219c..0ae4b1a86168 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -34,6 +34,7 @@ + #include <linux/tboot.h> + #include <linux/hrtimer.h> + #include <linux/frame.h> ++#include <linux/nospec.h> + #include "kvm_cache_regs.h" + #include "x86.h" + +@@ -108,6 +109,14 @@ static u64 __read_mostly host_xss; + static bool __read_mostly enable_pml = 1; + module_param_named(pml, enable_pml, bool, S_IRUGO); + ++#define MSR_TYPE_R 1 ++#define MSR_TYPE_W 2 ++#define MSR_TYPE_RW 3 ++ ++#define MSR_BITMAP_MODE_X2APIC 1 ++#define MSR_BITMAP_MODE_X2APIC_APICV 2 ++#define MSR_BITMAP_MODE_LM 4 ++ + #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL + + /* Guest_tsc -> host_tsc conversion requires 64-bit division. */ +@@ -182,7 +191,6 @@ module_param(ple_window_max, int, S_IRUGO); + extern const ulong vmx_return; + + #define NR_AUTOLOAD_MSRS 8 +-#define VMCS02_POOL_SIZE 1 + + struct vmcs { + u32 revision_id; +@@ -207,6 +215,7 @@ struct loaded_vmcs { + int soft_vnmi_blocked; + ktime_t entry_time; + s64 vnmi_blocked_time; ++ unsigned long *msr_bitmap; + struct list_head loaded_vmcss_on_cpu_link; + }; + +@@ -223,7 +232,7 @@ struct shared_msr_entry { + * stored in guest memory specified by VMPTRLD, but is opaque to the guest, + * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. + * More than one of these structures may exist, if L1 runs multiple L2 guests. +- * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the ++ * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the + * underlying hardware which will be used to run L2. + * This structure is packed to ensure that its layout is identical across + * machines (necessary for live migration). +@@ -406,13 +415,6 @@ struct __packed vmcs12 { + */ + #define VMCS12_SIZE 0x1000 + +-/* Used to remember the last vmcs02 used for some recently used vmcs12s */ +-struct vmcs02_list { +- struct list_head list; +- gpa_t vmptr; +- struct loaded_vmcs vmcs02; +-}; +- + /* + * The nested_vmx structure is part of vcpu_vmx, and holds information we need + * for correct emulation of VMX (i.e., nested VMX) on this vcpu. +@@ -437,15 +439,15 @@ struct nested_vmx { + */ + bool sync_shadow_vmcs; + +- /* vmcs02_list cache of VMCSs recently used to run L2 guests */ +- struct list_head vmcs02_pool; +- int vmcs02_num; + bool change_vmcs01_virtual_x2apic_mode; + /* L2 must run next, and mustn't decide to exit to L1. */ + bool nested_run_pending; ++ ++ struct loaded_vmcs vmcs02; ++ + /* +- * Guest pages referred to in vmcs02 with host-physical pointers, so +- * we must keep them pinned while L2 runs. ++ * Guest pages referred to in the vmcs02 with host-physical ++ * pointers, so we must keep them pinned while L2 runs. + */ + struct page *apic_access_page; + struct page *virtual_apic_page; +@@ -454,8 +456,6 @@ struct nested_vmx { + bool pi_pending; + u16 posted_intr_nv; + +- unsigned long *msr_bitmap; +- + struct hrtimer preemption_timer; + bool preemption_timer_expired; + +@@ -570,6 +570,7 @@ struct vcpu_vmx { + struct kvm_vcpu vcpu; + unsigned long host_rsp; + u8 fail; ++ u8 msr_bitmap_mode; + u32 exit_intr_info; + u32 idt_vectoring_info; + ulong rflags; +@@ -581,6 +582,10 @@ struct vcpu_vmx { + u64 msr_host_kernel_gs_base; + u64 msr_guest_kernel_gs_base; + #endif ++ ++ u64 arch_capabilities; ++ u64 spec_ctrl; ++ + u32 vm_entry_controls_shadow; + u32 vm_exit_controls_shadow; + u32 secondary_exec_control; +@@ -887,21 +892,18 @@ static const unsigned short vmcs_field_to_offset_table[] = { + + static inline short vmcs_field_to_offset(unsigned long field) + { +- BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX); ++ const size_t size = ARRAY_SIZE(vmcs_field_to_offset_table); ++ unsigned short offset; + +- if (field >= ARRAY_SIZE(vmcs_field_to_offset_table)) ++ BUILD_BUG_ON(size > SHRT_MAX); ++ if (field >= size) + return -ENOENT; + +- /* +- * FIXME: Mitigation for CVE-2017-5753. To be replaced with a +- * generic mechanism. +- */ +- asm("lfence"); +- +- if (vmcs_field_to_offset_table[field] == 0) ++ field = array_index_nospec(field, size); ++ offset = vmcs_field_to_offset_table[field]; ++ if (offset == 0) + return -ENOENT; +- +- return vmcs_field_to_offset_table[field]; ++ return offset; + } + + static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) +@@ -927,6 +929,9 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); + static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); + static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, + u16 error_code); ++static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); ++static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, ++ u32 msr, int type); + + static DEFINE_PER_CPU(struct vmcs *, vmxarea); + static DEFINE_PER_CPU(struct vmcs *, current_vmcs); +@@ -946,12 +951,6 @@ static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); + enum { + VMX_IO_BITMAP_A, + VMX_IO_BITMAP_B, +- VMX_MSR_BITMAP_LEGACY, +- VMX_MSR_BITMAP_LONGMODE, +- VMX_MSR_BITMAP_LEGACY_X2APIC_APICV, +- VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV, +- VMX_MSR_BITMAP_LEGACY_X2APIC, +- VMX_MSR_BITMAP_LONGMODE_X2APIC, + VMX_VMREAD_BITMAP, + VMX_VMWRITE_BITMAP, + VMX_BITMAP_NR +@@ -961,12 +960,6 @@ static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; + + #define vmx_io_bitmap_a (vmx_bitmap[VMX_IO_BITMAP_A]) + #define vmx_io_bitmap_b (vmx_bitmap[VMX_IO_BITMAP_B]) +-#define vmx_msr_bitmap_legacy (vmx_bitmap[VMX_MSR_BITMAP_LEGACY]) +-#define vmx_msr_bitmap_longmode (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE]) +-#define vmx_msr_bitmap_legacy_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC_APICV]) +-#define vmx_msr_bitmap_longmode_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV]) +-#define vmx_msr_bitmap_legacy_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC]) +-#define vmx_msr_bitmap_longmode_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC]) + #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) + #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) + +@@ -1913,6 +1906,52 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) + vmcs_write32(EXCEPTION_BITMAP, eb); + } + ++/* ++ * Check if MSR is intercepted for currently loaded MSR bitmap. ++ */ ++static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) ++{ ++ unsigned long *msr_bitmap; ++ int f = sizeof(unsigned long); ++ ++ if (!cpu_has_vmx_msr_bitmap()) ++ return true; ++ ++ msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap; ++ ++ if (msr <= 0x1fff) { ++ return !!test_bit(msr, msr_bitmap + 0x800 / f); ++ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { ++ msr &= 0x1fff; ++ return !!test_bit(msr, msr_bitmap + 0xc00 / f); ++ } ++ ++ return true; ++} ++ ++/* ++ * Check if MSR is intercepted for L01 MSR bitmap. ++ */ ++static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) ++{ ++ unsigned long *msr_bitmap; ++ int f = sizeof(unsigned long); ++ ++ if (!cpu_has_vmx_msr_bitmap()) ++ return true; ++ ++ msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; ++ ++ if (msr <= 0x1fff) { ++ return !!test_bit(msr, msr_bitmap + 0x800 / f); ++ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { ++ msr &= 0x1fff; ++ return !!test_bit(msr, msr_bitmap + 0xc00 / f); ++ } ++ ++ return true; ++} ++ + static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, + unsigned long entry, unsigned long exit) + { +@@ -2291,6 +2330,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) + if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { + per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; + vmcs_load(vmx->loaded_vmcs->vmcs); ++ indirect_branch_prediction_barrier(); + } + + if (!already_loaded) { +@@ -2567,36 +2607,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) + vmx->guest_msrs[from] = tmp; + } + +-static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) +-{ +- unsigned long *msr_bitmap; +- +- if (is_guest_mode(vcpu)) +- msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap; +- else if (cpu_has_secondary_exec_ctrls() && +- (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & +- SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { +- if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) { +- if (is_long_mode(vcpu)) +- msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv; +- else +- msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv; +- } else { +- if (is_long_mode(vcpu)) +- msr_bitmap = vmx_msr_bitmap_longmode_x2apic; +- else +- msr_bitmap = vmx_msr_bitmap_legacy_x2apic; +- } +- } else { +- if (is_long_mode(vcpu)) +- msr_bitmap = vmx_msr_bitmap_longmode; +- else +- msr_bitmap = vmx_msr_bitmap_legacy; +- } +- +- vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); +-} +- + /* + * Set up the vmcs to automatically save and restore system + * msrs. Don't touch the 64-bit msrs if the guest is in legacy +@@ -2637,7 +2647,7 @@ static void setup_msrs(struct vcpu_vmx *vmx) + vmx->save_nmsrs = save_nmsrs; + + if (cpu_has_vmx_msr_bitmap()) +- vmx_set_msr_bitmap(&vmx->vcpu); ++ vmx_update_msr_bitmap(&vmx->vcpu); + } + + /* +@@ -3273,6 +3283,20 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + case MSR_IA32_TSC: + msr_info->data = guest_read_tsc(vcpu); + break; ++ case MSR_IA32_SPEC_CTRL: ++ if (!msr_info->host_initiated && ++ !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) && ++ !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) ++ return 1; ++ ++ msr_info->data = to_vmx(vcpu)->spec_ctrl; ++ break; ++ case MSR_IA32_ARCH_CAPABILITIES: ++ if (!msr_info->host_initiated && ++ !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) ++ return 1; ++ msr_info->data = to_vmx(vcpu)->arch_capabilities; ++ break; + case MSR_IA32_SYSENTER_CS: + msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); + break; +@@ -3380,6 +3404,70 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + case MSR_IA32_TSC: + kvm_write_tsc(vcpu, msr_info); + break; ++ case MSR_IA32_SPEC_CTRL: ++ if (!msr_info->host_initiated && ++ !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) && ++ !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) ++ return 1; ++ ++ /* The STIBP bit doesn't fault even if it's not advertised */ ++ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) ++ return 1; ++ ++ vmx->spec_ctrl = data; ++ ++ if (!data) ++ break; ++ ++ /* ++ * For non-nested: ++ * When it's written (to non-zero) for the first time, pass ++ * it through. ++ * ++ * For nested: ++ * The handling of the MSR bitmap for L2 guests is done in ++ * nested_vmx_merge_msr_bitmap. We should not touch the ++ * vmcs02.msr_bitmap here since it gets completely overwritten ++ * in the merging. We update the vmcs01 here for L1 as well ++ * since it will end up touching the MSR anyway now. ++ */ ++ vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, ++ MSR_IA32_SPEC_CTRL, ++ MSR_TYPE_RW); ++ break; ++ case MSR_IA32_PRED_CMD: ++ if (!msr_info->host_initiated && ++ !guest_cpuid_has(vcpu, X86_FEATURE_IBPB) && ++ !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) ++ return 1; ++ ++ if (data & ~PRED_CMD_IBPB) ++ return 1; ++ ++ if (!data) ++ break; ++ ++ wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); ++ ++ /* ++ * For non-nested: ++ * When it's written (to non-zero) for the first time, pass ++ * it through. ++ * ++ * For nested: ++ * The handling of the MSR bitmap for L2 guests is done in ++ * nested_vmx_merge_msr_bitmap. We should not touch the ++ * vmcs02.msr_bitmap here since it gets completely overwritten ++ * in the merging. ++ */ ++ vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD, ++ MSR_TYPE_W); ++ break; ++ case MSR_IA32_ARCH_CAPABILITIES: ++ if (!msr_info->host_initiated) ++ return 1; ++ vmx->arch_capabilities = data; ++ break; + case MSR_IA32_CR_PAT: + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) +@@ -3822,11 +3910,6 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) + return vmcs; + } + +-static struct vmcs *alloc_vmcs(void) +-{ +- return alloc_vmcs_cpu(raw_smp_processor_id()); +-} +- + static void free_vmcs(struct vmcs *vmcs) + { + free_pages((unsigned long)vmcs, vmcs_config.order); +@@ -3842,9 +3925,38 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) + loaded_vmcs_clear(loaded_vmcs); + free_vmcs(loaded_vmcs->vmcs); + loaded_vmcs->vmcs = NULL; ++ if (loaded_vmcs->msr_bitmap) ++ free_page((unsigned long)loaded_vmcs->msr_bitmap); + WARN_ON(loaded_vmcs->shadow_vmcs != NULL); + } + ++static struct vmcs *alloc_vmcs(void) ++{ ++ return alloc_vmcs_cpu(raw_smp_processor_id()); ++} ++ ++static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) ++{ ++ loaded_vmcs->vmcs = alloc_vmcs(); ++ if (!loaded_vmcs->vmcs) ++ return -ENOMEM; ++ ++ loaded_vmcs->shadow_vmcs = NULL; ++ loaded_vmcs_init(loaded_vmcs); ++ ++ if (cpu_has_vmx_msr_bitmap()) { ++ loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); ++ if (!loaded_vmcs->msr_bitmap) ++ goto out_vmcs; ++ memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); ++ } ++ return 0; ++ ++out_vmcs: ++ free_loaded_vmcs(loaded_vmcs); ++ return -ENOMEM; ++} ++ + static void free_kvm_area(void) + { + int cpu; +@@ -4917,10 +5029,8 @@ static void free_vpid(int vpid) + spin_unlock(&vmx_vpid_lock); + } + +-#define MSR_TYPE_R 1 +-#define MSR_TYPE_W 2 +-static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, +- u32 msr, int type) ++static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, ++ u32 msr, int type) + { + int f = sizeof(unsigned long); + +@@ -4954,6 +5064,50 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, + } + } + ++static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, ++ u32 msr, int type) ++{ ++ int f = sizeof(unsigned long); ++ ++ if (!cpu_has_vmx_msr_bitmap()) ++ return; ++ ++ /* ++ * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals ++ * have the write-low and read-high bitmap offsets the wrong way round. ++ * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. ++ */ ++ if (msr <= 0x1fff) { ++ if (type & MSR_TYPE_R) ++ /* read-low */ ++ __set_bit(msr, msr_bitmap + 0x000 / f); ++ ++ if (type & MSR_TYPE_W) ++ /* write-low */ ++ __set_bit(msr, msr_bitmap + 0x800 / f); ++ ++ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { ++ msr &= 0x1fff; ++ if (type & MSR_TYPE_R) ++ /* read-high */ ++ __set_bit(msr, msr_bitmap + 0x400 / f); ++ ++ if (type & MSR_TYPE_W) ++ /* write-high */ ++ __set_bit(msr, msr_bitmap + 0xc00 / f); ++ ++ } ++} ++ ++static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap, ++ u32 msr, int type, bool value) ++{ ++ if (value) ++ vmx_enable_intercept_for_msr(msr_bitmap, msr, type); ++ else ++ vmx_disable_intercept_for_msr(msr_bitmap, msr, type); ++} ++ + /* + * If a msr is allowed by L0, we should check whether it is allowed by L1. + * The corresponding bit will be cleared unless both of L0 and L1 allow it. +@@ -5000,30 +5154,70 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, + } + } + +-static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) ++static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) + { +- if (!longmode_only) +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +- msr, MSR_TYPE_R | MSR_TYPE_W); +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, +- msr, MSR_TYPE_R | MSR_TYPE_W); ++ u8 mode = 0; ++ ++ if (cpu_has_secondary_exec_ctrls() && ++ (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & ++ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { ++ mode |= MSR_BITMAP_MODE_X2APIC; ++ if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) ++ mode |= MSR_BITMAP_MODE_X2APIC_APICV; ++ } ++ ++ if (is_long_mode(vcpu)) ++ mode |= MSR_BITMAP_MODE_LM; ++ ++ return mode; + } + +-static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_active) ++#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) ++ ++static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap, ++ u8 mode) + { +- if (apicv_active) { +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv, +- msr, type); +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv, +- msr, type); +- } else { +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, +- msr, type); +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, +- msr, type); ++ int msr; ++ ++ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { ++ unsigned word = msr / BITS_PER_LONG; ++ msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0; ++ msr_bitmap[word + (0x800 / sizeof(long))] = ~0; ++ } ++ ++ if (mode & MSR_BITMAP_MODE_X2APIC) { ++ /* ++ * TPR reads and writes can be virtualized even if virtual interrupt ++ * delivery is not in use. ++ */ ++ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW); ++ if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { ++ vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R); ++ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); ++ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); ++ } + } + } + ++static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) ++{ ++ struct vcpu_vmx *vmx = to_vmx(vcpu); ++ unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; ++ u8 mode = vmx_msr_bitmap_mode(vcpu); ++ u8 changed = mode ^ vmx->msr_bitmap_mode; ++ ++ if (!changed) ++ return; ++ ++ vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW, ++ !(mode & MSR_BITMAP_MODE_LM)); ++ ++ if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) ++ vmx_update_msr_bitmap_x2apic(msr_bitmap, mode); ++ ++ vmx->msr_bitmap_mode = mode; ++} ++ + static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu) + { + return enable_apicv; +@@ -5269,7 +5463,7 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) + } + + if (cpu_has_vmx_msr_bitmap()) +- vmx_set_msr_bitmap(vcpu); ++ vmx_update_msr_bitmap(vcpu); + } + + static u32 vmx_exec_control(struct vcpu_vmx *vmx) +@@ -5456,7 +5650,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) + vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); + } + if (cpu_has_vmx_msr_bitmap()) +- vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy)); ++ vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); + + vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ + +@@ -5534,6 +5728,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) + ++vmx->nmsrs; + } + ++ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) ++ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities); + + vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl); + +@@ -5564,6 +5760,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) + u64 cr0; + + vmx->rmode.vm86_active = 0; ++ vmx->spec_ctrl = 0; + + vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); + kvm_set_cr8(vcpu, 0); +@@ -6739,7 +6936,7 @@ void vmx_enable_tdp(void) + + static __init int hardware_setup(void) + { +- int r = -ENOMEM, i, msr; ++ int r = -ENOMEM, i; + + rdmsrl_safe(MSR_EFER, &host_efer); + +@@ -6760,9 +6957,6 @@ static __init int hardware_setup(void) + + memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); + +- memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); +- memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); +- + if (setup_vmcs_config(&vmcs_config) < 0) { + r = -EIO; + goto out; +@@ -6825,42 +7019,8 @@ static __init int hardware_setup(void) + kvm_tsc_scaling_ratio_frac_bits = 48; + } + +- vmx_disable_intercept_for_msr(MSR_FS_BASE, false); +- vmx_disable_intercept_for_msr(MSR_GS_BASE, false); +- vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); +- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); +- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); +- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); +- +- memcpy(vmx_msr_bitmap_legacy_x2apic_apicv, +- vmx_msr_bitmap_legacy, PAGE_SIZE); +- memcpy(vmx_msr_bitmap_longmode_x2apic_apicv, +- vmx_msr_bitmap_longmode, PAGE_SIZE); +- memcpy(vmx_msr_bitmap_legacy_x2apic, +- vmx_msr_bitmap_legacy, PAGE_SIZE); +- memcpy(vmx_msr_bitmap_longmode_x2apic, +- vmx_msr_bitmap_longmode, PAGE_SIZE); +- + set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ + +- for (msr = 0x800; msr <= 0x8ff; msr++) { +- if (msr == 0x839 /* TMCCT */) +- continue; +- vmx_disable_intercept_msr_x2apic(msr, MSR_TYPE_R, true); +- } +- +- /* +- * TPR reads and writes can be virtualized even if virtual interrupt +- * delivery is not in use. +- */ +- vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_W, true); +- vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false); +- +- /* EOI */ +- vmx_disable_intercept_msr_x2apic(0x80b, MSR_TYPE_W, true); +- /* SELF-IPI */ +- vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true); +- + if (enable_ept) + vmx_enable_tdp(); + else +@@ -6963,94 +7123,6 @@ static int handle_monitor(struct kvm_vcpu *vcpu) + return handle_nop(vcpu); + } + +-/* +- * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12. +- * We could reuse a single VMCS for all the L2 guests, but we also want the +- * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this +- * allows keeping them loaded on the processor, and in the future will allow +- * optimizations where prepare_vmcs02 doesn't need to set all the fields on +- * every entry if they never change. +- * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE +- * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first. +- * +- * The following functions allocate and free a vmcs02 in this pool. +- */ +- +-/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */ +-static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) +-{ +- struct vmcs02_list *item; +- list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) +- if (item->vmptr == vmx->nested.current_vmptr) { +- list_move(&item->list, &vmx->nested.vmcs02_pool); +- return &item->vmcs02; +- } +- +- if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) { +- /* Recycle the least recently used VMCS. */ +- item = list_last_entry(&vmx->nested.vmcs02_pool, +- struct vmcs02_list, list); +- item->vmptr = vmx->nested.current_vmptr; +- list_move(&item->list, &vmx->nested.vmcs02_pool); +- return &item->vmcs02; +- } +- +- /* Create a new VMCS */ +- item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL); +- if (!item) +- return NULL; +- item->vmcs02.vmcs = alloc_vmcs(); +- item->vmcs02.shadow_vmcs = NULL; +- if (!item->vmcs02.vmcs) { +- kfree(item); +- return NULL; +- } +- loaded_vmcs_init(&item->vmcs02); +- item->vmptr = vmx->nested.current_vmptr; +- list_add(&(item->list), &(vmx->nested.vmcs02_pool)); +- vmx->nested.vmcs02_num++; +- return &item->vmcs02; +-} +- +-/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */ +-static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr) +-{ +- struct vmcs02_list *item; +- list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) +- if (item->vmptr == vmptr) { +- free_loaded_vmcs(&item->vmcs02); +- list_del(&item->list); +- kfree(item); +- vmx->nested.vmcs02_num--; +- return; +- } +-} +- +-/* +- * Free all VMCSs saved for this vcpu, except the one pointed by +- * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs +- * must be &vmx->vmcs01. +- */ +-static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) +-{ +- struct vmcs02_list *item, *n; +- +- WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01); +- list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) { +- /* +- * Something will leak if the above WARN triggers. Better than +- * a use-after-free. +- */ +- if (vmx->loaded_vmcs == &item->vmcs02) +- continue; +- +- free_loaded_vmcs(&item->vmcs02); +- list_del(&item->list); +- kfree(item); +- vmx->nested.vmcs02_num--; +- } +-} +- + /* + * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), + * set the success or error code of an emulated VMX instruction, as specified +@@ -7231,13 +7303,11 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) + { + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs *shadow_vmcs; ++ int r; + +- if (cpu_has_vmx_msr_bitmap()) { +- vmx->nested.msr_bitmap = +- (unsigned long *)__get_free_page(GFP_KERNEL); +- if (!vmx->nested.msr_bitmap) +- goto out_msr_bitmap; +- } ++ r = alloc_loaded_vmcs(&vmx->nested.vmcs02); ++ if (r < 0) ++ goto out_vmcs02; + + vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); + if (!vmx->nested.cached_vmcs12) +@@ -7254,9 +7324,6 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) + vmx->vmcs01.shadow_vmcs = shadow_vmcs; + } + +- INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); +- vmx->nested.vmcs02_num = 0; +- + hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED); + vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; +@@ -7268,9 +7335,9 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) + kfree(vmx->nested.cached_vmcs12); + + out_cached_vmcs12: +- free_page((unsigned long)vmx->nested.msr_bitmap); ++ free_loaded_vmcs(&vmx->nested.vmcs02); + +-out_msr_bitmap: ++out_vmcs02: + return -ENOMEM; + } + +@@ -7412,10 +7479,6 @@ static void free_nested(struct vcpu_vmx *vmx) + free_vpid(vmx->nested.vpid02); + vmx->nested.posted_intr_nv = -1; + vmx->nested.current_vmptr = -1ull; +- if (vmx->nested.msr_bitmap) { +- free_page((unsigned long)vmx->nested.msr_bitmap); +- vmx->nested.msr_bitmap = NULL; +- } + if (enable_shadow_vmcs) { + vmx_disable_shadow_vmcs(vmx); + vmcs_clear(vmx->vmcs01.shadow_vmcs); +@@ -7423,7 +7486,7 @@ static void free_nested(struct vcpu_vmx *vmx) + vmx->vmcs01.shadow_vmcs = NULL; + } + kfree(vmx->nested.cached_vmcs12); +- /* Unpin physical memory we referred to in current vmcs02 */ ++ /* Unpin physical memory we referred to in the vmcs02 */ + if (vmx->nested.apic_access_page) { + kvm_release_page_dirty(vmx->nested.apic_access_page); + vmx->nested.apic_access_page = NULL; +@@ -7439,7 +7502,7 @@ static void free_nested(struct vcpu_vmx *vmx) + vmx->nested.pi_desc = NULL; + } + +- nested_free_all_saved_vmcss(vmx); ++ free_loaded_vmcs(&vmx->nested.vmcs02); + } + + /* Emulate the VMXOFF instruction */ +@@ -7482,8 +7545,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) + vmptr + offsetof(struct vmcs12, launch_state), + &zero, sizeof(zero)); + +- nested_free_vmcs02(vmx, vmptr); +- + nested_vmx_succeed(vcpu); + return kvm_skip_emulated_instruction(vcpu); + } +@@ -8395,10 +8456,11 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) + + /* + * The host physical addresses of some pages of guest memory +- * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU +- * may write to these pages via their host physical address while +- * L2 is running, bypassing any address-translation-based dirty +- * tracking (e.g. EPT write protection). ++ * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC ++ * Page). The CPU may write to these pages via their host ++ * physical address while L2 is running, bypassing any ++ * address-translation-based dirty tracking (e.g. EPT write ++ * protection). + * + * Mark them dirty on every exit from L2 to prevent them from + * getting out of sync with dirty tracking. +@@ -8932,7 +8994,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) + } + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); + +- vmx_set_msr_bitmap(vcpu); ++ vmx_update_msr_bitmap(vcpu); + } + + static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) +@@ -9118,14 +9180,14 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) + #endif + "pushf\n\t" + __ASM_SIZE(push) " $%c[cs]\n\t" +- "call *%[entry]\n\t" ++ CALL_NOSPEC + : + #ifdef CONFIG_X86_64 + [sp]"=&r"(tmp), + #endif + ASM_CALL_CONSTRAINT + : +- [entry]"r"(entry), ++ THUNK_TARGET(entry), + [ss]"i"(__KERNEL_DS), + [cs]"i"(__KERNEL_CS) + ); +@@ -9362,6 +9424,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) + + vmx_arm_hv_timer(vcpu); + ++ /* ++ * If this vCPU has touched SPEC_CTRL, restore the guest's value if ++ * it's non-zero. Since vmentry is serialising on affected CPUs, there ++ * is no need to worry about the conditional branch over the wrmsr ++ * being speculatively taken. ++ */ ++ if (vmx->spec_ctrl) ++ wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); ++ + vmx->__launched = vmx->loaded_vmcs->launched; + asm( + /* Store host registers */ +@@ -9480,6 +9551,27 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) + #endif + ); + ++ /* ++ * We do not use IBRS in the kernel. If this vCPU has used the ++ * SPEC_CTRL MSR it may have left it on; save the value and ++ * turn it off. This is much more efficient than blindly adding ++ * it to the atomic save/restore list. Especially as the former ++ * (Saving guest MSRs on vmexit) doesn't even exist in KVM. ++ * ++ * For non-nested case: ++ * If the L01 MSR bitmap does not intercept the MSR, then we need to ++ * save it. ++ * ++ * For nested case: ++ * If the L02 MSR bitmap does not intercept the MSR, then we need to ++ * save it. ++ */ ++ if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)) ++ rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); ++ ++ if (vmx->spec_ctrl) ++ wrmsrl(MSR_IA32_SPEC_CTRL, 0); ++ + /* Eliminate branch target predictions from guest mode */ + vmexit_fill_RSB(); + +@@ -9594,6 +9686,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) + { + int err; + struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); ++ unsigned long *msr_bitmap; + int cpu; + + if (!vmx) +@@ -9626,13 +9719,20 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) + if (!vmx->guest_msrs) + goto free_pml; + +- vmx->loaded_vmcs = &vmx->vmcs01; +- vmx->loaded_vmcs->vmcs = alloc_vmcs(); +- vmx->loaded_vmcs->shadow_vmcs = NULL; +- if (!vmx->loaded_vmcs->vmcs) ++ err = alloc_loaded_vmcs(&vmx->vmcs01); ++ if (err < 0) + goto free_msrs; +- loaded_vmcs_init(vmx->loaded_vmcs); + ++ msr_bitmap = vmx->vmcs01.msr_bitmap; ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW); ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW); ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); ++ vmx->msr_bitmap_mode = 0; ++ ++ vmx->loaded_vmcs = &vmx->vmcs01; + cpu = get_cpu(); + vmx_vcpu_load(&vmx->vcpu, cpu); + vmx->vcpu.cpu = cpu; +@@ -10101,10 +10201,25 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, + int msr; + struct page *page; + unsigned long *msr_bitmap_l1; +- unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap; ++ unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; ++ /* ++ * pred_cmd & spec_ctrl are trying to verify two things: ++ * ++ * 1. L0 gave a permission to L1 to actually passthrough the MSR. This ++ * ensures that we do not accidentally generate an L02 MSR bitmap ++ * from the L12 MSR bitmap that is too permissive. ++ * 2. That L1 or L2s have actually used the MSR. This avoids ++ * unnecessarily merging of the bitmap if the MSR is unused. This ++ * works properly because we only update the L01 MSR bitmap lazily. ++ * So even if L0 should pass L1 these MSRs, the L01 bitmap is only ++ * updated to reflect this when L1 (or its L2s) actually write to ++ * the MSR. ++ */ ++ bool pred_cmd = msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD); ++ bool spec_ctrl = msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL); + +- /* This shortcut is ok because we support only x2APIC MSRs so far. */ +- if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) ++ if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && ++ !pred_cmd && !spec_ctrl) + return false; + + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap); +@@ -10137,6 +10252,19 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, + MSR_TYPE_W); + } + } ++ ++ if (spec_ctrl) ++ nested_vmx_disable_intercept_for_msr( ++ msr_bitmap_l1, msr_bitmap_l0, ++ MSR_IA32_SPEC_CTRL, ++ MSR_TYPE_R | MSR_TYPE_W); ++ ++ if (pred_cmd) ++ nested_vmx_disable_intercept_for_msr( ++ msr_bitmap_l1, msr_bitmap_l0, ++ MSR_IA32_PRED_CMD, ++ MSR_TYPE_W); ++ + kunmap(page); + kvm_release_page_clean(page); + +@@ -10678,6 +10806,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + if (kvm_has_tsc_control) + decache_tsc_multiplier(vmx); + ++ if (cpu_has_vmx_msr_bitmap()) ++ vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); ++ + if (enable_vpid) { + /* + * There is no direct mapping between vpid02 and vpid12, the +@@ -10894,20 +11025,15 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) + { + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); +- struct loaded_vmcs *vmcs02; + u32 msr_entry_idx; + u32 exit_qual; + +- vmcs02 = nested_get_current_vmcs02(vmx); +- if (!vmcs02) +- return -ENOMEM; +- + enter_guest_mode(vcpu); + + if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) + vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + +- vmx_switch_vmcs(vcpu, vmcs02); ++ vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); + vmx_segment_cache_clear(vmx); + + if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) { +@@ -11476,7 +11602,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, + vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + if (cpu_has_vmx_msr_bitmap()) +- vmx_set_msr_bitmap(vcpu); ++ vmx_update_msr_bitmap(vcpu); + + if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, + vmcs12->vm_exit_msr_load_count)) +@@ -11522,10 +11648,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, + vm_exit_controls_reset_shadow(vmx); + vmx_segment_cache_clear(vmx); + +- /* if no vmcs02 cache requested, remove the one we used */ +- if (VMCS02_POOL_SIZE == 0) +- nested_free_vmcs02(vmx, vmx->nested.current_vmptr); +- + /* Update any VMCS fields that might have changed while L2 ran */ + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 8c28023a43b1..f97358423f9c 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -1006,6 +1006,7 @@ static u32 msrs_to_save[] = { + #endif + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, + MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, ++ MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES + }; + + static unsigned num_msrs_to_save; +diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile +index d435c89875c1..d0a3170e6804 100644 +--- a/arch/x86/lib/Makefile ++++ b/arch/x86/lib/Makefile +@@ -27,6 +27,7 @@ lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o + lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o + lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o + lib-$(CONFIG_RETPOLINE) += retpoline.o ++OBJECT_FILES_NON_STANDARD_retpoline.o :=y + + obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o + +diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S +index c97d935a29e8..49b167f73215 100644 +--- a/arch/x86/lib/getuser.S ++++ b/arch/x86/lib/getuser.S +@@ -40,6 +40,8 @@ ENTRY(__get_user_1) + mov PER_CPU_VAR(current_task), %_ASM_DX + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX + jae bad_get_user ++ sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ ++ and %_ASM_DX, %_ASM_AX + ASM_STAC + 1: movzbl (%_ASM_AX),%edx + xor %eax,%eax +@@ -54,6 +56,8 @@ ENTRY(__get_user_2) + mov PER_CPU_VAR(current_task), %_ASM_DX + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX + jae bad_get_user ++ sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ ++ and %_ASM_DX, %_ASM_AX + ASM_STAC + 2: movzwl -1(%_ASM_AX),%edx + xor %eax,%eax +@@ -68,6 +72,8 @@ ENTRY(__get_user_4) + mov PER_CPU_VAR(current_task), %_ASM_DX + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX + jae bad_get_user ++ sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ ++ and %_ASM_DX, %_ASM_AX + ASM_STAC + 3: movl -3(%_ASM_AX),%edx + xor %eax,%eax +@@ -83,6 +89,8 @@ ENTRY(__get_user_8) + mov PER_CPU_VAR(current_task), %_ASM_DX + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX + jae bad_get_user ++ sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ ++ and %_ASM_DX, %_ASM_AX + ASM_STAC + 4: movq -7(%_ASM_AX),%rdx + xor %eax,%eax +@@ -94,6 +102,8 @@ ENTRY(__get_user_8) + mov PER_CPU_VAR(current_task), %_ASM_DX + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX + jae bad_get_user_8 ++ sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ ++ and %_ASM_DX, %_ASM_AX + ASM_STAC + 4: movl -7(%_ASM_AX),%edx + 5: movl -3(%_ASM_AX),%ecx +diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S +index dfb2ba91b670..480edc3a5e03 100644 +--- a/arch/x86/lib/retpoline.S ++++ b/arch/x86/lib/retpoline.S +@@ -7,6 +7,7 @@ + #include <asm/alternative-asm.h> + #include <asm/export.h> + #include <asm/nospec-branch.h> ++#include <asm/bitsperlong.h> + + .macro THUNK reg + .section .text.__x86.indirect_thunk +@@ -36,7 +37,6 @@ GENERATE_THUNK(_ASM_DX) + GENERATE_THUNK(_ASM_SI) + GENERATE_THUNK(_ASM_DI) + GENERATE_THUNK(_ASM_BP) +-GENERATE_THUNK(_ASM_SP) + #ifdef CONFIG_64BIT + GENERATE_THUNK(r8) + GENERATE_THUNK(r9) +@@ -47,3 +47,58 @@ GENERATE_THUNK(r13) + GENERATE_THUNK(r14) + GENERATE_THUNK(r15) + #endif ++ ++/* ++ * Fill the CPU return stack buffer. ++ * ++ * Each entry in the RSB, if used for a speculative 'ret', contains an ++ * infinite 'pause; lfence; jmp' loop to capture speculative execution. ++ * ++ * This is required in various cases for retpoline and IBRS-based ++ * mitigations for the Spectre variant 2 vulnerability. Sometimes to ++ * eliminate potentially bogus entries from the RSB, and sometimes ++ * purely to ensure that it doesn't get empty, which on some CPUs would ++ * allow predictions from other (unwanted!) sources to be used. ++ * ++ * Google experimented with loop-unrolling and this turned out to be ++ * the optimal version - two calls, each with their own speculation ++ * trap should their return address end up getting used, in a loop. ++ */ ++.macro STUFF_RSB nr:req sp:req ++ mov $(\nr / 2), %_ASM_BX ++ .align 16 ++771: ++ call 772f ++773: /* speculation trap */ ++ pause ++ lfence ++ jmp 773b ++ .align 16 ++772: ++ call 774f ++775: /* speculation trap */ ++ pause ++ lfence ++ jmp 775b ++ .align 16 ++774: ++ dec %_ASM_BX ++ jnz 771b ++ add $((BITS_PER_LONG/8) * \nr), \sp ++.endm ++ ++#define RSB_FILL_LOOPS 16 /* To avoid underflow */ ++ ++ENTRY(__fill_rsb) ++ STUFF_RSB RSB_FILL_LOOPS, %_ASM_SP ++ ret ++END(__fill_rsb) ++EXPORT_SYMBOL_GPL(__fill_rsb) ++ ++#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */ ++ ++ENTRY(__clear_rsb) ++ STUFF_RSB RSB_CLEAR_LOOPS, %_ASM_SP ++ ret ++END(__clear_rsb) ++EXPORT_SYMBOL_GPL(__clear_rsb) +diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c +index 1b377f734e64..7add8ba06887 100644 +--- a/arch/x86/lib/usercopy_32.c ++++ b/arch/x86/lib/usercopy_32.c +@@ -331,12 +331,12 @@ do { \ + + unsigned long __copy_user_ll(void *to, const void *from, unsigned long n) + { +- stac(); ++ __uaccess_begin_nospec(); + if (movsl_is_ok(to, from, n)) + __copy_user(to, from, n); + else + n = __copy_user_intel(to, from, n); +- clac(); ++ __uaccess_end(); + return n; + } + EXPORT_SYMBOL(__copy_user_ll); +@@ -344,7 +344,7 @@ EXPORT_SYMBOL(__copy_user_ll); + unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from, + unsigned long n) + { +- stac(); ++ __uaccess_begin_nospec(); + #ifdef CONFIG_X86_INTEL_USERCOPY + if (n > 64 && static_cpu_has(X86_FEATURE_XMM2)) + n = __copy_user_intel_nocache(to, from, n); +@@ -353,7 +353,7 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr + #else + __copy_user(to, from, n); + #endif +- clac(); ++ __uaccess_end(); + return n; + } + EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero); +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index 5bfe61a5e8e3..012d02624848 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -6,13 +6,14 @@ + #include <linux/interrupt.h> + #include <linux/export.h> + #include <linux/cpu.h> ++#include <linux/debugfs.h> + + #include <asm/tlbflush.h> + #include <asm/mmu_context.h> ++#include <asm/nospec-branch.h> + #include <asm/cache.h> + #include <asm/apic.h> + #include <asm/uv/uv.h> +-#include <linux/debugfs.h> + + /* + * TLB flushing, formerly SMP-only +@@ -247,6 +248,27 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + } else { + u16 new_asid; + bool need_flush; ++ u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); ++ ++ /* ++ * Avoid user/user BTB poisoning by flushing the branch ++ * predictor when switching between processes. This stops ++ * one process from doing Spectre-v2 attacks on another. ++ * ++ * As an optimization, flush indirect branches only when ++ * switching into processes that disable dumping. This ++ * protects high value processes like gpg, without having ++ * too high performance overhead. IBPB is *expensive*! ++ * ++ * This will not flush branches when switching into kernel ++ * threads. It will also not flush if we switch to idle ++ * thread and back to the same process. It will flush if we ++ * switch to a different non-dumpable process. ++ */ ++ if (tsk && tsk->mm && ++ tsk->mm->context.ctx_id != last_ctx_id && ++ get_dumpable(tsk->mm) != SUID_DUMP_USER) ++ indirect_branch_prediction_barrier(); + + if (IS_ENABLED(CONFIG_VMAP_STACK)) { + /* +@@ -292,6 +314,14 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); + } + ++ /* ++ * Record last user mm's context id, so we can avoid ++ * flushing branch buffer with IBPB if we switch back ++ * to the same user. ++ */ ++ if (next != &init_mm) ++ this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); ++ + this_cpu_write(cpu_tlbstate.loaded_mm, next); + this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); + } +@@ -369,6 +399,7 @@ void initialize_tlbstate_and_flush(void) + write_cr3(build_cr3(mm->pgd, 0)); + + /* Reinitialize tlbstate. */ ++ this_cpu_write(cpu_tlbstate.last_ctx_id, mm->context.ctx_id); + this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); + this_cpu_write(cpu_tlbstate.next_asid, 1); + this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); +diff --git a/drivers/auxdisplay/img-ascii-lcd.c b/drivers/auxdisplay/img-ascii-lcd.c +index a9020f82eea7..58403052514f 100644 +--- a/drivers/auxdisplay/img-ascii-lcd.c ++++ b/drivers/auxdisplay/img-ascii-lcd.c +@@ -443,3 +443,7 @@ static struct platform_driver img_ascii_lcd_driver = { + .remove = img_ascii_lcd_remove, + }; + module_platform_driver(img_ascii_lcd_driver); ++ ++MODULE_DESCRIPTION("Imagination Technologies ASCII LCD Display"); ++MODULE_AUTHOR("Paul Burton <paul.burton@mips.com>"); ++MODULE_LICENSE("GPL"); +diff --git a/drivers/fpga/fpga-region.c b/drivers/fpga/fpga-region.c +index d9ab7c75b14f..e0c73ceba2ed 100644 +--- a/drivers/fpga/fpga-region.c ++++ b/drivers/fpga/fpga-region.c +@@ -147,6 +147,7 @@ static struct fpga_manager *fpga_region_get_manager(struct fpga_region *region) + mgr_node = of_parse_phandle(np, "fpga-mgr", 0); + if (mgr_node) { + mgr = of_fpga_mgr_get(mgr_node); ++ of_node_put(mgr_node); + of_node_put(np); + return mgr; + } +@@ -192,10 +193,13 @@ static int fpga_region_get_bridges(struct fpga_region *region, + parent_br = region_np->parent; + + /* If overlay has a list of bridges, use it. */ +- if (of_parse_phandle(overlay, "fpga-bridges", 0)) ++ br = of_parse_phandle(overlay, "fpga-bridges", 0); ++ if (br) { ++ of_node_put(br); + np = overlay; +- else ++ } else { + np = region_np; ++ } + + for (i = 0; ; i++) { + br = of_parse_phandle(np, "fpga-bridges", i); +@@ -203,12 +207,15 @@ static int fpga_region_get_bridges(struct fpga_region *region, + break; + + /* If parent bridge is in list, skip it. */ +- if (br == parent_br) ++ if (br == parent_br) { ++ of_node_put(br); + continue; ++ } + + /* If node is a bridge, get it and add to list */ + ret = fpga_bridge_get_to_list(br, region->info, + ®ion->bridge_list); ++ of_node_put(br); + + /* If any of the bridges are in use, give up */ + if (ret == -EBUSY) { +diff --git a/drivers/iio/accel/kxsd9-i2c.c b/drivers/iio/accel/kxsd9-i2c.c +index 98fbb628d5bd..38411e1c155b 100644 +--- a/drivers/iio/accel/kxsd9-i2c.c ++++ b/drivers/iio/accel/kxsd9-i2c.c +@@ -63,3 +63,6 @@ static struct i2c_driver kxsd9_i2c_driver = { + .id_table = kxsd9_i2c_id, + }; + module_i2c_driver(kxsd9_i2c_driver); ++ ++MODULE_LICENSE("GPL v2"); ++MODULE_DESCRIPTION("KXSD9 accelerometer I2C interface"); +diff --git a/drivers/iio/adc/qcom-vadc-common.c b/drivers/iio/adc/qcom-vadc-common.c +index 47d24ae5462f..fe3d7826783c 100644 +--- a/drivers/iio/adc/qcom-vadc-common.c ++++ b/drivers/iio/adc/qcom-vadc-common.c +@@ -5,6 +5,7 @@ + #include <linux/math64.h> + #include <linux/log2.h> + #include <linux/err.h> ++#include <linux/module.h> + + #include "qcom-vadc-common.h" + +@@ -229,3 +230,6 @@ int qcom_vadc_decimation_from_dt(u32 value) + return __ffs64(value / VADC_DECIMATION_MIN); + } + EXPORT_SYMBOL(qcom_vadc_decimation_from_dt); ++ ++MODULE_LICENSE("GPL v2"); ++MODULE_DESCRIPTION("Qualcomm ADC common functionality"); +diff --git a/drivers/pinctrl/pxa/pinctrl-pxa2xx.c b/drivers/pinctrl/pxa/pinctrl-pxa2xx.c +index 866aa3ce1ac9..6cf0006d4c8d 100644 +--- a/drivers/pinctrl/pxa/pinctrl-pxa2xx.c ++++ b/drivers/pinctrl/pxa/pinctrl-pxa2xx.c +@@ -436,3 +436,7 @@ int pxa2xx_pinctrl_exit(struct platform_device *pdev) + return 0; + } + EXPORT_SYMBOL_GPL(pxa2xx_pinctrl_exit); ++ ++MODULE_AUTHOR("Robert Jarzmik <robert.jarzmik@free.fr>"); ++MODULE_DESCRIPTION("Marvell PXA2xx pinctrl driver"); ++MODULE_LICENSE("GPL v2"); +diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c +index 3a14cccbd7ff..7948acf14601 100644 +--- a/drivers/tty/serial/serial_core.c ++++ b/drivers/tty/serial/serial_core.c +@@ -987,6 +987,8 @@ static int uart_set_info(struct tty_struct *tty, struct tty_port *port, + } + } else { + retval = uart_startup(tty, state, 1); ++ if (retval == 0) ++ tty_port_set_initialized(port, true); + if (retval > 0) + retval = 0; + } +diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h +index 1c65817673db..41615f38bcff 100644 +--- a/include/linux/fdtable.h ++++ b/include/linux/fdtable.h +@@ -10,6 +10,7 @@ + #include <linux/compiler.h> + #include <linux/spinlock.h> + #include <linux/rcupdate.h> ++#include <linux/nospec.h> + #include <linux/types.h> + #include <linux/init.h> + #include <linux/fs.h> +@@ -82,8 +83,10 @@ static inline struct file *__fcheck_files(struct files_struct *files, unsigned i + { + struct fdtable *fdt = rcu_dereference_raw(files->fdt); + +- if (fd < fdt->max_fds) ++ if (fd < fdt->max_fds) { ++ fd = array_index_nospec(fd, fdt->max_fds); + return rcu_dereference_raw(fdt->fd[fd]); ++ } + return NULL; + } + +diff --git a/include/linux/init.h b/include/linux/init.h +index f38b993edacb..943139a563e3 100644 +--- a/include/linux/init.h ++++ b/include/linux/init.h +@@ -5,6 +5,13 @@ + #include <linux/compiler.h> + #include <linux/types.h> + ++/* Built-in __init functions needn't be compiled with retpoline */ ++#if defined(RETPOLINE) && !defined(MODULE) ++#define __noretpoline __attribute__((indirect_branch("keep"))) ++#else ++#define __noretpoline ++#endif ++ + /* These macros are used to mark some functions or + * initialized data (doesn't apply to uninitialized data) + * as `initialization' functions. The kernel can take this +@@ -40,7 +47,7 @@ + + /* These are for everybody (although not all archs will actually + discard it in modules) */ +-#define __init __section(.init.text) __cold __inittrace __latent_entropy ++#define __init __section(.init.text) __cold __inittrace __latent_entropy __noretpoline + #define __initdata __section(.init.data) + #define __initconst __section(.init.rodata) + #define __exitdata __section(.exit.data) +diff --git a/include/linux/module.h b/include/linux/module.h +index fe5aa3736707..b1cc541f2ddf 100644 +--- a/include/linux/module.h ++++ b/include/linux/module.h +@@ -794,6 +794,15 @@ static inline void module_bug_finalize(const Elf_Ehdr *hdr, + static inline void module_bug_cleanup(struct module *mod) {} + #endif /* CONFIG_GENERIC_BUG */ + ++#ifdef RETPOLINE ++extern bool retpoline_module_ok(bool has_retpoline); ++#else ++static inline bool retpoline_module_ok(bool has_retpoline) ++{ ++ return true; ++} ++#endif ++ + #ifdef CONFIG_MODULE_SIG + static inline bool module_sig_ok(struct module *module) + { +diff --git a/include/linux/nospec.h b/include/linux/nospec.h +new file mode 100644 +index 000000000000..b99bced39ac2 +--- /dev/null ++++ b/include/linux/nospec.h +@@ -0,0 +1,72 @@ ++// SPDX-License-Identifier: GPL-2.0 ++// Copyright(c) 2018 Linus Torvalds. All rights reserved. ++// Copyright(c) 2018 Alexei Starovoitov. All rights reserved. ++// Copyright(c) 2018 Intel Corporation. All rights reserved. ++ ++#ifndef _LINUX_NOSPEC_H ++#define _LINUX_NOSPEC_H ++ ++/** ++ * array_index_mask_nospec() - generate a ~0 mask when index < size, 0 otherwise ++ * @index: array element index ++ * @size: number of elements in array ++ * ++ * When @index is out of bounds (@index >= @size), the sign bit will be ++ * set. Extend the sign bit to all bits and invert, giving a result of ++ * zero for an out of bounds index, or ~0 if within bounds [0, @size). ++ */ ++#ifndef array_index_mask_nospec ++static inline unsigned long array_index_mask_nospec(unsigned long index, ++ unsigned long size) ++{ ++ /* ++ * Warn developers about inappropriate array_index_nospec() usage. ++ * ++ * Even if the CPU speculates past the WARN_ONCE branch, the ++ * sign bit of @index is taken into account when generating the ++ * mask. ++ * ++ * This warning is compiled out when the compiler can infer that ++ * @index and @size are less than LONG_MAX. ++ */ ++ if (WARN_ONCE(index > LONG_MAX || size > LONG_MAX, ++ "array_index_nospec() limited to range of [0, LONG_MAX]\n")) ++ return 0; ++ ++ /* ++ * Always calculate and emit the mask even if the compiler ++ * thinks the mask is not needed. The compiler does not take ++ * into account the value of @index under speculation. ++ */ ++ OPTIMIZER_HIDE_VAR(index); ++ return ~(long)(index | (size - 1UL - index)) >> (BITS_PER_LONG - 1); ++} ++#endif ++ ++/* ++ * array_index_nospec - sanitize an array index after a bounds check ++ * ++ * For a code sequence like: ++ * ++ * if (index < size) { ++ * index = array_index_nospec(index, size); ++ * val = array[index]; ++ * } ++ * ++ * ...if the CPU speculates past the bounds check then ++ * array_index_nospec() will clamp the index within the range of [0, ++ * size). ++ */ ++#define array_index_nospec(index, size) \ ++({ \ ++ typeof(index) _i = (index); \ ++ typeof(size) _s = (size); \ ++ unsigned long _mask = array_index_mask_nospec(_i, _s); \ ++ \ ++ BUILD_BUG_ON(sizeof(_i) > sizeof(long)); \ ++ BUILD_BUG_ON(sizeof(_s) > sizeof(long)); \ ++ \ ++ _i &= _mask; \ ++ _i; \ ++}) ++#endif /* _LINUX_NOSPEC_H */ +diff --git a/kernel/module.c b/kernel/module.c +index de66ec825992..690c0651c40f 100644 +--- a/kernel/module.c ++++ b/kernel/module.c +@@ -2855,6 +2855,15 @@ static int check_modinfo_livepatch(struct module *mod, struct load_info *info) + } + #endif /* CONFIG_LIVEPATCH */ + ++static void check_modinfo_retpoline(struct module *mod, struct load_info *info) ++{ ++ if (retpoline_module_ok(get_modinfo(info, "retpoline"))) ++ return; ++ ++ pr_warn("%s: loading module not compiled with retpoline compiler.\n", ++ mod->name); ++} ++ + /* Sets info->hdr and info->len. */ + static int copy_module_from_user(const void __user *umod, unsigned long len, + struct load_info *info) +@@ -3021,6 +3030,8 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) + add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK); + } + ++ check_modinfo_retpoline(mod, info); ++ + if (get_modinfo(info, "staging")) { + add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); + pr_warn("%s: module is from the staging directory, the quality " +diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c +index d396cb61a280..81bef0676e1d 100644 +--- a/net/wireless/nl80211.c ++++ b/net/wireless/nl80211.c +@@ -16,6 +16,7 @@ + #include <linux/nl80211.h> + #include <linux/rtnetlink.h> + #include <linux/netlink.h> ++#include <linux/nospec.h> + #include <linux/etherdevice.h> + #include <net/net_namespace.h> + #include <net/genetlink.h> +@@ -2056,20 +2057,22 @@ static const struct nla_policy txq_params_policy[NL80211_TXQ_ATTR_MAX + 1] = { + static int parse_txq_params(struct nlattr *tb[], + struct ieee80211_txq_params *txq_params) + { ++ u8 ac; ++ + if (!tb[NL80211_TXQ_ATTR_AC] || !tb[NL80211_TXQ_ATTR_TXOP] || + !tb[NL80211_TXQ_ATTR_CWMIN] || !tb[NL80211_TXQ_ATTR_CWMAX] || + !tb[NL80211_TXQ_ATTR_AIFS]) + return -EINVAL; + +- txq_params->ac = nla_get_u8(tb[NL80211_TXQ_ATTR_AC]); ++ ac = nla_get_u8(tb[NL80211_TXQ_ATTR_AC]); + txq_params->txop = nla_get_u16(tb[NL80211_TXQ_ATTR_TXOP]); + txq_params->cwmin = nla_get_u16(tb[NL80211_TXQ_ATTR_CWMIN]); + txq_params->cwmax = nla_get_u16(tb[NL80211_TXQ_ATTR_CWMAX]); + txq_params->aifs = nla_get_u8(tb[NL80211_TXQ_ATTR_AIFS]); + +- if (txq_params->ac >= NL80211_NUM_ACS) ++ if (ac >= NL80211_NUM_ACS) + return -EINVAL; +- ++ txq_params->ac = array_index_nospec(ac, NL80211_NUM_ACS); + return 0; + } + +diff --git a/scripts/faddr2line b/scripts/faddr2line +index 39e07d8574dd..7721d5b2b0c0 100755 +--- a/scripts/faddr2line ++++ b/scripts/faddr2line +@@ -44,10 +44,10 @@ + set -o errexit + set -o nounset + +-READELF="${CROSS_COMPILE}readelf" +-ADDR2LINE="${CROSS_COMPILE}addr2line" +-SIZE="${CROSS_COMPILE}size" +-NM="${CROSS_COMPILE}nm" ++READELF="${CROSS_COMPILE:-}readelf" ++ADDR2LINE="${CROSS_COMPILE:-}addr2line" ++SIZE="${CROSS_COMPILE:-}size" ++NM="${CROSS_COMPILE:-}nm" + + command -v awk >/dev/null 2>&1 || die "awk isn't installed" + command -v ${READELF} >/dev/null 2>&1 || die "readelf isn't installed" +diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c +index 98314b400a95..54deaa1066cf 100644 +--- a/scripts/mod/modpost.c ++++ b/scripts/mod/modpost.c +@@ -2165,6 +2165,14 @@ static void add_intree_flag(struct buffer *b, int is_intree) + buf_printf(b, "\nMODULE_INFO(intree, \"Y\");\n"); + } + ++/* Cannot check for assembler */ ++static void add_retpoline(struct buffer *b) ++{ ++ buf_printf(b, "\n#ifdef RETPOLINE\n"); ++ buf_printf(b, "MODULE_INFO(retpoline, \"Y\");\n"); ++ buf_printf(b, "#endif\n"); ++} ++ + static void add_staging_flag(struct buffer *b, const char *name) + { + static const char *staging_dir = "drivers/staging"; +@@ -2506,6 +2514,7 @@ int main(int argc, char **argv) + err |= check_modname_len(mod); + add_header(&buf, mod); + add_intree_flag(&buf, !external_module); ++ add_retpoline(&buf); + add_staging_flag(&buf, mod->name); + err |= add_versions(&buf, mod); + add_depends(&buf, mod, modules); +diff --git a/sound/soc/codecs/pcm512x-spi.c b/sound/soc/codecs/pcm512x-spi.c +index 712ed6598c48..ebdf9bd5a64c 100644 +--- a/sound/soc/codecs/pcm512x-spi.c ++++ b/sound/soc/codecs/pcm512x-spi.c +@@ -70,3 +70,7 @@ static struct spi_driver pcm512x_spi_driver = { + }; + + module_spi_driver(pcm512x_spi_driver); ++ ++MODULE_DESCRIPTION("ASoC PCM512x codec driver - SPI"); ++MODULE_AUTHOR("Mark Brown <broonie@kernel.org>"); ++MODULE_LICENSE("GPL v2"); +diff --git a/tools/objtool/check.c b/tools/objtool/check.c +index f40d46e24bcc..9cd028aa1509 100644 +--- a/tools/objtool/check.c ++++ b/tools/objtool/check.c +@@ -543,18 +543,14 @@ static int add_call_destinations(struct objtool_file *file) + dest_off = insn->offset + insn->len + insn->immediate; + insn->call_dest = find_symbol_by_offset(insn->sec, + dest_off); +- /* +- * FIXME: Thanks to retpolines, it's now considered +- * normal for a function to call within itself. So +- * disable this warning for now. +- */ +-#if 0 +- if (!insn->call_dest) { +- WARN_FUNC("can't find call dest symbol at offset 0x%lx", +- insn->sec, insn->offset, dest_off); ++ ++ if (!insn->call_dest && !insn->ignore) { ++ WARN_FUNC("unsupported intra-function call", ++ insn->sec, insn->offset); ++ WARN("If this is a retpoline, please patch it in with alternatives and annotate it with ANNOTATE_NOSPEC_ALTERNATIVE."); + return -1; + } +-#endif ++ + } else if (rela->sym->type == STT_SECTION) { + insn->call_dest = find_symbol_by_offset(rela->sym->sec, + rela->addend+4); +@@ -598,7 +594,7 @@ static int handle_group_alt(struct objtool_file *file, + struct instruction *orig_insn, + struct instruction **new_insn) + { +- struct instruction *last_orig_insn, *last_new_insn, *insn, *fake_jump; ++ struct instruction *last_orig_insn, *last_new_insn, *insn, *fake_jump = NULL; + unsigned long dest_off; + + last_orig_insn = NULL; +@@ -614,28 +610,30 @@ static int handle_group_alt(struct objtool_file *file, + last_orig_insn = insn; + } + +- if (!next_insn_same_sec(file, last_orig_insn)) { +- WARN("%s: don't know how to handle alternatives at end of section", +- special_alt->orig_sec->name); +- return -1; +- } +- +- fake_jump = malloc(sizeof(*fake_jump)); +- if (!fake_jump) { +- WARN("malloc failed"); +- return -1; ++ if (next_insn_same_sec(file, last_orig_insn)) { ++ fake_jump = malloc(sizeof(*fake_jump)); ++ if (!fake_jump) { ++ WARN("malloc failed"); ++ return -1; ++ } ++ memset(fake_jump, 0, sizeof(*fake_jump)); ++ INIT_LIST_HEAD(&fake_jump->alts); ++ clear_insn_state(&fake_jump->state); ++ ++ fake_jump->sec = special_alt->new_sec; ++ fake_jump->offset = -1; ++ fake_jump->type = INSN_JUMP_UNCONDITIONAL; ++ fake_jump->jump_dest = list_next_entry(last_orig_insn, list); ++ fake_jump->ignore = true; + } +- memset(fake_jump, 0, sizeof(*fake_jump)); +- INIT_LIST_HEAD(&fake_jump->alts); +- clear_insn_state(&fake_jump->state); +- +- fake_jump->sec = special_alt->new_sec; +- fake_jump->offset = -1; +- fake_jump->type = INSN_JUMP_UNCONDITIONAL; +- fake_jump->jump_dest = list_next_entry(last_orig_insn, list); +- fake_jump->ignore = true; + + if (!special_alt->new_len) { ++ if (!fake_jump) { ++ WARN("%s: empty alternative at end of section", ++ special_alt->orig_sec->name); ++ return -1; ++ } ++ + *new_insn = fake_jump; + return 0; + } +@@ -648,6 +646,8 @@ static int handle_group_alt(struct objtool_file *file, + + last_new_insn = insn; + ++ insn->ignore = orig_insn->ignore_alts; ++ + if (insn->type != INSN_JUMP_CONDITIONAL && + insn->type != INSN_JUMP_UNCONDITIONAL) + continue; +@@ -656,8 +656,14 @@ static int handle_group_alt(struct objtool_file *file, + continue; + + dest_off = insn->offset + insn->len + insn->immediate; +- if (dest_off == special_alt->new_off + special_alt->new_len) ++ if (dest_off == special_alt->new_off + special_alt->new_len) { ++ if (!fake_jump) { ++ WARN("%s: alternative jump to end of section", ++ special_alt->orig_sec->name); ++ return -1; ++ } + insn->jump_dest = fake_jump; ++ } + + if (!insn->jump_dest) { + WARN_FUNC("can't find alternative jump destination", +@@ -672,7 +678,8 @@ static int handle_group_alt(struct objtool_file *file, + return -1; + } + +- list_add(&fake_jump->list, &last_new_insn->list); ++ if (fake_jump) ++ list_add(&fake_jump->list, &last_new_insn->list); + + return 0; + } +@@ -729,10 +736,6 @@ static int add_special_section_alts(struct objtool_file *file) + goto out; + } + +- /* Ignore retpoline alternatives. */ +- if (orig_insn->ignore_alts) +- continue; +- + new_insn = NULL; + if (!special_alt->group || special_alt->new_len) { + new_insn = find_insn(file, special_alt->new_sec, +@@ -1089,11 +1092,11 @@ static int decode_sections(struct objtool_file *file) + if (ret) + return ret; + +- ret = add_call_destinations(file); ++ ret = add_special_section_alts(file); + if (ret) + return ret; + +- ret = add_special_section_alts(file); ++ ret = add_call_destinations(file); + if (ret) + return ret; + +@@ -1720,10 +1723,12 @@ static int validate_branch(struct objtool_file *file, struct instruction *first, + + insn->visited = true; + +- list_for_each_entry(alt, &insn->alts, list) { +- ret = validate_branch(file, alt->insn, state); +- if (ret) +- return 1; ++ if (!insn->ignore_alts) { ++ list_for_each_entry(alt, &insn->alts, list) { ++ ret = validate_branch(file, alt->insn, state); ++ if (ret) ++ return 1; ++ } + } + + switch (insn->type) { +diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c +index e61fe703197b..18384d9be4e1 100644 +--- a/tools/objtool/orc_gen.c ++++ b/tools/objtool/orc_gen.c +@@ -98,6 +98,11 @@ static int create_orc_entry(struct section *u_sec, struct section *ip_relasec, + struct orc_entry *orc; + struct rela *rela; + ++ if (!insn_sec->sym) { ++ WARN("missing symbol for section %s", insn_sec->name); ++ return -1; ++ } ++ + /* populate ORC data */ + orc = (struct orc_entry *)u_sec->data->d_buf + idx; + memcpy(orc, o, sizeof(*orc)); |