diff options
author | Mike Pagano <mpagano@gentoo.org> | 2018-12-05 14:42:14 -0500 |
---|---|---|
committer | Mike Pagano <mpagano@gentoo.org> | 2018-12-05 14:42:14 -0500 |
commit | a1249a08fe1aead9f7e3e0c0438a14d3c1487981 (patch) | |
tree | 39715dbb2dca6cad88e31ed7c5e001b34132f3e8 | |
parent | proj/linux-patches: Update existing patch for 4.14.85 (diff) | |
download | linux-patches-a1249a08fe1aead9f7e3e0c0438a14d3c1487981.tar.gz linux-patches-a1249a08fe1aead9f7e3e0c0438a14d3c1487981.tar.bz2 linux-patches-a1249a08fe1aead9f7e3e0c0438a14d3c1487981.zip |
proj/linux-patches: Linux patch 4.14.864.14-93
Signed-off-by: Mike Pagano <mpagano@gentoo.org>
-rw-r--r-- | 0000_README | 14 | ||||
-rw-r--r-- | 1085_linux-4.14.86.patch | 7052 |
2 files changed, 7061 insertions, 5 deletions
diff --git a/0000_README b/0000_README index b328a3b6..b0b15a3d 100644 --- a/0000_README +++ b/0000_README @@ -363,26 +363,30 @@ Patch: 1079_linux-4.14.80.patch From: http://www.kernel.org Desc: Linux 4.14.80 -Patch: 1080-4.14.81.patch +Patch: 1080_4.14.81.patch From: http://www.kernel.org Desc: Linux 4.14.81 -Patch: 1081-4.14.82.patch +Patch: 1081_4.14.82.patch From: http://www.kernel.org Desc: Linux 4.14.82 -Patch: 1082-4.14.83.patch +Patch: 1082_4.14.83.patch From: http://www.kernel.org Desc: Linux 4.14.83 -Patch: 1083-4.14.84.patch +Patch: 1083_4.14.84.patch From: http://www.kernel.org Desc: Linux 4.14.84 -Patch: 1084-4.14.85.patch +Patch: 1084_4.14.85.patch From: http://www.kernel.org Desc: Linux 4.14.85 +Patch: 1085_4.14.86.patch +From: http://www.kernel.org +Desc: Linux 4.14.86 + Patch: 1500_XATTR_USER_PREFIX.patch From: https://bugs.gentoo.org/show_bug.cgi?id=470644 Desc: Support for namespace user.pax.* on tmpfs. diff --git a/1085_linux-4.14.86.patch b/1085_linux-4.14.86.patch new file mode 100644 index 00000000..c1ec4d9e --- /dev/null +++ b/1085_linux-4.14.86.patch @@ -0,0 +1,7052 @@ +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 99a08722124d..5f3d58142600 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -3994,9 +3994,13 @@ + + spectre_v2= [X86] Control mitigation of Spectre variant 2 + (indirect branch speculation) vulnerability. ++ The default operation protects the kernel from ++ user space attacks. + +- on - unconditionally enable +- off - unconditionally disable ++ on - unconditionally enable, implies ++ spectre_v2_user=on ++ off - unconditionally disable, implies ++ spectre_v2_user=off + auto - kernel detects whether your CPU model is + vulnerable + +@@ -4006,6 +4010,12 @@ + CONFIG_RETPOLINE configuration option, and the + compiler with which the kernel was built. + ++ Selecting 'on' will also enable the mitigation ++ against user space to user space task attacks. ++ ++ Selecting 'off' will disable both the kernel and ++ the user space protections. ++ + Specific mitigations can also be selected manually: + + retpoline - replace indirect branches +@@ -4015,6 +4025,48 @@ + Not specifying this option is equivalent to + spectre_v2=auto. + ++ spectre_v2_user= ++ [X86] Control mitigation of Spectre variant 2 ++ (indirect branch speculation) vulnerability between ++ user space tasks ++ ++ on - Unconditionally enable mitigations. Is ++ enforced by spectre_v2=on ++ ++ off - Unconditionally disable mitigations. Is ++ enforced by spectre_v2=off ++ ++ prctl - Indirect branch speculation is enabled, ++ but mitigation can be enabled via prctl ++ per thread. The mitigation control state ++ is inherited on fork. ++ ++ prctl,ibpb ++ - Like "prctl" above, but only STIBP is ++ controlled per thread. IBPB is issued ++ always when switching between different user ++ space processes. ++ ++ seccomp ++ - Same as "prctl" above, but all seccomp ++ threads will enable the mitigation unless ++ they explicitly opt out. ++ ++ seccomp,ibpb ++ - Like "seccomp" above, but only STIBP is ++ controlled per thread. IBPB is issued ++ always when switching between different ++ user space processes. ++ ++ auto - Kernel selects the mitigation depending on ++ the available CPU features and vulnerability. ++ ++ Default mitigation: ++ If CONFIG_SECCOMP=y then "seccomp", otherwise "prctl" ++ ++ Not specifying this option is equivalent to ++ spectre_v2_user=auto. ++ + spec_store_bypass_disable= + [HW] Control Speculative Store Bypass (SSB) Disable mitigation + (Speculative Store Bypass vulnerability) +diff --git a/Documentation/userspace-api/spec_ctrl.rst b/Documentation/userspace-api/spec_ctrl.rst +index 32f3d55c54b7..c4dbe6f7cdae 100644 +--- a/Documentation/userspace-api/spec_ctrl.rst ++++ b/Documentation/userspace-api/spec_ctrl.rst +@@ -92,3 +92,12 @@ Speculation misfeature controls + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_ENABLE, 0, 0); + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_DISABLE, 0, 0); + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_FORCE_DISABLE, 0, 0); ++ ++- PR_SPEC_INDIR_BRANCH: Indirect Branch Speculation in User Processes ++ (Mitigate Spectre V2 style attacks against user processes) ++ ++ Invocations: ++ * prctl(PR_GET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, 0, 0, 0); ++ * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_ENABLE, 0, 0); ++ * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_DISABLE, 0, 0); ++ * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_FORCE_DISABLE, 0, 0); +diff --git a/Makefile b/Makefile +index 58a248264090..572bd98d2344 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,7 +1,7 @@ + # SPDX-License-Identifier: GPL-2.0 + VERSION = 4 + PATCHLEVEL = 14 +-SUBLEVEL = 85 ++SUBLEVEL = 86 + EXTRAVERSION = + NAME = Petit Gorille + +diff --git a/arch/arm/boot/dts/rk3288-veyron.dtsi b/arch/arm/boot/dts/rk3288-veyron.dtsi +index 6e5bd8974f22..679b839bb2eb 100644 +--- a/arch/arm/boot/dts/rk3288-veyron.dtsi ++++ b/arch/arm/boot/dts/rk3288-veyron.dtsi +@@ -47,7 +47,11 @@ + #include "rk3288.dtsi" + + / { +- memory@0 { ++ /* ++ * The default coreboot on veyron devices ignores memory@0 nodes ++ * and would instead create another memory node. ++ */ ++ memory { + device_type = "memory"; + reg = <0x0 0x0 0x0 0x80000000>; + }; +diff --git a/arch/arm64/boot/dts/rockchip/rk3399-puma-haikou.dts b/arch/arm64/boot/dts/rockchip/rk3399-puma-haikou.dts +index 9a7486058455..eea7f8f070cf 100644 +--- a/arch/arm64/boot/dts/rockchip/rk3399-puma-haikou.dts ++++ b/arch/arm64/boot/dts/rockchip/rk3399-puma-haikou.dts +@@ -130,7 +130,7 @@ + }; + + &pcie0 { +- ep-gpios = <&gpio4 RK_PC6 GPIO_ACTIVE_LOW>; ++ ep-gpios = <&gpio4 RK_PC6 GPIO_ACTIVE_HIGH>; + num-lanes = <4>; + pinctrl-names = "default"; + pinctrl-0 = <&pcie_clkreqn_cpm>; +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 2af0af33362a..4f393eb9745f 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -440,10 +440,6 @@ config RETPOLINE + branches. Requires a compiler with -mindirect-branch=thunk-extern + support for full protection. The kernel may run slower. + +- Without compiler support, at least indirect branches in assembler +- code are eliminated. Since this includes the syscall entry path, +- it is not entirely pointless. +- + config INTEL_RDT + bool "Intel Resource Director Technology support" + default n +@@ -959,13 +955,7 @@ config NR_CPUS + approximately eight kilobytes to the kernel image. + + config SCHED_SMT +- bool "SMT (Hyperthreading) scheduler support" +- depends on SMP +- ---help--- +- SMT scheduler support improves the CPU scheduler's decision making +- when dealing with Intel Pentium 4 chips with HyperThreading at a +- cost of slightly increased overhead in some places. If unsure say +- N here. ++ def_bool y if SMP + + config SCHED_MC + def_bool y +diff --git a/arch/x86/Makefile b/arch/x86/Makefile +index 1c4d012550ec..ce3658dd98e8 100644 +--- a/arch/x86/Makefile ++++ b/arch/x86/Makefile +@@ -241,9 +241,10 @@ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables + + # Avoid indirect branches in kernel to deal with Spectre + ifdef CONFIG_RETPOLINE +-ifneq ($(RETPOLINE_CFLAGS),) +- KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE ++ifeq ($(RETPOLINE_CFLAGS),) ++ $(error You are building kernel with non-retpoline compiler, please update your compiler.) + endif ++ KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) + endif + + archscripts: scripts_basic +diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c +index e5097dc85a06..7d12b0d1f359 100644 +--- a/arch/x86/events/core.c ++++ b/arch/x86/events/core.c +@@ -438,26 +438,6 @@ int x86_setup_perfctr(struct perf_event *event) + if (config == -1LL) + return -EINVAL; + +- /* +- * Branch tracing: +- */ +- if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS && +- !attr->freq && hwc->sample_period == 1) { +- /* BTS is not supported by this architecture. */ +- if (!x86_pmu.bts_active) +- return -EOPNOTSUPP; +- +- /* BTS is currently only allowed for user-mode. */ +- if (!attr->exclude_kernel) +- return -EOPNOTSUPP; +- +- /* disallow bts if conflicting events are present */ +- if (x86_add_exclusive(x86_lbr_exclusive_lbr)) +- return -EBUSY; +- +- event->destroy = hw_perf_lbr_event_destroy; +- } +- + hwc->config |= config; + + return 0; +diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c +index 228732654cfe..7bb80151bfff 100644 +--- a/arch/x86/events/intel/core.c ++++ b/arch/x86/events/intel/core.c +@@ -2345,16 +2345,7 @@ done: + static struct event_constraint * + intel_bts_constraints(struct perf_event *event) + { +- struct hw_perf_event *hwc = &event->hw; +- unsigned int hw_event, bts_event; +- +- if (event->attr.freq) +- return NULL; +- +- hw_event = hwc->config & INTEL_ARCH_EVENT_MASK; +- bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); +- +- if (unlikely(hw_event == bts_event && hwc->sample_period == 1)) ++ if (unlikely(intel_pmu_has_bts(event))) + return &bts_constraint; + + return NULL; +@@ -2973,10 +2964,47 @@ static unsigned long intel_pmu_free_running_flags(struct perf_event *event) + return flags; + } + ++static int intel_pmu_bts_config(struct perf_event *event) ++{ ++ struct perf_event_attr *attr = &event->attr; ++ ++ if (unlikely(intel_pmu_has_bts(event))) { ++ /* BTS is not supported by this architecture. */ ++ if (!x86_pmu.bts_active) ++ return -EOPNOTSUPP; ++ ++ /* BTS is currently only allowed for user-mode. */ ++ if (!attr->exclude_kernel) ++ return -EOPNOTSUPP; ++ ++ /* disallow bts if conflicting events are present */ ++ if (x86_add_exclusive(x86_lbr_exclusive_lbr)) ++ return -EBUSY; ++ ++ event->destroy = hw_perf_lbr_event_destroy; ++ } ++ ++ return 0; ++} ++ ++static int core_pmu_hw_config(struct perf_event *event) ++{ ++ int ret = x86_pmu_hw_config(event); ++ ++ if (ret) ++ return ret; ++ ++ return intel_pmu_bts_config(event); ++} ++ + static int intel_pmu_hw_config(struct perf_event *event) + { + int ret = x86_pmu_hw_config(event); + ++ if (ret) ++ return ret; ++ ++ ret = intel_pmu_bts_config(event); + if (ret) + return ret; + +@@ -2999,7 +3027,7 @@ static int intel_pmu_hw_config(struct perf_event *event) + /* + * BTS is set up earlier in this path, so don't account twice + */ +- if (!intel_pmu_has_bts(event)) { ++ if (!unlikely(intel_pmu_has_bts(event))) { + /* disallow lbr if conflicting events are present */ + if (x86_add_exclusive(x86_lbr_exclusive_lbr)) + return -EBUSY; +@@ -3462,7 +3490,7 @@ static __initconst const struct x86_pmu core_pmu = { + .enable_all = core_pmu_enable_all, + .enable = core_pmu_enable_event, + .disable = x86_pmu_disable_event, +- .hw_config = x86_pmu_hw_config, ++ .hw_config = core_pmu_hw_config, + .schedule_events = x86_schedule_events, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, +diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h +index c6698c63c047..3c51fcaf1e34 100644 +--- a/arch/x86/events/perf_event.h ++++ b/arch/x86/events/perf_event.h +@@ -850,11 +850,16 @@ static inline int amd_pmu_init(void) + + static inline bool intel_pmu_has_bts(struct perf_event *event) + { +- if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS && +- !event->attr.freq && event->hw.sample_period == 1) +- return true; ++ struct hw_perf_event *hwc = &event->hw; ++ unsigned int hw_event, bts_event; ++ ++ if (event->attr.freq) ++ return false; ++ ++ hw_event = hwc->config & INTEL_ARCH_EVENT_MASK; ++ bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); + +- return false; ++ return hw_event == bts_event && hwc->sample_period == 1; + } + + int intel_pmu_save_and_restart(struct perf_event *event); +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index 673d6e988196..7d910827126b 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -284,7 +284,9 @@ + #define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */ + #define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */ + #define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */ ++#define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */ + #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ ++#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ + + /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ + #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ +diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h +index ef7eec669a1b..62c62d3eb0ff 100644 +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -41,9 +41,10 @@ + + #define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */ + #define SPEC_CTRL_IBRS (1 << 0) /* Indirect Branch Restricted Speculation */ +-#define SPEC_CTRL_STIBP (1 << 1) /* Single Thread Indirect Branch Predictors */ ++#define SPEC_CTRL_STIBP_SHIFT 1 /* Single Thread Indirect Branch Predictor (STIBP) bit */ ++#define SPEC_CTRL_STIBP (1 << SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */ + #define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */ +-#define SPEC_CTRL_SSBD (1 << SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ ++#define SPEC_CTRL_SSBD (1 << SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ + + #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ + #define PRED_CMD_IBPB (1 << 0) /* Indirect Branch Prediction Barrier */ +diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h +index 1b4132161c1f..a633767419f2 100644 +--- a/arch/x86/include/asm/nospec-branch.h ++++ b/arch/x86/include/asm/nospec-branch.h +@@ -3,6 +3,8 @@ + #ifndef _ASM_X86_NOSPEC_BRANCH_H_ + #define _ASM_X86_NOSPEC_BRANCH_H_ + ++#include <linux/static_key.h> ++ + #include <asm/alternative.h> + #include <asm/alternative-asm.h> + #include <asm/cpufeatures.h> +@@ -162,29 +164,35 @@ + _ASM_PTR " 999b\n\t" \ + ".popsection\n\t" + +-#if defined(CONFIG_X86_64) && defined(RETPOLINE) ++#ifdef CONFIG_RETPOLINE ++#ifdef CONFIG_X86_64 + + /* +- * Since the inline asm uses the %V modifier which is only in newer GCC, +- * the 64-bit one is dependent on RETPOLINE not CONFIG_RETPOLINE. ++ * Inline asm uses the %V modifier which is only in newer GCC ++ * which is ensured when CONFIG_RETPOLINE is defined. + */ + # define CALL_NOSPEC \ + ANNOTATE_NOSPEC_ALTERNATIVE \ +- ALTERNATIVE( \ ++ ALTERNATIVE_2( \ + ANNOTATE_RETPOLINE_SAFE \ + "call *%[thunk_target]\n", \ + "call __x86_indirect_thunk_%V[thunk_target]\n", \ +- X86_FEATURE_RETPOLINE) ++ X86_FEATURE_RETPOLINE, \ ++ "lfence;\n" \ ++ ANNOTATE_RETPOLINE_SAFE \ ++ "call *%[thunk_target]\n", \ ++ X86_FEATURE_RETPOLINE_AMD) + # define THUNK_TARGET(addr) [thunk_target] "r" (addr) + +-#elif defined(CONFIG_X86_32) && defined(CONFIG_RETPOLINE) ++#else /* CONFIG_X86_32 */ + /* + * For i386 we use the original ret-equivalent retpoline, because + * otherwise we'll run out of registers. We don't care about CET + * here, anyway. + */ + # define CALL_NOSPEC \ +- ALTERNATIVE( \ ++ ANNOTATE_NOSPEC_ALTERNATIVE \ ++ ALTERNATIVE_2( \ + ANNOTATE_RETPOLINE_SAFE \ + "call *%[thunk_target]\n", \ + " jmp 904f;\n" \ +@@ -199,9 +207,14 @@ + " ret;\n" \ + " .align 16\n" \ + "904: call 901b;\n", \ +- X86_FEATURE_RETPOLINE) ++ X86_FEATURE_RETPOLINE, \ ++ "lfence;\n" \ ++ ANNOTATE_RETPOLINE_SAFE \ ++ "call *%[thunk_target]\n", \ ++ X86_FEATURE_RETPOLINE_AMD) + + # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) ++#endif + #else /* No retpoline for C / inline asm */ + # define CALL_NOSPEC "call *%[thunk_target]\n" + # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) +@@ -210,14 +223,19 @@ + /* The Spectre V2 mitigation variants */ + enum spectre_v2_mitigation { + SPECTRE_V2_NONE, +- SPECTRE_V2_RETPOLINE_MINIMAL, +- SPECTRE_V2_RETPOLINE_MINIMAL_AMD, + SPECTRE_V2_RETPOLINE_GENERIC, + SPECTRE_V2_RETPOLINE_AMD, +- SPECTRE_V2_IBRS, + SPECTRE_V2_IBRS_ENHANCED, + }; + ++/* The indirect branch speculation control variants */ ++enum spectre_v2_user_mitigation { ++ SPECTRE_V2_USER_NONE, ++ SPECTRE_V2_USER_STRICT, ++ SPECTRE_V2_USER_PRCTL, ++ SPECTRE_V2_USER_SECCOMP, ++}; ++ + /* The Speculative Store Bypass disable variants */ + enum ssb_mitigation { + SPEC_STORE_BYPASS_NONE, +@@ -295,6 +313,10 @@ do { \ + preempt_enable(); \ + } while (0) + ++DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp); ++DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); ++DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb); ++ + #endif /* __ASSEMBLY__ */ + + /* +diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h +index ae7c2c5cd7f0..5393babc0598 100644 +--- a/arch/x86/include/asm/spec-ctrl.h ++++ b/arch/x86/include/asm/spec-ctrl.h +@@ -53,12 +53,24 @@ static inline u64 ssbd_tif_to_spec_ctrl(u64 tifn) + return (tifn & _TIF_SSBD) >> (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT); + } + ++static inline u64 stibp_tif_to_spec_ctrl(u64 tifn) ++{ ++ BUILD_BUG_ON(TIF_SPEC_IB < SPEC_CTRL_STIBP_SHIFT); ++ return (tifn & _TIF_SPEC_IB) >> (TIF_SPEC_IB - SPEC_CTRL_STIBP_SHIFT); ++} ++ + static inline unsigned long ssbd_spec_ctrl_to_tif(u64 spec_ctrl) + { + BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT); + return (spec_ctrl & SPEC_CTRL_SSBD) << (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT); + } + ++static inline unsigned long stibp_spec_ctrl_to_tif(u64 spec_ctrl) ++{ ++ BUILD_BUG_ON(TIF_SPEC_IB < SPEC_CTRL_STIBP_SHIFT); ++ return (spec_ctrl & SPEC_CTRL_STIBP) << (TIF_SPEC_IB - SPEC_CTRL_STIBP_SHIFT); ++} ++ + static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn) + { + return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL; +@@ -70,11 +82,7 @@ extern void speculative_store_bypass_ht_init(void); + static inline void speculative_store_bypass_ht_init(void) { } + #endif + +-extern void speculative_store_bypass_update(unsigned long tif); +- +-static inline void speculative_store_bypass_update_current(void) +-{ +- speculative_store_bypass_update(current_thread_info()->flags); +-} ++extern void speculation_ctrl_update(unsigned long tif); ++extern void speculation_ctrl_update_current(void); + + #endif +diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h +index 9b6df68d8fd1..12ef2b49d11b 100644 +--- a/arch/x86/include/asm/switch_to.h ++++ b/arch/x86/include/asm/switch_to.h +@@ -11,9 +11,6 @@ struct task_struct *__switch_to_asm(struct task_struct *prev, + + __visible struct task_struct *__switch_to(struct task_struct *prev, + struct task_struct *next); +-struct tss_struct; +-void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, +- struct tss_struct *tss); + + /* This runs runs on the previous thread's stack. */ + static inline void prepare_switch_to(struct task_struct *prev, +diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h +index 95ff2d7f553f..bf9175d87844 100644 +--- a/arch/x86/include/asm/thread_info.h ++++ b/arch/x86/include/asm/thread_info.h +@@ -81,10 +81,12 @@ struct thread_info { + #define TIF_SIGPENDING 2 /* signal pending */ + #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ + #define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ +-#define TIF_SSBD 5 /* Reduced data speculation */ ++#define TIF_SSBD 5 /* Speculative store bypass disable */ + #define TIF_SYSCALL_EMU 6 /* syscall emulation active */ + #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ + #define TIF_SECCOMP 8 /* secure computing */ ++#define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */ ++#define TIF_SPEC_FORCE_UPDATE 10 /* Force speculation MSR update in context switch */ + #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ + #define TIF_UPROBE 12 /* breakpointed or singlestepping */ + #define TIF_PATCH_PENDING 13 /* pending live patching update */ +@@ -112,6 +114,8 @@ struct thread_info { + #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) + #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) + #define _TIF_SECCOMP (1 << TIF_SECCOMP) ++#define _TIF_SPEC_IB (1 << TIF_SPEC_IB) ++#define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) + #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) + #define _TIF_UPROBE (1 << TIF_UPROBE) + #define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING) +@@ -147,8 +151,18 @@ struct thread_info { + _TIF_FSCHECK) + + /* flags to check in __switch_to() */ +-#define _TIF_WORK_CTXSW \ +- (_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP|_TIF_SSBD) ++#define _TIF_WORK_CTXSW_BASE \ ++ (_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP| \ ++ _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE) ++ ++/* ++ * Avoid calls to __switch_to_xtra() on UP as STIBP is not evaluated. ++ */ ++#ifdef CONFIG_SMP ++# define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE | _TIF_SPEC_IB) ++#else ++# define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE) ++#endif + + #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY) + #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 2501be609b82..e31040333f0c 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -185,10 +185,14 @@ struct tlb_state { + + #define LOADED_MM_SWITCHING ((struct mm_struct *)1) + ++ /* Last user mm for optimizing IBPB */ ++ union { ++ struct mm_struct *last_user_mm; ++ unsigned long last_user_mm_ibpb; ++ }; ++ + u16 loaded_mm_asid; + u16 next_asid; +- /* last user mm's ctx id */ +- u64 last_ctx_id; + + /* + * We can be in one of several states: +diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c +index dda741bd5789..7e03515662c0 100644 +--- a/arch/x86/kernel/cpu/amd.c ++++ b/arch/x86/kernel/cpu/amd.c +@@ -554,7 +554,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) + nodes_per_socket = ((value >> 3) & 7) + 1; + } + +- if (c->x86 >= 0x15 && c->x86 <= 0x17) { ++ if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) && ++ !boot_cpu_has(X86_FEATURE_VIRT_SSBD) && ++ c->x86 >= 0x15 && c->x86 <= 0x17) { + unsigned int bit; + + switch (c->x86) { +diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c +index e92aedd93806..f7a6d6203e13 100644 +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -14,6 +14,7 @@ + #include <linux/module.h> + #include <linux/nospec.h> + #include <linux/prctl.h> ++#include <linux/sched/smt.h> + + #include <asm/spec-ctrl.h> + #include <asm/cmdline.h> +@@ -34,12 +35,10 @@ static void __init spectre_v2_select_mitigation(void); + static void __init ssb_select_mitigation(void); + static void __init l1tf_select_mitigation(void); + +-/* +- * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any +- * writes to SPEC_CTRL contain whatever reserved bits have been set. +- */ +-u64 __ro_after_init x86_spec_ctrl_base; ++/* The base value of the SPEC_CTRL MSR that always has to be preserved. */ ++u64 x86_spec_ctrl_base; + EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); ++static DEFINE_MUTEX(spec_ctrl_mutex); + + /* + * The vendor and possibly platform specific bits which can be modified in +@@ -54,6 +53,13 @@ static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS; + u64 __ro_after_init x86_amd_ls_cfg_base; + u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask; + ++/* Control conditional STIPB in switch_to() */ ++DEFINE_STATIC_KEY_FALSE(switch_to_cond_stibp); ++/* Control conditional IBPB in switch_mm() */ ++DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); ++/* Control unconditional IBPB in switch_mm() */ ++DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb); ++ + void __init check_bugs(void) + { + identify_boot_cpu(); +@@ -124,31 +130,6 @@ void __init check_bugs(void) + #endif + } + +-/* The kernel command line selection */ +-enum spectre_v2_mitigation_cmd { +- SPECTRE_V2_CMD_NONE, +- SPECTRE_V2_CMD_AUTO, +- SPECTRE_V2_CMD_FORCE, +- SPECTRE_V2_CMD_RETPOLINE, +- SPECTRE_V2_CMD_RETPOLINE_GENERIC, +- SPECTRE_V2_CMD_RETPOLINE_AMD, +-}; +- +-static const char *spectre_v2_strings[] = { +- [SPECTRE_V2_NONE] = "Vulnerable", +- [SPECTRE_V2_RETPOLINE_MINIMAL] = "Vulnerable: Minimal generic ASM retpoline", +- [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline", +- [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", +- [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", +- [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS", +-}; +- +-#undef pr_fmt +-#define pr_fmt(fmt) "Spectre V2 : " fmt +- +-static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = +- SPECTRE_V2_NONE; +- + void + x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) + { +@@ -166,9 +147,14 @@ x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) + guestval |= guest_spec_ctrl & x86_spec_ctrl_mask; + + /* SSBD controlled in MSR_SPEC_CTRL */ +- if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD)) ++ if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || ++ static_cpu_has(X86_FEATURE_AMD_SSBD)) + hostval |= ssbd_tif_to_spec_ctrl(ti->flags); + ++ /* Conditional STIBP enabled? */ ++ if (static_branch_unlikely(&switch_to_cond_stibp)) ++ hostval |= stibp_tif_to_spec_ctrl(ti->flags); ++ + if (hostval != guestval) { + msrval = setguest ? guestval : hostval; + wrmsrl(MSR_IA32_SPEC_CTRL, msrval); +@@ -202,7 +188,7 @@ x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) + tif = setguest ? ssbd_spec_ctrl_to_tif(guestval) : + ssbd_spec_ctrl_to_tif(hostval); + +- speculative_store_bypass_update(tif); ++ speculation_ctrl_update(tif); + } + } + EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl); +@@ -217,6 +203,15 @@ static void x86_amd_ssb_disable(void) + wrmsrl(MSR_AMD64_LS_CFG, msrval); + } + ++#undef pr_fmt ++#define pr_fmt(fmt) "Spectre V2 : " fmt ++ ++static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = ++ SPECTRE_V2_NONE; ++ ++static enum spectre_v2_user_mitigation spectre_v2_user __ro_after_init = ++ SPECTRE_V2_USER_NONE; ++ + #ifdef RETPOLINE + static bool spectre_v2_bad_module; + +@@ -238,67 +233,217 @@ static inline const char *spectre_v2_module_string(void) + static inline const char *spectre_v2_module_string(void) { return ""; } + #endif + +-static void __init spec2_print_if_insecure(const char *reason) ++static inline bool match_option(const char *arg, int arglen, const char *opt) + { +- if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) +- pr_info("%s selected on command line.\n", reason); ++ int len = strlen(opt); ++ ++ return len == arglen && !strncmp(arg, opt, len); + } + +-static void __init spec2_print_if_secure(const char *reason) ++/* The kernel command line selection for spectre v2 */ ++enum spectre_v2_mitigation_cmd { ++ SPECTRE_V2_CMD_NONE, ++ SPECTRE_V2_CMD_AUTO, ++ SPECTRE_V2_CMD_FORCE, ++ SPECTRE_V2_CMD_RETPOLINE, ++ SPECTRE_V2_CMD_RETPOLINE_GENERIC, ++ SPECTRE_V2_CMD_RETPOLINE_AMD, ++}; ++ ++enum spectre_v2_user_cmd { ++ SPECTRE_V2_USER_CMD_NONE, ++ SPECTRE_V2_USER_CMD_AUTO, ++ SPECTRE_V2_USER_CMD_FORCE, ++ SPECTRE_V2_USER_CMD_PRCTL, ++ SPECTRE_V2_USER_CMD_PRCTL_IBPB, ++ SPECTRE_V2_USER_CMD_SECCOMP, ++ SPECTRE_V2_USER_CMD_SECCOMP_IBPB, ++}; ++ ++static const char * const spectre_v2_user_strings[] = { ++ [SPECTRE_V2_USER_NONE] = "User space: Vulnerable", ++ [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection", ++ [SPECTRE_V2_USER_PRCTL] = "User space: Mitigation: STIBP via prctl", ++ [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl", ++}; ++ ++static const struct { ++ const char *option; ++ enum spectre_v2_user_cmd cmd; ++ bool secure; ++} v2_user_options[] __initdata = { ++ { "auto", SPECTRE_V2_USER_CMD_AUTO, false }, ++ { "off", SPECTRE_V2_USER_CMD_NONE, false }, ++ { "on", SPECTRE_V2_USER_CMD_FORCE, true }, ++ { "prctl", SPECTRE_V2_USER_CMD_PRCTL, false }, ++ { "prctl,ibpb", SPECTRE_V2_USER_CMD_PRCTL_IBPB, false }, ++ { "seccomp", SPECTRE_V2_USER_CMD_SECCOMP, false }, ++ { "seccomp,ibpb", SPECTRE_V2_USER_CMD_SECCOMP_IBPB, false }, ++}; ++ ++static void __init spec_v2_user_print_cond(const char *reason, bool secure) + { +- if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) +- pr_info("%s selected on command line.\n", reason); ++ if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure) ++ pr_info("spectre_v2_user=%s forced on command line.\n", reason); + } + +-static inline bool retp_compiler(void) ++static enum spectre_v2_user_cmd __init ++spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd) + { +- return __is_defined(RETPOLINE); ++ char arg[20]; ++ int ret, i; ++ ++ switch (v2_cmd) { ++ case SPECTRE_V2_CMD_NONE: ++ return SPECTRE_V2_USER_CMD_NONE; ++ case SPECTRE_V2_CMD_FORCE: ++ return SPECTRE_V2_USER_CMD_FORCE; ++ default: ++ break; ++ } ++ ++ ret = cmdline_find_option(boot_command_line, "spectre_v2_user", ++ arg, sizeof(arg)); ++ if (ret < 0) ++ return SPECTRE_V2_USER_CMD_AUTO; ++ ++ for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) { ++ if (match_option(arg, ret, v2_user_options[i].option)) { ++ spec_v2_user_print_cond(v2_user_options[i].option, ++ v2_user_options[i].secure); ++ return v2_user_options[i].cmd; ++ } ++ } ++ ++ pr_err("Unknown user space protection option (%s). Switching to AUTO select\n", arg); ++ return SPECTRE_V2_USER_CMD_AUTO; + } + +-static inline bool match_option(const char *arg, int arglen, const char *opt) ++static void __init ++spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) + { +- int len = strlen(opt); ++ enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE; ++ bool smt_possible = IS_ENABLED(CONFIG_SMP); ++ enum spectre_v2_user_cmd cmd; + +- return len == arglen && !strncmp(arg, opt, len); ++ if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP)) ++ return; ++ ++ if (cpu_smt_control == CPU_SMT_FORCE_DISABLED || ++ cpu_smt_control == CPU_SMT_NOT_SUPPORTED) ++ smt_possible = false; ++ ++ cmd = spectre_v2_parse_user_cmdline(v2_cmd); ++ switch (cmd) { ++ case SPECTRE_V2_USER_CMD_NONE: ++ goto set_mode; ++ case SPECTRE_V2_USER_CMD_FORCE: ++ mode = SPECTRE_V2_USER_STRICT; ++ break; ++ case SPECTRE_V2_USER_CMD_PRCTL: ++ case SPECTRE_V2_USER_CMD_PRCTL_IBPB: ++ mode = SPECTRE_V2_USER_PRCTL; ++ break; ++ case SPECTRE_V2_USER_CMD_AUTO: ++ case SPECTRE_V2_USER_CMD_SECCOMP: ++ case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: ++ if (IS_ENABLED(CONFIG_SECCOMP)) ++ mode = SPECTRE_V2_USER_SECCOMP; ++ else ++ mode = SPECTRE_V2_USER_PRCTL; ++ break; ++ } ++ ++ /* Initialize Indirect Branch Prediction Barrier */ ++ if (boot_cpu_has(X86_FEATURE_IBPB)) { ++ setup_force_cpu_cap(X86_FEATURE_USE_IBPB); ++ ++ switch (cmd) { ++ case SPECTRE_V2_USER_CMD_FORCE: ++ case SPECTRE_V2_USER_CMD_PRCTL_IBPB: ++ case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: ++ static_branch_enable(&switch_mm_always_ibpb); ++ break; ++ case SPECTRE_V2_USER_CMD_PRCTL: ++ case SPECTRE_V2_USER_CMD_AUTO: ++ case SPECTRE_V2_USER_CMD_SECCOMP: ++ static_branch_enable(&switch_mm_cond_ibpb); ++ break; ++ default: ++ break; ++ } ++ ++ pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", ++ static_key_enabled(&switch_mm_always_ibpb) ? ++ "always-on" : "conditional"); ++ } ++ ++ /* If enhanced IBRS is enabled no STIPB required */ ++ if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) ++ return; ++ ++ /* ++ * If SMT is not possible or STIBP is not available clear the STIPB ++ * mode. ++ */ ++ if (!smt_possible || !boot_cpu_has(X86_FEATURE_STIBP)) ++ mode = SPECTRE_V2_USER_NONE; ++set_mode: ++ spectre_v2_user = mode; ++ /* Only print the STIBP mode when SMT possible */ ++ if (smt_possible) ++ pr_info("%s\n", spectre_v2_user_strings[mode]); + } + ++static const char * const spectre_v2_strings[] = { ++ [SPECTRE_V2_NONE] = "Vulnerable", ++ [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", ++ [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", ++ [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS", ++}; ++ + static const struct { + const char *option; + enum spectre_v2_mitigation_cmd cmd; + bool secure; +-} mitigation_options[] = { +- { "off", SPECTRE_V2_CMD_NONE, false }, +- { "on", SPECTRE_V2_CMD_FORCE, true }, +- { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, +- { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false }, +- { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, +- { "auto", SPECTRE_V2_CMD_AUTO, false }, ++} mitigation_options[] __initdata = { ++ { "off", SPECTRE_V2_CMD_NONE, false }, ++ { "on", SPECTRE_V2_CMD_FORCE, true }, ++ { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, ++ { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false }, ++ { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, ++ { "auto", SPECTRE_V2_CMD_AUTO, false }, + }; + ++static void __init spec_v2_print_cond(const char *reason, bool secure) ++{ ++ if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure) ++ pr_info("%s selected on command line.\n", reason); ++} ++ + static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) + { ++ enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO; + char arg[20]; + int ret, i; +- enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO; + + if (cmdline_find_option_bool(boot_command_line, "nospectre_v2")) + return SPECTRE_V2_CMD_NONE; +- else { +- ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); +- if (ret < 0) +- return SPECTRE_V2_CMD_AUTO; + +- for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) { +- if (!match_option(arg, ret, mitigation_options[i].option)) +- continue; +- cmd = mitigation_options[i].cmd; +- break; +- } ++ ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); ++ if (ret < 0) ++ return SPECTRE_V2_CMD_AUTO; + +- if (i >= ARRAY_SIZE(mitigation_options)) { +- pr_err("unknown option (%s). Switching to AUTO select\n", arg); +- return SPECTRE_V2_CMD_AUTO; +- } ++ for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) { ++ if (!match_option(arg, ret, mitigation_options[i].option)) ++ continue; ++ cmd = mitigation_options[i].cmd; ++ break; ++ } ++ ++ if (i >= ARRAY_SIZE(mitigation_options)) { ++ pr_err("unknown option (%s). Switching to AUTO select\n", arg); ++ return SPECTRE_V2_CMD_AUTO; + } + + if ((cmd == SPECTRE_V2_CMD_RETPOLINE || +@@ -315,11 +460,8 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) + return SPECTRE_V2_CMD_AUTO; + } + +- if (mitigation_options[i].secure) +- spec2_print_if_secure(mitigation_options[i].option); +- else +- spec2_print_if_insecure(mitigation_options[i].option); +- ++ spec_v2_print_cond(mitigation_options[i].option, ++ mitigation_options[i].secure); + return cmd; + } + +@@ -375,14 +517,12 @@ retpoline_auto: + pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n"); + goto retpoline_generic; + } +- mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_AMD : +- SPECTRE_V2_RETPOLINE_MINIMAL_AMD; ++ mode = SPECTRE_V2_RETPOLINE_AMD; + setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD); + setup_force_cpu_cap(X86_FEATURE_RETPOLINE); + } else { + retpoline_generic: +- mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_GENERIC : +- SPECTRE_V2_RETPOLINE_MINIMAL; ++ mode = SPECTRE_V2_RETPOLINE_GENERIC; + setup_force_cpu_cap(X86_FEATURE_RETPOLINE); + } + +@@ -401,12 +541,6 @@ specv2_set_mode: + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); + pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); + +- /* Initialize Indirect Branch Prediction Barrier if supported */ +- if (boot_cpu_has(X86_FEATURE_IBPB)) { +- setup_force_cpu_cap(X86_FEATURE_USE_IBPB); +- pr_info("Spectre v2 mitigation: Enabling Indirect Branch Prediction Barrier\n"); +- } +- + /* + * Retpoline means the kernel is safe because it has no indirect + * branches. Enhanced IBRS protects firmware too, so, enable restricted +@@ -422,6 +556,66 @@ specv2_set_mode: + setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); + pr_info("Enabling Restricted Speculation for firmware calls\n"); + } ++ ++ /* Set up IBPB and STIBP depending on the general spectre V2 command */ ++ spectre_v2_user_select_mitigation(cmd); ++ ++ /* Enable STIBP if appropriate */ ++ arch_smt_update(); ++} ++ ++static void update_stibp_msr(void * __unused) ++{ ++ wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); ++} ++ ++/* Update x86_spec_ctrl_base in case SMT state changed. */ ++static void update_stibp_strict(void) ++{ ++ u64 mask = x86_spec_ctrl_base & ~SPEC_CTRL_STIBP; ++ ++ if (sched_smt_active()) ++ mask |= SPEC_CTRL_STIBP; ++ ++ if (mask == x86_spec_ctrl_base) ++ return; ++ ++ pr_info("Update user space SMT mitigation: STIBP %s\n", ++ mask & SPEC_CTRL_STIBP ? "always-on" : "off"); ++ x86_spec_ctrl_base = mask; ++ on_each_cpu(update_stibp_msr, NULL, 1); ++} ++ ++/* Update the static key controlling the evaluation of TIF_SPEC_IB */ ++static void update_indir_branch_cond(void) ++{ ++ if (sched_smt_active()) ++ static_branch_enable(&switch_to_cond_stibp); ++ else ++ static_branch_disable(&switch_to_cond_stibp); ++} ++ ++void arch_smt_update(void) ++{ ++ /* Enhanced IBRS implies STIBP. No update required. */ ++ if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) ++ return; ++ ++ mutex_lock(&spec_ctrl_mutex); ++ ++ switch (spectre_v2_user) { ++ case SPECTRE_V2_USER_NONE: ++ break; ++ case SPECTRE_V2_USER_STRICT: ++ update_stibp_strict(); ++ break; ++ case SPECTRE_V2_USER_PRCTL: ++ case SPECTRE_V2_USER_SECCOMP: ++ update_indir_branch_cond(); ++ break; ++ } ++ ++ mutex_unlock(&spec_ctrl_mutex); + } + + #undef pr_fmt +@@ -438,7 +632,7 @@ enum ssb_mitigation_cmd { + SPEC_STORE_BYPASS_CMD_SECCOMP, + }; + +-static const char *ssb_strings[] = { ++static const char * const ssb_strings[] = { + [SPEC_STORE_BYPASS_NONE] = "Vulnerable", + [SPEC_STORE_BYPASS_DISABLE] = "Mitigation: Speculative Store Bypass disabled", + [SPEC_STORE_BYPASS_PRCTL] = "Mitigation: Speculative Store Bypass disabled via prctl", +@@ -448,7 +642,7 @@ static const char *ssb_strings[] = { + static const struct { + const char *option; + enum ssb_mitigation_cmd cmd; +-} ssb_mitigation_options[] = { ++} ssb_mitigation_options[] __initdata = { + { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */ + { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */ + { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */ +@@ -532,18 +726,16 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) + if (mode == SPEC_STORE_BYPASS_DISABLE) { + setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE); + /* +- * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD uses +- * a completely different MSR and bit dependent on family. ++ * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may ++ * use a completely different MSR and bit dependent on family. + */ +- switch (boot_cpu_data.x86_vendor) { +- case X86_VENDOR_INTEL: ++ if (!static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) && ++ !static_cpu_has(X86_FEATURE_AMD_SSBD)) { ++ x86_amd_ssb_disable(); ++ } else { + x86_spec_ctrl_base |= SPEC_CTRL_SSBD; + x86_spec_ctrl_mask |= SPEC_CTRL_SSBD; + wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); +- break; +- case X86_VENDOR_AMD: +- x86_amd_ssb_disable(); +- break; + } + } + +@@ -561,10 +753,25 @@ static void ssb_select_mitigation(void) + #undef pr_fmt + #define pr_fmt(fmt) "Speculation prctl: " fmt + +-static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) ++static void task_update_spec_tif(struct task_struct *tsk) + { +- bool update; ++ /* Force the update of the real TIF bits */ ++ set_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE); + ++ /* ++ * Immediately update the speculation control MSRs for the current ++ * task, but for a non-current task delay setting the CPU ++ * mitigation until it is scheduled next. ++ * ++ * This can only happen for SECCOMP mitigation. For PRCTL it's ++ * always the current task. ++ */ ++ if (tsk == current) ++ speculation_ctrl_update_current(); ++} ++ ++static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) ++{ + if (ssb_mode != SPEC_STORE_BYPASS_PRCTL && + ssb_mode != SPEC_STORE_BYPASS_SECCOMP) + return -ENXIO; +@@ -575,28 +782,56 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) + if (task_spec_ssb_force_disable(task)) + return -EPERM; + task_clear_spec_ssb_disable(task); +- update = test_and_clear_tsk_thread_flag(task, TIF_SSBD); ++ task_update_spec_tif(task); + break; + case PR_SPEC_DISABLE: + task_set_spec_ssb_disable(task); +- update = !test_and_set_tsk_thread_flag(task, TIF_SSBD); ++ task_update_spec_tif(task); + break; + case PR_SPEC_FORCE_DISABLE: + task_set_spec_ssb_disable(task); + task_set_spec_ssb_force_disable(task); +- update = !test_and_set_tsk_thread_flag(task, TIF_SSBD); ++ task_update_spec_tif(task); + break; + default: + return -ERANGE; + } ++ return 0; ++} + +- /* +- * If being set on non-current task, delay setting the CPU +- * mitigation until it is next scheduled. +- */ +- if (task == current && update) +- speculative_store_bypass_update_current(); +- ++static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) ++{ ++ switch (ctrl) { ++ case PR_SPEC_ENABLE: ++ if (spectre_v2_user == SPECTRE_V2_USER_NONE) ++ return 0; ++ /* ++ * Indirect branch speculation is always disabled in strict ++ * mode. ++ */ ++ if (spectre_v2_user == SPECTRE_V2_USER_STRICT) ++ return -EPERM; ++ task_clear_spec_ib_disable(task); ++ task_update_spec_tif(task); ++ break; ++ case PR_SPEC_DISABLE: ++ case PR_SPEC_FORCE_DISABLE: ++ /* ++ * Indirect branch speculation is always allowed when ++ * mitigation is force disabled. ++ */ ++ if (spectre_v2_user == SPECTRE_V2_USER_NONE) ++ return -EPERM; ++ if (spectre_v2_user == SPECTRE_V2_USER_STRICT) ++ return 0; ++ task_set_spec_ib_disable(task); ++ if (ctrl == PR_SPEC_FORCE_DISABLE) ++ task_set_spec_ib_force_disable(task); ++ task_update_spec_tif(task); ++ break; ++ default: ++ return -ERANGE; ++ } + return 0; + } + +@@ -606,6 +841,8 @@ int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, + switch (which) { + case PR_SPEC_STORE_BYPASS: + return ssb_prctl_set(task, ctrl); ++ case PR_SPEC_INDIRECT_BRANCH: ++ return ib_prctl_set(task, ctrl); + default: + return -ENODEV; + } +@@ -616,6 +853,8 @@ void arch_seccomp_spec_mitigate(struct task_struct *task) + { + if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP) + ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE); ++ if (spectre_v2_user == SPECTRE_V2_USER_SECCOMP) ++ ib_prctl_set(task, PR_SPEC_FORCE_DISABLE); + } + #endif + +@@ -638,11 +877,35 @@ static int ssb_prctl_get(struct task_struct *task) + } + } + ++static int ib_prctl_get(struct task_struct *task) ++{ ++ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) ++ return PR_SPEC_NOT_AFFECTED; ++ ++ switch (spectre_v2_user) { ++ case SPECTRE_V2_USER_NONE: ++ return PR_SPEC_ENABLE; ++ case SPECTRE_V2_USER_PRCTL: ++ case SPECTRE_V2_USER_SECCOMP: ++ if (task_spec_ib_force_disable(task)) ++ return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; ++ if (task_spec_ib_disable(task)) ++ return PR_SPEC_PRCTL | PR_SPEC_DISABLE; ++ return PR_SPEC_PRCTL | PR_SPEC_ENABLE; ++ case SPECTRE_V2_USER_STRICT: ++ return PR_SPEC_DISABLE; ++ default: ++ return PR_SPEC_NOT_AFFECTED; ++ } ++} ++ + int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) + { + switch (which) { + case PR_SPEC_STORE_BYPASS: + return ssb_prctl_get(task); ++ case PR_SPEC_INDIRECT_BRANCH: ++ return ib_prctl_get(task); + default: + return -ENODEV; + } +@@ -780,7 +1043,7 @@ early_param("l1tf", l1tf_cmdline); + #define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion" + + #if IS_ENABLED(CONFIG_KVM_INTEL) +-static const char *l1tf_vmx_states[] = { ++static const char * const l1tf_vmx_states[] = { + [VMENTER_L1D_FLUSH_AUTO] = "auto", + [VMENTER_L1D_FLUSH_NEVER] = "vulnerable", + [VMENTER_L1D_FLUSH_COND] = "conditional cache flushes", +@@ -796,13 +1059,14 @@ static ssize_t l1tf_show_state(char *buf) + + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED || + (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER && +- cpu_smt_control == CPU_SMT_ENABLED)) ++ sched_smt_active())) { + return sprintf(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG, + l1tf_vmx_states[l1tf_vmx_mitigation]); ++ } + + return sprintf(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG, + l1tf_vmx_states[l1tf_vmx_mitigation], +- cpu_smt_control == CPU_SMT_ENABLED ? "vulnerable" : "disabled"); ++ sched_smt_active() ? "vulnerable" : "disabled"); + } + #else + static ssize_t l1tf_show_state(char *buf) +@@ -811,6 +1075,36 @@ static ssize_t l1tf_show_state(char *buf) + } + #endif + ++static char *stibp_state(void) ++{ ++ if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) ++ return ""; ++ ++ switch (spectre_v2_user) { ++ case SPECTRE_V2_USER_NONE: ++ return ", STIBP: disabled"; ++ case SPECTRE_V2_USER_STRICT: ++ return ", STIBP: forced"; ++ case SPECTRE_V2_USER_PRCTL: ++ case SPECTRE_V2_USER_SECCOMP: ++ if (static_key_enabled(&switch_to_cond_stibp)) ++ return ", STIBP: conditional"; ++ } ++ return ""; ++} ++ ++static char *ibpb_state(void) ++{ ++ if (boot_cpu_has(X86_FEATURE_IBPB)) { ++ if (static_key_enabled(&switch_mm_always_ibpb)) ++ return ", IBPB: always-on"; ++ if (static_key_enabled(&switch_mm_cond_ibpb)) ++ return ", IBPB: conditional"; ++ return ", IBPB: disabled"; ++ } ++ return ""; ++} ++ + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, + char *buf, unsigned int bug) + { +@@ -828,9 +1122,11 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr + return sprintf(buf, "Mitigation: __user pointer sanitization\n"); + + case X86_BUG_SPECTRE_V2: +- return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], +- boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "", ++ return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], ++ ibpb_state(), + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", ++ stibp_state(), ++ boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", + spectre_v2_module_string()); + + case X86_BUG_SPEC_STORE_BYPASS: +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 96643e2c75b8..51e49f6fe8e1 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -760,6 +760,12 @@ static void init_speculation_control(struct cpuinfo_x86 *c) + set_cpu_cap(c, X86_FEATURE_STIBP); + set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); + } ++ ++ if (cpu_has(c, X86_FEATURE_AMD_SSBD)) { ++ set_cpu_cap(c, X86_FEATURE_SSBD); ++ set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); ++ clear_cpu_cap(c, X86_FEATURE_VIRT_SSBD); ++ } + } + + void get_cpu_cap(struct cpuinfo_x86 *c) +@@ -958,7 +964,8 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); + + if (!x86_match_cpu(cpu_no_spec_store_bypass) && +- !(ia32_cap & ARCH_CAP_SSB_NO)) ++ !(ia32_cap & ARCH_CAP_SSB_NO) && ++ !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) + setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); + + if (x86_match_cpu(cpu_no_speculation)) +diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c +index dbcb01006749..beec0daecbc5 100644 +--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c ++++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c +@@ -56,7 +56,7 @@ + /* Threshold LVT offset is at MSR0xC0000410[15:12] */ + #define SMCA_THR_LVT_OFF 0xF000 + +-static bool thresholding_en; ++static bool thresholding_irq_en; + + static const char * const th_names[] = { + "load_store", +@@ -533,9 +533,8 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, + + set_offset: + offset = setup_APIC_mce_threshold(offset, new); +- +- if ((offset == new) && (mce_threshold_vector != amd_threshold_interrupt)) +- mce_threshold_vector = amd_threshold_interrupt; ++ if (offset == new) ++ thresholding_irq_en = true; + + done: + mce_threshold_block_init(&b, offset); +@@ -1356,9 +1355,6 @@ int mce_threshold_remove_device(unsigned int cpu) + { + unsigned int bank; + +- if (!thresholding_en) +- return 0; +- + for (bank = 0; bank < mca_cfg.banks; ++bank) { + if (!(per_cpu(bank_map, cpu) & (1 << bank))) + continue; +@@ -1376,9 +1372,6 @@ int mce_threshold_create_device(unsigned int cpu) + struct threshold_bank **bp; + int err = 0; + +- if (!thresholding_en) +- return 0; +- + bp = per_cpu(threshold_banks, cpu); + if (bp) + return 0; +@@ -1407,9 +1400,6 @@ static __init int threshold_init_device(void) + { + unsigned lcpu = 0; + +- if (mce_threshold_vector == amd_threshold_interrupt) +- thresholding_en = true; +- + /* to hit CPUs online before the notifier is up */ + for_each_online_cpu(lcpu) { + int err = mce_threshold_create_device(lcpu); +@@ -1418,6 +1408,9 @@ static __init int threshold_init_device(void) + return err; + } + ++ if (thresholding_irq_en) ++ mce_threshold_vector = amd_threshold_interrupt; ++ + return 0; + } + /* +diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c +index 61a949d84dfa..d99a8ee9e185 100644 +--- a/arch/x86/kernel/fpu/signal.c ++++ b/arch/x86/kernel/fpu/signal.c +@@ -344,10 +344,10 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) + sanitize_restored_xstate(tsk, &env, xfeatures, fx_only); + } + ++ local_bh_disable(); + fpu->initialized = 1; +- preempt_disable(); + fpu__restore(fpu); +- preempt_enable(); ++ local_bh_enable(); + + return err; + } else { +diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c +index 988a98f34c66..a98d1cdd6299 100644 +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -41,6 +41,8 @@ + #include <asm/prctl.h> + #include <asm/spec-ctrl.h> + ++#include "process.h" ++ + /* + * per-CPU TSS segments. Threads are completely 'soft' on Linux, + * no more per-task TSS's. The TSS size is kept cacheline-aligned +@@ -255,11 +257,12 @@ void arch_setup_new_exec(void) + enable_cpuid(); + } + +-static inline void switch_to_bitmap(struct tss_struct *tss, +- struct thread_struct *prev, ++static inline void switch_to_bitmap(struct thread_struct *prev, + struct thread_struct *next, + unsigned long tifp, unsigned long tifn) + { ++ struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); ++ + if (tifn & _TIF_IO_BITMAP) { + /* + * Copy the relevant range of the IO bitmap. +@@ -398,32 +401,85 @@ static __always_inline void amd_set_ssb_virt_state(unsigned long tifn) + wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn)); + } + +-static __always_inline void intel_set_ssb_state(unsigned long tifn) ++/* ++ * Update the MSRs managing speculation control, during context switch. ++ * ++ * tifp: Previous task's thread flags ++ * tifn: Next task's thread flags ++ */ ++static __always_inline void __speculation_ctrl_update(unsigned long tifp, ++ unsigned long tifn) + { +- u64 msr = x86_spec_ctrl_base | ssbd_tif_to_spec_ctrl(tifn); ++ unsigned long tif_diff = tifp ^ tifn; ++ u64 msr = x86_spec_ctrl_base; ++ bool updmsr = false; ++ ++ /* ++ * If TIF_SSBD is different, select the proper mitigation ++ * method. Note that if SSBD mitigation is disabled or permanentely ++ * enabled this branch can't be taken because nothing can set ++ * TIF_SSBD. ++ */ ++ if (tif_diff & _TIF_SSBD) { ++ if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) { ++ amd_set_ssb_virt_state(tifn); ++ } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) { ++ amd_set_core_ssb_state(tifn); ++ } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || ++ static_cpu_has(X86_FEATURE_AMD_SSBD)) { ++ msr |= ssbd_tif_to_spec_ctrl(tifn); ++ updmsr = true; ++ } ++ } ++ ++ /* ++ * Only evaluate TIF_SPEC_IB if conditional STIBP is enabled, ++ * otherwise avoid the MSR write. ++ */ ++ if (IS_ENABLED(CONFIG_SMP) && ++ static_branch_unlikely(&switch_to_cond_stibp)) { ++ updmsr |= !!(tif_diff & _TIF_SPEC_IB); ++ msr |= stibp_tif_to_spec_ctrl(tifn); ++ } + +- wrmsrl(MSR_IA32_SPEC_CTRL, msr); ++ if (updmsr) ++ wrmsrl(MSR_IA32_SPEC_CTRL, msr); + } + +-static __always_inline void __speculative_store_bypass_update(unsigned long tifn) ++static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) + { +- if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) +- amd_set_ssb_virt_state(tifn); +- else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) +- amd_set_core_ssb_state(tifn); +- else +- intel_set_ssb_state(tifn); ++ if (test_and_clear_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE)) { ++ if (task_spec_ssb_disable(tsk)) ++ set_tsk_thread_flag(tsk, TIF_SSBD); ++ else ++ clear_tsk_thread_flag(tsk, TIF_SSBD); ++ ++ if (task_spec_ib_disable(tsk)) ++ set_tsk_thread_flag(tsk, TIF_SPEC_IB); ++ else ++ clear_tsk_thread_flag(tsk, TIF_SPEC_IB); ++ } ++ /* Return the updated threadinfo flags*/ ++ return task_thread_info(tsk)->flags; + } + +-void speculative_store_bypass_update(unsigned long tif) ++void speculation_ctrl_update(unsigned long tif) + { ++ /* Forced update. Make sure all relevant TIF flags are different */ + preempt_disable(); +- __speculative_store_bypass_update(tif); ++ __speculation_ctrl_update(~tif, tif); + preempt_enable(); + } + +-void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, +- struct tss_struct *tss) ++/* Called from seccomp/prctl update */ ++void speculation_ctrl_update_current(void) ++{ ++ preempt_disable(); ++ speculation_ctrl_update(speculation_ctrl_update_tif(current)); ++ preempt_enable(); ++} ++ ++void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) + { + struct thread_struct *prev, *next; + unsigned long tifp, tifn; +@@ -433,7 +489,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, + + tifn = READ_ONCE(task_thread_info(next_p)->flags); + tifp = READ_ONCE(task_thread_info(prev_p)->flags); +- switch_to_bitmap(tss, prev, next, tifp, tifn); ++ switch_to_bitmap(prev, next, tifp, tifn); + + propagate_user_return_notify(prev_p, next_p); + +@@ -454,8 +510,15 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, + if ((tifp ^ tifn) & _TIF_NOCPUID) + set_cpuid_faulting(!!(tifn & _TIF_NOCPUID)); + +- if ((tifp ^ tifn) & _TIF_SSBD) +- __speculative_store_bypass_update(tifn); ++ if (likely(!((tifp | tifn) & _TIF_SPEC_FORCE_UPDATE))) { ++ __speculation_ctrl_update(tifp, tifn); ++ } else { ++ speculation_ctrl_update_tif(prev_p); ++ tifn = speculation_ctrl_update_tif(next_p); ++ ++ /* Enforce MSR update to ensure consistent state */ ++ __speculation_ctrl_update(~tifn, tifn); ++ } + } + + /* +diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h +new file mode 100644 +index 000000000000..898e97cf6629 +--- /dev/null ++++ b/arch/x86/kernel/process.h +@@ -0,0 +1,39 @@ ++// SPDX-License-Identifier: GPL-2.0 ++// ++// Code shared between 32 and 64 bit ++ ++#include <asm/spec-ctrl.h> ++ ++void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p); ++ ++/* ++ * This needs to be inline to optimize for the common case where no extra ++ * work needs to be done. ++ */ ++static inline void switch_to_extra(struct task_struct *prev, ++ struct task_struct *next) ++{ ++ unsigned long next_tif = task_thread_info(next)->flags; ++ unsigned long prev_tif = task_thread_info(prev)->flags; ++ ++ if (IS_ENABLED(CONFIG_SMP)) { ++ /* ++ * Avoid __switch_to_xtra() invocation when conditional ++ * STIPB is disabled and the only different bit is ++ * TIF_SPEC_IB. For CONFIG_SMP=n TIF_SPEC_IB is not ++ * in the TIF_WORK_CTXSW masks. ++ */ ++ if (!static_branch_likely(&switch_to_cond_stibp)) { ++ prev_tif &= ~_TIF_SPEC_IB; ++ next_tif &= ~_TIF_SPEC_IB; ++ } ++ } ++ ++ /* ++ * __switch_to_xtra() handles debug registers, i/o bitmaps, ++ * speculation mitigations etc. ++ */ ++ if (unlikely(next_tif & _TIF_WORK_CTXSW_NEXT || ++ prev_tif & _TIF_WORK_CTXSW_PREV)) ++ __switch_to_xtra(prev, next); ++} +diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c +index 5224c6099184..c2df91eab573 100644 +--- a/arch/x86/kernel/process_32.c ++++ b/arch/x86/kernel/process_32.c +@@ -59,6 +59,8 @@ + #include <asm/intel_rdt_sched.h> + #include <asm/proto.h> + ++#include "process.h" ++ + void __show_regs(struct pt_regs *regs, int all) + { + unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; +@@ -234,7 +236,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + struct fpu *prev_fpu = &prev->fpu; + struct fpu *next_fpu = &next->fpu; + int cpu = smp_processor_id(); +- struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu); + + /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ + +@@ -266,12 +267,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl)) + set_iopl_mask(next->iopl); + +- /* +- * Now maybe handle debug registers and/or IO bitmaps +- */ +- if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || +- task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) +- __switch_to_xtra(prev_p, next_p, tss); ++ switch_to_extra(prev_p, next_p); + + /* + * Leave lazy mode, flushing any hypercalls made here. +diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c +index cbeecfcc66d6..ec63d6be5e02 100644 +--- a/arch/x86/kernel/process_64.c ++++ b/arch/x86/kernel/process_64.c +@@ -59,6 +59,8 @@ + #include <asm/unistd_32_ia32.h> + #endif + ++#include "process.h" ++ + __visible DEFINE_PER_CPU(unsigned long, rsp_scratch); + + /* Prints also some state that isn't saved in the pt_regs */ +@@ -400,7 +402,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + struct fpu *prev_fpu = &prev->fpu; + struct fpu *next_fpu = &next->fpu; + int cpu = smp_processor_id(); +- struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu); + + WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && + this_cpu_read(irq_count) != -1); +@@ -467,12 +468,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + /* Reload sp0. */ + update_sp0(next_p); + +- /* +- * Now maybe reload the debug registers and handle I/O bitmaps +- */ +- if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT || +- task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) +- __switch_to_xtra(prev_p, next_p, tss); ++ __switch_to_xtra(prev_p, next_p); + + #ifdef CONFIG_XEN_PV + /* +diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c +index d1f5c744142b..bbcd69c76d96 100644 +--- a/arch/x86/kvm/cpuid.c ++++ b/arch/x86/kvm/cpuid.c +@@ -367,7 +367,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + + /* cpuid 0x80000008.ebx */ + const u32 kvm_cpuid_8000_0008_ebx_x86_features = +- F(AMD_IBPB) | F(AMD_IBRS) | F(VIRT_SSBD); ++ F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | ++ F(AMD_SSB_NO); + + /* cpuid 0xC0000001.edx */ + const u32 kvm_cpuid_C000_0001_edx_x86_features = +@@ -649,7 +650,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + entry->ebx |= F(VIRT_SSBD); + entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; + cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); +- if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD)) ++ /* ++ * The preference is to use SPEC CTRL MSR instead of the ++ * VIRT_SPEC MSR. ++ */ ++ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) && ++ !boot_cpu_has(X86_FEATURE_AMD_SSBD)) + entry->ebx |= F(VIRT_SSBD); + break; + } +diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c +index d755e0d44ac1..364d9895dd56 100644 +--- a/arch/x86/kvm/mmu.c ++++ b/arch/x86/kvm/mmu.c +@@ -4734,9 +4734,9 @@ static bool need_remote_flush(u64 old, u64 new) + } + + static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, +- const u8 *new, int *bytes) ++ int *bytes) + { +- u64 gentry; ++ u64 gentry = 0; + int r; + + /* +@@ -4748,22 +4748,12 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, + /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ + *gpa &= ~(gpa_t)7; + *bytes = 8; +- r = kvm_vcpu_read_guest(vcpu, *gpa, &gentry, 8); +- if (r) +- gentry = 0; +- new = (const u8 *)&gentry; + } + +- switch (*bytes) { +- case 4: +- gentry = *(const u32 *)new; +- break; +- case 8: +- gentry = *(const u64 *)new; +- break; +- default: +- gentry = 0; +- break; ++ if (*bytes == 4 || *bytes == 8) { ++ r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes); ++ if (r) ++ gentry = 0; + } + + return gentry; +@@ -4876,8 +4866,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, + + pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); + +- gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, new, &bytes); +- + /* + * No need to care whether allocation memory is successful + * or not since pte prefetch is skiped if it does not have +@@ -4886,6 +4874,9 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, + mmu_topup_memory_caches(vcpu); + + spin_lock(&vcpu->kvm->mmu_lock); ++ ++ gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes); ++ + ++vcpu->kvm->stat.mmu_pte_write; + kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); + +diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c +index f6bebcec60b4..17f08db34547 100644 +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -1733,21 +1733,31 @@ out: + return ERR_PTR(err); + } + ++static void svm_clear_current_vmcb(struct vmcb *vmcb) ++{ ++ int i; ++ ++ for_each_online_cpu(i) ++ cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL); ++} ++ + static void svm_free_vcpu(struct kvm_vcpu *vcpu) + { + struct vcpu_svm *svm = to_svm(vcpu); + ++ /* ++ * The vmcb page can be recycled, causing a false negative in ++ * svm_vcpu_load(). So, ensure that no logical CPU has this ++ * vmcb page recorded as its current vmcb. ++ */ ++ svm_clear_current_vmcb(svm->vmcb); ++ + __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT)); + __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); + __free_page(virt_to_page(svm->nested.hsave)); + __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); + kvm_vcpu_uninit(vcpu); + kmem_cache_free(kvm_vcpu_cache, svm); +- /* +- * The vmcb page can be recycled, causing a false negative in +- * svm_vcpu_load(). So do a full IBPB now. +- */ +- indirect_branch_prediction_barrier(); + } + + static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +@@ -3644,7 +3654,8 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + break; + case MSR_IA32_SPEC_CTRL: + if (!msr_info->host_initiated && +- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS)) ++ !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) && ++ !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) + return 1; + + msr_info->data = svm->spec_ctrl; +@@ -3749,11 +3760,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) + break; + case MSR_IA32_SPEC_CTRL: + if (!msr->host_initiated && +- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS)) ++ !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) && ++ !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) + return 1; + + /* The STIBP bit doesn't fault even if it's not advertised */ +- if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) ++ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD)) + return 1; + + svm->spec_ctrl = data; +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 8d688b213504..f24329659bea 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -6378,6 +6378,7 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, + clock_pairing.nsec = ts.tv_nsec; + clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle); + clock_pairing.flags = 0; ++ memset(&clock_pairing.pad, 0, sizeof(clock_pairing.pad)); + + ret = 0; + if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing, +@@ -6884,7 +6885,8 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) + else { + if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active) + kvm_x86_ops->sync_pir_to_irr(vcpu); +- kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); ++ if (ioapic_in_kernel(vcpu->kvm)) ++ kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); + } + bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors, + vcpu_to_synic(vcpu)->vec_bitmap, 256); +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index 83a3f4c935fc..5400a24e1a8c 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -29,6 +29,12 @@ + * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi + */ + ++/* ++ * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is ++ * stored in cpu_tlb_state.last_user_mm_ibpb. ++ */ ++#define LAST_USER_MM_IBPB 0x1UL ++ + /* + * We get here when we do something requiring a TLB invalidation + * but could not go invalidate all of the contexts. We do the +@@ -180,6 +186,89 @@ static void sync_current_stack_to_mm(struct mm_struct *mm) + } + } + ++static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next) ++{ ++ unsigned long next_tif = task_thread_info(next)->flags; ++ unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB; ++ ++ return (unsigned long)next->mm | ibpb; ++} ++ ++static void cond_ibpb(struct task_struct *next) ++{ ++ if (!next || !next->mm) ++ return; ++ ++ /* ++ * Both, the conditional and the always IBPB mode use the mm ++ * pointer to avoid the IBPB when switching between tasks of the ++ * same process. Using the mm pointer instead of mm->context.ctx_id ++ * opens a hypothetical hole vs. mm_struct reuse, which is more or ++ * less impossible to control by an attacker. Aside of that it ++ * would only affect the first schedule so the theoretically ++ * exposed data is not really interesting. ++ */ ++ if (static_branch_likely(&switch_mm_cond_ibpb)) { ++ unsigned long prev_mm, next_mm; ++ ++ /* ++ * This is a bit more complex than the always mode because ++ * it has to handle two cases: ++ * ++ * 1) Switch from a user space task (potential attacker) ++ * which has TIF_SPEC_IB set to a user space task ++ * (potential victim) which has TIF_SPEC_IB not set. ++ * ++ * 2) Switch from a user space task (potential attacker) ++ * which has TIF_SPEC_IB not set to a user space task ++ * (potential victim) which has TIF_SPEC_IB set. ++ * ++ * This could be done by unconditionally issuing IBPB when ++ * a task which has TIF_SPEC_IB set is either scheduled in ++ * or out. Though that results in two flushes when: ++ * ++ * - the same user space task is scheduled out and later ++ * scheduled in again and only a kernel thread ran in ++ * between. ++ * ++ * - a user space task belonging to the same process is ++ * scheduled in after a kernel thread ran in between ++ * ++ * - a user space task belonging to the same process is ++ * scheduled in immediately. ++ * ++ * Optimize this with reasonably small overhead for the ++ * above cases. Mangle the TIF_SPEC_IB bit into the mm ++ * pointer of the incoming task which is stored in ++ * cpu_tlbstate.last_user_mm_ibpb for comparison. ++ */ ++ next_mm = mm_mangle_tif_spec_ib(next); ++ prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb); ++ ++ /* ++ * Issue IBPB only if the mm's are different and one or ++ * both have the IBPB bit set. ++ */ ++ if (next_mm != prev_mm && ++ (next_mm | prev_mm) & LAST_USER_MM_IBPB) ++ indirect_branch_prediction_barrier(); ++ ++ this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm); ++ } ++ ++ if (static_branch_unlikely(&switch_mm_always_ibpb)) { ++ /* ++ * Only flush when switching to a user space task with a ++ * different context than the user space task which ran ++ * last on this CPU. ++ */ ++ if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) { ++ indirect_branch_prediction_barrier(); ++ this_cpu_write(cpu_tlbstate.last_user_mm, next->mm); ++ } ++ } ++} ++ + void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) + { +@@ -248,27 +337,13 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + } else { + u16 new_asid; + bool need_flush; +- u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); + + /* + * Avoid user/user BTB poisoning by flushing the branch + * predictor when switching between processes. This stops + * one process from doing Spectre-v2 attacks on another. +- * +- * As an optimization, flush indirect branches only when +- * switching into processes that disable dumping. This +- * protects high value processes like gpg, without having +- * too high performance overhead. IBPB is *expensive*! +- * +- * This will not flush branches when switching into kernel +- * threads. It will also not flush if we switch to idle +- * thread and back to the same process. It will flush if we +- * switch to a different non-dumpable process. + */ +- if (tsk && tsk->mm && +- tsk->mm->context.ctx_id != last_ctx_id && +- get_dumpable(tsk->mm) != SUID_DUMP_USER) +- indirect_branch_prediction_barrier(); ++ cond_ibpb(tsk); + + if (IS_ENABLED(CONFIG_VMAP_STACK)) { + /* +@@ -318,14 +393,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); + } + +- /* +- * Record last user mm's context id, so we can avoid +- * flushing branch buffer with IBPB if we switch back +- * to the same user. +- */ +- if (next != &init_mm) +- this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); +- + /* Make sure we write CR3 before loaded_mm. */ + barrier(); + +@@ -406,7 +473,7 @@ void initialize_tlbstate_and_flush(void) + write_cr3(build_cr3(mm->pgd, 0)); + + /* Reinitialize tlbstate. */ +- this_cpu_write(cpu_tlbstate.last_ctx_id, mm->context.ctx_id); ++ this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB); + this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); + this_cpu_write(cpu_tlbstate.next_asid, 1); + this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); +diff --git a/arch/xtensa/kernel/asm-offsets.c b/arch/xtensa/kernel/asm-offsets.c +index bcb5beb81177..7df02fc934a9 100644 +--- a/arch/xtensa/kernel/asm-offsets.c ++++ b/arch/xtensa/kernel/asm-offsets.c +@@ -91,14 +91,14 @@ int main(void) + DEFINE(THREAD_SP, offsetof (struct task_struct, thread.sp)); + DEFINE(THREAD_CPENABLE, offsetof (struct thread_info, cpenable)); + #if XTENSA_HAVE_COPROCESSORS +- DEFINE(THREAD_XTREGS_CP0, offsetof (struct thread_info, xtregs_cp)); +- DEFINE(THREAD_XTREGS_CP1, offsetof (struct thread_info, xtregs_cp)); +- DEFINE(THREAD_XTREGS_CP2, offsetof (struct thread_info, xtregs_cp)); +- DEFINE(THREAD_XTREGS_CP3, offsetof (struct thread_info, xtregs_cp)); +- DEFINE(THREAD_XTREGS_CP4, offsetof (struct thread_info, xtregs_cp)); +- DEFINE(THREAD_XTREGS_CP5, offsetof (struct thread_info, xtregs_cp)); +- DEFINE(THREAD_XTREGS_CP6, offsetof (struct thread_info, xtregs_cp)); +- DEFINE(THREAD_XTREGS_CP7, offsetof (struct thread_info, xtregs_cp)); ++ DEFINE(THREAD_XTREGS_CP0, offsetof(struct thread_info, xtregs_cp.cp0)); ++ DEFINE(THREAD_XTREGS_CP1, offsetof(struct thread_info, xtregs_cp.cp1)); ++ DEFINE(THREAD_XTREGS_CP2, offsetof(struct thread_info, xtregs_cp.cp2)); ++ DEFINE(THREAD_XTREGS_CP3, offsetof(struct thread_info, xtregs_cp.cp3)); ++ DEFINE(THREAD_XTREGS_CP4, offsetof(struct thread_info, xtregs_cp.cp4)); ++ DEFINE(THREAD_XTREGS_CP5, offsetof(struct thread_info, xtregs_cp.cp5)); ++ DEFINE(THREAD_XTREGS_CP6, offsetof(struct thread_info, xtregs_cp.cp6)); ++ DEFINE(THREAD_XTREGS_CP7, offsetof(struct thread_info, xtregs_cp.cp7)); + #endif + DEFINE(THREAD_XTREGS_USER, offsetof (struct thread_info, xtregs_user)); + DEFINE(XTREGS_USER_SIZE, sizeof(xtregs_user_t)); +diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c +index ff4f0ecb03dd..f1c46bc5d465 100644 +--- a/arch/xtensa/kernel/process.c ++++ b/arch/xtensa/kernel/process.c +@@ -88,18 +88,21 @@ void coprocessor_release_all(struct thread_info *ti) + + void coprocessor_flush_all(struct thread_info *ti) + { +- unsigned long cpenable; ++ unsigned long cpenable, old_cpenable; + int i; + + preempt_disable(); + ++ RSR_CPENABLE(old_cpenable); + cpenable = ti->cpenable; ++ WSR_CPENABLE(cpenable); + + for (i = 0; i < XCHAL_CP_MAX; i++) { + if ((cpenable & 1) != 0 && coprocessor_owner[i] == ti) + coprocessor_flush(ti, i); + cpenable >>= 1; + } ++ WSR_CPENABLE(old_cpenable); + + preempt_enable(); + } +diff --git a/arch/xtensa/kernel/ptrace.c b/arch/xtensa/kernel/ptrace.c +index e2461968efb2..7c3ed7d78075 100644 +--- a/arch/xtensa/kernel/ptrace.c ++++ b/arch/xtensa/kernel/ptrace.c +@@ -127,12 +127,37 @@ static int ptrace_setregs(struct task_struct *child, void __user *uregs) + } + + ++#if XTENSA_HAVE_COPROCESSORS ++#define CP_OFFSETS(cp) \ ++ { \ ++ .elf_xtregs_offset = offsetof(elf_xtregs_t, cp), \ ++ .ti_offset = offsetof(struct thread_info, xtregs_cp.cp), \ ++ .sz = sizeof(xtregs_ ## cp ## _t), \ ++ } ++ ++static const struct { ++ size_t elf_xtregs_offset; ++ size_t ti_offset; ++ size_t sz; ++} cp_offsets[] = { ++ CP_OFFSETS(cp0), ++ CP_OFFSETS(cp1), ++ CP_OFFSETS(cp2), ++ CP_OFFSETS(cp3), ++ CP_OFFSETS(cp4), ++ CP_OFFSETS(cp5), ++ CP_OFFSETS(cp6), ++ CP_OFFSETS(cp7), ++}; ++#endif ++ + static int ptrace_getxregs(struct task_struct *child, void __user *uregs) + { + struct pt_regs *regs = task_pt_regs(child); + struct thread_info *ti = task_thread_info(child); + elf_xtregs_t __user *xtregs = uregs; + int ret = 0; ++ int i __maybe_unused; + + if (!access_ok(VERIFY_WRITE, uregs, sizeof(elf_xtregs_t))) + return -EIO; +@@ -140,8 +165,13 @@ static int ptrace_getxregs(struct task_struct *child, void __user *uregs) + #if XTENSA_HAVE_COPROCESSORS + /* Flush all coprocessor registers to memory. */ + coprocessor_flush_all(ti); +- ret |= __copy_to_user(&xtregs->cp0, &ti->xtregs_cp, +- sizeof(xtregs_coprocessor_t)); ++ ++ for (i = 0; i < ARRAY_SIZE(cp_offsets); ++i) ++ ret |= __copy_to_user((char __user *)xtregs + ++ cp_offsets[i].elf_xtregs_offset, ++ (const char *)ti + ++ cp_offsets[i].ti_offset, ++ cp_offsets[i].sz); + #endif + ret |= __copy_to_user(&xtregs->opt, ®s->xtregs_opt, + sizeof(xtregs->opt)); +@@ -157,6 +187,7 @@ static int ptrace_setxregs(struct task_struct *child, void __user *uregs) + struct pt_regs *regs = task_pt_regs(child); + elf_xtregs_t *xtregs = uregs; + int ret = 0; ++ int i __maybe_unused; + + if (!access_ok(VERIFY_READ, uregs, sizeof(elf_xtregs_t))) + return -EFAULT; +@@ -166,8 +197,11 @@ static int ptrace_setxregs(struct task_struct *child, void __user *uregs) + coprocessor_flush_all(ti); + coprocessor_release_all(ti); + +- ret |= __copy_from_user(&ti->xtregs_cp, &xtregs->cp0, +- sizeof(xtregs_coprocessor_t)); ++ for (i = 0; i < ARRAY_SIZE(cp_offsets); ++i) ++ ret |= __copy_from_user((char *)ti + cp_offsets[i].ti_offset, ++ (const char __user *)xtregs + ++ cp_offsets[i].elf_xtregs_offset, ++ cp_offsets[i].sz); + #endif + ret |= __copy_from_user(®s->xtregs_opt, &xtregs->opt, + sizeof(xtregs->opt)); +diff --git a/drivers/android/binder.c b/drivers/android/binder.c +index a86c27948fca..96a0f940e54d 100644 +--- a/drivers/android/binder.c ++++ b/drivers/android/binder.c +@@ -2918,7 +2918,6 @@ static void binder_transaction(struct binder_proc *proc, + t->buffer = NULL; + goto err_binder_alloc_buf_failed; + } +- t->buffer->allow_user_free = 0; + t->buffer->debug_id = t->debug_id; + t->buffer->transaction = t; + t->buffer->target_node = target_node; +@@ -3407,14 +3406,18 @@ static int binder_thread_write(struct binder_proc *proc, + + buffer = binder_alloc_prepare_to_free(&proc->alloc, + data_ptr); +- if (buffer == NULL) { +- binder_user_error("%d:%d BC_FREE_BUFFER u%016llx no match\n", +- proc->pid, thread->pid, (u64)data_ptr); +- break; +- } +- if (!buffer->allow_user_free) { +- binder_user_error("%d:%d BC_FREE_BUFFER u%016llx matched unreturned buffer\n", +- proc->pid, thread->pid, (u64)data_ptr); ++ if (IS_ERR_OR_NULL(buffer)) { ++ if (PTR_ERR(buffer) == -EPERM) { ++ binder_user_error( ++ "%d:%d BC_FREE_BUFFER u%016llx matched unreturned or currently freeing buffer\n", ++ proc->pid, thread->pid, ++ (u64)data_ptr); ++ } else { ++ binder_user_error( ++ "%d:%d BC_FREE_BUFFER u%016llx no match\n", ++ proc->pid, thread->pid, ++ (u64)data_ptr); ++ } + break; + } + binder_debug(BINDER_DEBUG_FREE_BUFFER, +diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c +index 58e4658f9dd6..b9281f2725a6 100644 +--- a/drivers/android/binder_alloc.c ++++ b/drivers/android/binder_alloc.c +@@ -149,14 +149,12 @@ static struct binder_buffer *binder_alloc_prepare_to_free_locked( + else { + /* + * Guard against user threads attempting to +- * free the buffer twice ++ * free the buffer when in use by kernel or ++ * after it's already been freed. + */ +- if (buffer->free_in_progress) { +- pr_err("%d:%d FREE_BUFFER u%016llx user freed buffer twice\n", +- alloc->pid, current->pid, (u64)user_ptr); +- return NULL; +- } +- buffer->free_in_progress = 1; ++ if (!buffer->allow_user_free) ++ return ERR_PTR(-EPERM); ++ buffer->allow_user_free = 0; + return buffer; + } + } +@@ -486,7 +484,7 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc, + + rb_erase(best_fit, &alloc->free_buffers); + buffer->free = 0; +- buffer->free_in_progress = 0; ++ buffer->allow_user_free = 0; + binder_insert_allocated_buffer_locked(alloc, buffer); + binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, + "%d: binder_alloc_buf size %zd got %pK\n", +diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h +index 2dd33b6df104..a3ad7683b6f2 100644 +--- a/drivers/android/binder_alloc.h ++++ b/drivers/android/binder_alloc.h +@@ -50,8 +50,7 @@ struct binder_buffer { + unsigned free:1; + unsigned allow_user_free:1; + unsigned async_transaction:1; +- unsigned free_in_progress:1; +- unsigned debug_id:28; ++ unsigned debug_id:29; + + struct binder_transaction *transaction; + +diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c +index a861b5b4d443..21ed0e20c5d9 100644 +--- a/drivers/dma/at_hdmac.c ++++ b/drivers/dma/at_hdmac.c +@@ -1641,6 +1641,12 @@ static void atc_free_chan_resources(struct dma_chan *chan) + atchan->descs_allocated = 0; + atchan->status = 0; + ++ /* ++ * Free atslave allocated in at_dma_xlate() ++ */ ++ kfree(chan->private); ++ chan->private = NULL; ++ + dev_vdbg(chan2dev(chan), "free_chan_resources: done\n"); + } + +@@ -1675,7 +1681,7 @@ static struct dma_chan *at_dma_xlate(struct of_phandle_args *dma_spec, + dma_cap_zero(mask); + dma_cap_set(DMA_SLAVE, mask); + +- atslave = devm_kzalloc(&dmac_pdev->dev, sizeof(*atslave), GFP_KERNEL); ++ atslave = kzalloc(sizeof(*atslave), GFP_KERNEL); + if (!atslave) + return NULL; + +@@ -2000,6 +2006,8 @@ static int at_dma_remove(struct platform_device *pdev) + struct resource *io; + + at_dma_off(atdma); ++ if (pdev->dev.of_node) ++ of_dma_controller_free(pdev->dev.of_node); + dma_async_device_unregister(&atdma->dma_common); + + dma_pool_destroy(atdma->memset_pool); +diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c +index d96b09fea835..e05de5032f0c 100644 +--- a/drivers/hv/channel.c ++++ b/drivers/hv/channel.c +@@ -454,6 +454,14 @@ int vmbus_establish_gpadl(struct vmbus_channel *channel, void *kbuffer, + } + wait_for_completion(&msginfo->waitevent); + ++ if (msginfo->response.gpadl_created.creation_status != 0) { ++ pr_err("Failed to establish GPADL: err = 0x%x\n", ++ msginfo->response.gpadl_created.creation_status); ++ ++ ret = -EDQUOT; ++ goto cleanup; ++ } ++ + if (channel->rescind) { + ret = -ENODEV; + goto cleanup; +diff --git a/drivers/iio/magnetometer/st_magn_buffer.c b/drivers/iio/magnetometer/st_magn_buffer.c +index 0a9e8fadfa9d..37ab30566464 100644 +--- a/drivers/iio/magnetometer/st_magn_buffer.c ++++ b/drivers/iio/magnetometer/st_magn_buffer.c +@@ -30,11 +30,6 @@ int st_magn_trig_set_state(struct iio_trigger *trig, bool state) + return st_sensors_set_dataready_irq(indio_dev, state); + } + +-static int st_magn_buffer_preenable(struct iio_dev *indio_dev) +-{ +- return st_sensors_set_enable(indio_dev, true); +-} +- + static int st_magn_buffer_postenable(struct iio_dev *indio_dev) + { + int err; +@@ -50,7 +45,7 @@ static int st_magn_buffer_postenable(struct iio_dev *indio_dev) + if (err < 0) + goto st_magn_buffer_postenable_error; + +- return err; ++ return st_sensors_set_enable(indio_dev, true); + + st_magn_buffer_postenable_error: + kfree(mdata->buffer_data); +@@ -63,11 +58,11 @@ static int st_magn_buffer_predisable(struct iio_dev *indio_dev) + int err; + struct st_sensor_data *mdata = iio_priv(indio_dev); + +- err = iio_triggered_buffer_predisable(indio_dev); ++ err = st_sensors_set_enable(indio_dev, false); + if (err < 0) + goto st_magn_buffer_predisable_error; + +- err = st_sensors_set_enable(indio_dev, false); ++ err = iio_triggered_buffer_predisable(indio_dev); + + st_magn_buffer_predisable_error: + kfree(mdata->buffer_data); +@@ -75,7 +70,6 @@ st_magn_buffer_predisable_error: + } + + static const struct iio_buffer_setup_ops st_magn_buffer_setup_ops = { +- .preenable = &st_magn_buffer_preenable, + .postenable = &st_magn_buffer_postenable, + .predisable = &st_magn_buffer_predisable, + }; +diff --git a/drivers/media/usb/em28xx/em28xx-dvb.c b/drivers/media/usb/em28xx/em28xx-dvb.c +index 4a7db623fe29..29cdaaf1ed90 100644 +--- a/drivers/media/usb/em28xx/em28xx-dvb.c ++++ b/drivers/media/usb/em28xx/em28xx-dvb.c +@@ -2105,6 +2105,8 @@ static int em28xx_dvb_fini(struct em28xx *dev) + } + } + ++ em28xx_unregister_dvb(dvb); ++ + /* remove I2C SEC */ + client = dvb->i2c_client_sec; + if (client) { +@@ -2126,7 +2128,6 @@ static int em28xx_dvb_fini(struct em28xx *dev) + i2c_unregister_device(client); + } + +- em28xx_unregister_dvb(dvb); + kfree(dvb); + dev->dvb = NULL; + kref_put(&dev->ref, em28xx_free_device); +diff --git a/drivers/misc/mic/scif/scif_rma.c b/drivers/misc/mic/scif/scif_rma.c +index 329727e00e97..95745dc4e0ec 100644 +--- a/drivers/misc/mic/scif/scif_rma.c ++++ b/drivers/misc/mic/scif/scif_rma.c +@@ -417,7 +417,7 @@ static int scif_create_remote_lookup(struct scif_dev *remote_dev, + if (err) + goto error_window; + err = scif_map_page(&window->num_pages_lookup.lookup[j], +- vmalloc_dma_phys ? ++ vmalloc_num_pages ? + vmalloc_to_page(&window->num_pages[i]) : + virt_to_page(&window->num_pages[i]), + remote_dev); +diff --git a/drivers/mtd/ubi/vtbl.c b/drivers/mtd/ubi/vtbl.c +index 94d7a865b135..7504f430c011 100644 +--- a/drivers/mtd/ubi/vtbl.c ++++ b/drivers/mtd/ubi/vtbl.c +@@ -578,6 +578,16 @@ static int init_volumes(struct ubi_device *ubi, + vol->ubi = ubi; + reserved_pebs += vol->reserved_pebs; + ++ /* ++ * We use ubi->peb_count and not vol->reserved_pebs because ++ * we want to keep the code simple. Otherwise we'd have to ++ * resize/check the bitmap upon volume resize too. ++ * Allocating a few bytes more does not hurt. ++ */ ++ err = ubi_fastmap_init_checkmap(vol, ubi->peb_count); ++ if (err) ++ return err; ++ + /* + * In case of dynamic volume UBI knows nothing about how many + * data is stored there. So assume the whole volume is used. +@@ -620,16 +630,6 @@ static int init_volumes(struct ubi_device *ubi, + (long long)(vol->used_ebs - 1) * vol->usable_leb_size; + vol->used_bytes += av->last_data_size; + vol->last_eb_bytes = av->last_data_size; +- +- /* +- * We use ubi->peb_count and not vol->reserved_pebs because +- * we want to keep the code simple. Otherwise we'd have to +- * resize/check the bitmap upon volume resize too. +- * Allocating a few bytes more does not hurt. +- */ +- err = ubi_fastmap_init_checkmap(vol, ubi->peb_count); +- if (err) +- return err; + } + + /* And add the layout volume */ +diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c +index 2237ef8e4344..f13256af8031 100644 +--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c ++++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c +@@ -1691,6 +1691,7 @@ static int nicvf_xdp_setup(struct nicvf *nic, struct bpf_prog *prog) + bool if_up = netif_running(nic->netdev); + struct bpf_prog *old_prog; + bool bpf_attached = false; ++ int ret = 0; + + /* For now just support only the usual MTU sized frames */ + if (prog && (dev->mtu > 1500)) { +@@ -1724,8 +1725,12 @@ static int nicvf_xdp_setup(struct nicvf *nic, struct bpf_prog *prog) + if (nic->xdp_prog) { + /* Attach BPF program */ + nic->xdp_prog = bpf_prog_add(nic->xdp_prog, nic->rx_queues - 1); +- if (!IS_ERR(nic->xdp_prog)) ++ if (!IS_ERR(nic->xdp_prog)) { + bpf_attached = true; ++ } else { ++ ret = PTR_ERR(nic->xdp_prog); ++ nic->xdp_prog = NULL; ++ } + } + + /* Calculate Tx queues needed for XDP and network stack */ +@@ -1737,7 +1742,7 @@ static int nicvf_xdp_setup(struct nicvf *nic, struct bpf_prog *prog) + netif_trans_update(nic->netdev); + } + +- return 0; ++ return ret; + } + + static int nicvf_xdp(struct net_device *netdev, struct netdev_xdp *xdp) +diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c +index a3d12dbde95b..09494e1c77c5 100644 +--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c ++++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c +@@ -585,10 +585,12 @@ static void nicvf_free_snd_queue(struct nicvf *nic, struct snd_queue *sq) + if (!sq->dmem.base) + return; + +- if (sq->tso_hdrs) ++ if (sq->tso_hdrs) { + dma_free_coherent(&nic->pdev->dev, + sq->dmem.q_len * TSO_HEADER_SIZE, + sq->tso_hdrs, sq->tso_hdrs_phys); ++ sq->tso_hdrs = NULL; ++ } + + /* Free pending skbs in the queue */ + smp_rmb(); +diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c +index e9f101c9bae2..bfbb39f93554 100644 +--- a/drivers/net/rionet.c ++++ b/drivers/net/rionet.c +@@ -216,9 +216,9 @@ static int rionet_start_xmit(struct sk_buff *skb, struct net_device *ndev) + * it just report sending a packet to the target + * (without actual packet transfer). + */ +- dev_kfree_skb_any(skb); + ndev->stats.tx_packets++; + ndev->stats.tx_bytes += skb->len; ++ dev_kfree_skb_any(skb); + } + } + +diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c +index d49c7103085e..aabbcfb6e6da 100644 +--- a/drivers/net/usb/ipheth.c ++++ b/drivers/net/usb/ipheth.c +@@ -140,7 +140,6 @@ struct ipheth_device { + struct usb_device *udev; + struct usb_interface *intf; + struct net_device *net; +- struct sk_buff *tx_skb; + struct urb *tx_urb; + struct urb *rx_urb; + unsigned char *tx_buf; +@@ -229,6 +228,7 @@ static void ipheth_rcvbulk_callback(struct urb *urb) + case -ENOENT: + case -ECONNRESET: + case -ESHUTDOWN: ++ case -EPROTO: + return; + case 0: + break; +@@ -280,7 +280,6 @@ static void ipheth_sndbulk_callback(struct urb *urb) + dev_err(&dev->intf->dev, "%s: urb status: %d\n", + __func__, status); + +- dev_kfree_skb_irq(dev->tx_skb); + netif_wake_queue(dev->net); + } + +@@ -410,7 +409,7 @@ static int ipheth_tx(struct sk_buff *skb, struct net_device *net) + if (skb->len > IPHETH_BUF_SIZE) { + WARN(1, "%s: skb too large: %d bytes\n", __func__, skb->len); + dev->net->stats.tx_dropped++; +- dev_kfree_skb_irq(skb); ++ dev_kfree_skb_any(skb); + return NETDEV_TX_OK; + } + +@@ -430,12 +429,11 @@ static int ipheth_tx(struct sk_buff *skb, struct net_device *net) + dev_err(&dev->intf->dev, "%s: usb_submit_urb: %d\n", + __func__, retval); + dev->net->stats.tx_errors++; +- dev_kfree_skb_irq(skb); ++ dev_kfree_skb_any(skb); + } else { +- dev->tx_skb = skb; +- + dev->net->stats.tx_packets++; + dev->net->stats.tx_bytes += skb->len; ++ dev_consume_skb_any(skb); + netif_stop_queue(net); + } + +diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c +index f528e9ac3413..0e8e3be50332 100644 +--- a/drivers/net/virtio_net.c ++++ b/drivers/net/virtio_net.c +@@ -61,7 +61,8 @@ static const unsigned long guest_offloads[] = { + VIRTIO_NET_F_GUEST_TSO4, + VIRTIO_NET_F_GUEST_TSO6, + VIRTIO_NET_F_GUEST_ECN, +- VIRTIO_NET_F_GUEST_UFO ++ VIRTIO_NET_F_GUEST_UFO, ++ VIRTIO_NET_F_GUEST_CSUM + }; + + struct virtnet_stats { +@@ -1939,9 +1940,6 @@ static int virtnet_clear_guest_offloads(struct virtnet_info *vi) + if (!vi->guest_offloads) + return 0; + +- if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM)) +- offloads = 1ULL << VIRTIO_NET_F_GUEST_CSUM; +- + return virtnet_set_guest_offloads(vi, offloads); + } + +@@ -1951,8 +1949,6 @@ static int virtnet_restore_guest_offloads(struct virtnet_info *vi) + + if (!vi->guest_offloads) + return 0; +- if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM)) +- offloads |= 1ULL << VIRTIO_NET_F_GUEST_CSUM; + + return virtnet_set_guest_offloads(vi, offloads); + } +@@ -1970,8 +1966,9 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, + && (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) || + virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) || + virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) || +- virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO))) { +- NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing LRO, disable LRO first"); ++ virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO) || ++ virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM))) { ++ NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing LRO/CSUM, disable LRO/CSUM first"); + return -EOPNOTSUPP; + } + +diff --git a/drivers/net/wireless/ath/wil6210/wmi.c b/drivers/net/wireless/ath/wil6210/wmi.c +index ffdd2fa401b1..d63d7c326801 100644 +--- a/drivers/net/wireless/ath/wil6210/wmi.c ++++ b/drivers/net/wireless/ath/wil6210/wmi.c +@@ -1380,8 +1380,14 @@ int wmi_set_ie(struct wil6210_priv *wil, u8 type, u16 ie_len, const void *ie) + }; + int rc; + u16 len = sizeof(struct wmi_set_appie_cmd) + ie_len; +- struct wmi_set_appie_cmd *cmd = kzalloc(len, GFP_KERNEL); ++ struct wmi_set_appie_cmd *cmd; + ++ if (len < ie_len) { ++ rc = -EINVAL; ++ goto out; ++ } ++ ++ cmd = kzalloc(len, GFP_KERNEL); + if (!cmd) { + rc = -ENOMEM; + goto out; +diff --git a/drivers/net/wireless/ti/wlcore/cmd.c b/drivers/net/wireless/ti/wlcore/cmd.c +index f48c3f62966d..761cf8573a80 100644 +--- a/drivers/net/wireless/ti/wlcore/cmd.c ++++ b/drivers/net/wireless/ti/wlcore/cmd.c +@@ -35,7 +35,6 @@ + #include "wl12xx_80211.h" + #include "cmd.h" + #include "event.h" +-#include "ps.h" + #include "tx.h" + #include "hw_ops.h" + +@@ -192,10 +191,6 @@ int wlcore_cmd_wait_for_event_or_timeout(struct wl1271 *wl, + + timeout_time = jiffies + msecs_to_jiffies(WL1271_EVENT_TIMEOUT); + +- ret = wl1271_ps_elp_wakeup(wl); +- if (ret < 0) +- return ret; +- + do { + if (time_after(jiffies, timeout_time)) { + wl1271_debug(DEBUG_CMD, "timeout waiting for event %d", +@@ -227,7 +222,6 @@ int wlcore_cmd_wait_for_event_or_timeout(struct wl1271 *wl, + } while (!event); + + out: +- wl1271_ps_elp_sleep(wl); + kfree(events_vector); + return ret; + } +diff --git a/drivers/pci/dwc/pci-layerscape.c b/drivers/pci/dwc/pci-layerscape.c +index 87fa486bee2c..1ede4b60aac3 100644 +--- a/drivers/pci/dwc/pci-layerscape.c ++++ b/drivers/pci/dwc/pci-layerscape.c +@@ -89,7 +89,7 @@ static void ls_pcie_disable_outbound_atus(struct ls_pcie *pcie) + int i; + + for (i = 0; i < PCIE_IATU_NUM; i++) +- dw_pcie_disable_atu(pcie->pci, DW_PCIE_REGION_OUTBOUND, i); ++ dw_pcie_disable_atu(pcie->pci, i, DW_PCIE_REGION_OUTBOUND); + } + + static int ls1021_pcie_link_up(struct dw_pcie *pci) +diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c +index 169dd7127f9e..69ef5f4060ed 100644 +--- a/drivers/s390/net/qeth_core_main.c ++++ b/drivers/s390/net/qeth_core_main.c +@@ -4545,8 +4545,8 @@ static int qeth_snmp_command_cb(struct qeth_card *card, + { + struct qeth_ipa_cmd *cmd; + struct qeth_arp_query_info *qinfo; +- struct qeth_snmp_cmd *snmp; + unsigned char *data; ++ void *snmp_data; + __u16 data_len; + + QETH_CARD_TEXT(card, 3, "snpcmdcb"); +@@ -4554,7 +4554,6 @@ static int qeth_snmp_command_cb(struct qeth_card *card, + cmd = (struct qeth_ipa_cmd *) sdata; + data = (unsigned char *)((char *)cmd - reply->offset); + qinfo = (struct qeth_arp_query_info *) reply->param; +- snmp = &cmd->data.setadapterparms.data.snmp; + + if (cmd->hdr.return_code) { + QETH_CARD_TEXT_(card, 4, "scer1%x", cmd->hdr.return_code); +@@ -4567,10 +4566,15 @@ static int qeth_snmp_command_cb(struct qeth_card *card, + return 0; + } + data_len = *((__u16 *)QETH_IPA_PDU_LEN_PDU1(data)); +- if (cmd->data.setadapterparms.hdr.seq_no == 1) +- data_len -= (__u16)((char *)&snmp->data - (char *)cmd); +- else +- data_len -= (__u16)((char *)&snmp->request - (char *)cmd); ++ if (cmd->data.setadapterparms.hdr.seq_no == 1) { ++ snmp_data = &cmd->data.setadapterparms.data.snmp; ++ data_len -= offsetof(struct qeth_ipa_cmd, ++ data.setadapterparms.data.snmp); ++ } else { ++ snmp_data = &cmd->data.setadapterparms.data.snmp.request; ++ data_len -= offsetof(struct qeth_ipa_cmd, ++ data.setadapterparms.data.snmp.request); ++ } + + /* check if there is enough room in userspace */ + if ((qinfo->udata_len - qinfo->udata_offset) < data_len) { +@@ -4583,16 +4587,9 @@ static int qeth_snmp_command_cb(struct qeth_card *card, + QETH_CARD_TEXT_(card, 4, "sseqn%i", + cmd->data.setadapterparms.hdr.seq_no); + /*copy entries to user buffer*/ +- if (cmd->data.setadapterparms.hdr.seq_no == 1) { +- memcpy(qinfo->udata + qinfo->udata_offset, +- (char *)snmp, +- data_len + offsetof(struct qeth_snmp_cmd, data)); +- qinfo->udata_offset += offsetof(struct qeth_snmp_cmd, data); +- } else { +- memcpy(qinfo->udata + qinfo->udata_offset, +- (char *)&snmp->request, data_len); +- } ++ memcpy(qinfo->udata + qinfo->udata_offset, snmp_data, data_len); + qinfo->udata_offset += data_len; ++ + /* check if all replies received ... */ + QETH_CARD_TEXT_(card, 4, "srtot%i", + cmd->data.setadapterparms.hdr.used_total); +diff --git a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c +index bd4352fe2de3..83852f323c5e 100644 +--- a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c ++++ b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c +@@ -1293,7 +1293,7 @@ static int cfg80211_rtw_get_station(struct wiphy *wiphy, + + sinfo->filled |= BIT(NL80211_STA_INFO_TX_PACKETS); + sinfo->tx_packets = psta->sta_stats.tx_pkts; +- ++ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_FAILED); + } + + /* for Ad-Hoc/AP mode */ +diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c +index 314ffac50bb8..f05e9af4fe81 100644 +--- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c ++++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c +@@ -1461,6 +1461,7 @@ vchiq_compat_ioctl_await_completion(struct file *file, + struct vchiq_await_completion32 args32; + struct vchiq_completion_data32 completion32; + unsigned int *msgbufcount32; ++ unsigned int msgbufcount_native; + compat_uptr_t msgbuf32; + void *msgbuf; + void **msgbufptr; +@@ -1572,7 +1573,11 @@ vchiq_compat_ioctl_await_completion(struct file *file, + sizeof(completion32))) + return -EFAULT; + +- args32.msgbufcount--; ++ if (get_user(msgbufcount_native, &args->msgbufcount)) ++ return -EFAULT; ++ ++ if (!msgbufcount_native) ++ args32.msgbufcount--; + + msgbufcount32 = + &((struct vchiq_await_completion32 __user *)arg)->msgbufcount; +diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c +index 1e8f68960014..808437c5ec49 100644 +--- a/drivers/usb/core/quirks.c ++++ b/drivers/usb/core/quirks.c +@@ -64,6 +64,9 @@ static const struct usb_device_id usb_quirk_list[] = { + /* Microsoft LifeCam-VX700 v2.0 */ + { USB_DEVICE(0x045e, 0x0770), .driver_info = USB_QUIRK_RESET_RESUME }, + ++ /* Cherry Stream G230 2.0 (G85-231) and 3.0 (G85-232) */ ++ { USB_DEVICE(0x046a, 0x0023), .driver_info = USB_QUIRK_RESET_RESUME }, ++ + /* Logitech HD Pro Webcams C920, C920-C, C925e and C930e */ + { USB_DEVICE(0x046d, 0x082d), .driver_info = USB_QUIRK_DELAY_INIT }, + { USB_DEVICE(0x046d, 0x0841), .driver_info = USB_QUIRK_DELAY_INIT }, +diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c +index ac8d619ff887..b8704c0678f9 100644 +--- a/drivers/usb/dwc3/gadget.c ++++ b/drivers/usb/dwc3/gadget.c +@@ -1511,9 +1511,6 @@ int __dwc3_gadget_ep_set_halt(struct dwc3_ep *dep, int value, int protocol) + unsigned transfer_in_flight; + unsigned started; + +- if (dep->flags & DWC3_EP_STALL) +- return 0; +- + if (dep->number > 1) + trb = dwc3_ep_prev_trb(dep, dep->trb_enqueue); + else +@@ -1535,8 +1532,6 @@ int __dwc3_gadget_ep_set_halt(struct dwc3_ep *dep, int value, int protocol) + else + dep->flags |= DWC3_EP_STALL; + } else { +- if (!(dep->flags & DWC3_EP_STALL)) +- return 0; + + ret = dwc3_send_clear_stall_ep_cmd(dep); + if (ret) +diff --git a/drivers/usb/storage/unusual_realtek.h b/drivers/usb/storage/unusual_realtek.h +index 8fe624ad302a..7ca779493671 100644 +--- a/drivers/usb/storage/unusual_realtek.h ++++ b/drivers/usb/storage/unusual_realtek.h +@@ -39,4 +39,14 @@ UNUSUAL_DEV(0x0bda, 0x0159, 0x0000, 0x9999, + "USB Card Reader", + USB_SC_DEVICE, USB_PR_DEVICE, init_realtek_cr, 0), + ++UNUSUAL_DEV(0x0bda, 0x0177, 0x0000, 0x9999, ++ "Realtek", ++ "USB Card Reader", ++ USB_SC_DEVICE, USB_PR_DEVICE, init_realtek_cr, 0), ++ ++UNUSUAL_DEV(0x0bda, 0x0184, 0x0000, 0x9999, ++ "Realtek", ++ "USB Card Reader", ++ USB_SC_DEVICE, USB_PR_DEVICE, init_realtek_cr, 0), ++ + #endif /* defined(CONFIG_USB_STORAGE_REALTEK) || ... */ +diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile +index f2cd9dedb037..195229df5ba0 100644 +--- a/fs/btrfs/Makefile ++++ b/fs/btrfs/Makefile +@@ -10,7 +10,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ + export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ + compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ + reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ +- uuid-tree.o props.o hash.o free-space-tree.o ++ uuid-tree.o props.o hash.o free-space-tree.o tree-checker.o + + btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o + btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c +index 0e67cee73c53..e42673477c25 100644 +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -50,6 +50,7 @@ + #include "sysfs.h" + #include "qgroup.h" + #include "compression.h" ++#include "tree-checker.h" + + #ifdef CONFIG_X86 + #include <asm/cpufeature.h> +@@ -544,146 +545,6 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info, + return ret; + } + +-#define CORRUPT(reason, eb, root, slot) \ +- btrfs_crit(root->fs_info, \ +- "corrupt %s, %s: block=%llu, root=%llu, slot=%d", \ +- btrfs_header_level(eb) == 0 ? "leaf" : "node", \ +- reason, btrfs_header_bytenr(eb), root->objectid, slot) +- +-static noinline int check_leaf(struct btrfs_root *root, +- struct extent_buffer *leaf) +-{ +- struct btrfs_fs_info *fs_info = root->fs_info; +- struct btrfs_key key; +- struct btrfs_key leaf_key; +- u32 nritems = btrfs_header_nritems(leaf); +- int slot; +- +- /* +- * Extent buffers from a relocation tree have a owner field that +- * corresponds to the subvolume tree they are based on. So just from an +- * extent buffer alone we can not find out what is the id of the +- * corresponding subvolume tree, so we can not figure out if the extent +- * buffer corresponds to the root of the relocation tree or not. So skip +- * this check for relocation trees. +- */ +- if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) { +- struct btrfs_root *check_root; +- +- key.objectid = btrfs_header_owner(leaf); +- key.type = BTRFS_ROOT_ITEM_KEY; +- key.offset = (u64)-1; +- +- check_root = btrfs_get_fs_root(fs_info, &key, false); +- /* +- * The only reason we also check NULL here is that during +- * open_ctree() some roots has not yet been set up. +- */ +- if (!IS_ERR_OR_NULL(check_root)) { +- struct extent_buffer *eb; +- +- eb = btrfs_root_node(check_root); +- /* if leaf is the root, then it's fine */ +- if (leaf != eb) { +- CORRUPT("non-root leaf's nritems is 0", +- leaf, check_root, 0); +- free_extent_buffer(eb); +- return -EIO; +- } +- free_extent_buffer(eb); +- } +- return 0; +- } +- +- if (nritems == 0) +- return 0; +- +- /* Check the 0 item */ +- if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) != +- BTRFS_LEAF_DATA_SIZE(fs_info)) { +- CORRUPT("invalid item offset size pair", leaf, root, 0); +- return -EIO; +- } +- +- /* +- * Check to make sure each items keys are in the correct order and their +- * offsets make sense. We only have to loop through nritems-1 because +- * we check the current slot against the next slot, which verifies the +- * next slot's offset+size makes sense and that the current's slot +- * offset is correct. +- */ +- for (slot = 0; slot < nritems - 1; slot++) { +- btrfs_item_key_to_cpu(leaf, &leaf_key, slot); +- btrfs_item_key_to_cpu(leaf, &key, slot + 1); +- +- /* Make sure the keys are in the right order */ +- if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) { +- CORRUPT("bad key order", leaf, root, slot); +- return -EIO; +- } +- +- /* +- * Make sure the offset and ends are right, remember that the +- * item data starts at the end of the leaf and grows towards the +- * front. +- */ +- if (btrfs_item_offset_nr(leaf, slot) != +- btrfs_item_end_nr(leaf, slot + 1)) { +- CORRUPT("slot offset bad", leaf, root, slot); +- return -EIO; +- } +- +- /* +- * Check to make sure that we don't point outside of the leaf, +- * just in case all the items are consistent to each other, but +- * all point outside of the leaf. +- */ +- if (btrfs_item_end_nr(leaf, slot) > +- BTRFS_LEAF_DATA_SIZE(fs_info)) { +- CORRUPT("slot end outside of leaf", leaf, root, slot); +- return -EIO; +- } +- } +- +- return 0; +-} +- +-static int check_node(struct btrfs_root *root, struct extent_buffer *node) +-{ +- unsigned long nr = btrfs_header_nritems(node); +- struct btrfs_key key, next_key; +- int slot; +- u64 bytenr; +- int ret = 0; +- +- if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)) { +- btrfs_crit(root->fs_info, +- "corrupt node: block %llu root %llu nritems %lu", +- node->start, root->objectid, nr); +- return -EIO; +- } +- +- for (slot = 0; slot < nr - 1; slot++) { +- bytenr = btrfs_node_blockptr(node, slot); +- btrfs_node_key_to_cpu(node, &key, slot); +- btrfs_node_key_to_cpu(node, &next_key, slot + 1); +- +- if (!bytenr) { +- CORRUPT("invalid item slot", node, root, slot); +- ret = -EIO; +- goto out; +- } +- +- if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) { +- CORRUPT("bad key order", node, root, slot); +- ret = -EIO; +- goto out; +- } +- } +-out: +- return ret; +-} +- + static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, + u64 phy_offset, struct page *page, + u64 start, u64 end, int mirror) +@@ -749,12 +610,12 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, + * that we don't try and read the other copies of this block, just + * return -EIO. + */ +- if (found_level == 0 && check_leaf(root, eb)) { ++ if (found_level == 0 && btrfs_check_leaf_full(root, eb)) { + set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); + ret = -EIO; + } + +- if (found_level > 0 && check_node(root, eb)) ++ if (found_level > 0 && btrfs_check_node(root, eb)) + ret = -EIO; + + if (!ret) +@@ -4009,7 +3870,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) + buf->len, + fs_info->dirty_metadata_batch); + #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY +- if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) { ++ /* ++ * Since btrfs_mark_buffer_dirty() can be called with item pointer set ++ * but item data not updated. ++ * So here we should only check item pointers, not item data. ++ */ ++ if (btrfs_header_level(buf) == 0 && ++ btrfs_check_leaf_relaxed(root, buf)) { + btrfs_print_leaf(buf); + ASSERT(0); + } +diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c +index 2cb3569ac548..83791d13c204 100644 +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -9828,6 +9828,8 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info, + int ret = 0; + struct btrfs_key found_key; + struct extent_buffer *leaf; ++ struct btrfs_block_group_item bg; ++ u64 flags; + int slot; + + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); +@@ -9862,8 +9864,32 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info, + "logical %llu len %llu found bg but no related chunk", + found_key.objectid, found_key.offset); + ret = -ENOENT; ++ } else if (em->start != found_key.objectid || ++ em->len != found_key.offset) { ++ btrfs_err(fs_info, ++ "block group %llu len %llu mismatch with chunk %llu len %llu", ++ found_key.objectid, found_key.offset, ++ em->start, em->len); ++ ret = -EUCLEAN; + } else { +- ret = 0; ++ read_extent_buffer(leaf, &bg, ++ btrfs_item_ptr_offset(leaf, slot), ++ sizeof(bg)); ++ flags = btrfs_block_group_flags(&bg) & ++ BTRFS_BLOCK_GROUP_TYPE_MASK; ++ ++ if (flags != (em->map_lookup->type & ++ BTRFS_BLOCK_GROUP_TYPE_MASK)) { ++ btrfs_err(fs_info, ++"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", ++ found_key.objectid, ++ found_key.offset, flags, ++ (BTRFS_BLOCK_GROUP_TYPE_MASK & ++ em->map_lookup->type)); ++ ret = -EUCLEAN; ++ } else { ++ ret = 0; ++ } + } + free_extent_map(em); + goto out; +@@ -10092,6 +10118,62 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info, + return cache; + } + ++ ++/* ++ * Iterate all chunks and verify that each of them has the corresponding block ++ * group ++ */ ++static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) ++{ ++ struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; ++ struct extent_map *em; ++ struct btrfs_block_group_cache *bg; ++ u64 start = 0; ++ int ret = 0; ++ ++ while (1) { ++ read_lock(&map_tree->map_tree.lock); ++ /* ++ * lookup_extent_mapping will return the first extent map ++ * intersecting the range, so setting @len to 1 is enough to ++ * get the first chunk. ++ */ ++ em = lookup_extent_mapping(&map_tree->map_tree, start, 1); ++ read_unlock(&map_tree->map_tree.lock); ++ if (!em) ++ break; ++ ++ bg = btrfs_lookup_block_group(fs_info, em->start); ++ if (!bg) { ++ btrfs_err(fs_info, ++ "chunk start=%llu len=%llu doesn't have corresponding block group", ++ em->start, em->len); ++ ret = -EUCLEAN; ++ free_extent_map(em); ++ break; ++ } ++ if (bg->key.objectid != em->start || ++ bg->key.offset != em->len || ++ (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != ++ (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { ++ btrfs_err(fs_info, ++"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", ++ em->start, em->len, ++ em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK, ++ bg->key.objectid, bg->key.offset, ++ bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); ++ ret = -EUCLEAN; ++ free_extent_map(em); ++ btrfs_put_block_group(bg); ++ break; ++ } ++ start = em->start + em->len; ++ free_extent_map(em); ++ btrfs_put_block_group(bg); ++ } ++ return ret; ++} ++ + int btrfs_read_block_groups(struct btrfs_fs_info *info) + { + struct btrfs_path *path; +@@ -10264,7 +10346,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) + } + + init_global_block_rsv(info); +- ret = 0; ++ ret = check_chunk_block_group_mappings(info); + error: + btrfs_free_path(path); + return ret; +diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c +index eeae2c3ab17e..5feb8b03ffe8 100644 +--- a/fs/btrfs/relocation.c ++++ b/fs/btrfs/relocation.c +@@ -4048,6 +4048,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) + restart: + if (update_backref_cache(trans, &rc->backref_cache)) { + btrfs_end_transaction(trans); ++ trans = NULL; + continue; + } + +diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c +index fe960d5e8913..49a02bf091ae 100644 +--- a/fs/btrfs/super.c ++++ b/fs/btrfs/super.c +@@ -2176,6 +2176,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, + vol = memdup_user((void __user *)arg, sizeof(*vol)); + if (IS_ERR(vol)) + return PTR_ERR(vol); ++ vol->name[BTRFS_PATH_NAME_MAX] = '\0'; + + switch (cmd) { + case BTRFS_IOC_SCAN_DEV: +diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c +index f74005ca8f08..73c1fbca0c35 100644 +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -1955,6 +1955,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) + return ret; + } + ++ btrfs_trans_release_metadata(trans, fs_info); ++ trans->block_rsv = NULL; ++ + /* make a pass through all the delayed refs we have so far + * any runnings procs may add more while we are here + */ +@@ -1964,9 +1967,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) + return ret; + } + +- btrfs_trans_release_metadata(trans, fs_info); +- trans->block_rsv = NULL; +- + cur_trans = trans->transaction; + + /* +diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c +new file mode 100644 +index 000000000000..f206aec1525d +--- /dev/null ++++ b/fs/btrfs/tree-checker.c +@@ -0,0 +1,649 @@ ++/* ++ * Copyright (C) Qu Wenruo 2017. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public ++ * License v2 as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public ++ * License along with this program. ++ */ ++ ++/* ++ * The module is used to catch unexpected/corrupted tree block data. ++ * Such behavior can be caused either by a fuzzed image or bugs. ++ * ++ * The objective is to do leaf/node validation checks when tree block is read ++ * from disk, and check *every* possible member, so other code won't ++ * need to checking them again. ++ * ++ * Due to the potential and unwanted damage, every checker needs to be ++ * carefully reviewed otherwise so it does not prevent mount of valid images. ++ */ ++ ++#include "ctree.h" ++#include "tree-checker.h" ++#include "disk-io.h" ++#include "compression.h" ++#include "hash.h" ++#include "volumes.h" ++ ++#define CORRUPT(reason, eb, root, slot) \ ++ btrfs_crit(root->fs_info, \ ++ "corrupt %s, %s: block=%llu, root=%llu, slot=%d", \ ++ btrfs_header_level(eb) == 0 ? "leaf" : "node", \ ++ reason, btrfs_header_bytenr(eb), root->objectid, slot) ++ ++/* ++ * Error message should follow the following format: ++ * corrupt <type>: <identifier>, <reason>[, <bad_value>] ++ * ++ * @type: leaf or node ++ * @identifier: the necessary info to locate the leaf/node. ++ * It's recommened to decode key.objecitd/offset if it's ++ * meaningful. ++ * @reason: describe the error ++ * @bad_value: optional, it's recommened to output bad value and its ++ * expected value (range). ++ * ++ * Since comma is used to separate the components, only space is allowed ++ * inside each component. ++ */ ++ ++/* ++ * Append generic "corrupt leaf/node root=%llu block=%llu slot=%d: " to @fmt. ++ * Allows callers to customize the output. ++ */ ++__printf(4, 5) ++static void generic_err(const struct btrfs_root *root, ++ const struct extent_buffer *eb, int slot, ++ const char *fmt, ...) ++{ ++ struct va_format vaf; ++ va_list args; ++ ++ va_start(args, fmt); ++ ++ vaf.fmt = fmt; ++ vaf.va = &args; ++ ++ btrfs_crit(root->fs_info, ++ "corrupt %s: root=%llu block=%llu slot=%d, %pV", ++ btrfs_header_level(eb) == 0 ? "leaf" : "node", ++ root->objectid, btrfs_header_bytenr(eb), slot, &vaf); ++ va_end(args); ++} ++ ++static int check_extent_data_item(struct btrfs_root *root, ++ struct extent_buffer *leaf, ++ struct btrfs_key *key, int slot) ++{ ++ struct btrfs_file_extent_item *fi; ++ u32 sectorsize = root->fs_info->sectorsize; ++ u32 item_size = btrfs_item_size_nr(leaf, slot); ++ ++ if (!IS_ALIGNED(key->offset, sectorsize)) { ++ CORRUPT("unaligned key offset for file extent", ++ leaf, root, slot); ++ return -EUCLEAN; ++ } ++ ++ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); ++ ++ if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) { ++ CORRUPT("invalid file extent type", leaf, root, slot); ++ return -EUCLEAN; ++ } ++ ++ /* ++ * Support for new compression/encrption must introduce incompat flag, ++ * and must be caught in open_ctree(). ++ */ ++ if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) { ++ CORRUPT("invalid file extent compression", leaf, root, slot); ++ return -EUCLEAN; ++ } ++ if (btrfs_file_extent_encryption(leaf, fi)) { ++ CORRUPT("invalid file extent encryption", leaf, root, slot); ++ return -EUCLEAN; ++ } ++ if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) { ++ /* Inline extent must have 0 as key offset */ ++ if (key->offset) { ++ CORRUPT("inline extent has non-zero key offset", ++ leaf, root, slot); ++ return -EUCLEAN; ++ } ++ ++ /* Compressed inline extent has no on-disk size, skip it */ ++ if (btrfs_file_extent_compression(leaf, fi) != ++ BTRFS_COMPRESS_NONE) ++ return 0; ++ ++ /* Uncompressed inline extent size must match item size */ ++ if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START + ++ btrfs_file_extent_ram_bytes(leaf, fi)) { ++ CORRUPT("plaintext inline extent has invalid size", ++ leaf, root, slot); ++ return -EUCLEAN; ++ } ++ return 0; ++ } ++ ++ /* Regular or preallocated extent has fixed item size */ ++ if (item_size != sizeof(*fi)) { ++ CORRUPT( ++ "regluar or preallocated extent data item size is invalid", ++ leaf, root, slot); ++ return -EUCLEAN; ++ } ++ if (!IS_ALIGNED(btrfs_file_extent_ram_bytes(leaf, fi), sectorsize) || ++ !IS_ALIGNED(btrfs_file_extent_disk_bytenr(leaf, fi), sectorsize) || ++ !IS_ALIGNED(btrfs_file_extent_disk_num_bytes(leaf, fi), sectorsize) || ++ !IS_ALIGNED(btrfs_file_extent_offset(leaf, fi), sectorsize) || ++ !IS_ALIGNED(btrfs_file_extent_num_bytes(leaf, fi), sectorsize)) { ++ CORRUPT( ++ "regular or preallocated extent data item has unaligned value", ++ leaf, root, slot); ++ return -EUCLEAN; ++ } ++ ++ return 0; ++} ++ ++static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf, ++ struct btrfs_key *key, int slot) ++{ ++ u32 sectorsize = root->fs_info->sectorsize; ++ u32 csumsize = btrfs_super_csum_size(root->fs_info->super_copy); ++ ++ if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) { ++ CORRUPT("invalid objectid for csum item", leaf, root, slot); ++ return -EUCLEAN; ++ } ++ if (!IS_ALIGNED(key->offset, sectorsize)) { ++ CORRUPT("unaligned key offset for csum item", leaf, root, slot); ++ return -EUCLEAN; ++ } ++ if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) { ++ CORRUPT("unaligned csum item size", leaf, root, slot); ++ return -EUCLEAN; ++ } ++ return 0; ++} ++ ++/* ++ * Customized reported for dir_item, only important new info is key->objectid, ++ * which represents inode number ++ */ ++__printf(4, 5) ++static void dir_item_err(const struct btrfs_root *root, ++ const struct extent_buffer *eb, int slot, ++ const char *fmt, ...) ++{ ++ struct btrfs_key key; ++ struct va_format vaf; ++ va_list args; ++ ++ btrfs_item_key_to_cpu(eb, &key, slot); ++ va_start(args, fmt); ++ ++ vaf.fmt = fmt; ++ vaf.va = &args; ++ ++ btrfs_crit(root->fs_info, ++ "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV", ++ btrfs_header_level(eb) == 0 ? "leaf" : "node", root->objectid, ++ btrfs_header_bytenr(eb), slot, key.objectid, &vaf); ++ va_end(args); ++} ++ ++static int check_dir_item(struct btrfs_root *root, ++ struct extent_buffer *leaf, ++ struct btrfs_key *key, int slot) ++{ ++ struct btrfs_dir_item *di; ++ u32 item_size = btrfs_item_size_nr(leaf, slot); ++ u32 cur = 0; ++ ++ di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); ++ while (cur < item_size) { ++ u32 name_len; ++ u32 data_len; ++ u32 max_name_len; ++ u32 total_size; ++ u32 name_hash; ++ u8 dir_type; ++ ++ /* header itself should not cross item boundary */ ++ if (cur + sizeof(*di) > item_size) { ++ dir_item_err(root, leaf, slot, ++ "dir item header crosses item boundary, have %zu boundary %u", ++ cur + sizeof(*di), item_size); ++ return -EUCLEAN; ++ } ++ ++ /* dir type check */ ++ dir_type = btrfs_dir_type(leaf, di); ++ if (dir_type >= BTRFS_FT_MAX) { ++ dir_item_err(root, leaf, slot, ++ "invalid dir item type, have %u expect [0, %u)", ++ dir_type, BTRFS_FT_MAX); ++ return -EUCLEAN; ++ } ++ ++ if (key->type == BTRFS_XATTR_ITEM_KEY && ++ dir_type != BTRFS_FT_XATTR) { ++ dir_item_err(root, leaf, slot, ++ "invalid dir item type for XATTR key, have %u expect %u", ++ dir_type, BTRFS_FT_XATTR); ++ return -EUCLEAN; ++ } ++ if (dir_type == BTRFS_FT_XATTR && ++ key->type != BTRFS_XATTR_ITEM_KEY) { ++ dir_item_err(root, leaf, slot, ++ "xattr dir type found for non-XATTR key"); ++ return -EUCLEAN; ++ } ++ if (dir_type == BTRFS_FT_XATTR) ++ max_name_len = XATTR_NAME_MAX; ++ else ++ max_name_len = BTRFS_NAME_LEN; ++ ++ /* Name/data length check */ ++ name_len = btrfs_dir_name_len(leaf, di); ++ data_len = btrfs_dir_data_len(leaf, di); ++ if (name_len > max_name_len) { ++ dir_item_err(root, leaf, slot, ++ "dir item name len too long, have %u max %u", ++ name_len, max_name_len); ++ return -EUCLEAN; ++ } ++ if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root->fs_info)) { ++ dir_item_err(root, leaf, slot, ++ "dir item name and data len too long, have %u max %u", ++ name_len + data_len, ++ BTRFS_MAX_XATTR_SIZE(root->fs_info)); ++ return -EUCLEAN; ++ } ++ ++ if (data_len && dir_type != BTRFS_FT_XATTR) { ++ dir_item_err(root, leaf, slot, ++ "dir item with invalid data len, have %u expect 0", ++ data_len); ++ return -EUCLEAN; ++ } ++ ++ total_size = sizeof(*di) + name_len + data_len; ++ ++ /* header and name/data should not cross item boundary */ ++ if (cur + total_size > item_size) { ++ dir_item_err(root, leaf, slot, ++ "dir item data crosses item boundary, have %u boundary %u", ++ cur + total_size, item_size); ++ return -EUCLEAN; ++ } ++ ++ /* ++ * Special check for XATTR/DIR_ITEM, as key->offset is name ++ * hash, should match its name ++ */ ++ if (key->type == BTRFS_DIR_ITEM_KEY || ++ key->type == BTRFS_XATTR_ITEM_KEY) { ++ char namebuf[max(BTRFS_NAME_LEN, XATTR_NAME_MAX)]; ++ ++ read_extent_buffer(leaf, namebuf, ++ (unsigned long)(di + 1), name_len); ++ name_hash = btrfs_name_hash(namebuf, name_len); ++ if (key->offset != name_hash) { ++ dir_item_err(root, leaf, slot, ++ "name hash mismatch with key, have 0x%016x expect 0x%016llx", ++ name_hash, key->offset); ++ return -EUCLEAN; ++ } ++ } ++ cur += total_size; ++ di = (struct btrfs_dir_item *)((void *)di + total_size); ++ } ++ return 0; ++} ++ ++__printf(4, 5) ++__cold ++static void block_group_err(const struct btrfs_fs_info *fs_info, ++ const struct extent_buffer *eb, int slot, ++ const char *fmt, ...) ++{ ++ struct btrfs_key key; ++ struct va_format vaf; ++ va_list args; ++ ++ btrfs_item_key_to_cpu(eb, &key, slot); ++ va_start(args, fmt); ++ ++ vaf.fmt = fmt; ++ vaf.va = &args; ++ ++ btrfs_crit(fs_info, ++ "corrupt %s: root=%llu block=%llu slot=%d bg_start=%llu bg_len=%llu, %pV", ++ btrfs_header_level(eb) == 0 ? "leaf" : "node", ++ btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, ++ key.objectid, key.offset, &vaf); ++ va_end(args); ++} ++ ++static int check_block_group_item(struct btrfs_fs_info *fs_info, ++ struct extent_buffer *leaf, ++ struct btrfs_key *key, int slot) ++{ ++ struct btrfs_block_group_item bgi; ++ u32 item_size = btrfs_item_size_nr(leaf, slot); ++ u64 flags; ++ u64 type; ++ ++ /* ++ * Here we don't really care about alignment since extent allocator can ++ * handle it. We care more about the size, as if one block group is ++ * larger than maximum size, it's must be some obvious corruption. ++ */ ++ if (key->offset > BTRFS_MAX_DATA_CHUNK_SIZE || key->offset == 0) { ++ block_group_err(fs_info, leaf, slot, ++ "invalid block group size, have %llu expect (0, %llu]", ++ key->offset, BTRFS_MAX_DATA_CHUNK_SIZE); ++ return -EUCLEAN; ++ } ++ ++ if (item_size != sizeof(bgi)) { ++ block_group_err(fs_info, leaf, slot, ++ "invalid item size, have %u expect %zu", ++ item_size, sizeof(bgi)); ++ return -EUCLEAN; ++ } ++ ++ read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), ++ sizeof(bgi)); ++ if (btrfs_block_group_chunk_objectid(&bgi) != ++ BTRFS_FIRST_CHUNK_TREE_OBJECTID) { ++ block_group_err(fs_info, leaf, slot, ++ "invalid block group chunk objectid, have %llu expect %llu", ++ btrfs_block_group_chunk_objectid(&bgi), ++ BTRFS_FIRST_CHUNK_TREE_OBJECTID); ++ return -EUCLEAN; ++ } ++ ++ if (btrfs_block_group_used(&bgi) > key->offset) { ++ block_group_err(fs_info, leaf, slot, ++ "invalid block group used, have %llu expect [0, %llu)", ++ btrfs_block_group_used(&bgi), key->offset); ++ return -EUCLEAN; ++ } ++ ++ flags = btrfs_block_group_flags(&bgi); ++ if (hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1) { ++ block_group_err(fs_info, leaf, slot, ++"invalid profile flags, have 0x%llx (%lu bits set) expect no more than 1 bit set", ++ flags & BTRFS_BLOCK_GROUP_PROFILE_MASK, ++ hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)); ++ return -EUCLEAN; ++ } ++ ++ type = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; ++ if (type != BTRFS_BLOCK_GROUP_DATA && ++ type != BTRFS_BLOCK_GROUP_METADATA && ++ type != BTRFS_BLOCK_GROUP_SYSTEM && ++ type != (BTRFS_BLOCK_GROUP_METADATA | ++ BTRFS_BLOCK_GROUP_DATA)) { ++ block_group_err(fs_info, leaf, slot, ++"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx", ++ type, hweight64(type), ++ BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA, ++ BTRFS_BLOCK_GROUP_SYSTEM, ++ BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA); ++ return -EUCLEAN; ++ } ++ return 0; ++} ++ ++/* ++ * Common point to switch the item-specific validation. ++ */ ++static int check_leaf_item(struct btrfs_root *root, ++ struct extent_buffer *leaf, ++ struct btrfs_key *key, int slot) ++{ ++ int ret = 0; ++ ++ switch (key->type) { ++ case BTRFS_EXTENT_DATA_KEY: ++ ret = check_extent_data_item(root, leaf, key, slot); ++ break; ++ case BTRFS_EXTENT_CSUM_KEY: ++ ret = check_csum_item(root, leaf, key, slot); ++ break; ++ case BTRFS_DIR_ITEM_KEY: ++ case BTRFS_DIR_INDEX_KEY: ++ case BTRFS_XATTR_ITEM_KEY: ++ ret = check_dir_item(root, leaf, key, slot); ++ break; ++ case BTRFS_BLOCK_GROUP_ITEM_KEY: ++ ret = check_block_group_item(root->fs_info, leaf, key, slot); ++ break; ++ } ++ return ret; ++} ++ ++static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf, ++ bool check_item_data) ++{ ++ struct btrfs_fs_info *fs_info = root->fs_info; ++ /* No valid key type is 0, so all key should be larger than this key */ ++ struct btrfs_key prev_key = {0, 0, 0}; ++ struct btrfs_key key; ++ u32 nritems = btrfs_header_nritems(leaf); ++ int slot; ++ ++ if (btrfs_header_level(leaf) != 0) { ++ generic_err(root, leaf, 0, ++ "invalid level for leaf, have %d expect 0", ++ btrfs_header_level(leaf)); ++ return -EUCLEAN; ++ } ++ ++ /* ++ * Extent buffers from a relocation tree have a owner field that ++ * corresponds to the subvolume tree they are based on. So just from an ++ * extent buffer alone we can not find out what is the id of the ++ * corresponding subvolume tree, so we can not figure out if the extent ++ * buffer corresponds to the root of the relocation tree or not. So ++ * skip this check for relocation trees. ++ */ ++ if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) { ++ u64 owner = btrfs_header_owner(leaf); ++ struct btrfs_root *check_root; ++ ++ /* These trees must never be empty */ ++ if (owner == BTRFS_ROOT_TREE_OBJECTID || ++ owner == BTRFS_CHUNK_TREE_OBJECTID || ++ owner == BTRFS_EXTENT_TREE_OBJECTID || ++ owner == BTRFS_DEV_TREE_OBJECTID || ++ owner == BTRFS_FS_TREE_OBJECTID || ++ owner == BTRFS_DATA_RELOC_TREE_OBJECTID) { ++ generic_err(root, leaf, 0, ++ "invalid root, root %llu must never be empty", ++ owner); ++ return -EUCLEAN; ++ } ++ key.objectid = owner; ++ key.type = BTRFS_ROOT_ITEM_KEY; ++ key.offset = (u64)-1; ++ ++ check_root = btrfs_get_fs_root(fs_info, &key, false); ++ /* ++ * The only reason we also check NULL here is that during ++ * open_ctree() some roots has not yet been set up. ++ */ ++ if (!IS_ERR_OR_NULL(check_root)) { ++ struct extent_buffer *eb; ++ ++ eb = btrfs_root_node(check_root); ++ /* if leaf is the root, then it's fine */ ++ if (leaf != eb) { ++ CORRUPT("non-root leaf's nritems is 0", ++ leaf, check_root, 0); ++ free_extent_buffer(eb); ++ return -EUCLEAN; ++ } ++ free_extent_buffer(eb); ++ } ++ return 0; ++ } ++ ++ if (nritems == 0) ++ return 0; ++ ++ /* ++ * Check the following things to make sure this is a good leaf, and ++ * leaf users won't need to bother with similar sanity checks: ++ * ++ * 1) key ordering ++ * 2) item offset and size ++ * No overlap, no hole, all inside the leaf. ++ * 3) item content ++ * If possible, do comprehensive sanity check. ++ * NOTE: All checks must only rely on the item data itself. ++ */ ++ for (slot = 0; slot < nritems; slot++) { ++ u32 item_end_expected; ++ int ret; ++ ++ btrfs_item_key_to_cpu(leaf, &key, slot); ++ ++ /* Make sure the keys are in the right order */ ++ if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) { ++ CORRUPT("bad key order", leaf, root, slot); ++ return -EUCLEAN; ++ } ++ ++ /* ++ * Make sure the offset and ends are right, remember that the ++ * item data starts at the end of the leaf and grows towards the ++ * front. ++ */ ++ if (slot == 0) ++ item_end_expected = BTRFS_LEAF_DATA_SIZE(fs_info); ++ else ++ item_end_expected = btrfs_item_offset_nr(leaf, ++ slot - 1); ++ if (btrfs_item_end_nr(leaf, slot) != item_end_expected) { ++ CORRUPT("slot offset bad", leaf, root, slot); ++ return -EUCLEAN; ++ } ++ ++ /* ++ * Check to make sure that we don't point outside of the leaf, ++ * just in case all the items are consistent to each other, but ++ * all point outside of the leaf. ++ */ ++ if (btrfs_item_end_nr(leaf, slot) > ++ BTRFS_LEAF_DATA_SIZE(fs_info)) { ++ CORRUPT("slot end outside of leaf", leaf, root, slot); ++ return -EUCLEAN; ++ } ++ ++ /* Also check if the item pointer overlaps with btrfs item. */ ++ if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) > ++ btrfs_item_ptr_offset(leaf, slot)) { ++ CORRUPT("slot overlap with its data", leaf, root, slot); ++ return -EUCLEAN; ++ } ++ ++ if (check_item_data) { ++ /* ++ * Check if the item size and content meet other ++ * criteria ++ */ ++ ret = check_leaf_item(root, leaf, &key, slot); ++ if (ret < 0) ++ return ret; ++ } ++ ++ prev_key.objectid = key.objectid; ++ prev_key.type = key.type; ++ prev_key.offset = key.offset; ++ } ++ ++ return 0; ++} ++ ++int btrfs_check_leaf_full(struct btrfs_root *root, struct extent_buffer *leaf) ++{ ++ return check_leaf(root, leaf, true); ++} ++ ++int btrfs_check_leaf_relaxed(struct btrfs_root *root, ++ struct extent_buffer *leaf) ++{ ++ return check_leaf(root, leaf, false); ++} ++ ++int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node) ++{ ++ unsigned long nr = btrfs_header_nritems(node); ++ struct btrfs_key key, next_key; ++ int slot; ++ int level = btrfs_header_level(node); ++ u64 bytenr; ++ int ret = 0; ++ ++ if (level <= 0 || level >= BTRFS_MAX_LEVEL) { ++ generic_err(root, node, 0, ++ "invalid level for node, have %d expect [1, %d]", ++ level, BTRFS_MAX_LEVEL - 1); ++ return -EUCLEAN; ++ } ++ if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)) { ++ btrfs_crit(root->fs_info, ++"corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]", ++ root->objectid, node->start, ++ nr == 0 ? "small" : "large", nr, ++ BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)); ++ return -EUCLEAN; ++ } ++ ++ for (slot = 0; slot < nr - 1; slot++) { ++ bytenr = btrfs_node_blockptr(node, slot); ++ btrfs_node_key_to_cpu(node, &key, slot); ++ btrfs_node_key_to_cpu(node, &next_key, slot + 1); ++ ++ if (!bytenr) { ++ generic_err(root, node, slot, ++ "invalid NULL node pointer"); ++ ret = -EUCLEAN; ++ goto out; ++ } ++ if (!IS_ALIGNED(bytenr, root->fs_info->sectorsize)) { ++ generic_err(root, node, slot, ++ "unaligned pointer, have %llu should be aligned to %u", ++ bytenr, root->fs_info->sectorsize); ++ ret = -EUCLEAN; ++ goto out; ++ } ++ ++ if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) { ++ generic_err(root, node, slot, ++ "bad key order, current (%llu %u %llu) next (%llu %u %llu)", ++ key.objectid, key.type, key.offset, ++ next_key.objectid, next_key.type, ++ next_key.offset); ++ ret = -EUCLEAN; ++ goto out; ++ } ++ } ++out: ++ return ret; ++} +diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h +new file mode 100644 +index 000000000000..3d53e8d6fda0 +--- /dev/null ++++ b/fs/btrfs/tree-checker.h +@@ -0,0 +1,38 @@ ++/* ++ * Copyright (C) Qu Wenruo 2017. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public ++ * License v2 as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public ++ * License along with this program. ++ */ ++ ++#ifndef __BTRFS_TREE_CHECKER__ ++#define __BTRFS_TREE_CHECKER__ ++ ++#include "ctree.h" ++#include "extent_io.h" ++ ++/* ++ * Comprehensive leaf checker. ++ * Will check not only the item pointers, but also every possible member ++ * in item data. ++ */ ++int btrfs_check_leaf_full(struct btrfs_root *root, struct extent_buffer *leaf); ++ ++/* ++ * Less strict leaf checker. ++ * Will only check item pointers, not reading item data. ++ */ ++int btrfs_check_leaf_relaxed(struct btrfs_root *root, ++ struct extent_buffer *leaf); ++int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node); ++ ++#endif +diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c +index a0947f4a3e87..9663b6aa2a56 100644 +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -4647,7 +4647,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + + if (type & BTRFS_BLOCK_GROUP_DATA) { + max_stripe_size = SZ_1G; +- max_chunk_size = 10 * max_stripe_size; ++ max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; + if (!devs_max) + devs_max = BTRFS_MAX_DEVS(info->chunk_root); + } else if (type & BTRFS_BLOCK_GROUP_METADATA) { +@@ -6353,6 +6353,8 @@ static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, + u16 num_stripes; + u16 sub_stripes; + u64 type; ++ u64 features; ++ bool mixed = false; + + length = btrfs_chunk_length(leaf, chunk); + stripe_len = btrfs_chunk_stripe_len(leaf, chunk); +@@ -6391,6 +6393,32 @@ static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, + btrfs_chunk_type(leaf, chunk)); + return -EIO; + } ++ ++ if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) { ++ btrfs_err(fs_info, "missing chunk type flag: 0x%llx", type); ++ return -EIO; ++ } ++ ++ if ((type & BTRFS_BLOCK_GROUP_SYSTEM) && ++ (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) { ++ btrfs_err(fs_info, ++ "system chunk with data or metadata type: 0x%llx", type); ++ return -EIO; ++ } ++ ++ features = btrfs_super_incompat_flags(fs_info->super_copy); ++ if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) ++ mixed = true; ++ ++ if (!mixed) { ++ if ((type & BTRFS_BLOCK_GROUP_METADATA) && ++ (type & BTRFS_BLOCK_GROUP_DATA)) { ++ btrfs_err(fs_info, ++ "mixed chunk type in non-mixed mode: 0x%llx", type); ++ return -EIO; ++ } ++ } ++ + if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || + (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) || + (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || +diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h +index c5dd48eb7b3d..76fb6e84f201 100644 +--- a/fs/btrfs/volumes.h ++++ b/fs/btrfs/volumes.h +@@ -24,6 +24,8 @@ + #include <linux/btrfs.h> + #include "async-thread.h" + ++#define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) ++ + extern struct mutex uuid_mutex; + + #define BTRFS_STRIPE_LEN SZ_64K +diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c +index bf378ddca4db..a48984dd6426 100644 +--- a/fs/ceph/mds_client.c ++++ b/fs/ceph/mds_client.c +@@ -4079,6 +4079,16 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, + return auth; + } + ++static int add_authorizer_challenge(struct ceph_connection *con, ++ void *challenge_buf, int challenge_buf_len) ++{ ++ struct ceph_mds_session *s = con->private; ++ struct ceph_mds_client *mdsc = s->s_mdsc; ++ struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; ++ ++ return ceph_auth_add_authorizer_challenge(ac, s->s_auth.authorizer, ++ challenge_buf, challenge_buf_len); ++} + + static int verify_authorizer_reply(struct ceph_connection *con) + { +@@ -4142,6 +4152,7 @@ static const struct ceph_connection_operations mds_con_ops = { + .put = con_put, + .dispatch = dispatch, + .get_authorizer = get_authorizer, ++ .add_authorizer_challenge = add_authorizer_challenge, + .verify_authorizer_reply = verify_authorizer_reply, + .invalidate_authorizer = invalidate_authorizer, + .peer_reset = peer_reset, +diff --git a/fs/direct-io.c b/fs/direct-io.c +index 625a84aa6484..40567501015f 100644 +--- a/fs/direct-io.c ++++ b/fs/direct-io.c +@@ -304,8 +304,8 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) + */ + dio->iocb->ki_pos += transferred; + +- if (dio->op == REQ_OP_WRITE) +- ret = generic_write_sync(dio->iocb, transferred); ++ if (ret > 0 && dio->op == REQ_OP_WRITE) ++ ret = generic_write_sync(dio->iocb, ret); + dio->iocb->ki_complete(dio->iocb, ret, 0); + } + +diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c +index 62d9a659a8ff..dd8f10db82e9 100644 +--- a/fs/ext2/xattr.c ++++ b/fs/ext2/xattr.c +@@ -612,9 +612,9 @@ skip_replace: + } + + cleanup: +- brelse(bh); + if (!(bh && header == HDR(bh))) + kfree(header); ++ brelse(bh); + up_write(&EXT2_I(inode)->xattr_sem); + + return error; +diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c +index 41fce930f44c..624817eeb25e 100644 +--- a/fs/f2fs/checkpoint.c ++++ b/fs/f2fs/checkpoint.c +@@ -69,6 +69,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, + .old_blkaddr = index, + .new_blkaddr = index, + .encrypted_page = NULL, ++ .is_meta = is_meta, + }; + + if (unlikely(!is_meta)) +@@ -85,8 +86,10 @@ repeat: + fio.page = page; + + if (f2fs_submit_page_bio(&fio)) { +- f2fs_put_page(page, 1); +- goto repeat; ++ memset(page_address(page), 0, PAGE_SIZE); ++ f2fs_stop_checkpoint(sbi, false); ++ f2fs_bug_on(sbi, 1); ++ return page; + } + + lock_page(page); +@@ -117,7 +120,8 @@ struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index) + return __get_meta_page(sbi, index, false); + } + +-bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) ++bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, ++ block_t blkaddr, int type) + { + switch (type) { + case META_NAT: +@@ -137,8 +141,20 @@ bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) + return false; + break; + case META_POR: ++ case DATA_GENERIC: + if (unlikely(blkaddr >= MAX_BLKADDR(sbi) || +- blkaddr < MAIN_BLKADDR(sbi))) ++ blkaddr < MAIN_BLKADDR(sbi))) { ++ if (type == DATA_GENERIC) { ++ f2fs_msg(sbi->sb, KERN_WARNING, ++ "access invalid blkaddr:%u", blkaddr); ++ WARN_ON(1); ++ } ++ return false; ++ } ++ break; ++ case META_GENERIC: ++ if (unlikely(blkaddr < SEG0_BLKADDR(sbi) || ++ blkaddr >= MAIN_BLKADDR(sbi))) + return false; + break; + default: +@@ -163,6 +179,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, + .op_flags = sync ? (REQ_META | REQ_PRIO) : REQ_RAHEAD, + .encrypted_page = NULL, + .in_list = false, ++ .is_meta = (type != META_POR), + }; + struct blk_plug plug; + +@@ -172,7 +189,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, + blk_start_plug(&plug); + for (; nrpages-- > 0; blkno++) { + +- if (!is_valid_blkaddr(sbi, blkno, type)) ++ if (!f2fs_is_valid_blkaddr(sbi, blkno, type)) + goto out; + + switch (type) { +@@ -737,6 +754,14 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, + &cp_page_1, version); + if (err) + return NULL; ++ ++ if (le32_to_cpu(cp_block->cp_pack_total_block_count) > ++ sbi->blocks_per_seg) { ++ f2fs_msg(sbi->sb, KERN_WARNING, ++ "invalid cp_pack_total_block_count:%u", ++ le32_to_cpu(cp_block->cp_pack_total_block_count)); ++ goto invalid_cp; ++ } + pre_version = *version; + + cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1; +@@ -800,15 +825,15 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) + cp_block = (struct f2fs_checkpoint *)page_address(cur_page); + memcpy(sbi->ckpt, cp_block, blk_size); + +- /* Sanity checking of checkpoint */ +- if (sanity_check_ckpt(sbi)) +- goto free_fail_no_cp; +- + if (cur_page == cp1) + sbi->cur_cp_pack = 1; + else + sbi->cur_cp_pack = 2; + ++ /* Sanity checking of checkpoint */ ++ if (sanity_check_ckpt(sbi)) ++ goto free_fail_no_cp; ++ + if (cp_blks <= 1) + goto done; + +diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c +index 6fbb6d75318a..8f6e7c3a10f8 100644 +--- a/fs/f2fs/data.c ++++ b/fs/f2fs/data.c +@@ -369,6 +369,10 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) + struct page *page = fio->encrypted_page ? + fio->encrypted_page : fio->page; + ++ if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, ++ __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) ++ return -EFAULT; ++ + trace_f2fs_submit_page_bio(page, fio); + f2fs_trace_ios(fio, 0); + +@@ -412,9 +416,9 @@ next: + spin_unlock(&io->io_lock); + } + +- if (fio->old_blkaddr != NEW_ADDR) +- verify_block_addr(sbi, fio->old_blkaddr); +- verify_block_addr(sbi, fio->new_blkaddr); ++ if (__is_valid_data_blkaddr(fio->old_blkaddr)) ++ verify_block_addr(fio, fio->old_blkaddr); ++ verify_block_addr(fio, fio->new_blkaddr); + + bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; + +@@ -945,7 +949,13 @@ next_dnode: + next_block: + blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); + +- if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { ++ if (__is_valid_data_blkaddr(blkaddr) && ++ !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC)) { ++ err = -EFAULT; ++ goto sync_out; ++ } ++ ++ if (!is_valid_data_blkaddr(sbi, blkaddr)) { + if (create) { + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; +@@ -1263,6 +1273,10 @@ got_it: + SetPageUptodate(page); + goto confused; + } ++ ++ if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr, ++ DATA_GENERIC)) ++ goto set_error_page; + } else { + zero_user_segment(page, 0, PAGE_SIZE); + if (!PageUptodate(page)) +@@ -1387,15 +1401,6 @@ static inline bool need_inplace_update(struct f2fs_io_info *fio) + return need_inplace_update_policy(inode, fio); + } + +-static inline bool valid_ipu_blkaddr(struct f2fs_io_info *fio) +-{ +- if (fio->old_blkaddr == NEW_ADDR) +- return false; +- if (fio->old_blkaddr == NULL_ADDR) +- return false; +- return true; +-} +- + int do_write_data_page(struct f2fs_io_info *fio) + { + struct page *page = fio->page; +@@ -1410,11 +1415,13 @@ int do_write_data_page(struct f2fs_io_info *fio) + f2fs_lookup_extent_cache(inode, page->index, &ei)) { + fio->old_blkaddr = ei.blk + page->index - ei.fofs; + +- if (valid_ipu_blkaddr(fio)) { +- ipu_force = true; +- fio->need_lock = LOCK_DONE; +- goto got_it; +- } ++ if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, ++ DATA_GENERIC)) ++ return -EFAULT; ++ ++ ipu_force = true; ++ fio->need_lock = LOCK_DONE; ++ goto got_it; + } + + /* Deadlock due to between page->lock and f2fs_lock_op */ +@@ -1433,11 +1440,18 @@ int do_write_data_page(struct f2fs_io_info *fio) + goto out_writepage; + } + got_it: ++ if (__is_valid_data_blkaddr(fio->old_blkaddr) && ++ !f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, ++ DATA_GENERIC)) { ++ err = -EFAULT; ++ goto out_writepage; ++ } + /* + * If current allocation needs SSR, + * it had better in-place writes for updated data. + */ +- if (ipu_force || (valid_ipu_blkaddr(fio) && need_inplace_update(fio))) { ++ if (ipu_force || (is_valid_data_blkaddr(fio->sbi, fio->old_blkaddr) && ++ need_inplace_update(fio))) { + err = encrypt_one_page(fio); + if (err) + goto out_writepage; +diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h +index 54f8520ad7a2..3f1a44696036 100644 +--- a/fs/f2fs/f2fs.h ++++ b/fs/f2fs/f2fs.h +@@ -162,7 +162,7 @@ struct cp_control { + }; + + /* +- * For CP/NAT/SIT/SSA readahead ++ * indicate meta/data type + */ + enum { + META_CP, +@@ -170,6 +170,8 @@ enum { + META_SIT, + META_SSA, + META_POR, ++ DATA_GENERIC, ++ META_GENERIC, + }; + + /* for the list of ino */ +@@ -910,6 +912,7 @@ struct f2fs_io_info { + bool submitted; /* indicate IO submission */ + int need_lock; /* indicate we need to lock cp_rwsem */ + bool in_list; /* indicate fio is in io_list */ ++ bool is_meta; /* indicate borrow meta inode mapping or not */ + enum iostat_type io_type; /* io type */ + }; + +@@ -2354,6 +2357,39 @@ static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, + spin_unlock(&sbi->iostat_lock); + } + ++#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO(fio->type) == META && \ ++ (!is_read_io(fio->op) || fio->is_meta)) ++ ++bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, ++ block_t blkaddr, int type); ++void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...); ++static inline void verify_blkaddr(struct f2fs_sb_info *sbi, ++ block_t blkaddr, int type) ++{ ++ if (!f2fs_is_valid_blkaddr(sbi, blkaddr, type)) { ++ f2fs_msg(sbi->sb, KERN_ERR, ++ "invalid blkaddr: %u, type: %d, run fsck to fix.", ++ blkaddr, type); ++ f2fs_bug_on(sbi, 1); ++ } ++} ++ ++static inline bool __is_valid_data_blkaddr(block_t blkaddr) ++{ ++ if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) ++ return false; ++ return true; ++} ++ ++static inline bool is_valid_data_blkaddr(struct f2fs_sb_info *sbi, ++ block_t blkaddr) ++{ ++ if (!__is_valid_data_blkaddr(blkaddr)) ++ return false; ++ verify_blkaddr(sbi, blkaddr, DATA_GENERIC); ++ return true; ++} ++ + /* + * file.c + */ +@@ -2564,7 +2600,8 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); + struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); + struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); + struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); +-bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); ++bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, ++ block_t blkaddr, int type); + int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, + int type, bool sync); + void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); +diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c +index 6f589730782d..7d3189f1941c 100644 +--- a/fs/f2fs/file.c ++++ b/fs/f2fs/file.c +@@ -328,13 +328,13 @@ static pgoff_t __get_first_dirty_index(struct address_space *mapping, + return pgofs; + } + +-static bool __found_offset(block_t blkaddr, pgoff_t dirty, pgoff_t pgofs, +- int whence) ++static bool __found_offset(struct f2fs_sb_info *sbi, block_t blkaddr, ++ pgoff_t dirty, pgoff_t pgofs, int whence) + { + switch (whence) { + case SEEK_DATA: + if ((blkaddr == NEW_ADDR && dirty == pgofs) || +- (blkaddr != NEW_ADDR && blkaddr != NULL_ADDR)) ++ is_valid_data_blkaddr(sbi, blkaddr)) + return true; + break; + case SEEK_HOLE: +@@ -397,7 +397,15 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) + blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); + +- if (__found_offset(blkaddr, dirty, pgofs, whence)) { ++ if (__is_valid_data_blkaddr(blkaddr) && ++ !f2fs_is_valid_blkaddr(F2FS_I_SB(inode), ++ blkaddr, DATA_GENERIC)) { ++ f2fs_put_dnode(&dn); ++ goto fail; ++ } ++ ++ if (__found_offset(F2FS_I_SB(inode), blkaddr, dirty, ++ pgofs, whence)) { + f2fs_put_dnode(&dn); + goto found; + } +@@ -495,6 +503,11 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) + + dn->data_blkaddr = NULL_ADDR; + set_data_blkaddr(dn); ++ ++ if (__is_valid_data_blkaddr(blkaddr) && ++ !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC)) ++ continue; ++ + invalidate_blocks(sbi, blkaddr); + if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page)) + clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN); +diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c +index 259b0aa283f0..9a40724dbaa6 100644 +--- a/fs/f2fs/inode.c ++++ b/fs/f2fs/inode.c +@@ -62,11 +62,12 @@ static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) + } + } + +-static bool __written_first_block(struct f2fs_inode *ri) ++static bool __written_first_block(struct f2fs_sb_info *sbi, ++ struct f2fs_inode *ri) + { + block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]); + +- if (addr != NEW_ADDR && addr != NULL_ADDR) ++ if (is_valid_data_blkaddr(sbi, addr)) + return true; + return false; + } +@@ -179,6 +180,72 @@ void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page) + ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page)); + } + ++static bool sanity_check_inode(struct inode *inode, struct page *node_page) ++{ ++ struct f2fs_sb_info *sbi = F2FS_I_SB(inode); ++ struct f2fs_inode_info *fi = F2FS_I(inode); ++ unsigned long long iblocks; ++ ++ iblocks = le64_to_cpu(F2FS_INODE(node_page)->i_blocks); ++ if (!iblocks) { ++ set_sbi_flag(sbi, SBI_NEED_FSCK); ++ f2fs_msg(sbi->sb, KERN_WARNING, ++ "%s: corrupted inode i_blocks i_ino=%lx iblocks=%llu, " ++ "run fsck to fix.", ++ __func__, inode->i_ino, iblocks); ++ return false; ++ } ++ ++ if (ino_of_node(node_page) != nid_of_node(node_page)) { ++ set_sbi_flag(sbi, SBI_NEED_FSCK); ++ f2fs_msg(sbi->sb, KERN_WARNING, ++ "%s: corrupted inode footer i_ino=%lx, ino,nid: " ++ "[%u, %u] run fsck to fix.", ++ __func__, inode->i_ino, ++ ino_of_node(node_page), nid_of_node(node_page)); ++ return false; ++ } ++ ++ if (f2fs_has_extra_attr(inode) && ++ !f2fs_sb_has_extra_attr(sbi->sb)) { ++ set_sbi_flag(sbi, SBI_NEED_FSCK); ++ f2fs_msg(sbi->sb, KERN_WARNING, ++ "%s: inode (ino=%lx) is with extra_attr, " ++ "but extra_attr feature is off", ++ __func__, inode->i_ino); ++ return false; ++ } ++ ++ if (fi->i_extra_isize > F2FS_TOTAL_EXTRA_ATTR_SIZE || ++ fi->i_extra_isize % sizeof(__le32)) { ++ set_sbi_flag(sbi, SBI_NEED_FSCK); ++ f2fs_msg(sbi->sb, KERN_WARNING, ++ "%s: inode (ino=%lx) has corrupted i_extra_isize: %d, " ++ "max: %zu", ++ __func__, inode->i_ino, fi->i_extra_isize, ++ F2FS_TOTAL_EXTRA_ATTR_SIZE); ++ return false; ++ } ++ ++ if (F2FS_I(inode)->extent_tree) { ++ struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest; ++ ++ if (ei->len && ++ (!f2fs_is_valid_blkaddr(sbi, ei->blk, DATA_GENERIC) || ++ !f2fs_is_valid_blkaddr(sbi, ei->blk + ei->len - 1, ++ DATA_GENERIC))) { ++ set_sbi_flag(sbi, SBI_NEED_FSCK); ++ f2fs_msg(sbi->sb, KERN_WARNING, ++ "%s: inode (ino=%lx) extent info [%u, %u, %u] " ++ "is incorrect, run fsck to fix", ++ __func__, inode->i_ino, ++ ei->blk, ei->fofs, ei->len); ++ return false; ++ } ++ } ++ return true; ++} ++ + static int do_read_inode(struct inode *inode) + { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); +@@ -228,6 +295,11 @@ static int do_read_inode(struct inode *inode) + fi->i_extra_isize = f2fs_has_extra_attr(inode) ? + le16_to_cpu(ri->i_extra_isize) : 0; + ++ if (!sanity_check_inode(inode, node_page)) { ++ f2fs_put_page(node_page, 1); ++ return -EINVAL; ++ } ++ + /* check data exist */ + if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) + __recover_inline_status(inode, node_page); +@@ -235,7 +307,7 @@ static int do_read_inode(struct inode *inode) + /* get rdev by using inline_info */ + __get_inode_rdev(inode, ri); + +- if (__written_first_block(ri)) ++ if (__written_first_block(sbi, ri)) + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); + + if (!need_inode_block_update(sbi, inode->i_ino)) +diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c +index 712505ec5de4..65de72d65562 100644 +--- a/fs/f2fs/node.c ++++ b/fs/f2fs/node.c +@@ -334,8 +334,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, + new_blkaddr == NULL_ADDR); + f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR && + new_blkaddr == NEW_ADDR); +- f2fs_bug_on(sbi, nat_get_blkaddr(e) != NEW_ADDR && +- nat_get_blkaddr(e) != NULL_ADDR && ++ f2fs_bug_on(sbi, is_valid_data_blkaddr(sbi, nat_get_blkaddr(e)) && + new_blkaddr == NEW_ADDR); + + /* increment version no as node is removed */ +@@ -350,7 +349,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, + + /* change address */ + nat_set_blkaddr(e, new_blkaddr); +- if (new_blkaddr == NEW_ADDR || new_blkaddr == NULL_ADDR) ++ if (!is_valid_data_blkaddr(sbi, new_blkaddr)) + set_nat_flag(e, IS_CHECKPOINTED, false); + __set_nat_cache_dirty(nm_i, e); + +@@ -1399,6 +1398,12 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, + return 0; + } + ++ if (__is_valid_data_blkaddr(ni.blk_addr) && ++ !f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC)) { ++ up_read(&sbi->node_write); ++ goto redirty_out; ++ } ++ + if (atomic && !test_opt(sbi, NOBARRIER)) + fio.op_flags |= REQ_PREFLUSH | REQ_FUA; + +diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c +index 765fadf954af..6ea445377767 100644 +--- a/fs/f2fs/recovery.c ++++ b/fs/f2fs/recovery.c +@@ -236,7 +236,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, + while (1) { + struct fsync_inode_entry *entry; + +- if (!is_valid_blkaddr(sbi, blkaddr, META_POR)) ++ if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR)) + return 0; + + page = get_tmp_page(sbi, blkaddr); +@@ -479,7 +479,7 @@ retry_dn: + } + + /* dest is valid block, try to recover from src to dest */ +- if (is_valid_blkaddr(sbi, dest, META_POR)) { ++ if (f2fs_is_valid_blkaddr(sbi, dest, META_POR)) { + + if (src == NULL_ADDR) { + err = reserve_new_block(&dn); +@@ -540,7 +540,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, + while (1) { + struct fsync_inode_entry *entry; + +- if (!is_valid_blkaddr(sbi, blkaddr, META_POR)) ++ if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR)) + break; + + ra_meta_pages_cond(sbi, blkaddr); +diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c +index 3c7bbbae0afa..5c698757e116 100644 +--- a/fs/f2fs/segment.c ++++ b/fs/f2fs/segment.c +@@ -1758,7 +1758,7 @@ bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) + struct seg_entry *se; + bool is_cp = false; + +- if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) ++ if (!is_valid_data_blkaddr(sbi, blkaddr)) + return true; + + mutex_lock(&sit_i->sentry_lock); +@@ -2571,7 +2571,7 @@ void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr) + { + struct page *cpage; + +- if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) ++ if (!is_valid_data_blkaddr(sbi, blkaddr)) + return; + + cpage = find_lock_page(META_MAPPING(sbi), blkaddr); +@@ -3304,6 +3304,15 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) + unsigned int old_valid_blocks; + + start = le32_to_cpu(segno_in_journal(journal, i)); ++ if (start >= MAIN_SEGS(sbi)) { ++ f2fs_msg(sbi->sb, KERN_ERR, ++ "Wrong journal entry on segno %u", ++ start); ++ set_sbi_flag(sbi, SBI_NEED_FSCK); ++ err = -EINVAL; ++ break; ++ } ++ + se = &sit_i->sentries[start]; + sit = sit_in_journal(journal, i); + +diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h +index 4dfb5080098f..47348d98165b 100644 +--- a/fs/f2fs/segment.h ++++ b/fs/f2fs/segment.h +@@ -53,13 +53,19 @@ + ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ + (sbi)->segs_per_sec)) \ + +-#define MAIN_BLKADDR(sbi) (SM_I(sbi)->main_blkaddr) +-#define SEG0_BLKADDR(sbi) (SM_I(sbi)->seg0_blkaddr) ++#define MAIN_BLKADDR(sbi) \ ++ (SM_I(sbi) ? SM_I(sbi)->main_blkaddr : \ ++ le32_to_cpu(F2FS_RAW_SUPER(sbi)->main_blkaddr)) ++#define SEG0_BLKADDR(sbi) \ ++ (SM_I(sbi) ? SM_I(sbi)->seg0_blkaddr : \ ++ le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment0_blkaddr)) + + #define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments) + #define MAIN_SECS(sbi) ((sbi)->total_sections) + +-#define TOTAL_SEGS(sbi) (SM_I(sbi)->segment_count) ++#define TOTAL_SEGS(sbi) \ ++ (SM_I(sbi) ? SM_I(sbi)->segment_count : \ ++ le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count)) + #define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg) + + #define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi)) +@@ -79,7 +85,7 @@ + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1)) + + #define GET_SEGNO(sbi, blk_addr) \ +- ((((blk_addr) == NULL_ADDR) || ((blk_addr) == NEW_ADDR)) ? \ ++ ((!is_valid_data_blkaddr(sbi, blk_addr)) ? \ + NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ + GET_SEGNO_FROM_SEG0(sbi, blk_addr))) + #define BLKS_PER_SEC(sbi) \ +@@ -619,10 +625,14 @@ static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) + f2fs_bug_on(sbi, segno > TOTAL_SEGS(sbi) - 1); + } + +-static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) ++static inline void verify_block_addr(struct f2fs_io_info *fio, block_t blk_addr) + { +- BUG_ON(blk_addr < SEG0_BLKADDR(sbi) +- || blk_addr >= MAX_BLKADDR(sbi)); ++ struct f2fs_sb_info *sbi = fio->sbi; ++ ++ if (__is_meta_io(fio)) ++ verify_blkaddr(sbi, blk_addr, META_GENERIC); ++ else ++ verify_blkaddr(sbi, blk_addr, DATA_GENERIC); + } + + /* +diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c +index 7cda685296b2..de4de4ebe64c 100644 +--- a/fs/f2fs/super.c ++++ b/fs/f2fs/super.c +@@ -1807,6 +1807,8 @@ static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, + static int sanity_check_raw_super(struct f2fs_sb_info *sbi, + struct buffer_head *bh) + { ++ block_t segment_count, segs_per_sec, secs_per_zone; ++ block_t total_sections, blocks_per_seg; + struct f2fs_super_block *raw_super = (struct f2fs_super_block *) + (bh->b_data + F2FS_SUPER_OFFSET); + struct super_block *sb = sbi->sb; +@@ -1863,6 +1865,68 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, + return 1; + } + ++ segment_count = le32_to_cpu(raw_super->segment_count); ++ segs_per_sec = le32_to_cpu(raw_super->segs_per_sec); ++ secs_per_zone = le32_to_cpu(raw_super->secs_per_zone); ++ total_sections = le32_to_cpu(raw_super->section_count); ++ ++ /* blocks_per_seg should be 512, given the above check */ ++ blocks_per_seg = 1 << le32_to_cpu(raw_super->log_blocks_per_seg); ++ ++ if (segment_count > F2FS_MAX_SEGMENT || ++ segment_count < F2FS_MIN_SEGMENTS) { ++ f2fs_msg(sb, KERN_INFO, ++ "Invalid segment count (%u)", ++ segment_count); ++ return 1; ++ } ++ ++ if (total_sections > segment_count || ++ total_sections < F2FS_MIN_SEGMENTS || ++ segs_per_sec > segment_count || !segs_per_sec) { ++ f2fs_msg(sb, KERN_INFO, ++ "Invalid segment/section count (%u, %u x %u)", ++ segment_count, total_sections, segs_per_sec); ++ return 1; ++ } ++ ++ if ((segment_count / segs_per_sec) < total_sections) { ++ f2fs_msg(sb, KERN_INFO, ++ "Small segment_count (%u < %u * %u)", ++ segment_count, segs_per_sec, total_sections); ++ return 1; ++ } ++ ++ if (segment_count > (le32_to_cpu(raw_super->block_count) >> 9)) { ++ f2fs_msg(sb, KERN_INFO, ++ "Wrong segment_count / block_count (%u > %u)", ++ segment_count, le32_to_cpu(raw_super->block_count)); ++ return 1; ++ } ++ ++ if (secs_per_zone > total_sections || !secs_per_zone) { ++ f2fs_msg(sb, KERN_INFO, ++ "Wrong secs_per_zone / total_sections (%u, %u)", ++ secs_per_zone, total_sections); ++ return 1; ++ } ++ if (le32_to_cpu(raw_super->extension_count) > F2FS_MAX_EXTENSION) { ++ f2fs_msg(sb, KERN_INFO, ++ "Corrupted extension count (%u > %u)", ++ le32_to_cpu(raw_super->extension_count), ++ F2FS_MAX_EXTENSION); ++ return 1; ++ } ++ ++ if (le32_to_cpu(raw_super->cp_payload) > ++ (blocks_per_seg - F2FS_CP_PACKS)) { ++ f2fs_msg(sb, KERN_INFO, ++ "Insane cp_payload (%u > %u)", ++ le32_to_cpu(raw_super->cp_payload), ++ blocks_per_seg - F2FS_CP_PACKS); ++ return 1; ++ } ++ + /* check reserved ino info */ + if (le32_to_cpu(raw_super->node_ino) != 1 || + le32_to_cpu(raw_super->meta_ino) != 2 || +@@ -1875,13 +1939,6 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, + return 1; + } + +- if (le32_to_cpu(raw_super->segment_count) > F2FS_MAX_SEGMENT) { +- f2fs_msg(sb, KERN_INFO, +- "Invalid segment count (%u)", +- le32_to_cpu(raw_super->segment_count)); +- return 1; +- } +- + /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ + if (sanity_check_area_boundary(sbi, bh)) + return 1; +@@ -1899,6 +1956,9 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) + unsigned int sit_segs, nat_segs; + unsigned int sit_bitmap_size, nat_bitmap_size; + unsigned int log_blocks_per_seg; ++ unsigned int segment_count_main; ++ unsigned int cp_pack_start_sum, cp_payload; ++ block_t user_block_count; + int i; + + total = le32_to_cpu(raw_super->segment_count); +@@ -1923,6 +1983,16 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) + return 1; + } + ++ user_block_count = le64_to_cpu(ckpt->user_block_count); ++ segment_count_main = le32_to_cpu(raw_super->segment_count_main); ++ log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); ++ if (!user_block_count || user_block_count >= ++ segment_count_main << log_blocks_per_seg) { ++ f2fs_msg(sbi->sb, KERN_ERR, ++ "Wrong user_block_count: %u", user_block_count); ++ return 1; ++ } ++ + main_segs = le32_to_cpu(raw_super->segment_count_main); + blocks_per_seg = sbi->blocks_per_seg; + +@@ -1939,7 +2009,6 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) + + sit_bitmap_size = le32_to_cpu(ckpt->sit_ver_bitmap_bytesize); + nat_bitmap_size = le32_to_cpu(ckpt->nat_ver_bitmap_bytesize); +- log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); + + if (sit_bitmap_size != ((sit_segs / 2) << log_blocks_per_seg) / 8 || + nat_bitmap_size != ((nat_segs / 2) << log_blocks_per_seg) / 8) { +@@ -1949,6 +2018,17 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) + return 1; + } + ++ cp_pack_start_sum = __start_sum_addr(sbi); ++ cp_payload = __cp_payload(sbi); ++ if (cp_pack_start_sum < cp_payload + 1 || ++ cp_pack_start_sum > blocks_per_seg - 1 - ++ NR_CURSEG_TYPE) { ++ f2fs_msg(sbi->sb, KERN_ERR, ++ "Wrong cp_pack_start_sum: %u", ++ cp_pack_start_sum); ++ return 1; ++ } ++ + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); + return 1; +diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c +index 6249c92671de..ea66f04f46f7 100644 +--- a/fs/xfs/libxfs/xfs_attr.c ++++ b/fs/xfs/libxfs/xfs_attr.c +@@ -501,7 +501,14 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) + if (args->flags & ATTR_CREATE) + return retval; + retval = xfs_attr_shortform_remove(args); +- ASSERT(retval == 0); ++ if (retval) ++ return retval; ++ /* ++ * Since we have removed the old attr, clear ATTR_REPLACE so ++ * that the leaf format add routine won't trip over the attr ++ * not being around. ++ */ ++ args->flags &= ~ATTR_REPLACE; + } + + if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX || +diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h +index a3333004fd2b..8458cc5fbce5 100644 +--- a/include/linux/bpf_verifier.h ++++ b/include/linux/bpf_verifier.h +@@ -113,6 +113,7 @@ struct bpf_insn_aux_data { + struct bpf_map *map_ptr; /* pointer for call insn into lookup_elem */ + }; + int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ ++ int sanitize_stack_off; /* stack slot to be cleared */ + bool seen; /* this insn was processed by the verifier */ + }; + +diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h +index e931da8424a4..6728c2ee0205 100644 +--- a/include/linux/ceph/auth.h ++++ b/include/linux/ceph/auth.h +@@ -64,6 +64,10 @@ struct ceph_auth_client_ops { + /* ensure that an existing authorizer is up to date */ + int (*update_authorizer)(struct ceph_auth_client *ac, int peer_type, + struct ceph_auth_handshake *auth); ++ int (*add_authorizer_challenge)(struct ceph_auth_client *ac, ++ struct ceph_authorizer *a, ++ void *challenge_buf, ++ int challenge_buf_len); + int (*verify_authorizer_reply)(struct ceph_auth_client *ac, + struct ceph_authorizer *a); + void (*invalidate_authorizer)(struct ceph_auth_client *ac, +@@ -118,6 +122,10 @@ void ceph_auth_destroy_authorizer(struct ceph_authorizer *a); + extern int ceph_auth_update_authorizer(struct ceph_auth_client *ac, + int peer_type, + struct ceph_auth_handshake *a); ++int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac, ++ struct ceph_authorizer *a, ++ void *challenge_buf, ++ int challenge_buf_len); + extern int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac, + struct ceph_authorizer *a); + extern void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, +diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h +index 59042d5ac520..70f42eef813b 100644 +--- a/include/linux/ceph/ceph_features.h ++++ b/include/linux/ceph/ceph_features.h +@@ -165,9 +165,9 @@ DEFINE_CEPH_FEATURE(58, 1, FS_FILE_LAYOUT_V2) // overlap + DEFINE_CEPH_FEATURE(59, 1, FS_BTIME) + DEFINE_CEPH_FEATURE(59, 1, FS_CHANGE_ATTR) // overlap + DEFINE_CEPH_FEATURE(59, 1, MSG_ADDR2) // overlap +-DEFINE_CEPH_FEATURE(60, 1, BLKIN_TRACING) // *do not share this bit* ++DEFINE_CEPH_FEATURE(60, 1, OSD_RECOVERY_DELETES) // *do not share this bit* ++DEFINE_CEPH_FEATURE(61, 1, CEPHX_V2) // *do not share this bit* + +-DEFINE_CEPH_FEATURE(61, 1, RESERVED2) // unused, but slow down! + DEFINE_CEPH_FEATURE(62, 1, RESERVED) // do not use; used as a sentinal + DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facing + +@@ -209,7 +209,8 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin + CEPH_FEATURE_SERVER_JEWEL | \ + CEPH_FEATURE_MON_STATEFUL_SUB | \ + CEPH_FEATURE_CRUSH_TUNABLES5 | \ +- CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING) ++ CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING | \ ++ CEPH_FEATURE_CEPHX_V2) + + #define CEPH_FEATURES_REQUIRED_DEFAULT \ + (CEPH_FEATURE_NOSRCADDR | \ +diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h +index ead9d85f1c11..18fbe910ed55 100644 +--- a/include/linux/ceph/messenger.h ++++ b/include/linux/ceph/messenger.h +@@ -31,6 +31,9 @@ struct ceph_connection_operations { + struct ceph_auth_handshake *(*get_authorizer) ( + struct ceph_connection *con, + int *proto, int force_new); ++ int (*add_authorizer_challenge)(struct ceph_connection *con, ++ void *challenge_buf, ++ int challenge_buf_len); + int (*verify_authorizer_reply) (struct ceph_connection *con); + int (*invalidate_authorizer)(struct ceph_connection *con); + +@@ -203,9 +206,8 @@ struct ceph_connection { + attempt for this connection, client */ + u32 peer_global_seq; /* peer's global seq for this connection */ + ++ struct ceph_auth_handshake *auth; + int auth_retry; /* true if we need a newer authorizer */ +- void *auth_reply_buf; /* where to put the authorizer reply */ +- int auth_reply_buf_len; + + struct mutex mutex; + +diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h +index 73ae2a926548..9e50aede46c8 100644 +--- a/include/linux/ceph/msgr.h ++++ b/include/linux/ceph/msgr.h +@@ -91,7 +91,7 @@ struct ceph_entity_inst { + #define CEPH_MSGR_TAG_SEQ 13 /* 64-bit int follows with seen seq number */ + #define CEPH_MSGR_TAG_KEEPALIVE2 14 /* keepalive2 byte + ceph_timespec */ + #define CEPH_MSGR_TAG_KEEPALIVE2_ACK 15 /* keepalive2 reply */ +- ++#define CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER 16 /* cephx v2 doing server challenge */ + + /* + * connection negotiation +diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h +index 3b7675bcca64..cd0d2270998f 100644 +--- a/include/linux/jump_label.h ++++ b/include/linux/jump_label.h +@@ -160,6 +160,8 @@ extern void arch_jump_label_transform_static(struct jump_entry *entry, + extern int jump_label_text_reserved(void *start, void *end); + extern void static_key_slow_inc(struct static_key *key); + extern void static_key_slow_dec(struct static_key *key); ++extern void static_key_slow_inc_cpuslocked(struct static_key *key); ++extern void static_key_slow_dec_cpuslocked(struct static_key *key); + extern void jump_label_apply_nops(struct module *mod); + extern int static_key_count(struct static_key *key); + extern void static_key_enable(struct static_key *key); +@@ -222,6 +224,9 @@ static inline void static_key_slow_dec(struct static_key *key) + atomic_dec(&key->enabled); + } + ++#define static_key_slow_inc_cpuslocked(key) static_key_slow_inc(key) ++#define static_key_slow_dec_cpuslocked(key) static_key_slow_dec(key) ++ + static inline int jump_label_text_reserved(void *start, void *end) + { + return 0; +@@ -416,6 +421,8 @@ extern bool ____wrong_branch_error(void); + + #define static_branch_inc(x) static_key_slow_inc(&(x)->key) + #define static_branch_dec(x) static_key_slow_dec(&(x)->key) ++#define static_branch_inc_cpuslocked(x) static_key_slow_inc_cpuslocked(&(x)->key) ++#define static_branch_dec_cpuslocked(x) static_key_slow_dec_cpuslocked(&(x)->key) + + /* + * Normal usage; boolean enable/disable. +diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h +index 919b2a0b0307..38342e88b3f3 100644 +--- a/include/linux/ptrace.h ++++ b/include/linux/ptrace.h +@@ -62,8 +62,8 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); + #define PTRACE_MODE_READ 0x01 + #define PTRACE_MODE_ATTACH 0x02 + #define PTRACE_MODE_NOAUDIT 0x04 +-#define PTRACE_MODE_FSCREDS 0x08 +-#define PTRACE_MODE_REALCREDS 0x10 ++#define PTRACE_MODE_FSCREDS 0x08 ++#define PTRACE_MODE_REALCREDS 0x10 + + /* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */ + #define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS) +diff --git a/include/linux/sched.h b/include/linux/sched.h +index e04919aa8201..866439c361a9 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1405,6 +1405,8 @@ static inline bool is_percpu_thread(void) + #define PFA_SPREAD_SLAB 2 /* Spread some slab caches over cpuset */ + #define PFA_SPEC_SSB_DISABLE 3 /* Speculative Store Bypass disabled */ + #define PFA_SPEC_SSB_FORCE_DISABLE 4 /* Speculative Store Bypass force disabled*/ ++#define PFA_SPEC_IB_DISABLE 5 /* Indirect branch speculation restricted */ ++#define PFA_SPEC_IB_FORCE_DISABLE 6 /* Indirect branch speculation permanently restricted */ + + #define TASK_PFA_TEST(name, func) \ + static inline bool task_##func(struct task_struct *p) \ +@@ -1436,6 +1438,13 @@ TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable) + TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) + TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) + ++TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable) ++TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable) ++TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable) ++ ++TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) ++TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) ++ + static inline void + current_restore_flags(unsigned long orig_flags, unsigned long flags) + { +diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h +new file mode 100644 +index 000000000000..59d3736c454c +--- /dev/null ++++ b/include/linux/sched/smt.h +@@ -0,0 +1,20 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _LINUX_SCHED_SMT_H ++#define _LINUX_SCHED_SMT_H ++ ++#include <linux/static_key.h> ++ ++#ifdef CONFIG_SCHED_SMT ++extern struct static_key_false sched_smt_present; ++ ++static __always_inline bool sched_smt_active(void) ++{ ++ return static_branch_likely(&sched_smt_present); ++} ++#else ++static inline bool sched_smt_active(void) { return false; } ++#endif ++ ++void arch_smt_update(void); ++ ++#endif +diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h +index f64e88444082..f6250555ce7d 100644 +--- a/include/linux/skbuff.h ++++ b/include/linux/skbuff.h +@@ -1288,6 +1288,22 @@ static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg) + } + } + ++static inline void skb_zcopy_set_nouarg(struct sk_buff *skb, void *val) ++{ ++ skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t) val | 0x1UL); ++ skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG; ++} ++ ++static inline bool skb_zcopy_is_nouarg(struct sk_buff *skb) ++{ ++ return (uintptr_t) skb_shinfo(skb)->destructor_arg & 0x1UL; ++} ++ ++static inline void *skb_zcopy_get_nouarg(struct sk_buff *skb) ++{ ++ return (void *)((uintptr_t) skb_shinfo(skb)->destructor_arg & ~0x1UL); ++} ++ + /* Release a reference on a zerocopy structure */ + static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy) + { +@@ -1297,7 +1313,7 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy) + if (uarg->callback == sock_zerocopy_callback) { + uarg->zerocopy = uarg->zerocopy && zerocopy; + sock_zerocopy_put(uarg); +- } else { ++ } else if (!skb_zcopy_is_nouarg(skb)) { + uarg->callback(uarg, zerocopy); + } + +diff --git a/include/net/tls.h b/include/net/tls.h +index 86ed3dd80fe7..604fd982da19 100644 +--- a/include/net/tls.h ++++ b/include/net/tls.h +@@ -89,6 +89,8 @@ struct tls_context { + + void *priv_ctx; + ++ u8 tx_conf:2; ++ + u16 prepend_size; + u16 tag_size; + u16 overhead_size; +@@ -104,7 +106,6 @@ struct tls_context { + + u16 pending_open_record_frags; + int (*push_pending_record)(struct sock *sk, int flags); +- void (*free_resources)(struct sock *sk); + + void (*sk_write_space)(struct sock *sk); + void (*sk_proto_close)(struct sock *sk, long timeout); +@@ -129,6 +130,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); + int tls_sw_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags); + void tls_sw_close(struct sock *sk, long timeout); ++void tls_sw_free_tx_resources(struct sock *sk); + + void tls_sk_destruct(struct sock *sk, struct tls_context *ctx); + void tls_icsk_clean_acked(struct sock *sk); +diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h +index 7115838fbf2a..38ab0e06259a 100644 +--- a/include/uapi/linux/btrfs_tree.h ++++ b/include/uapi/linux/btrfs_tree.h +@@ -734,6 +734,7 @@ struct btrfs_balance_item { + #define BTRFS_FILE_EXTENT_INLINE 0 + #define BTRFS_FILE_EXTENT_REG 1 + #define BTRFS_FILE_EXTENT_PREALLOC 2 ++#define BTRFS_FILE_EXTENT_TYPES 2 + + struct btrfs_file_extent_item { + /* +diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h +index 3027f943f4b3..214102fab940 100644 +--- a/include/uapi/linux/prctl.h ++++ b/include/uapi/linux/prctl.h +@@ -203,6 +203,7 @@ struct prctl_mm_map { + #define PR_SET_SPECULATION_CTRL 53 + /* Speculation control variants */ + # define PR_SPEC_STORE_BYPASS 0 ++# define PR_SPEC_INDIRECT_BRANCH 1 + /* Return and control values for PR_SET/GET_SPECULATION_CTRL */ + # define PR_SPEC_NOT_AFFECTED 0 + # define PR_SPEC_PRCTL (1UL << 0) +diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c +index 013b0cd1958e..f6755fd5bae2 100644 +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -717,8 +717,9 @@ static bool is_spillable_regtype(enum bpf_reg_type type) + /* check_stack_read/write functions track spill/fill of registers, + * stack boundary and alignment are checked in check_mem_access() + */ +-static int check_stack_write(struct bpf_verifier_state *state, int off, +- int size, int value_regno) ++static int check_stack_write(struct bpf_verifier_env *env, ++ struct bpf_verifier_state *state, int off, ++ int size, int value_regno, int insn_idx) + { + int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE; + /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, +@@ -738,8 +739,32 @@ static int check_stack_write(struct bpf_verifier_state *state, int off, + state->spilled_regs[spi] = state->regs[value_regno]; + state->spilled_regs[spi].live |= REG_LIVE_WRITTEN; + +- for (i = 0; i < BPF_REG_SIZE; i++) ++ for (i = 0; i < BPF_REG_SIZE; i++) { ++ if (state->stack_slot_type[MAX_BPF_STACK + off + i] == STACK_MISC && ++ !env->allow_ptr_leaks) { ++ int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off; ++ int soff = (-spi - 1) * BPF_REG_SIZE; ++ ++ /* detected reuse of integer stack slot with a pointer ++ * which means either llvm is reusing stack slot or ++ * an attacker is trying to exploit CVE-2018-3639 ++ * (speculative store bypass) ++ * Have to sanitize that slot with preemptive ++ * store of zero. ++ */ ++ if (*poff && *poff != soff) { ++ /* disallow programs where single insn stores ++ * into two different stack slots, since verifier ++ * cannot sanitize them ++ */ ++ verbose("insn %d cannot access two stack slots fp%d and fp%d", ++ insn_idx, *poff, soff); ++ return -EINVAL; ++ } ++ *poff = soff; ++ } + state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL; ++ } + } else { + /* regular write of data into stack */ + state->spilled_regs[spi] = (struct bpf_reg_state) {}; +@@ -1216,7 +1241,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn + verbose("attempt to corrupt spilled pointer on stack\n"); + return -EACCES; + } +- err = check_stack_write(state, off, size, value_regno); ++ err = check_stack_write(env, state, off, size, ++ value_regno, insn_idx); + } else { + err = check_stack_read(state, off, size, value_regno); + } +@@ -4270,6 +4296,34 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) + else + continue; + ++ if (type == BPF_WRITE && ++ env->insn_aux_data[i + delta].sanitize_stack_off) { ++ struct bpf_insn patch[] = { ++ /* Sanitize suspicious stack slot with zero. ++ * There are no memory dependencies for this store, ++ * since it's only using frame pointer and immediate ++ * constant of zero ++ */ ++ BPF_ST_MEM(BPF_DW, BPF_REG_FP, ++ env->insn_aux_data[i + delta].sanitize_stack_off, ++ 0), ++ /* the original STX instruction will immediately ++ * overwrite the same stack slot with appropriate value ++ */ ++ *insn, ++ }; ++ ++ cnt = ARRAY_SIZE(patch); ++ new_prog = bpf_patch_insn_data(env, i + delta, patch, cnt); ++ if (!new_prog) ++ return -ENOMEM; ++ ++ delta += cnt - 1; ++ env->prog = new_prog; ++ insn = new_prog->insnsi + i + delta; ++ continue; ++ } ++ + if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) + continue; + +diff --git a/kernel/cpu.c b/kernel/cpu.c +index f3f389e33343..5c907d96e3dd 100644 +--- a/kernel/cpu.c ++++ b/kernel/cpu.c +@@ -10,6 +10,7 @@ + #include <linux/sched/signal.h> + #include <linux/sched/hotplug.h> + #include <linux/sched/task.h> ++#include <linux/sched/smt.h> + #include <linux/unistd.h> + #include <linux/cpu.h> + #include <linux/oom.h> +@@ -347,6 +348,12 @@ void cpu_hotplug_enable(void) + EXPORT_SYMBOL_GPL(cpu_hotplug_enable); + #endif /* CONFIG_HOTPLUG_CPU */ + ++/* ++ * Architectures that need SMT-specific errata handling during SMT hotplug ++ * should override this. ++ */ ++void __weak arch_smt_update(void) { } ++ + #ifdef CONFIG_HOTPLUG_SMT + enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; + EXPORT_SYMBOL_GPL(cpu_smt_control); +@@ -998,6 +1005,7 @@ out: + * concurrent CPU hotplug via cpu_add_remove_lock. + */ + lockup_detector_cleanup(); ++ arch_smt_update(); + return ret; + } + +@@ -1126,6 +1134,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) + ret = cpuhp_up_callbacks(cpu, st, target); + out: + cpus_write_unlock(); ++ arch_smt_update(); + return ret; + } + +@@ -2071,8 +2080,10 @@ static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) + */ + cpuhp_offline_cpu_device(cpu); + } +- if (!ret) ++ if (!ret) { + cpu_smt_control = ctrlval; ++ arch_smt_update(); ++ } + cpu_maps_update_done(); + return ret; + } +@@ -2083,6 +2094,7 @@ static int cpuhp_smt_enable(void) + + cpu_maps_update_begin(); + cpu_smt_control = CPU_SMT_ENABLED; ++ arch_smt_update(); + for_each_present_cpu(cpu) { + /* Skip online CPUs and CPUs on offline nodes */ + if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) +diff --git a/kernel/jump_label.c b/kernel/jump_label.c +index 7c3774ac1d51..70be35a19be2 100644 +--- a/kernel/jump_label.c ++++ b/kernel/jump_label.c +@@ -79,7 +79,7 @@ int static_key_count(struct static_key *key) + } + EXPORT_SYMBOL_GPL(static_key_count); + +-static void static_key_slow_inc_cpuslocked(struct static_key *key) ++void static_key_slow_inc_cpuslocked(struct static_key *key) + { + int v, v1; + +@@ -180,7 +180,7 @@ void static_key_disable(struct static_key *key) + } + EXPORT_SYMBOL_GPL(static_key_disable); + +-static void static_key_slow_dec_cpuslocked(struct static_key *key, ++static void __static_key_slow_dec_cpuslocked(struct static_key *key, + unsigned long rate_limit, + struct delayed_work *work) + { +@@ -211,7 +211,7 @@ static void __static_key_slow_dec(struct static_key *key, + struct delayed_work *work) + { + cpus_read_lock(); +- static_key_slow_dec_cpuslocked(key, rate_limit, work); ++ __static_key_slow_dec_cpuslocked(key, rate_limit, work); + cpus_read_unlock(); + } + +@@ -229,6 +229,12 @@ void static_key_slow_dec(struct static_key *key) + } + EXPORT_SYMBOL_GPL(static_key_slow_dec); + ++void static_key_slow_dec_cpuslocked(struct static_key *key) ++{ ++ STATIC_KEY_CHECK_USE(); ++ __static_key_slow_dec_cpuslocked(key, 0, NULL); ++} ++ + void static_key_slow_dec_deferred(struct static_key_deferred *key) + { + STATIC_KEY_CHECK_USE(); +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 3bc664662081..0552ddbb25e2 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -5617,15 +5617,10 @@ int sched_cpu_activate(unsigned int cpu) + + #ifdef CONFIG_SCHED_SMT + /* +- * The sched_smt_present static key needs to be evaluated on every +- * hotplug event because at boot time SMT might be disabled when +- * the number of booted CPUs is limited. +- * +- * If then later a sibling gets hotplugged, then the key would stay +- * off and SMT scheduling would never be functional. ++ * When going up, increment the number of cores with SMT present. + */ +- if (cpumask_weight(cpu_smt_mask(cpu)) > 1) +- static_branch_enable_cpuslocked(&sched_smt_present); ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_inc_cpuslocked(&sched_smt_present); + #endif + set_cpu_active(cpu, true); + +@@ -5669,6 +5664,14 @@ int sched_cpu_deactivate(unsigned int cpu) + */ + synchronize_rcu_mult(call_rcu, call_rcu_sched); + ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going down, decrement the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_dec_cpuslocked(&sched_smt_present); ++#endif ++ + if (!sched_smp_initialized) + return 0; + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 2d4d79420e36..7240bb4a4090 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -4040,12 +4040,12 @@ static inline bool cfs_bandwidth_used(void) + + void cfs_bandwidth_usage_inc(void) + { +- static_key_slow_inc(&__cfs_bandwidth_used); ++ static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used); + } + + void cfs_bandwidth_usage_dec(void) + { +- static_key_slow_dec(&__cfs_bandwidth_used); ++ static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used); + } + #else /* HAVE_JUMP_LABEL */ + static bool cfs_bandwidth_used(void) +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 63d999dfec80..b3ba6e5e99f2 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -20,6 +20,7 @@ + #include <linux/sched/task_stack.h> + #include <linux/sched/cputime.h> + #include <linux/sched/init.h> ++#include <linux/sched/smt.h> + + #include <linux/u64_stats_sync.h> + #include <linux/kernel_stat.h> +@@ -825,9 +826,6 @@ static inline int cpu_of(struct rq *rq) + + + #ifdef CONFIG_SCHED_SMT +- +-extern struct static_key_false sched_smt_present; +- + extern void __update_idle_core(struct rq *rq); + + static inline void update_idle_core(struct rq *rq) +diff --git a/lib/test_kmod.c b/lib/test_kmod.c +index 96c304fd656a..7abb59ce6613 100644 +--- a/lib/test_kmod.c ++++ b/lib/test_kmod.c +@@ -1221,7 +1221,6 @@ void unregister_test_dev_kmod(struct kmod_test_device *test_dev) + + dev_info(test_dev->dev, "removing interface\n"); + misc_deregister(&test_dev->misc_dev); +- kfree(&test_dev->misc_dev.name); + + mutex_unlock(&test_dev->config_mutex); + mutex_unlock(&test_dev->trigger_mutex); +diff --git a/mm/huge_memory.c b/mm/huge_memory.c +index adacfe66cf3d..930f2aa3bb4d 100644 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -2280,7 +2280,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, + } + } + +-static void freeze_page(struct page *page) ++static void unmap_page(struct page *page) + { + enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | + TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; +@@ -2295,7 +2295,7 @@ static void freeze_page(struct page *page) + VM_BUG_ON_PAGE(!unmap_success, page); + } + +-static void unfreeze_page(struct page *page) ++static void remap_page(struct page *page) + { + int i; + if (PageTransHuge(page)) { +@@ -2312,26 +2312,13 @@ static void __split_huge_page_tail(struct page *head, int tail, + struct page *page_tail = head + tail; + + VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); +- VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail); + + /* +- * tail_page->_refcount is zero and not changing from under us. But +- * get_page_unless_zero() may be running from under us on the +- * tail_page. If we used atomic_set() below instead of atomic_inc() or +- * atomic_add(), we would then run atomic_set() concurrently with +- * get_page_unless_zero(), and atomic_set() is implemented in C not +- * using locked ops. spin_unlock on x86 sometime uses locked ops +- * because of PPro errata 66, 92, so unless somebody can guarantee +- * atomic_set() here would be safe on all archs (and not only on x86), +- * it's safer to use atomic_inc()/atomic_add(). ++ * Clone page flags before unfreezing refcount. ++ * ++ * After successful get_page_unless_zero() might follow flags change, ++ * for exmaple lock_page() which set PG_waiters. + */ +- if (PageAnon(head) && !PageSwapCache(head)) { +- page_ref_inc(page_tail); +- } else { +- /* Additional pin to radix tree */ +- page_ref_add(page_tail, 2); +- } +- + page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + page_tail->flags |= (head->flags & + ((1L << PG_referenced) | +@@ -2344,36 +2331,42 @@ static void __split_huge_page_tail(struct page *head, int tail, + (1L << PG_unevictable) | + (1L << PG_dirty))); + +- /* +- * After clearing PageTail the gup refcount can be released. +- * Page flags also must be visible before we make the page non-compound. +- */ ++ /* ->mapping in first tail page is compound_mapcount */ ++ VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, ++ page_tail); ++ page_tail->mapping = head->mapping; ++ page_tail->index = head->index + tail; ++ ++ /* Page flags must be visible before we make the page non-compound. */ + smp_wmb(); + ++ /* ++ * Clear PageTail before unfreezing page refcount. ++ * ++ * After successful get_page_unless_zero() might follow put_page() ++ * which needs correct compound_head(). ++ */ + clear_compound_head(page_tail); + ++ /* Finally unfreeze refcount. Additional reference from page cache. */ ++ page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) || ++ PageSwapCache(head))); ++ + if (page_is_young(head)) + set_page_young(page_tail); + if (page_is_idle(head)) + set_page_idle(page_tail); + +- /* ->mapping in first tail page is compound_mapcount */ +- VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, +- page_tail); +- page_tail->mapping = head->mapping; +- +- page_tail->index = head->index + tail; + page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); + lru_add_page_tail(head, page_tail, lruvec, list); + } + + static void __split_huge_page(struct page *page, struct list_head *list, +- unsigned long flags) ++ pgoff_t end, unsigned long flags) + { + struct page *head = compound_head(page); + struct zone *zone = page_zone(head); + struct lruvec *lruvec; +- pgoff_t end = -1; + int i; + + lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat); +@@ -2381,9 +2374,6 @@ static void __split_huge_page(struct page *page, struct list_head *list, + /* complete memcg works before add pages to LRU */ + mem_cgroup_split_huge_fixup(head); + +- if (!PageAnon(page)) +- end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE); +- + for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { + __split_huge_page_tail(head, i, lruvec, list); + /* Some pages can be beyond i_size: drop them from page cache */ +@@ -2412,7 +2402,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, + + spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); + +- unfreeze_page(head); ++ remap_page(head); + + for (i = 0; i < HPAGE_PMD_NR; i++) { + struct page *subpage = head + i; +@@ -2555,6 +2545,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) + int count, mapcount, extra_pins, ret; + bool mlocked; + unsigned long flags; ++ pgoff_t end; + + VM_BUG_ON_PAGE(is_huge_zero_page(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); +@@ -2577,6 +2568,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) + ret = -EBUSY; + goto out; + } ++ end = -1; + mapping = NULL; + anon_vma_lock_write(anon_vma); + } else { +@@ -2590,10 +2582,19 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) + + anon_vma = NULL; + i_mmap_lock_read(mapping); ++ ++ /* ++ *__split_huge_page() may need to trim off pages beyond EOF: ++ * but on 32-bit, i_size_read() takes an irq-unsafe seqlock, ++ * which cannot be nested inside the page tree lock. So note ++ * end now: i_size itself may be changed at any moment, but ++ * head page lock is good enough to serialize the trimming. ++ */ ++ end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); + } + + /* +- * Racy check if we can split the page, before freeze_page() will ++ * Racy check if we can split the page, before unmap_page() will + * split PMDs + */ + if (!can_split_huge_page(head, &extra_pins)) { +@@ -2602,7 +2603,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) + } + + mlocked = PageMlocked(page); +- freeze_page(head); ++ unmap_page(head); + VM_BUG_ON_PAGE(compound_mapcount(head), head); + + /* Make sure the page is not on per-CPU pagevec as it takes pin */ +@@ -2639,7 +2640,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) + if (mapping) + __dec_node_page_state(page, NR_SHMEM_THPS); + spin_unlock(&pgdata->split_queue_lock); +- __split_huge_page(page, list, flags); ++ __split_huge_page(page, list, end, flags); + if (PageSwapCache(head)) { + swp_entry_t entry = { .val = page_private(head) }; + +@@ -2659,7 +2660,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) + fail: if (mapping) + spin_unlock(&mapping->tree_lock); + spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); +- unfreeze_page(head); ++ remap_page(head); + ret = -EBUSY; + } + +diff --git a/mm/khugepaged.c b/mm/khugepaged.c +index 0a5bb3e8a8a3..d27a73737f1a 100644 +--- a/mm/khugepaged.c ++++ b/mm/khugepaged.c +@@ -1288,7 +1288,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) + * collapse_shmem - collapse small tmpfs/shmem pages into huge one. + * + * Basic scheme is simple, details are more complex: +- * - allocate and freeze a new huge page; ++ * - allocate and lock a new huge page; + * - scan over radix tree replacing old pages the new one + * + swap in pages if necessary; + * + fill in gaps; +@@ -1296,11 +1296,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) + * - if replacing succeed: + * + copy data over; + * + free old pages; +- * + unfreeze huge page; ++ * + unlock huge page; + * - if replacing failed; + * + put all pages back and unfreeze them; + * + restore gaps in the radix-tree; +- * + free huge page; ++ * + unlock and free huge page; + */ + static void collapse_shmem(struct mm_struct *mm, + struct address_space *mapping, pgoff_t start, +@@ -1333,18 +1333,15 @@ static void collapse_shmem(struct mm_struct *mm, + goto out; + } + ++ __SetPageLocked(new_page); ++ __SetPageSwapBacked(new_page); + new_page->index = start; + new_page->mapping = mapping; +- __SetPageSwapBacked(new_page); +- __SetPageLocked(new_page); +- BUG_ON(!page_ref_freeze(new_page, 1)); +- + + /* +- * At this point the new_page is 'frozen' (page_count() is zero), locked +- * and not up-to-date. It's safe to insert it into radix tree, because +- * nobody would be able to map it or use it in other way until we +- * unfreeze it. ++ * At this point the new_page is locked and not up-to-date. ++ * It's safe to insert it into the page cache, because nobody would ++ * be able to map it or use it in another way until we unlock it. + */ + + index = start; +@@ -1352,19 +1349,29 @@ static void collapse_shmem(struct mm_struct *mm, + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + int n = min(iter.index, end) - index; + ++ /* ++ * Stop if extent has been hole-punched, and is now completely ++ * empty (the more obvious i_size_read() check would take an ++ * irq-unsafe seqlock on 32-bit). ++ */ ++ if (n >= HPAGE_PMD_NR) { ++ result = SCAN_TRUNCATED; ++ goto tree_locked; ++ } ++ + /* + * Handle holes in the radix tree: charge it from shmem and + * insert relevant subpage of new_page into the radix-tree. + */ + if (n && !shmem_charge(mapping->host, n)) { + result = SCAN_FAIL; +- break; ++ goto tree_locked; + } +- nr_none += n; + for (; index < min(iter.index, end); index++) { + radix_tree_insert(&mapping->page_tree, index, + new_page + (index % HPAGE_PMD_NR)); + } ++ nr_none += n; + + /* We are done. */ + if (index >= end) +@@ -1380,12 +1387,12 @@ static void collapse_shmem(struct mm_struct *mm, + result = SCAN_FAIL; + goto tree_unlocked; + } +- spin_lock_irq(&mapping->tree_lock); + } else if (trylock_page(page)) { + get_page(page); ++ spin_unlock_irq(&mapping->tree_lock); + } else { + result = SCAN_PAGE_LOCK; +- break; ++ goto tree_locked; + } + + /* +@@ -1394,17 +1401,24 @@ static void collapse_shmem(struct mm_struct *mm, + */ + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageUptodate(page), page); +- VM_BUG_ON_PAGE(PageTransCompound(page), page); ++ ++ /* ++ * If file was truncated then extended, or hole-punched, before ++ * we locked the first page, then a THP might be there already. ++ */ ++ if (PageTransCompound(page)) { ++ result = SCAN_PAGE_COMPOUND; ++ goto out_unlock; ++ } + + if (page_mapping(page) != mapping) { + result = SCAN_TRUNCATED; + goto out_unlock; + } +- spin_unlock_irq(&mapping->tree_lock); + + if (isolate_lru_page(page)) { + result = SCAN_DEL_PAGE_LRU; +- goto out_isolate_failed; ++ goto out_unlock; + } + + if (page_mapped(page)) +@@ -1426,7 +1440,9 @@ static void collapse_shmem(struct mm_struct *mm, + */ + if (!page_ref_freeze(page, 3)) { + result = SCAN_PAGE_COUNT; +- goto out_lru; ++ spin_unlock_irq(&mapping->tree_lock); ++ putback_lru_page(page); ++ goto out_unlock; + } + + /* +@@ -1442,17 +1458,10 @@ static void collapse_shmem(struct mm_struct *mm, + slot = radix_tree_iter_resume(slot, &iter); + index++; + continue; +-out_lru: +- spin_unlock_irq(&mapping->tree_lock); +- putback_lru_page(page); +-out_isolate_failed: +- unlock_page(page); +- put_page(page); +- goto tree_unlocked; + out_unlock: + unlock_page(page); + put_page(page); +- break; ++ goto tree_unlocked; + } + + /* +@@ -1460,14 +1469,18 @@ out_unlock: + * This code only triggers if there's nothing in radix tree + * beyond 'end'. + */ +- if (result == SCAN_SUCCEED && index < end) { ++ if (index < end) { + int n = end - index; + ++ /* Stop if extent has been truncated, and is now empty */ ++ if (n >= HPAGE_PMD_NR) { ++ result = SCAN_TRUNCATED; ++ goto tree_locked; ++ } + if (!shmem_charge(mapping->host, n)) { + result = SCAN_FAIL; + goto tree_locked; + } +- + for (; index < end; index++) { + radix_tree_insert(&mapping->page_tree, index, + new_page + (index % HPAGE_PMD_NR)); +@@ -1475,57 +1488,62 @@ out_unlock: + nr_none += n; + } + ++ __inc_node_page_state(new_page, NR_SHMEM_THPS); ++ if (nr_none) { ++ struct zone *zone = page_zone(new_page); ++ ++ __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); ++ __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none); ++ } ++ + tree_locked: + spin_unlock_irq(&mapping->tree_lock); + tree_unlocked: + + if (result == SCAN_SUCCEED) { +- unsigned long flags; +- struct zone *zone = page_zone(new_page); +- + /* + * Replacing old pages with new one has succeed, now we need to + * copy the content and free old pages. + */ ++ index = start; + list_for_each_entry_safe(page, tmp, &pagelist, lru) { ++ while (index < page->index) { ++ clear_highpage(new_page + (index % HPAGE_PMD_NR)); ++ index++; ++ } + copy_highpage(new_page + (page->index % HPAGE_PMD_NR), + page); + list_del(&page->lru); +- unlock_page(page); +- page_ref_unfreeze(page, 1); + page->mapping = NULL; ++ page_ref_unfreeze(page, 1); + ClearPageActive(page); + ClearPageUnevictable(page); ++ unlock_page(page); + put_page(page); ++ index++; + } +- +- local_irq_save(flags); +- __inc_node_page_state(new_page, NR_SHMEM_THPS); +- if (nr_none) { +- __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); +- __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none); ++ while (index < end) { ++ clear_highpage(new_page + (index % HPAGE_PMD_NR)); ++ index++; + } +- local_irq_restore(flags); + +- /* +- * Remove pte page tables, so we can re-faulti +- * the page as huge. +- */ +- retract_page_tables(mapping, start); +- +- /* Everything is ready, let's unfreeze the new_page */ +- set_page_dirty(new_page); + SetPageUptodate(new_page); +- page_ref_unfreeze(new_page, HPAGE_PMD_NR); ++ page_ref_add(new_page, HPAGE_PMD_NR - 1); ++ set_page_dirty(new_page); + mem_cgroup_commit_charge(new_page, memcg, false, true); + lru_cache_add_anon(new_page); +- unlock_page(new_page); + ++ /* ++ * Remove pte page tables, so we can re-fault the page as huge. ++ */ ++ retract_page_tables(mapping, start); + *hpage = NULL; + } else { + /* Something went wrong: rollback changes to the radix-tree */ +- shmem_uncharge(mapping->host, nr_none); + spin_lock_irq(&mapping->tree_lock); ++ mapping->nrpages -= nr_none; ++ shmem_uncharge(mapping->host, nr_none); ++ + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, + start) { + if (iter.index >= end) +@@ -1551,19 +1569,18 @@ tree_unlocked: + slot, page); + slot = radix_tree_iter_resume(slot, &iter); + spin_unlock_irq(&mapping->tree_lock); +- putback_lru_page(page); + unlock_page(page); ++ putback_lru_page(page); + spin_lock_irq(&mapping->tree_lock); + } + VM_BUG_ON(nr_none); + spin_unlock_irq(&mapping->tree_lock); + +- /* Unfreeze new_page, caller would take care about freeing it */ +- page_ref_unfreeze(new_page, 1); + mem_cgroup_cancel_charge(new_page, memcg, true); +- unlock_page(new_page); + new_page->mapping = NULL; + } ++ ++ unlock_page(new_page); + out: + VM_BUG_ON(!list_empty(&pagelist)); + /* TODO: tracepoints */ +diff --git a/mm/shmem.c b/mm/shmem.c +index fa08f56fd5e5..ab7ff0aeae2d 100644 +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -296,12 +296,14 @@ bool shmem_charge(struct inode *inode, long pages) + if (!shmem_inode_acct_block(inode, pages)) + return false; + ++ /* nrpages adjustment first, then shmem_recalc_inode() when balanced */ ++ inode->i_mapping->nrpages += pages; ++ + spin_lock_irqsave(&info->lock, flags); + info->alloced += pages; + inode->i_blocks += pages * BLOCKS_PER_PAGE; + shmem_recalc_inode(inode); + spin_unlock_irqrestore(&info->lock, flags); +- inode->i_mapping->nrpages += pages; + + return true; + } +@@ -311,6 +313,8 @@ void shmem_uncharge(struct inode *inode, long pages) + struct shmem_inode_info *info = SHMEM_I(inode); + unsigned long flags; + ++ /* nrpages adjustment done by __delete_from_page_cache() or caller */ ++ + spin_lock_irqsave(&info->lock, flags); + info->alloced -= pages; + inode->i_blocks -= pages * BLOCKS_PER_PAGE; +@@ -1528,11 +1532,13 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, + { + struct page *oldpage, *newpage; + struct address_space *swap_mapping; ++ swp_entry_t entry; + pgoff_t swap_index; + int error; + + oldpage = *pagep; +- swap_index = page_private(oldpage); ++ entry.val = page_private(oldpage); ++ swap_index = swp_offset(entry); + swap_mapping = page_mapping(oldpage); + + /* +@@ -1551,7 +1557,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, + __SetPageLocked(newpage); + __SetPageSwapBacked(newpage); + SetPageUptodate(newpage); +- set_page_private(newpage, swap_index); ++ set_page_private(newpage, entry.val); + SetPageSwapCache(newpage); + + /* +diff --git a/net/ceph/auth.c b/net/ceph/auth.c +index dbde2b3c3c15..fbeee068ea14 100644 +--- a/net/ceph/auth.c ++++ b/net/ceph/auth.c +@@ -315,6 +315,22 @@ int ceph_auth_update_authorizer(struct ceph_auth_client *ac, + } + EXPORT_SYMBOL(ceph_auth_update_authorizer); + ++int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac, ++ struct ceph_authorizer *a, ++ void *challenge_buf, ++ int challenge_buf_len) ++{ ++ int ret = 0; ++ ++ mutex_lock(&ac->mutex); ++ if (ac->ops && ac->ops->add_authorizer_challenge) ++ ret = ac->ops->add_authorizer_challenge(ac, a, challenge_buf, ++ challenge_buf_len); ++ mutex_unlock(&ac->mutex); ++ return ret; ++} ++EXPORT_SYMBOL(ceph_auth_add_authorizer_challenge); ++ + int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac, + struct ceph_authorizer *a) + { +diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c +index 2f4a1baf5f52..2bf9d9f7ddf3 100644 +--- a/net/ceph/auth_x.c ++++ b/net/ceph/auth_x.c +@@ -9,6 +9,7 @@ + + #include <linux/ceph/decode.h> + #include <linux/ceph/auth.h> ++#include <linux/ceph/ceph_features.h> + #include <linux/ceph/libceph.h> + #include <linux/ceph/messenger.h> + +@@ -70,25 +71,40 @@ static int ceph_x_encrypt(struct ceph_crypto_key *secret, void *buf, + return sizeof(u32) + ciphertext_len; + } + ++static int __ceph_x_decrypt(struct ceph_crypto_key *secret, void *p, ++ int ciphertext_len) ++{ ++ struct ceph_x_encrypt_header *hdr = p; ++ int plaintext_len; ++ int ret; ++ ++ ret = ceph_crypt(secret, false, p, ciphertext_len, ciphertext_len, ++ &plaintext_len); ++ if (ret) ++ return ret; ++ ++ if (le64_to_cpu(hdr->magic) != CEPHX_ENC_MAGIC) { ++ pr_err("%s bad magic\n", __func__); ++ return -EINVAL; ++ } ++ ++ return plaintext_len - sizeof(*hdr); ++} ++ + static int ceph_x_decrypt(struct ceph_crypto_key *secret, void **p, void *end) + { +- struct ceph_x_encrypt_header *hdr = *p + sizeof(u32); +- int ciphertext_len, plaintext_len; ++ int ciphertext_len; + int ret; + + ceph_decode_32_safe(p, end, ciphertext_len, e_inval); + ceph_decode_need(p, end, ciphertext_len, e_inval); + +- ret = ceph_crypt(secret, false, *p, end - *p, ciphertext_len, +- &plaintext_len); +- if (ret) ++ ret = __ceph_x_decrypt(secret, *p, ciphertext_len); ++ if (ret < 0) + return ret; + +- if (hdr->struct_v != 1 || le64_to_cpu(hdr->magic) != CEPHX_ENC_MAGIC) +- return -EPERM; +- + *p += ciphertext_len; +- return plaintext_len - sizeof(struct ceph_x_encrypt_header); ++ return ret; + + e_inval: + return -EINVAL; +@@ -275,6 +291,51 @@ bad: + return -EINVAL; + } + ++/* ++ * Encode and encrypt the second part (ceph_x_authorize_b) of the ++ * authorizer. The first part (ceph_x_authorize_a) should already be ++ * encoded. ++ */ ++static int encrypt_authorizer(struct ceph_x_authorizer *au, ++ u64 *server_challenge) ++{ ++ struct ceph_x_authorize_a *msg_a; ++ struct ceph_x_authorize_b *msg_b; ++ void *p, *end; ++ int ret; ++ ++ msg_a = au->buf->vec.iov_base; ++ WARN_ON(msg_a->ticket_blob.secret_id != cpu_to_le64(au->secret_id)); ++ p = (void *)(msg_a + 1) + le32_to_cpu(msg_a->ticket_blob.blob_len); ++ end = au->buf->vec.iov_base + au->buf->vec.iov_len; ++ ++ msg_b = p + ceph_x_encrypt_offset(); ++ msg_b->struct_v = 2; ++ msg_b->nonce = cpu_to_le64(au->nonce); ++ if (server_challenge) { ++ msg_b->have_challenge = 1; ++ msg_b->server_challenge_plus_one = ++ cpu_to_le64(*server_challenge + 1); ++ } else { ++ msg_b->have_challenge = 0; ++ msg_b->server_challenge_plus_one = 0; ++ } ++ ++ ret = ceph_x_encrypt(&au->session_key, p, end - p, sizeof(*msg_b)); ++ if (ret < 0) ++ return ret; ++ ++ p += ret; ++ if (server_challenge) { ++ WARN_ON(p != end); ++ } else { ++ WARN_ON(p > end); ++ au->buf->vec.iov_len = p - au->buf->vec.iov_base; ++ } ++ ++ return 0; ++} ++ + static void ceph_x_authorizer_cleanup(struct ceph_x_authorizer *au) + { + ceph_crypto_key_destroy(&au->session_key); +@@ -291,7 +352,6 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, + int maxlen; + struct ceph_x_authorize_a *msg_a; + struct ceph_x_authorize_b *msg_b; +- void *p, *end; + int ret; + int ticket_blob_len = + (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0); +@@ -335,21 +395,13 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, + dout(" th %p secret_id %lld %lld\n", th, th->secret_id, + le64_to_cpu(msg_a->ticket_blob.secret_id)); + +- p = msg_a + 1; +- p += ticket_blob_len; +- end = au->buf->vec.iov_base + au->buf->vec.iov_len; +- +- msg_b = p + ceph_x_encrypt_offset(); +- msg_b->struct_v = 1; + get_random_bytes(&au->nonce, sizeof(au->nonce)); +- msg_b->nonce = cpu_to_le64(au->nonce); +- ret = ceph_x_encrypt(&au->session_key, p, end - p, sizeof(*msg_b)); +- if (ret < 0) ++ ret = encrypt_authorizer(au, NULL); ++ if (ret) { ++ pr_err("failed to encrypt authorizer: %d", ret); + goto out_au; ++ } + +- p += ret; +- WARN_ON(p > end); +- au->buf->vec.iov_len = p - au->buf->vec.iov_base; + dout(" built authorizer nonce %llx len %d\n", au->nonce, + (int)au->buf->vec.iov_len); + return 0; +@@ -626,6 +678,54 @@ static int ceph_x_update_authorizer( + return 0; + } + ++static int decrypt_authorize_challenge(struct ceph_x_authorizer *au, ++ void *challenge_buf, ++ int challenge_buf_len, ++ u64 *server_challenge) ++{ ++ struct ceph_x_authorize_challenge *ch = ++ challenge_buf + sizeof(struct ceph_x_encrypt_header); ++ int ret; ++ ++ /* no leading len */ ++ ret = __ceph_x_decrypt(&au->session_key, challenge_buf, ++ challenge_buf_len); ++ if (ret < 0) ++ return ret; ++ if (ret < sizeof(*ch)) { ++ pr_err("bad size %d for ceph_x_authorize_challenge\n", ret); ++ return -EINVAL; ++ } ++ ++ *server_challenge = le64_to_cpu(ch->server_challenge); ++ return 0; ++} ++ ++static int ceph_x_add_authorizer_challenge(struct ceph_auth_client *ac, ++ struct ceph_authorizer *a, ++ void *challenge_buf, ++ int challenge_buf_len) ++{ ++ struct ceph_x_authorizer *au = (void *)a; ++ u64 server_challenge; ++ int ret; ++ ++ ret = decrypt_authorize_challenge(au, challenge_buf, challenge_buf_len, ++ &server_challenge); ++ if (ret) { ++ pr_err("failed to decrypt authorize challenge: %d", ret); ++ return ret; ++ } ++ ++ ret = encrypt_authorizer(au, &server_challenge); ++ if (ret) { ++ pr_err("failed to encrypt authorizer w/ challenge: %d", ret); ++ return ret; ++ } ++ ++ return 0; ++} ++ + static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, + struct ceph_authorizer *a) + { +@@ -637,8 +737,10 @@ static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, + ret = ceph_x_decrypt(&au->session_key, &p, p + CEPHX_AU_ENC_BUF_LEN); + if (ret < 0) + return ret; +- if (ret != sizeof(*reply)) +- return -EPERM; ++ if (ret < sizeof(*reply)) { ++ pr_err("bad size %d for ceph_x_authorize_reply\n", ret); ++ return -EINVAL; ++ } + + if (au->nonce + 1 != le64_to_cpu(reply->nonce_plus_one)) + ret = -EPERM; +@@ -704,26 +806,64 @@ static int calc_signature(struct ceph_x_authorizer *au, struct ceph_msg *msg, + __le64 *psig) + { + void *enc_buf = au->enc_buf; +- struct { +- __le32 len; +- __le32 header_crc; +- __le32 front_crc; +- __le32 middle_crc; +- __le32 data_crc; +- } __packed *sigblock = enc_buf + ceph_x_encrypt_offset(); + int ret; + +- sigblock->len = cpu_to_le32(4*sizeof(u32)); +- sigblock->header_crc = msg->hdr.crc; +- sigblock->front_crc = msg->footer.front_crc; +- sigblock->middle_crc = msg->footer.middle_crc; +- sigblock->data_crc = msg->footer.data_crc; +- ret = ceph_x_encrypt(&au->session_key, enc_buf, CEPHX_AU_ENC_BUF_LEN, +- sizeof(*sigblock)); +- if (ret < 0) +- return ret; ++ if (!CEPH_HAVE_FEATURE(msg->con->peer_features, CEPHX_V2)) { ++ struct { ++ __le32 len; ++ __le32 header_crc; ++ __le32 front_crc; ++ __le32 middle_crc; ++ __le32 data_crc; ++ } __packed *sigblock = enc_buf + ceph_x_encrypt_offset(); ++ ++ sigblock->len = cpu_to_le32(4*sizeof(u32)); ++ sigblock->header_crc = msg->hdr.crc; ++ sigblock->front_crc = msg->footer.front_crc; ++ sigblock->middle_crc = msg->footer.middle_crc; ++ sigblock->data_crc = msg->footer.data_crc; ++ ++ ret = ceph_x_encrypt(&au->session_key, enc_buf, ++ CEPHX_AU_ENC_BUF_LEN, sizeof(*sigblock)); ++ if (ret < 0) ++ return ret; ++ ++ *psig = *(__le64 *)(enc_buf + sizeof(u32)); ++ } else { ++ struct { ++ __le32 header_crc; ++ __le32 front_crc; ++ __le32 front_len; ++ __le32 middle_crc; ++ __le32 middle_len; ++ __le32 data_crc; ++ __le32 data_len; ++ __le32 seq_lower_word; ++ } __packed *sigblock = enc_buf; ++ struct { ++ __le64 a, b, c, d; ++ } __packed *penc = enc_buf; ++ int ciphertext_len; ++ ++ sigblock->header_crc = msg->hdr.crc; ++ sigblock->front_crc = msg->footer.front_crc; ++ sigblock->front_len = msg->hdr.front_len; ++ sigblock->middle_crc = msg->footer.middle_crc; ++ sigblock->middle_len = msg->hdr.middle_len; ++ sigblock->data_crc = msg->footer.data_crc; ++ sigblock->data_len = msg->hdr.data_len; ++ sigblock->seq_lower_word = *(__le32 *)&msg->hdr.seq; ++ ++ /* no leading len, no ceph_x_encrypt_header */ ++ ret = ceph_crypt(&au->session_key, true, enc_buf, ++ CEPHX_AU_ENC_BUF_LEN, sizeof(*sigblock), ++ &ciphertext_len); ++ if (ret) ++ return ret; ++ ++ *psig = penc->a ^ penc->b ^ penc->c ^ penc->d; ++ } + +- *psig = *(__le64 *)(enc_buf + sizeof(u32)); + return 0; + } + +@@ -778,6 +918,7 @@ static const struct ceph_auth_client_ops ceph_x_ops = { + .handle_reply = ceph_x_handle_reply, + .create_authorizer = ceph_x_create_authorizer, + .update_authorizer = ceph_x_update_authorizer, ++ .add_authorizer_challenge = ceph_x_add_authorizer_challenge, + .verify_authorizer_reply = ceph_x_verify_authorizer_reply, + .invalidate_authorizer = ceph_x_invalidate_authorizer, + .reset = ceph_x_reset, +diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h +index 32c13d763b9a..24b0b74564d0 100644 +--- a/net/ceph/auth_x_protocol.h ++++ b/net/ceph/auth_x_protocol.h +@@ -70,6 +70,13 @@ struct ceph_x_authorize_a { + struct ceph_x_authorize_b { + __u8 struct_v; + __le64 nonce; ++ __u8 have_challenge; ++ __le64 server_challenge_plus_one; ++} __attribute__ ((packed)); ++ ++struct ceph_x_authorize_challenge { ++ __u8 struct_v; ++ __le64 server_challenge; + } __attribute__ ((packed)); + + struct ceph_x_authorize_reply { +diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c +index 5281da82371a..f864807284d4 100644 +--- a/net/ceph/messenger.c ++++ b/net/ceph/messenger.c +@@ -1411,24 +1411,26 @@ static void prepare_write_keepalive(struct ceph_connection *con) + * Connection negotiation. + */ + +-static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection *con, +- int *auth_proto) ++static int get_connect_authorizer(struct ceph_connection *con) + { + struct ceph_auth_handshake *auth; ++ int auth_proto; + + if (!con->ops->get_authorizer) { ++ con->auth = NULL; + con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN; + con->out_connect.authorizer_len = 0; +- return NULL; ++ return 0; + } + +- auth = con->ops->get_authorizer(con, auth_proto, con->auth_retry); ++ auth = con->ops->get_authorizer(con, &auth_proto, con->auth_retry); + if (IS_ERR(auth)) +- return auth; ++ return PTR_ERR(auth); + +- con->auth_reply_buf = auth->authorizer_reply_buf; +- con->auth_reply_buf_len = auth->authorizer_reply_buf_len; +- return auth; ++ con->auth = auth; ++ con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto); ++ con->out_connect.authorizer_len = cpu_to_le32(auth->authorizer_buf_len); ++ return 0; + } + + /* +@@ -1444,12 +1446,22 @@ static void prepare_write_banner(struct ceph_connection *con) + con_flag_set(con, CON_FLAG_WRITE_PENDING); + } + ++static void __prepare_write_connect(struct ceph_connection *con) ++{ ++ con_out_kvec_add(con, sizeof(con->out_connect), &con->out_connect); ++ if (con->auth) ++ con_out_kvec_add(con, con->auth->authorizer_buf_len, ++ con->auth->authorizer_buf); ++ ++ con->out_more = 0; ++ con_flag_set(con, CON_FLAG_WRITE_PENDING); ++} ++ + static int prepare_write_connect(struct ceph_connection *con) + { + unsigned int global_seq = get_global_seq(con->msgr, 0); + int proto; +- int auth_proto; +- struct ceph_auth_handshake *auth; ++ int ret; + + switch (con->peer_name.type) { + case CEPH_ENTITY_TYPE_MON: +@@ -1476,24 +1488,11 @@ static int prepare_write_connect(struct ceph_connection *con) + con->out_connect.protocol_version = cpu_to_le32(proto); + con->out_connect.flags = 0; + +- auth_proto = CEPH_AUTH_UNKNOWN; +- auth = get_connect_authorizer(con, &auth_proto); +- if (IS_ERR(auth)) +- return PTR_ERR(auth); +- +- con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto); +- con->out_connect.authorizer_len = auth ? +- cpu_to_le32(auth->authorizer_buf_len) : 0; +- +- con_out_kvec_add(con, sizeof (con->out_connect), +- &con->out_connect); +- if (auth && auth->authorizer_buf_len) +- con_out_kvec_add(con, auth->authorizer_buf_len, +- auth->authorizer_buf); +- +- con->out_more = 0; +- con_flag_set(con, CON_FLAG_WRITE_PENDING); ++ ret = get_connect_authorizer(con); ++ if (ret) ++ return ret; + ++ __prepare_write_connect(con); + return 0; + } + +@@ -1753,11 +1752,21 @@ static int read_partial_connect(struct ceph_connection *con) + if (ret <= 0) + goto out; + +- size = le32_to_cpu(con->in_reply.authorizer_len); +- end += size; +- ret = read_partial(con, end, size, con->auth_reply_buf); +- if (ret <= 0) +- goto out; ++ if (con->auth) { ++ size = le32_to_cpu(con->in_reply.authorizer_len); ++ if (size > con->auth->authorizer_reply_buf_len) { ++ pr_err("authorizer reply too big: %d > %zu\n", size, ++ con->auth->authorizer_reply_buf_len); ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ end += size; ++ ret = read_partial(con, end, size, ++ con->auth->authorizer_reply_buf); ++ if (ret <= 0) ++ goto out; ++ } + + dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n", + con, (int)con->in_reply.tag, +@@ -1765,7 +1774,6 @@ static int read_partial_connect(struct ceph_connection *con) + le32_to_cpu(con->in_reply.global_seq)); + out: + return ret; +- + } + + /* +@@ -2048,12 +2056,27 @@ static int process_connect(struct ceph_connection *con) + + dout("process_connect on %p tag %d\n", con, (int)con->in_tag); + +- if (con->auth_reply_buf) { ++ if (con->auth) { + /* + * Any connection that defines ->get_authorizer() +- * should also define ->verify_authorizer_reply(). ++ * should also define ->add_authorizer_challenge() and ++ * ->verify_authorizer_reply(). ++ * + * See get_connect_authorizer(). + */ ++ if (con->in_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) { ++ ret = con->ops->add_authorizer_challenge( ++ con, con->auth->authorizer_reply_buf, ++ le32_to_cpu(con->in_reply.authorizer_len)); ++ if (ret < 0) ++ return ret; ++ ++ con_out_kvec_reset(con); ++ __prepare_write_connect(con); ++ prepare_read_connect(con); ++ return 0; ++ } ++ + ret = con->ops->verify_authorizer_reply(con); + if (ret < 0) { + con->error_msg = "bad authorize reply"; +diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c +index 2814dba5902d..53ea2d48896c 100644 +--- a/net/ceph/osd_client.c ++++ b/net/ceph/osd_client.c +@@ -5292,6 +5292,16 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, + return auth; + } + ++static int add_authorizer_challenge(struct ceph_connection *con, ++ void *challenge_buf, int challenge_buf_len) ++{ ++ struct ceph_osd *o = con->private; ++ struct ceph_osd_client *osdc = o->o_osdc; ++ struct ceph_auth_client *ac = osdc->client->monc.auth; ++ ++ return ceph_auth_add_authorizer_challenge(ac, o->o_auth.authorizer, ++ challenge_buf, challenge_buf_len); ++} + + static int verify_authorizer_reply(struct ceph_connection *con) + { +@@ -5341,6 +5351,7 @@ static const struct ceph_connection_operations osd_con_ops = { + .put = put_osd_con, + .dispatch = dispatch, + .get_authorizer = get_authorizer, ++ .add_authorizer_challenge = add_authorizer_challenge, + .verify_authorizer_reply = verify_authorizer_reply, + .invalidate_authorizer = invalidate_authorizer, + .alloc_msg = alloc_msg, +diff --git a/net/core/skbuff.c b/net/core/skbuff.c +index c19a118f9f82..4067fa3fcbb2 100644 +--- a/net/core/skbuff.c ++++ b/net/core/skbuff.c +@@ -4882,6 +4882,10 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) + nf_reset(skb); + nf_reset_trace(skb); + ++#ifdef CONFIG_NET_SWITCHDEV ++ skb->offload_fwd_mark = 0; ++#endif ++ + if (!xnet) + return; + +diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c +index 8d1a7c900393..88d5b2645bb0 100644 +--- a/net/packet/af_packet.c ++++ b/net/packet/af_packet.c +@@ -2433,7 +2433,7 @@ static void tpacket_destruct_skb(struct sk_buff *skb) + void *ph; + __u32 ts; + +- ph = skb_shinfo(skb)->destructor_arg; ++ ph = skb_zcopy_get_nouarg(skb); + packet_dec_pending(&po->tx_ring); + + ts = __packet_set_timestamp(po, ph, skb); +@@ -2499,7 +2499,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, + skb->priority = po->sk.sk_priority; + skb->mark = po->sk.sk_mark; + sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags); +- skb_shinfo(skb)->destructor_arg = ph.raw; ++ skb_zcopy_set_nouarg(skb, ph.raw); + + skb_reserve(skb, hlen); + skb_reset_network_header(skb); +diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c +index 4f2971f528db..e903bdd39b9f 100644 +--- a/net/tls/tls_main.c ++++ b/net/tls/tls_main.c +@@ -46,8 +46,28 @@ MODULE_DESCRIPTION("Transport Layer Security Support"); + MODULE_LICENSE("Dual BSD/GPL"); + MODULE_ALIAS_TCP_ULP("tls"); + +-static struct proto tls_base_prot; +-static struct proto tls_sw_prot; ++enum { ++ TLSV4, ++ TLSV6, ++ TLS_NUM_PROTS, ++}; ++ ++enum { ++ TLS_BASE_TX, ++ TLS_SW_TX, ++ TLS_NUM_CONFIG, ++}; ++ ++static struct proto *saved_tcpv6_prot; ++static DEFINE_MUTEX(tcpv6_prot_mutex); ++static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG]; ++ ++static inline void update_sk_prot(struct sock *sk, struct tls_context *ctx) ++{ ++ int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4; ++ ++ sk->sk_prot = &tls_prots[ip_ver][ctx->tx_conf]; ++} + + int wait_on_pending_writer(struct sock *sk, long *timeo) + { +@@ -239,6 +259,12 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) + void (*sk_proto_close)(struct sock *sk, long timeout); + + lock_sock(sk); ++ sk_proto_close = ctx->sk_proto_close; ++ ++ if (ctx->tx_conf == TLS_BASE_TX) { ++ tls_ctx_free(ctx); ++ goto skip_tx_cleanup; ++ } + + if (!tls_complete_pending_work(sk, ctx, 0, &timeo)) + tls_handle_open_record(sk, 0); +@@ -255,13 +281,16 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) + sg++; + } + } +- ctx->free_resources(sk); ++ + kfree(ctx->rec_seq); + kfree(ctx->iv); + +- sk_proto_close = ctx->sk_proto_close; +- tls_ctx_free(ctx); ++ if (ctx->tx_conf == TLS_SW_TX) { ++ tls_sw_free_tx_resources(sk); ++ tls_ctx_free(ctx); ++ } + ++skip_tx_cleanup: + release_sock(sk); + sk_proto_close(sk, timeout); + } +@@ -362,48 +391,43 @@ static int tls_getsockopt(struct sock *sk, int level, int optname, + static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, + unsigned int optlen) + { +- struct tls_crypto_info *crypto_info, tmp_crypto_info; ++ struct tls_crypto_info *crypto_info; + struct tls_context *ctx = tls_get_ctx(sk); +- struct proto *prot = NULL; + int rc = 0; ++ int tx_conf; + + if (!optval || (optlen < sizeof(*crypto_info))) { + rc = -EINVAL; + goto out; + } + +- rc = copy_from_user(&tmp_crypto_info, optval, sizeof(*crypto_info)); ++ crypto_info = &ctx->crypto_send.info; ++ /* Currently we don't support set crypto info more than one time */ ++ if (TLS_CRYPTO_INFO_READY(crypto_info)) { ++ rc = -EBUSY; ++ goto out; ++ } ++ ++ rc = copy_from_user(crypto_info, optval, sizeof(*crypto_info)); + if (rc) { + rc = -EFAULT; + goto out; + } + + /* check version */ +- if (tmp_crypto_info.version != TLS_1_2_VERSION) { ++ if (crypto_info->version != TLS_1_2_VERSION) { + rc = -ENOTSUPP; +- goto out; +- } +- +- /* get user crypto info */ +- crypto_info = &ctx->crypto_send.info; +- +- /* Currently we don't support set crypto info more than one time */ +- if (TLS_CRYPTO_INFO_READY(crypto_info)) { +- rc = -EBUSY; +- goto out; ++ goto err_crypto_info; + } + +- switch (tmp_crypto_info.cipher_type) { ++ switch (crypto_info->cipher_type) { + case TLS_CIPHER_AES_GCM_128: { + if (optlen != sizeof(struct tls12_crypto_info_aes_gcm_128)) { + rc = -EINVAL; + goto err_crypto_info; + } +- rc = copy_from_user( +- crypto_info, +- optval, +- sizeof(struct tls12_crypto_info_aes_gcm_128)); +- ++ rc = copy_from_user(crypto_info + 1, optval + sizeof(*crypto_info), ++ optlen - sizeof(*crypto_info)); + if (rc) { + rc = -EFAULT; + goto err_crypto_info; +@@ -415,18 +439,16 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, + goto err_crypto_info; + } + +- ctx->sk_write_space = sk->sk_write_space; +- sk->sk_write_space = tls_write_space; +- +- ctx->sk_proto_close = sk->sk_prot->close; +- + /* currently SW is default, we will have ethtool in future */ + rc = tls_set_sw_offload(sk, ctx); +- prot = &tls_sw_prot; ++ tx_conf = TLS_SW_TX; + if (rc) + goto err_crypto_info; + +- sk->sk_prot = prot; ++ ctx->tx_conf = tx_conf; ++ update_sk_prot(sk, ctx); ++ ctx->sk_write_space = sk->sk_write_space; ++ sk->sk_write_space = tls_write_space; + goto out; + + err_crypto_info: +@@ -464,8 +486,21 @@ static int tls_setsockopt(struct sock *sk, int level, int optname, + return do_tls_setsockopt(sk, optname, optval, optlen); + } + ++static void build_protos(struct proto *prot, struct proto *base) ++{ ++ prot[TLS_BASE_TX] = *base; ++ prot[TLS_BASE_TX].setsockopt = tls_setsockopt; ++ prot[TLS_BASE_TX].getsockopt = tls_getsockopt; ++ prot[TLS_BASE_TX].close = tls_sk_proto_close; ++ ++ prot[TLS_SW_TX] = prot[TLS_BASE_TX]; ++ prot[TLS_SW_TX].sendmsg = tls_sw_sendmsg; ++ prot[TLS_SW_TX].sendpage = tls_sw_sendpage; ++} ++ + static int tls_init(struct sock *sk) + { ++ int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4; + struct inet_connection_sock *icsk = inet_csk(sk); + struct tls_context *ctx; + int rc = 0; +@@ -488,7 +523,21 @@ static int tls_init(struct sock *sk) + icsk->icsk_ulp_data = ctx; + ctx->setsockopt = sk->sk_prot->setsockopt; + ctx->getsockopt = sk->sk_prot->getsockopt; +- sk->sk_prot = &tls_base_prot; ++ ctx->sk_proto_close = sk->sk_prot->close; ++ ++ /* Build IPv6 TLS whenever the address of tcpv6_prot changes */ ++ if (ip_ver == TLSV6 && ++ unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) { ++ mutex_lock(&tcpv6_prot_mutex); ++ if (likely(sk->sk_prot != saved_tcpv6_prot)) { ++ build_protos(tls_prots[TLSV6], sk->sk_prot); ++ smp_store_release(&saved_tcpv6_prot, sk->sk_prot); ++ } ++ mutex_unlock(&tcpv6_prot_mutex); ++ } ++ ++ ctx->tx_conf = TLS_BASE_TX; ++ update_sk_prot(sk, ctx); + out: + return rc; + } +@@ -501,14 +550,7 @@ static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = { + + static int __init tls_register(void) + { +- tls_base_prot = tcp_prot; +- tls_base_prot.setsockopt = tls_setsockopt; +- tls_base_prot.getsockopt = tls_getsockopt; +- +- tls_sw_prot = tls_base_prot; +- tls_sw_prot.sendmsg = tls_sw_sendmsg; +- tls_sw_prot.sendpage = tls_sw_sendpage; +- tls_sw_prot.close = tls_sk_proto_close; ++ build_protos(tls_prots[TLSV4], &tcp_prot); + + tcp_register_ulp(&tcp_tls_ulp_ops); + +diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c +index 6ae9ca567d6c..d18d4a478e4f 100644 +--- a/net/tls/tls_sw.c ++++ b/net/tls/tls_sw.c +@@ -388,7 +388,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) + { + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); +- int ret = 0; ++ int ret; + int required_size; + long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + bool eor = !(msg->msg_flags & MSG_MORE); +@@ -403,7 +403,8 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) + + lock_sock(sk); + +- if (tls_complete_pending_work(sk, tls_ctx, msg->msg_flags, &timeo)) ++ ret = tls_complete_pending_work(sk, tls_ctx, msg->msg_flags, &timeo); ++ if (ret) + goto send_end; + + if (unlikely(msg->msg_controllen)) { +@@ -539,7 +540,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, + { + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); +- int ret = 0; ++ int ret; + long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + bool eor; + size_t orig_size = size; +@@ -559,7 +560,8 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, + + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); + +- if (tls_complete_pending_work(sk, tls_ctx, flags, &timeo)) ++ ret = tls_complete_pending_work(sk, tls_ctx, flags, &timeo); ++ if (ret) + goto sendpage_end; + + /* Call the sk_stream functions to manage the sndbuf mem. */ +@@ -646,7 +648,7 @@ sendpage_end: + return ret; + } + +-static void tls_sw_free_resources(struct sock *sk) ++void tls_sw_free_tx_resources(struct sock *sk) + { + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); +@@ -685,7 +687,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) + } + + ctx->priv_ctx = (struct tls_offload_context *)sw_ctx; +- ctx->free_resources = tls_sw_free_resources; + + crypto_info = &ctx->crypto_send.info; + switch (crypto_info->cipher_type) { +diff --git a/scripts/Makefile.build b/scripts/Makefile.build +index 7143da06d702..be9e5deb58ba 100644 +--- a/scripts/Makefile.build ++++ b/scripts/Makefile.build +@@ -272,10 +272,8 @@ else + objtool_args += $(call cc-ifversion, -lt, 0405, --no-unreachable) + endif + ifdef CONFIG_RETPOLINE +-ifneq ($(RETPOLINE_CFLAGS),) + objtool_args += --retpoline + endif +-endif + + + ifdef CONFIG_MODVERSIONS +diff --git a/sound/core/control.c b/sound/core/control.c +index af7e6165e21e..36571cd49be3 100644 +--- a/sound/core/control.c ++++ b/sound/core/control.c +@@ -347,6 +347,40 @@ static int snd_ctl_find_hole(struct snd_card *card, unsigned int count) + return 0; + } + ++/* add a new kcontrol object; call with card->controls_rwsem locked */ ++static int __snd_ctl_add(struct snd_card *card, struct snd_kcontrol *kcontrol) ++{ ++ struct snd_ctl_elem_id id; ++ unsigned int idx; ++ unsigned int count; ++ ++ id = kcontrol->id; ++ if (id.index > UINT_MAX - kcontrol->count) ++ return -EINVAL; ++ ++ if (snd_ctl_find_id(card, &id)) { ++ dev_err(card->dev, ++ "control %i:%i:%i:%s:%i is already present\n", ++ id.iface, id.device, id.subdevice, id.name, id.index); ++ return -EBUSY; ++ } ++ ++ if (snd_ctl_find_hole(card, kcontrol->count) < 0) ++ return -ENOMEM; ++ ++ list_add_tail(&kcontrol->list, &card->controls); ++ card->controls_count += kcontrol->count; ++ kcontrol->id.numid = card->last_numid + 1; ++ card->last_numid += kcontrol->count; ++ ++ id = kcontrol->id; ++ count = kcontrol->count; ++ for (idx = 0; idx < count; idx++, id.index++, id.numid++) ++ snd_ctl_notify(card, SNDRV_CTL_EVENT_MASK_ADD, &id); ++ ++ return 0; ++} ++ + /** + * snd_ctl_add - add the control instance to the card + * @card: the card instance +@@ -363,45 +397,18 @@ static int snd_ctl_find_hole(struct snd_card *card, unsigned int count) + */ + int snd_ctl_add(struct snd_card *card, struct snd_kcontrol *kcontrol) + { +- struct snd_ctl_elem_id id; +- unsigned int idx; +- unsigned int count; + int err = -EINVAL; + + if (! kcontrol) + return err; + if (snd_BUG_ON(!card || !kcontrol->info)) + goto error; +- id = kcontrol->id; +- if (id.index > UINT_MAX - kcontrol->count) +- goto error; + + down_write(&card->controls_rwsem); +- if (snd_ctl_find_id(card, &id)) { +- up_write(&card->controls_rwsem); +- dev_err(card->dev, "control %i:%i:%i:%s:%i is already present\n", +- id.iface, +- id.device, +- id.subdevice, +- id.name, +- id.index); +- err = -EBUSY; +- goto error; +- } +- if (snd_ctl_find_hole(card, kcontrol->count) < 0) { +- up_write(&card->controls_rwsem); +- err = -ENOMEM; +- goto error; +- } +- list_add_tail(&kcontrol->list, &card->controls); +- card->controls_count += kcontrol->count; +- kcontrol->id.numid = card->last_numid + 1; +- card->last_numid += kcontrol->count; +- id = kcontrol->id; +- count = kcontrol->count; ++ err = __snd_ctl_add(card, kcontrol); + up_write(&card->controls_rwsem); +- for (idx = 0; idx < count; idx++, id.index++, id.numid++) +- snd_ctl_notify(card, SNDRV_CTL_EVENT_MASK_ADD, &id); ++ if (err < 0) ++ goto error; + return 0; + + error: +@@ -1360,9 +1367,12 @@ static int snd_ctl_elem_add(struct snd_ctl_file *file, + kctl->tlv.c = snd_ctl_elem_user_tlv; + + /* This function manage to free the instance on failure. */ +- err = snd_ctl_add(card, kctl); +- if (err < 0) +- return err; ++ down_write(&card->controls_rwsem); ++ err = __snd_ctl_add(card, kctl); ++ if (err < 0) { ++ snd_ctl_free_one(kctl); ++ goto unlock; ++ } + offset = snd_ctl_get_ioff(kctl, &info->id); + snd_ctl_build_ioff(&info->id, kctl, offset); + /* +@@ -1373,10 +1383,10 @@ static int snd_ctl_elem_add(struct snd_ctl_file *file, + * which locks the element. + */ + +- down_write(&card->controls_rwsem); + card->user_ctl_count++; +- up_write(&card->controls_rwsem); + ++ unlock: ++ up_write(&card->controls_rwsem); + return 0; + } + +diff --git a/sound/isa/wss/wss_lib.c b/sound/isa/wss/wss_lib.c +index 8a852042a066..91cd305cabd7 100644 +--- a/sound/isa/wss/wss_lib.c ++++ b/sound/isa/wss/wss_lib.c +@@ -1531,7 +1531,6 @@ static int snd_wss_playback_open(struct snd_pcm_substream *substream) + if (err < 0) { + if (chip->release_dma) + chip->release_dma(chip, chip->dma_private_data, chip->dma1); +- snd_free_pages(runtime->dma_area, runtime->dma_bytes); + return err; + } + chip->playback_substream = substream; +@@ -1572,7 +1571,6 @@ static int snd_wss_capture_open(struct snd_pcm_substream *substream) + if (err < 0) { + if (chip->release_dma) + chip->release_dma(chip, chip->dma_private_data, chip->dma2); +- snd_free_pages(runtime->dma_area, runtime->dma_bytes); + return err; + } + chip->capture_substream = substream; +diff --git a/sound/pci/ac97/ac97_codec.c b/sound/pci/ac97/ac97_codec.c +index 1ef7cdf1d3e8..38f355ae1863 100644 +--- a/sound/pci/ac97/ac97_codec.c ++++ b/sound/pci/ac97/ac97_codec.c +@@ -824,7 +824,7 @@ static int snd_ac97_put_spsa(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_ + { + struct snd_ac97 *ac97 = snd_kcontrol_chip(kcontrol); + int reg = kcontrol->private_value & 0xff; +- int shift = (kcontrol->private_value >> 8) & 0xff; ++ int shift = (kcontrol->private_value >> 8) & 0x0f; + int mask = (kcontrol->private_value >> 16) & 0xff; + // int invert = (kcontrol->private_value >> 24) & 0xff; + unsigned short value, old, new; +diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c +index eb8807de3ebc..66b0a124beae 100644 +--- a/sound/pci/hda/patch_realtek.c ++++ b/sound/pci/hda/patch_realtek.c +@@ -343,6 +343,7 @@ static void alc_fill_eapd_coef(struct hda_codec *codec) + case 0x10ec0285: + case 0x10ec0298: + case 0x10ec0289: ++ case 0x10ec0300: + alc_update_coef_idx(codec, 0x10, 1<<9, 0); + break; + case 0x10ec0275: +@@ -2758,6 +2759,7 @@ enum { + ALC269_TYPE_ALC215, + ALC269_TYPE_ALC225, + ALC269_TYPE_ALC294, ++ ALC269_TYPE_ALC300, + ALC269_TYPE_ALC700, + }; + +@@ -2792,6 +2794,7 @@ static int alc269_parse_auto_config(struct hda_codec *codec) + case ALC269_TYPE_ALC215: + case ALC269_TYPE_ALC225: + case ALC269_TYPE_ALC294: ++ case ALC269_TYPE_ALC300: + case ALC269_TYPE_ALC700: + ssids = alc269_ssids; + break; +@@ -6408,6 +6411,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { + SND_PCI_QUIRK(0x144d, 0xc740, "Samsung Ativ book 8 (NP870Z5G)", ALC269_FIXUP_ATIV_BOOK_8), + SND_PCI_QUIRK(0x1458, 0xfa53, "Gigabyte BXBT-2807", ALC283_FIXUP_HEADSET_MIC), + SND_PCI_QUIRK(0x1462, 0xb120, "MSI Cubi MS-B120", ALC283_FIXUP_HEADSET_MIC), ++ SND_PCI_QUIRK(0x1462, 0xb171, "Cubi N 8GL (MS-B171)", ALC283_FIXUP_HEADSET_MIC), + SND_PCI_QUIRK(0x17aa, 0x1036, "Lenovo P520", ALC233_FIXUP_LENOVO_MULTI_CODECS), + SND_PCI_QUIRK(0x17aa, 0x20f2, "Thinkpad SL410/510", ALC269_FIXUP_SKU_IGNORE), + SND_PCI_QUIRK(0x17aa, 0x215e, "Thinkpad L512", ALC269_FIXUP_SKU_IGNORE), +@@ -7089,6 +7093,10 @@ static int patch_alc269(struct hda_codec *codec) + spec->gen.mixer_nid = 0; /* ALC2x4 does not have any loopback mixer path */ + alc_update_coef_idx(codec, 0x6b, 0x0018, (1<<4) | (1<<3)); /* UAJ MIC Vref control by verb */ + break; ++ case 0x10ec0300: ++ spec->codec_variant = ALC269_TYPE_ALC300; ++ spec->gen.mixer_nid = 0; /* no loopback on ALC300 */ ++ break; + case 0x10ec0700: + case 0x10ec0701: + case 0x10ec0703: +@@ -8160,6 +8168,7 @@ static const struct hda_device_id snd_hda_id_realtek[] = { + HDA_CODEC_ENTRY(0x10ec0295, "ALC295", patch_alc269), + HDA_CODEC_ENTRY(0x10ec0298, "ALC298", patch_alc269), + HDA_CODEC_ENTRY(0x10ec0299, "ALC299", patch_alc269), ++ HDA_CODEC_ENTRY(0x10ec0300, "ALC300", patch_alc269), + HDA_CODEC_REV_ENTRY(0x10ec0861, 0x100340, "ALC660", patch_alc861), + HDA_CODEC_ENTRY(0x10ec0660, "ALC660-VD", patch_alc861vd), + HDA_CODEC_ENTRY(0x10ec0861, "ALC861", patch_alc861), +diff --git a/sound/sparc/cs4231.c b/sound/sparc/cs4231.c +index e73c962590eb..079063d8038d 100644 +--- a/sound/sparc/cs4231.c ++++ b/sound/sparc/cs4231.c +@@ -1146,10 +1146,8 @@ static int snd_cs4231_playback_open(struct snd_pcm_substream *substream) + runtime->hw = snd_cs4231_playback; + + err = snd_cs4231_open(chip, CS4231_MODE_PLAY); +- if (err < 0) { +- snd_free_pages(runtime->dma_area, runtime->dma_bytes); ++ if (err < 0) + return err; +- } + chip->playback_substream = substream; + chip->p_periods_sent = 0; + snd_pcm_set_sync(substream); +@@ -1167,10 +1165,8 @@ static int snd_cs4231_capture_open(struct snd_pcm_substream *substream) + runtime->hw = snd_cs4231_capture; + + err = snd_cs4231_open(chip, CS4231_MODE_RECORD); +- if (err < 0) { +- snd_free_pages(runtime->dma_area, runtime->dma_bytes); ++ if (err < 0) + return err; +- } + chip->capture_substream = substream; + chip->c_periods_sent = 0; + snd_pcm_set_sync(substream); |