Strengthen existing control register pinning when running paravirtualized under KVM. Check which bits KVM supports pinning for each control register and only pin supported bits which are already pinned via the existing native protection. Write to KVM CR0/4 pinned MSRs to enable pinning. Initiate KVM assisted pinning directly following the setup of native pinning on boot CPU. For non-boot CPUs initiate paravirtualized pinning on CPU identification. Identification of non-boot CPUs takes place after the boot CPU has setup native CR pinning. Therefore, non-boot CPUs access pinned bits setup by the boot CPU and request that those be pinned. All CPUs request paravirtualized pinning of the same bits which are already pinned natively. Guests using the kexec system call currently do not support paravirtualized control register pinning. This is due to early boot code writing known good values to control registers, these values do not contain the protected bits. This is due to CPU feature identification being done at a later time, when the kernel properly checks if it can enable protections. As such, the pv_cr_pin command line option has been added which instructs the kernel to disable kexec in favor of enabling paravirtualized control register pinning. crashkernel is also disabled when the pv_cr_pin parameter is specified due to its reliance on kexec. When we fix kexec, we will still need a way for a kernel with support to know if the kernel it is attempting to load has support. If a kernel with this enabled attempts to kexec a kernel where this is not supported, it would trigger a fault almost immediately. Liran suggested adding a section to the built image acting as a flag to signify support for being kexec'd by a kernel with pinning enabled. Should that approach be implemented, it is likely that the command line flag (pv_cr_pin) would still be desired for some deprecation period. We wouldn't want the default behavior to change from being able to kexec older kernels to not being able to, as this might break some users workflows. Signed-off-by: John Andersen <john.s.andersen@xxxxxxxxx> --- .../admin-guide/kernel-parameters.txt | 11 ++++++ arch/x86/Kconfig | 10 +++++ arch/x86/include/asm/kvm_para.h | 17 ++++++++ arch/x86/kernel/cpu/common.c | 5 +++ arch/x86/kernel/kvm.c | 39 +++++++++++++++++++ arch/x86/kernel/setup.c | 8 ++++ 6 files changed, 90 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index dbc22d684627..8552501a1579 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3836,6 +3836,17 @@ [KNL] Number of legacy pty's. Overwrites compiled-in default number. + pv_cr_pin [SECURITY,X86] + Enable paravirtualized control register pinning. When + running paravirutalized under KVM, request that KVM not + allow the guest to disable kernel protection features + set in CPU control registers. Specifying this option + will disable kexec (and crashkernel). If kexec support + has not been compiled into the kernel and host KVM + supports paravirtualized control register pinning, it + will be active by default without the need to specify + this parameter. + quiet [KNL] Disable most log messages r128= [HW,DRM] diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index beea77046f9b..d47b09ae23bd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -800,6 +800,7 @@ config KVM_GUEST bool "KVM Guest support (including kvmclock)" depends on PARAVIRT select PARAVIRT_CLOCK + select PARAVIRT_CR_PIN select ARCH_CPUIDLE_HALTPOLL default y ---help--- @@ -843,6 +844,15 @@ config PARAVIRT_TIME_ACCOUNTING config PARAVIRT_CLOCK bool +config PARAVIRT_CR_PIN + bool "Paravirtual bit pinning for CR0 and CR4" + depends on KVM_GUEST + help + Select this option to have the virtualised guest request that the + hypervisor disallow it from disabling protections set in control + registers. The hypervisor will prevent exploits from disabling + features such as SMEP, SMAP, UMIP, and WP. + config JAILHOUSE_GUEST bool "Jailhouse non-root cell support" depends on X86_64 && PCI diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 9b4df6eaa11a..342b475e7adf 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -102,6 +102,23 @@ static inline void kvm_spinlock_init(void) } #endif /* CONFIG_PARAVIRT_SPINLOCKS */ +#ifdef CONFIG_PARAVIRT_CR_PIN +void __init kvm_paravirt_cr_pinning_init(void); +void kvm_setup_paravirt_cr_pinning(unsigned long cr0_pinned_bits, + unsigned long cr4_pinned_bits); +#else +static inline void kvm_paravirt_cr_pinning_init(void) +{ + return; +} + +static inline void kvm_setup_paravirt_cr_pinning(unsigned long cr0_pinned_bits, + unsigned long cr4_pinned_bits) +{ + return; +} +#endif /* CONFIG_PARAVIRT_CR_PIN */ + #else /* CONFIG_KVM_GUEST */ #define kvm_async_pf_task_wait(T, I) do {} while(0) #define kvm_async_pf_task_wake(T) do {} while(0) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 08682757e547..bbf1de0cb253 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -21,6 +21,7 @@ #include <linux/smp.h> #include <linux/io.h> #include <linux/syscore_ops.h> +#include <linux/kvm_para.h> #include <asm/stackprotector.h> #include <asm/perf_event.h> @@ -416,6 +417,8 @@ static void __init setup_cr_pinning(void) mask = (X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP); cr4_pinned_bits = this_cpu_read(cpu_tlbstate.cr4) & mask; static_key_enable(&cr_pinning.key); + + kvm_setup_paravirt_cr_pinning(X86_CR0_WP, cr4_pinned_bits); } /* @@ -1589,6 +1592,8 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) mtrr_ap_init(); validate_apic_and_package_id(c); x86_spec_ctrl_setup_ap(); + + kvm_setup_paravirt_cr_pinning(X86_CR0_WP, cr4_pinned_bits); } static __init int setup_noclflush(char *arg) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index d817f255aed8..f6c159229e35 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -24,6 +24,8 @@ #include <linux/debugfs.h> #include <linux/nmi.h> #include <linux/swait.h> +#include <linux/init.h> +#include <linux/kexec.h> #include <asm/timer.h> #include <asm/cpu.h> #include <asm/traps.h> @@ -34,6 +36,7 @@ #include <asm/hypervisor.h> #include <asm/tlb.h> #include <asm/cpuidle_haltpoll.h> +#include <asm/cmdline.h> static int kvmapf = 1; @@ -708,6 +711,7 @@ static void __init kvm_apic_init(void) static void __init kvm_init_platform(void) { kvmclock_init(); + kvm_paravirt_cr_pinning_init(); x86_platform.apic_post_init = kvm_apic_init; } @@ -857,6 +861,41 @@ void __init kvm_spinlock_init(void) #endif /* CONFIG_PARAVIRT_SPINLOCKS */ +#ifdef CONFIG_PARAVIRT_CR_PIN +static int kvm_paravirt_cr_pinning_enabled __ro_after_init = 0; + +void __init kvm_paravirt_cr_pinning_init() +{ +#ifdef CONFIG_KEXEC_CORE + if (!cmdline_find_option_bool(boot_command_line, "pv_cr_pin")) + return; + + /* Paravirtualized CR pinning is currently incompatible with kexec */ + kexec_load_disabled = 1; +#endif + + kvm_paravirt_cr_pinning_enabled = 1; +} + +void kvm_setup_paravirt_cr_pinning(unsigned long cr0_pinned_bits, + unsigned long cr4_pinned_bits) +{ + u64 mask; + + if (!kvm_paravirt_cr_pinning_enabled) + return; + + if (!kvm_para_has_feature(KVM_FEATURE_CR_PIN)) + return; + + rdmsrl(MSR_KVM_CR0_PIN_ALLOWED, mask); + wrmsrl(MSR_KVM_CR0_PINNED, cr0_pinned_bits & mask); + + rdmsrl(MSR_KVM_CR4_PIN_ALLOWED, mask); + wrmsrl(MSR_KVM_CR4_PINNED, cr4_pinned_bits & mask); +} +#endif + #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL static void kvm_disable_host_haltpoll(void *i) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 9e71d6f6e564..a75f9e730cc3 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -26,6 +26,9 @@ #include <asm/apic.h> #include <asm/bios_ebda.h> #include <asm/bugs.h> +#include <asm/kasan.h> +#include <asm/cmdline.h> + #include <asm/cpu.h> #include <asm/efi.h> #include <asm/gart.h> @@ -496,6 +499,11 @@ static void __init reserve_crashkernel(void) return; } + if (cmdline_find_option_bool(boot_command_line, "pv_cr_pin")) { + pr_info("Ignoring crashkernel since pv_cr_pin present in cmdline\n"); + return; + } + /* 0 means: find the address automatically */ if (!crash_base) { /* -- 2.21.0