From: Lai Jiangshan <laijs@xxxxxxxxxxxxxxxxx> Both VMX and SVM made mistakes of calling kvm_read_and_reset_apf_flags() in instrumentable code: vmx_vcpu_enter_exit() ASYNC PF induced VMEXIT save cr2 leave noinstr section -- kernel #PF can happen here due to instrumentation handle_exception_nmi_irqoff kvm_read_and_reset_apf_flags() If kernel #PF happens before handle_exception_nmi_irqoff() after leaving noinstr section due to instrumentation, kernel #PF handler will consume the incorrect apf flags and panic. The problem might be fixed by moving kvm_read_and_reset_apf_flags() into vmx_vcpu_enter_exit() to consume apf flags earlier before leaving noinstr section like the way it handles CR2. But linux kernel only resigters ASYNC PF for userspace and guest, so ASYNC PF can't hit when it is in kernel, so apf flags can be changed to be consumed only when the #PF is from guest or userspace. It is natually that kvm_read_and_reset_apf_flags() in vmx/svm is for page fault from guest. So this patch changes kvm_handle_async_pf() to be called only for page fault from userspace and renames it. Signed-off-by: Lai Jiangshan <laijs@xxxxxxxxxxxxxxxxx> --- arch/x86/include/asm/kvm_para.h | 8 +++---- arch/x86/kernel/kvm.c | 10 +-------- arch/x86/mm/fault.c | 39 ++++++++++++--------------------- 3 files changed, 19 insertions(+), 38 deletions(-) diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 56935ebb1dfe..3452e705dfda 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -104,14 +104,14 @@ unsigned int kvm_arch_para_hints(void); void kvm_async_pf_task_wait_schedule(u32 token); void kvm_async_pf_task_wake(u32 token); u32 kvm_read_and_reset_apf_flags(void); -bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token); +bool __kvm_handle_async_user_pf(struct pt_regs *regs, u32 token); DECLARE_STATIC_KEY_FALSE(kvm_async_pf_enabled); -static __always_inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token) +static inline bool kvm_handle_async_user_pf(struct pt_regs *regs, u32 token) { if (static_branch_unlikely(&kvm_async_pf_enabled)) - return __kvm_handle_async_pf(regs, token); + return __kvm_handle_async_user_pf(regs, token); else return false; } @@ -148,7 +148,7 @@ static inline u32 kvm_read_and_reset_apf_flags(void) return 0; } -static __always_inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token) +static inline bool kvm_handle_async_user_pf(struct pt_regs *regs, u32 token) { return false; } diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 59abbdad7729..285eeddf98f2 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -240,17 +240,13 @@ noinstr u32 kvm_read_and_reset_apf_flags(void) } EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags); -noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) +bool __kvm_handle_async_user_pf(struct pt_regs *regs, u32 token) { u32 flags = kvm_read_and_reset_apf_flags(); - irqentry_state_t state; if (!flags) return false; - state = irqentry_enter(regs); - instrumentation_begin(); - /* * If the host managed to inject an async #PF into an interrupt * disabled region, then die hard as this is not going to end well @@ -260,16 +256,12 @@ noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) panic("Host injected async #PF in interrupt disabled region\n"); if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { - if (unlikely(!(user_mode(regs)))) - panic("Host injected async #PF in kernel mode\n"); /* Page is swapped out by the host. */ kvm_async_pf_task_wait_schedule(token); } else { WARN_ONCE(1, "Unexpected async PF flags: %x\n", flags); } - instrumentation_end(); - irqentry_exit(regs, state); return true; } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 4bfed53e210e..dadef2afa185 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1501,30 +1501,6 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) prefetchw(¤t->mm->mmap_lock); - /* - * KVM uses #PF vector to deliver 'page not present' events to guests - * (asynchronous page fault mechanism). The event happens when a - * userspace task is trying to access some valid (from guest's point of - * view) memory which is not currently mapped by the host (e.g. the - * memory is swapped out). Note, the corresponding "page ready" event - * which is injected when the memory becomes available, is delivered via - * an interrupt mechanism and not a #PF exception - * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()). - * - * We are relying on the interrupted context being sane (valid RSP, - * relevant locks not held, etc.), which is fine as long as the - * interrupted context had IF=1. We are also relying on the KVM - * async pf type field and CR2 being read consistently instead of - * getting values from real and async page faults mixed up. - * - * Fingers crossed. - * - * The async #PF handling code takes care of idtentry handling - * itself. - */ - if (kvm_handle_async_pf(regs, (u32)address)) - return; - /* * Entry handling for valid #PF from kernel mode is slightly * different: RCU is already watching and rcu_irq_enter() must not @@ -1538,7 +1514,20 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) state = irqentry_enter(regs); instrumentation_begin(); - handle_page_fault(regs, error_code, address); + + /* + * KVM uses #PF vector to deliver 'page not present' events to guests + * (asynchronous page fault mechanism). The event happens when a + * userspace task is trying to access some valid (from guest's point of + * view) memory which is not currently mapped by the host (e.g. the + * memory is swapped out). Note, the corresponding "page ready" event + * which is injected when the memory becomes available, is delivered via + * an interrupt mechanism and not a #PF exception + * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()). + */ + if (!user_mode(regs) || !kvm_handle_async_user_pf(regs, (u32)address)) + handle_page_fault(regs, error_code, address); + instrumentation_end(); irqentry_exit(regs, state); -- 2.19.1.6.gb485710b