Am 20.10.2010 13:48, Jan Kiszka wrote: > Am 14.10.2010 11:22, Gleb Natapov wrote: >> When async PF capability is detected hook up special page fault handler >> that will handle async page fault events and bypass other page faults to >> regular page fault handler. Also add async PF handling to nested SVM >> emulation. Async PF always generates exit to L1 where vcpu thread will >> be scheduled out until page is available. >> >> Acked-by: Rik van Riel <riel@xxxxxxxxxx> >> Signed-off-by: Gleb Natapov <gleb@xxxxxxxxxx> >> --- >> arch/x86/include/asm/kvm_para.h | 12 +++ >> arch/x86/include/asm/traps.h | 1 + >> arch/x86/kernel/entry_32.S | 10 ++ >> arch/x86/kernel/entry_64.S | 3 + >> arch/x86/kernel/kvm.c | 181 +++++++++++++++++++++++++++++++++++++++ >> arch/x86/kvm/svm.c | 45 ++++++++-- >> 6 files changed, 243 insertions(+), 9 deletions(-) >> >> diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h >> index 2315398..fbfd367 100644 >> --- a/arch/x86/include/asm/kvm_para.h >> +++ b/arch/x86/include/asm/kvm_para.h >> @@ -65,6 +65,9 @@ struct kvm_mmu_op_release_pt { >> __u64 pt_phys; >> }; >> >> +#define KVM_PV_REASON_PAGE_NOT_PRESENT 1 >> +#define KVM_PV_REASON_PAGE_READY 2 >> + >> struct kvm_vcpu_pv_apf_data { >> __u32 reason; >> __u8 pad[60]; >> @@ -171,8 +174,17 @@ static inline unsigned int kvm_arch_para_features(void) >> >> #ifdef CONFIG_KVM_GUEST >> void __init kvm_guest_init(void); >> +void kvm_async_pf_task_wait(u32 token); >> +void kvm_async_pf_task_wake(u32 token); >> +u32 kvm_read_and_reset_pf_reason(void); >> #else >> #define kvm_guest_init() do { } while (0) >> +#define kvm_async_pf_task_wait(T) do {} while(0) >> +#define kvm_async_pf_task_wake(T) do {} while(0) >> +static u32 kvm_read_and_reset_pf_reason(void) >> +{ >> + return 0; >> +} >> #endif >> >> #endif /* __KERNEL__ */ >> diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h >> index f66cda5..0310da6 100644 >> --- a/arch/x86/include/asm/traps.h >> +++ b/arch/x86/include/asm/traps.h >> @@ -30,6 +30,7 @@ asmlinkage void segment_not_present(void); >> asmlinkage void stack_segment(void); >> asmlinkage void general_protection(void); >> asmlinkage void page_fault(void); >> +asmlinkage void async_page_fault(void); >> asmlinkage void spurious_interrupt_bug(void); >> asmlinkage void coprocessor_error(void); >> asmlinkage void alignment_check(void); >> diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S >> index 227d009..e6e7273 100644 >> --- a/arch/x86/kernel/entry_32.S >> +++ b/arch/x86/kernel/entry_32.S >> @@ -1496,6 +1496,16 @@ ENTRY(general_protection) >> CFI_ENDPROC >> END(general_protection) >> >> +#ifdef CONFIG_KVM_GUEST >> +ENTRY(async_page_fault) >> + RING0_EC_FRAME >> + pushl $do_async_page_fault >> + CFI_ADJUST_CFA_OFFSET 4 >> + jmp error_code >> + CFI_ENDPROC >> +END(apf_page_fault) >> +#endif >> + >> /* >> * End of kprobes section >> */ >> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S >> index 17be5ec..def98c3 100644 >> --- a/arch/x86/kernel/entry_64.S >> +++ b/arch/x86/kernel/entry_64.S >> @@ -1349,6 +1349,9 @@ errorentry xen_stack_segment do_stack_segment >> #endif >> errorentry general_protection do_general_protection >> errorentry page_fault do_page_fault >> +#ifdef CONFIG_KVM_GUEST >> +errorentry async_page_fault do_async_page_fault >> +#endif >> #ifdef CONFIG_X86_MCE >> paranoidzeroentry machine_check *machine_check_vector(%rip) >> #endif >> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c >> index 032d03b..d564063 100644 >> --- a/arch/x86/kernel/kvm.c >> +++ b/arch/x86/kernel/kvm.c >> @@ -29,8 +29,14 @@ >> #include <linux/hardirq.h> >> #include <linux/notifier.h> >> #include <linux/reboot.h> >> +#include <linux/hash.h> >> +#include <linux/sched.h> >> +#include <linux/slab.h> >> +#include <linux/kprobes.h> >> #include <asm/timer.h> >> #include <asm/cpu.h> >> +#include <asm/traps.h> >> +#include <asm/desc.h> >> >> #define MMU_QUEUE_SIZE 1024 >> >> @@ -64,6 +70,168 @@ static void kvm_io_delay(void) >> { >> } >> >> +#define KVM_TASK_SLEEP_HASHBITS 8 >> +#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS) >> + >> +struct kvm_task_sleep_node { >> + struct hlist_node link; >> + wait_queue_head_t wq; >> + u32 token; >> + int cpu; >> +}; >> + >> +static struct kvm_task_sleep_head { >> + spinlock_t lock; >> + struct hlist_head list; >> +} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE]; >> + >> +static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, >> + u32 token) >> +{ >> + struct hlist_node *p; >> + >> + hlist_for_each(p, &b->list) { >> + struct kvm_task_sleep_node *n = >> + hlist_entry(p, typeof(*n), link); >> + if (n->token == token) >> + return n; >> + } >> + >> + return NULL; >> +} >> + >> +void kvm_async_pf_task_wait(u32 token) >> +{ >> + u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); >> + struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; >> + struct kvm_task_sleep_node n, *e; >> + DEFINE_WAIT(wait); >> + >> + spin_lock(&b->lock); >> + e = _find_apf_task(b, token); >> + if (e) { >> + /* dummy entry exist -> wake up was delivered ahead of PF */ >> + hlist_del(&e->link); >> + kfree(e); >> + spin_unlock(&b->lock); >> + return; >> + } >> + >> + n.token = token; >> + n.cpu = smp_processor_id(); >> + init_waitqueue_head(&n.wq); >> + hlist_add_head(&n.link, &b->list); >> + spin_unlock(&b->lock); >> + >> + for (;;) { >> + prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); >> + if (hlist_unhashed(&n.link)) >> + break; >> + local_irq_enable(); >> + schedule(); >> + local_irq_disable(); >> + } >> + finish_wait(&n.wq, &wait); >> + >> + return; >> +} >> +EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); >> + >> +static void apf_task_wake_one(struct kvm_task_sleep_node *n) >> +{ >> + hlist_del_init(&n->link); >> + if (waitqueue_active(&n->wq)) >> + wake_up(&n->wq); >> +} >> + >> +static void apf_task_wake_all(void) >> +{ >> + int i; >> + >> + for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { >> + struct hlist_node *p, *next; >> + struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; >> + spin_lock(&b->lock); >> + hlist_for_each_safe(p, next, &b->list) { >> + struct kvm_task_sleep_node *n = >> + hlist_entry(p, typeof(*n), link); >> + if (n->cpu == smp_processor_id()) >> + apf_task_wake_one(n); >> + } >> + spin_unlock(&b->lock); >> + } >> +} >> + >> +void kvm_async_pf_task_wake(u32 token) >> +{ >> + u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); >> + struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; >> + struct kvm_task_sleep_node *n; >> + >> + if (token == ~0) { >> + apf_task_wake_all(); >> + return; >> + } >> + >> +again: >> + spin_lock(&b->lock); >> + n = _find_apf_task(b, token); >> + if (!n) { >> + /* >> + * async PF was not yet handled. >> + * Add dummy entry for the token. >> + */ >> + n = kmalloc(sizeof(*n), GFP_ATOMIC); >> + if (!n) { >> + /* >> + * Allocation failed! Busy wait while other cpu >> + * handles async PF. >> + */ >> + spin_unlock(&b->lock); >> + cpu_relax(); >> + goto again; >> + } >> + n->token = token; >> + n->cpu = smp_processor_id(); >> + init_waitqueue_head(&n->wq); >> + hlist_add_head(&n->link, &b->list); >> + } else >> + apf_task_wake_one(n); >> + spin_unlock(&b->lock); >> + return; >> +} >> +EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); >> + >> +u32 kvm_read_and_reset_pf_reason(void) >> +{ >> + u32 reason = 0; >> + >> + if (__get_cpu_var(apf_reason).enabled) { >> + reason = __get_cpu_var(apf_reason).reason; >> + __get_cpu_var(apf_reason).reason = 0; >> + } >> + >> + return reason; >> +} >> +EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason); >> + >> +dotraplinkage void __kprobes >> +do_async_page_fault(struct pt_regs *regs, unsigned long error_code) >> +{ >> + switch (kvm_read_and_reset_pf_reason()) { >> + default: >> + do_page_fault(regs, error_code); >> + break; >> + case KVM_PV_REASON_PAGE_NOT_PRESENT: >> + /* page is swapped out by the host. */ >> + kvm_async_pf_task_wait((u32)read_cr2()); >> + break; >> + case KVM_PV_REASON_PAGE_READY: >> + kvm_async_pf_task_wake((u32)read_cr2()); >> + break; >> + } >> +} >> + >> static void kvm_mmu_op(void *buffer, unsigned len) >> { >> int r; >> @@ -300,6 +468,7 @@ static void kvm_guest_cpu_online(void *dummy) >> static void kvm_guest_cpu_offline(void *dummy) >> { >> kvm_pv_disable_apf(NULL); >> + apf_task_wake_all(); >> } >> >> static int __cpuinit kvm_cpu_notify(struct notifier_block *self, >> @@ -327,13 +496,25 @@ static struct notifier_block __cpuinitdata kvm_cpu_notifier = { >> }; >> #endif >> >> +static void __init kvm_apf_trap_init(void) >> +{ >> + set_intr_gate(14, &async_page_fault); >> +} >> + >> void __init kvm_guest_init(void) >> { >> + int i; >> + >> if (!kvm_para_available()) >> return; >> >> paravirt_ops_setup(); >> register_reboot_notifier(&kvm_pv_reboot_nb); >> + for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) >> + spin_lock_init(&async_pf_sleepers[i].lock); >> + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) >> + x86_init.irqs.trap_init = kvm_apf_trap_init; >> + >> #ifdef CONFIG_SMP >> smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; >> register_cpu_notifier(&kvm_cpu_notifier); >> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c >> index 9a92224..9fa27a5 100644 >> --- a/arch/x86/kvm/svm.c >> +++ b/arch/x86/kvm/svm.c >> @@ -31,6 +31,7 @@ >> >> #include <asm/tlbflush.h> >> #include <asm/desc.h> >> +#include <asm/kvm_para.h> >> >> #include <asm/virtext.h> >> #include "trace.h" >> @@ -133,6 +134,7 @@ struct vcpu_svm { >> >> unsigned int3_injected; >> unsigned long int3_rip; >> + u32 apf_reason; >> }; >> >> #define MSR_INVALID 0xffffffffU >> @@ -1383,16 +1385,33 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) >> >> static int pf_interception(struct vcpu_svm *svm) >> { >> - u64 fault_address; >> + u64 fault_address = svm->vmcb->control.exit_info_2; >> u32 error_code; >> + int r = 1; >> >> - fault_address = svm->vmcb->control.exit_info_2; >> - error_code = svm->vmcb->control.exit_info_1; >> + switch (svm->apf_reason) { >> + default: >> + error_code = svm->vmcb->control.exit_info_1; >> >> - trace_kvm_page_fault(fault_address, error_code); >> - if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) >> - kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); >> - return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); >> + trace_kvm_page_fault(fault_address, error_code); >> + if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) >> + kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); >> + r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); >> + break; >> + case KVM_PV_REASON_PAGE_NOT_PRESENT: >> + svm->apf_reason = 0; >> + local_irq_disable(); >> + kvm_async_pf_task_wait(fault_address); >> + local_irq_enable(); >> + break; >> + case KVM_PV_REASON_PAGE_READY: >> + svm->apf_reason = 0; >> + local_irq_disable(); >> + kvm_async_pf_task_wake(fault_address); >> + local_irq_enable(); >> + break; > > That's only available if CONFIG_KVM_GUEST is set, no? Is there anything > I miss that resolves this dependency automatically? Otherwise, some more > #ifdef CONFIG_KVM_GUEST might be needed. Err, found it. Sorry for the noise. Jan -- Siemens AG, Corporate Technology, CT T DE IT 1 Corporate Competence Center Embedded Linux -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html