Am 14.10.2010 11:22, Gleb Natapov wrote: > When async PF capability is detected hook up special page fault handler > that will handle async page fault events and bypass other page faults to > regular page fault handler. Also add async PF handling to nested SVM > emulation. Async PF always generates exit to L1 where vcpu thread will > be scheduled out until page is available. > > Acked-by: Rik van Riel <riel@xxxxxxxxxx> > Signed-off-by: Gleb Natapov <gleb@xxxxxxxxxx> > --- > arch/x86/include/asm/kvm_para.h | 12 +++ > arch/x86/include/asm/traps.h | 1 + > arch/x86/kernel/entry_32.S | 10 ++ > arch/x86/kernel/entry_64.S | 3 + > arch/x86/kernel/kvm.c | 181 +++++++++++++++++++++++++++++++++++++++ > arch/x86/kvm/svm.c | 45 ++++++++-- > 6 files changed, 243 insertions(+), 9 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h > index 2315398..fbfd367 100644 > --- a/arch/x86/include/asm/kvm_para.h > +++ b/arch/x86/include/asm/kvm_para.h > @@ -65,6 +65,9 @@ struct kvm_mmu_op_release_pt { > __u64 pt_phys; > }; > > +#define KVM_PV_REASON_PAGE_NOT_PRESENT 1 > +#define KVM_PV_REASON_PAGE_READY 2 > + > struct kvm_vcpu_pv_apf_data { > __u32 reason; > __u8 pad[60]; > @@ -171,8 +174,17 @@ static inline unsigned int kvm_arch_para_features(void) > > #ifdef CONFIG_KVM_GUEST > void __init kvm_guest_init(void); > +void kvm_async_pf_task_wait(u32 token); > +void kvm_async_pf_task_wake(u32 token); > +u32 kvm_read_and_reset_pf_reason(void); > #else > #define kvm_guest_init() do { } while (0) > +#define kvm_async_pf_task_wait(T) do {} while(0) > +#define kvm_async_pf_task_wake(T) do {} while(0) > +static u32 kvm_read_and_reset_pf_reason(void) > +{ > + return 0; > +} > #endif > > #endif /* __KERNEL__ */ > diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h > index f66cda5..0310da6 100644 > --- a/arch/x86/include/asm/traps.h > +++ b/arch/x86/include/asm/traps.h > @@ -30,6 +30,7 @@ asmlinkage void segment_not_present(void); > asmlinkage void stack_segment(void); > asmlinkage void general_protection(void); > asmlinkage void page_fault(void); > +asmlinkage void async_page_fault(void); > asmlinkage void spurious_interrupt_bug(void); > asmlinkage void coprocessor_error(void); > asmlinkage void alignment_check(void); > diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S > index 227d009..e6e7273 100644 > --- a/arch/x86/kernel/entry_32.S > +++ b/arch/x86/kernel/entry_32.S > @@ -1496,6 +1496,16 @@ ENTRY(general_protection) > CFI_ENDPROC > END(general_protection) > > +#ifdef CONFIG_KVM_GUEST > +ENTRY(async_page_fault) > + RING0_EC_FRAME > + pushl $do_async_page_fault > + CFI_ADJUST_CFA_OFFSET 4 > + jmp error_code > + CFI_ENDPROC > +END(apf_page_fault) > +#endif > + > /* > * End of kprobes section > */ > diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S > index 17be5ec..def98c3 100644 > --- a/arch/x86/kernel/entry_64.S > +++ b/arch/x86/kernel/entry_64.S > @@ -1349,6 +1349,9 @@ errorentry xen_stack_segment do_stack_segment > #endif > errorentry general_protection do_general_protection > errorentry page_fault do_page_fault > +#ifdef CONFIG_KVM_GUEST > +errorentry async_page_fault do_async_page_fault > +#endif > #ifdef CONFIG_X86_MCE > paranoidzeroentry machine_check *machine_check_vector(%rip) > #endif > diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c > index 032d03b..d564063 100644 > --- a/arch/x86/kernel/kvm.c > +++ b/arch/x86/kernel/kvm.c > @@ -29,8 +29,14 @@ > #include <linux/hardirq.h> > #include <linux/notifier.h> > #include <linux/reboot.h> > +#include <linux/hash.h> > +#include <linux/sched.h> > +#include <linux/slab.h> > +#include <linux/kprobes.h> > #include <asm/timer.h> > #include <asm/cpu.h> > +#include <asm/traps.h> > +#include <asm/desc.h> > > #define MMU_QUEUE_SIZE 1024 > > @@ -64,6 +70,168 @@ static void kvm_io_delay(void) > { > } > > +#define KVM_TASK_SLEEP_HASHBITS 8 > +#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS) > + > +struct kvm_task_sleep_node { > + struct hlist_node link; > + wait_queue_head_t wq; > + u32 token; > + int cpu; > +}; > + > +static struct kvm_task_sleep_head { > + spinlock_t lock; > + struct hlist_head list; > +} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE]; > + > +static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, > + u32 token) > +{ > + struct hlist_node *p; > + > + hlist_for_each(p, &b->list) { > + struct kvm_task_sleep_node *n = > + hlist_entry(p, typeof(*n), link); > + if (n->token == token) > + return n; > + } > + > + return NULL; > +} > + > +void kvm_async_pf_task_wait(u32 token) > +{ > + u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); > + struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; > + struct kvm_task_sleep_node n, *e; > + DEFINE_WAIT(wait); > + > + spin_lock(&b->lock); > + e = _find_apf_task(b, token); > + if (e) { > + /* dummy entry exist -> wake up was delivered ahead of PF */ > + hlist_del(&e->link); > + kfree(e); > + spin_unlock(&b->lock); > + return; > + } > + > + n.token = token; > + n.cpu = smp_processor_id(); > + init_waitqueue_head(&n.wq); > + hlist_add_head(&n.link, &b->list); > + spin_unlock(&b->lock); > + > + for (;;) { > + prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); > + if (hlist_unhashed(&n.link)) > + break; > + local_irq_enable(); > + schedule(); > + local_irq_disable(); > + } > + finish_wait(&n.wq, &wait); > + > + return; > +} > +EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); > + > +static void apf_task_wake_one(struct kvm_task_sleep_node *n) > +{ > + hlist_del_init(&n->link); > + if (waitqueue_active(&n->wq)) > + wake_up(&n->wq); > +} > + > +static void apf_task_wake_all(void) > +{ > + int i; > + > + for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { > + struct hlist_node *p, *next; > + struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; > + spin_lock(&b->lock); > + hlist_for_each_safe(p, next, &b->list) { > + struct kvm_task_sleep_node *n = > + hlist_entry(p, typeof(*n), link); > + if (n->cpu == smp_processor_id()) > + apf_task_wake_one(n); > + } > + spin_unlock(&b->lock); > + } > +} > + > +void kvm_async_pf_task_wake(u32 token) > +{ > + u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); > + struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; > + struct kvm_task_sleep_node *n; > + > + if (token == ~0) { > + apf_task_wake_all(); > + return; > + } > + > +again: > + spin_lock(&b->lock); > + n = _find_apf_task(b, token); > + if (!n) { > + /* > + * async PF was not yet handled. > + * Add dummy entry for the token. > + */ > + n = kmalloc(sizeof(*n), GFP_ATOMIC); > + if (!n) { > + /* > + * Allocation failed! Busy wait while other cpu > + * handles async PF. > + */ > + spin_unlock(&b->lock); > + cpu_relax(); > + goto again; > + } > + n->token = token; > + n->cpu = smp_processor_id(); > + init_waitqueue_head(&n->wq); > + hlist_add_head(&n->link, &b->list); > + } else > + apf_task_wake_one(n); > + spin_unlock(&b->lock); > + return; > +} > +EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); > + > +u32 kvm_read_and_reset_pf_reason(void) > +{ > + u32 reason = 0; > + > + if (__get_cpu_var(apf_reason).enabled) { > + reason = __get_cpu_var(apf_reason).reason; > + __get_cpu_var(apf_reason).reason = 0; > + } > + > + return reason; > +} > +EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason); > + > +dotraplinkage void __kprobes > +do_async_page_fault(struct pt_regs *regs, unsigned long error_code) > +{ > + switch (kvm_read_and_reset_pf_reason()) { > + default: > + do_page_fault(regs, error_code); > + break; > + case KVM_PV_REASON_PAGE_NOT_PRESENT: > + /* page is swapped out by the host. */ > + kvm_async_pf_task_wait((u32)read_cr2()); > + break; > + case KVM_PV_REASON_PAGE_READY: > + kvm_async_pf_task_wake((u32)read_cr2()); > + break; > + } > +} > + > static void kvm_mmu_op(void *buffer, unsigned len) > { > int r; > @@ -300,6 +468,7 @@ static void kvm_guest_cpu_online(void *dummy) > static void kvm_guest_cpu_offline(void *dummy) > { > kvm_pv_disable_apf(NULL); > + apf_task_wake_all(); > } > > static int __cpuinit kvm_cpu_notify(struct notifier_block *self, > @@ -327,13 +496,25 @@ static struct notifier_block __cpuinitdata kvm_cpu_notifier = { > }; > #endif > > +static void __init kvm_apf_trap_init(void) > +{ > + set_intr_gate(14, &async_page_fault); > +} > + > void __init kvm_guest_init(void) > { > + int i; > + > if (!kvm_para_available()) > return; > > paravirt_ops_setup(); > register_reboot_notifier(&kvm_pv_reboot_nb); > + for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) > + spin_lock_init(&async_pf_sleepers[i].lock); > + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) > + x86_init.irqs.trap_init = kvm_apf_trap_init; > + > #ifdef CONFIG_SMP > smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; > register_cpu_notifier(&kvm_cpu_notifier); > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c > index 9a92224..9fa27a5 100644 > --- a/arch/x86/kvm/svm.c > +++ b/arch/x86/kvm/svm.c > @@ -31,6 +31,7 @@ > > #include <asm/tlbflush.h> > #include <asm/desc.h> > +#include <asm/kvm_para.h> > > #include <asm/virtext.h> > #include "trace.h" > @@ -133,6 +134,7 @@ struct vcpu_svm { > > unsigned int3_injected; > unsigned long int3_rip; > + u32 apf_reason; > }; > > #define MSR_INVALID 0xffffffffU > @@ -1383,16 +1385,33 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) > > static int pf_interception(struct vcpu_svm *svm) > { > - u64 fault_address; > + u64 fault_address = svm->vmcb->control.exit_info_2; > u32 error_code; > + int r = 1; > > - fault_address = svm->vmcb->control.exit_info_2; > - error_code = svm->vmcb->control.exit_info_1; > + switch (svm->apf_reason) { > + default: > + error_code = svm->vmcb->control.exit_info_1; > > - trace_kvm_page_fault(fault_address, error_code); > - if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) > - kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); > - return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); > + trace_kvm_page_fault(fault_address, error_code); > + if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) > + kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); > + r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); > + break; > + case KVM_PV_REASON_PAGE_NOT_PRESENT: > + svm->apf_reason = 0; > + local_irq_disable(); > + kvm_async_pf_task_wait(fault_address); > + local_irq_enable(); > + break; > + case KVM_PV_REASON_PAGE_READY: > + svm->apf_reason = 0; > + local_irq_disable(); > + kvm_async_pf_task_wake(fault_address); > + local_irq_enable(); > + break; That's only available if CONFIG_KVM_GUEST is set, no? Is there anything I miss that resolves this dependency automatically? Otherwise, some more #ifdef CONFIG_KVM_GUEST might be needed. Jan -- Siemens AG, Corporate Technology, CT T DE IT 1 Corporate Competence Center Embedded Linux -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html