Asynchronous page fault notifies vcpu that page it is trying to access is swapped out by a host. In response guest puts a task that caused the fault to sleep until page is swapped in again. When missing page is brought back into the memory guest is notified and task resumes execution. Signed-off-by: Gleb Natapov <gleb@xxxxxxxxxx> --- arch/x86/include/asm/kvm_para.h | 3 + arch/x86/kernel/kvm.c | 132 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 135 insertions(+), 0 deletions(-) diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index d7d7079..79bb7f2 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -49,6 +49,9 @@ struct kvm_mmu_op_release_pt { __u64 pt_phys; }; +#define KVM_PV_REASON_PAGE_NOT_PRESENT 1 +#define KVM_PV_REASON_PAGE_READY 2 + struct kvm_vcpu_pv_apf_data { __u32 reason; __u32 enabled; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index fdd0b95..09444c9 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -29,6 +29,8 @@ #include <linux/hardirq.h> #include <linux/notifier.h> #include <linux/reboot.h> +#include <linux/hash.h> +#include <linux/sched.h> #include <asm/timer.h> #include <asm/cpu.h> @@ -54,6 +56,130 @@ static void kvm_io_delay(void) { } +#define KVM_TASK_SLEEP_HASHBITS 8 +#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS) + +struct kvm_task_sleep_node { + struct hlist_node link; + wait_queue_head_t wq; + u32 token; +}; + +static struct kvm_task_sleep_head { + spinlock_t lock; + struct hlist_head list; +} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE]; + +static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, + u64 token) +{ + struct hlist_node *p; + + hlist_for_each(p, &b->list) { + struct kvm_task_sleep_node *n = + hlist_entry(p, typeof(*n), link); + if (n->token == token) + return n; + } + + return NULL; +} + +static void apf_task_wait(struct task_struct *tsk, u32 token) +{ + u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); + struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; + struct kvm_task_sleep_node n, *e; + DEFINE_WAIT(wait); + + spin_lock(&b->lock); + e = _find_apf_task(b, token); + if (e) { + /* dummy entry exist -> wake up was delivered ahead of PF */ + hlist_del(&e->link); + kfree(e); + spin_unlock(&b->lock); + return; + } + + n.token = token; + init_waitqueue_head(&n.wq); + hlist_add_head(&n.link, &b->list); + spin_unlock(&b->lock); + + for (;;) { + prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); + if (hlist_unhashed(&n.link)) + break; + schedule(); + } + finish_wait(&n.wq, &wait); + + return; +} + +static void apf_task_wake(u32 token) +{ + u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); + struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; + struct kvm_task_sleep_node *n; + +again: + spin_lock(&b->lock); + n = _find_apf_task(b, token); + if (!n) { + /* + * async PF was not yet handled. + * Add dummy entry for the token. + */ + n = kmalloc(sizeof(*n), GFP_ATOMIC); + if (!n) { + /* + * Allocation failed! Busy wait while other vcpu + * handles async PF. + */ + spin_unlock(&b->lock); + cpu_relax(); + goto again; + } + n->token = token; + hlist_add_head(&n->link, &b->list); + } else { + hlist_del_init(&n->link); + if (waitqueue_active(&n->wq)) + wake_up(&n->wq); + } + spin_unlock(&b->lock); + return; +} + +int kvm_handle_pf(struct pt_regs *regs, unsigned long error_code) +{ + u32 reason, token; + + if (!per_cpu(apf_reason, smp_processor_id()).enabled) + return 0; + + reason = per_cpu(apf_reason, smp_processor_id()).reason; + per_cpu(apf_reason, smp_processor_id()).reason = 0; + + token = (u32)read_cr2(); + + switch (reason) { + default: + return 0; + case KVM_PV_REASON_PAGE_NOT_PRESENT: + /* page is swapped out by the host. */ + apf_task_wait(current, token); + break; + case KVM_PV_REASON_PAGE_READY: + apf_task_wake(token); + break; + } + + return 1; +} + static void kvm_mmu_op(void *buffer, unsigned len) { int r; @@ -207,6 +333,9 @@ static void __init paravirt_ops_setup(void) if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) pv_cpu_ops.io_delay = kvm_io_delay; + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) + pv_cpu_ops.handle_pf = kvm_handle_pf; + if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) { pv_mmu_ops.set_pte = kvm_set_pte; pv_mmu_ops.set_pte_at = kvm_set_pte_at; @@ -270,11 +399,14 @@ static void __init kvm_smp_prepare_boot_cpu(void) void __init kvm_guest_init(void) { + int i; if (!kvm_para_available()) return; paravirt_ops_setup(); register_reboot_notifier(&kvm_pv_reboot_nb); + for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) + spin_lock_init(&async_pf_sleepers[i].lock); #ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; #else -- 1.6.5 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html