Asynchronous page fault notifies vcpu that page it is trying to access is swapped out by a host. In response guest puts a task that caused the fault to sleep until page is swapped in again. When missing page is brought back into the memory guest is notified and task resumes execution. Signed-off-by: Gleb Natapov <gleb@xxxxxxxxxx> --- arch/x86/include/asm/kvm_para.h | 3 + arch/x86/kernel/kvm.c | 120 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 120 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 90708b7..61e2aa3 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -52,6 +52,9 @@ struct kvm_mmu_op_release_pt { #define KVM_PV_SHM_FEATURES_ASYNC_PF (1 << 0) +#define KVM_PV_REASON_PAGE_NP 1 +#define KVM_PV_REASON_PAGE_READY 2 + struct kvm_vcpu_pv_shm { __u64 features; __u64 reason; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index d03f33c..79d291f 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -30,6 +30,8 @@ #include <linux/bootmem.h> #include <linux/notifier.h> #include <linux/reboot.h> +#include <linux/hash.h> +#include <linux/sched.h> #include <asm/timer.h> #include <asm/cpu.h> @@ -55,15 +57,121 @@ static void kvm_io_delay(void) { } -static void kvm_end_context_switch(struct task_struct *next) +#define KVM_TASK_SLEEP_HASHBITS 8 +#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS) + +struct kvm_task_sleep_node { + struct hlist_node link; + wait_queue_head_t wq; + u64 token; +}; + +static struct kvm_task_sleep_head { + spinlock_t lock; + struct hlist_head list; +} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE]; + +static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, + u64 token) +{ + struct hlist_node *p; + + hlist_for_each(p, &b->list) { + struct kvm_task_sleep_node *n = + hlist_entry(p, typeof(*n), link); + if (n->token == token) + return n; + } + + return NULL; +} + +static void apf_task_wait(struct task_struct *tsk, u64 token) { + u64 key = hash_64(token, KVM_TASK_SLEEP_HASHBITS); + struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; + struct kvm_task_sleep_node n, *e; + DEFINE_WAIT(wait); + + spin_lock(&b->lock); + e = _find_apf_task(b, token); + if (e) { + /* dummy entry exist -> wake up was delivered ahead of PF */ + hlist_del(&e->link); + kfree(e); + spin_unlock(&b->lock); + return; + } + + n.token = token; + init_waitqueue_head(&n.wq); + hlist_add_head(&n.link, &b->list); + spin_unlock(&b->lock); + + for (;;) { + prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); + if (hlist_unhashed(&n.link)) + break; + schedule(); + } + finish_wait(&n.wq, &wait); + + return; +} + +static void apf_task_wake(u64 token) +{ + u64 key = hash_64(token, KVM_TASK_SLEEP_HASHBITS); + struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; + struct kvm_task_sleep_node *n; + + spin_lock(&b->lock); + n = _find_apf_task(b, token); + if (!n) { + /* PF was not yet handled. Add dummy entry for the token */ + n = kmalloc(sizeof(*n), GFP_ATOMIC); + if (!n) { + printk(KERN_EMERG"async PF can't allocate memory\n"); + } else { + n->token = token; + hlist_add_head(&n->link, &b->list); + } + } else { + hlist_del_init(&n->link); + if (waitqueue_active(&n->wq)) + wake_up(&n->wq); + } + spin_unlock(&b->lock); + return; +} + +int kvm_handle_pf(struct pt_regs *regs, unsigned long error_code) +{ + u64 reason, token; struct kvm_vcpu_pv_shm *pv_shm = per_cpu(kvm_vcpu_pv_shm, smp_processor_id()); if (!pv_shm) - return; + return 0; + + reason = pv_shm->reason; + pv_shm->reason = 0; + + token = pv_shm->param; + + switch (reason) { + default: + return 0; + case KVM_PV_REASON_PAGE_NP: + /* real page is missing. */ + apf_task_wait(current, token); + break; + case KVM_PV_REASON_PAGE_READY: + apf_task_wake(token); + break; + } - pv_shm->current_task = (u64)next; + return 1; } static void kvm_mmu_op(void *buffer, unsigned len) @@ -219,6 +327,9 @@ static void __init paravirt_ops_setup(void) if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) pv_cpu_ops.io_delay = kvm_io_delay; + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) + pv_cpu_ops.handle_pf = kvm_handle_pf; + if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) { pv_mmu_ops.set_pte = kvm_set_pte; pv_mmu_ops.set_pte_at = kvm_set_pte_at; @@ -272,11 +383,14 @@ static struct notifier_block kvm_pv_reboot_nb = { void __init kvm_guest_init(void) { + int i; if (!kvm_para_available()) return; paravirt_ops_setup(); register_reboot_notifier(&kvm_pv_reboot_nb); + for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) + spin_lock_init(&async_pf_sleepers[i].lock); } void __cpuinit kvm_guest_cpu_init(void) -- 1.6.3.3 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html