This supports asynchronous page fault for the guest. The design is similar to what x86 has: on receiving a PAGE_NOT_PRESENT signal from the hypervisor, the current task is either rescheduled or put into power-saving mode. The task will be waken up when PAGE_READY signal is received. The signals are conveyed through data abort with specific (IMPDEF) Data Fault Status Code (DFSC). Besides, a hash table is introduced to track the processes that have been put into waiting state, to avoid out-of-consistency. The feature is put into the CONFIG_KVM_GUEST umbrella, which is added by this patch. Signed-off-by: Gavin Shan <gshan@xxxxxxxxxx> --- arch/arm64/Kconfig | 11 ++ arch/arm64/include/asm/exception.h | 5 + arch/arm64/include/asm/kvm_para.h | 42 ++++- arch/arm64/kernel/smp.c | 47 ++++++ arch/arm64/mm/fault.c | 239 ++++++++++++++++++++++++++++- 5 files changed, 336 insertions(+), 8 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 40fb05d96c60..2d5e5ee62d6d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1045,6 +1045,17 @@ config PARAVIRT under a hypervisor, potentially improving performance significantly over full virtualization. +config KVM_GUEST + bool "KVM Guest Support" + depends on PARAVIRT + default y + help + This option enables various optimizations for running under the KVM + hypervisor. Overhead for the kernel when not running inside KVM should + be minimal. + + In case of doubt, say Y + config PARAVIRT_TIME_ACCOUNTING bool "Paravirtual steal time accounting" select PARAVIRT diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index 7a6e81ca23a8..17ac2db36472 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -46,4 +46,9 @@ void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr); void do_cp15instr(unsigned int esr, struct pt_regs *regs); void do_el0_svc(struct pt_regs *regs); void do_el0_svc_compat(struct pt_regs *regs); + +#ifdef CONFIG_KVM_GUEST +void kvm_pv_async_pf_enable(void); +void kvm_pv_async_pf_disable(void); +#endif /* CONFIG_KVM_GUEST */ #endif /* __ASM_EXCEPTION_H */ diff --git a/arch/arm64/include/asm/kvm_para.h b/arch/arm64/include/asm/kvm_para.h index 0ea481dd1c7a..a43bed479c2b 100644 --- a/arch/arm64/include/asm/kvm_para.h +++ b/arch/arm64/include/asm/kvm_para.h @@ -3,6 +3,30 @@ #define _ASM_ARM_KVM_PARA_H #include <uapi/asm/kvm_para.h> +#include <linux/of.h> + +#ifdef CONFIG_KVM_GUEST +static inline int kvm_para_available(void) +{ + struct device_node *hyper_node; + int ret = 0; + + hyper_node = of_find_node_by_path("/hypervisor"); + if (!hyper_node) + return 0; + + if (of_device_is_compatible(hyper_node, "linux,kvm")) + ret = 1; + + of_node_put(hyper_node); + return ret; +} +#else +static inline int kvm_para_available(void) +{ + return 0; +} +#endif /* CONFIG_KVM_GUEST */ static inline bool kvm_check_and_clear_guest_paused(void) { @@ -11,17 +35,21 @@ static inline bool kvm_check_and_clear_guest_paused(void) static inline unsigned int kvm_arch_para_features(void) { - return 0; + struct device_node *hyper_node; + unsigned int features = 0; + + if (!kvm_para_available()) + return 0; + + hyper_node = of_find_node_by_path("/hypervisor"); + of_property_read_u32(hyper_node, "para-features", &features); + of_node_put(hyper_node); + + return features; } static inline unsigned int kvm_arch_para_hints(void) { return 0; } - -static inline bool kvm_para_available(void) -{ - return false; -} - #endif /* _ASM_ARM_KVM_PARA_H */ diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 061f60fe452f..cc97a8462d7f 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -40,6 +40,7 @@ #include <asm/cputype.h> #include <asm/cpu_ops.h> #include <asm/daifflags.h> +#include <asm/exception.h> #include <asm/kvm_mmu.h> #include <asm/mmu_context.h> #include <asm/numa.h> @@ -443,6 +444,38 @@ void __init smp_cpus_done(unsigned int max_cpus) mark_linear_text_alias_ro(); } +#ifdef CONFIG_KVM_GUEST +static void kvm_cpu_reboot(void *unused) +{ + kvm_pv_async_pf_disable(); +} + +static int kvm_cpu_reboot_notify(struct notifier_block *nb, + unsigned long code, void *unused) +{ + if (code == SYS_RESTART) + on_each_cpu(kvm_cpu_reboot, NULL, 1); + + return NOTIFY_DONE; +} + +static struct notifier_block kvm_cpu_reboot_nb = { + .notifier_call = kvm_cpu_reboot_notify, +}; + +static int kvm_cpu_online(unsigned int cpu) +{ + kvm_pv_async_pf_enable(); + return 0; +} + +static int kvm_cpu_offline(unsigned int cpu) +{ + kvm_pv_async_pf_disable(); + return 0; +} +#endif /* CONFIG_KVM_GUEST */ + void __init smp_prepare_boot_cpu(void) { set_my_cpu_offset(per_cpu_offset(smp_processor_id())); @@ -458,6 +491,20 @@ void __init smp_prepare_boot_cpu(void) /* Conditionally switch to GIC PMR for interrupt masking */ if (system_uses_irq_prio_masking()) init_gic_priority_masking(); + + + /* Enable async page fault */ +#ifdef CONFIG_KVM_GUEST + register_reboot_notifier(&kvm_cpu_reboot_nb); + if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "arm/kvm:online", kvm_cpu_online, kvm_cpu_offline) < 0) { + pr_warn("%s: Failed to install cpu hotplug callbacks\n", + __func__); + return; + } + + kvm_pv_async_pf_enable(); +#endif /* CONFIG_KVM_GUEST */ } static u64 __init of_get_cpu_mpidr(struct device_node *dn) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 1027851d469a..39c7570fe303 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -19,10 +19,12 @@ #include <linux/page-flags.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> +#include <linux/swait.h> #include <linux/highmem.h> #include <linux/perf_event.h> #include <linux/preempt.h> #include <linux/hugetlb.h> +#include <linux/kvm_para.h> #include <asm/acpi.h> #include <asm/bug.h> @@ -48,8 +50,31 @@ struct fault_info { const char *name; }; +#ifdef CONFIG_KVM_GUEST +#define KVM_TASK_SLEEP_HASHBITS 8 +#define KVM_TASK_SLEEP_HASHSIZE (1 << KVM_TASK_SLEEP_HASHBITS) + +struct kvm_task_sleep_node { + struct hlist_node link; + struct swait_queue_head wq; + u32 token; + int cpu; + bool halted; +}; + +struct kvm_task_sleep_head { + raw_spinlock_t lock; + struct hlist_head list; +}; +#endif /* CONFIG_KVM_GUEST */ + static const struct fault_info fault_info[]; static struct fault_info debug_fault_info[]; +#ifdef CONFIG_KVM_GUEST +static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_data) __aligned(64); +static struct kvm_task_sleep_head async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE]; +static bool async_pf_initialized; +#endif static inline const struct fault_info *esr_to_fault_info(unsigned int esr) { @@ -623,6 +648,178 @@ static int do_alignment_fault(unsigned long addr, unsigned int esr, return 0; } +#ifdef CONFIG_KVM_GUEST +static struct kvm_task_sleep_node *kvm_pv_async_pf_find( + struct kvm_task_sleep_head *b, u32 token) +{ + struct kvm_task_sleep_node *n; + struct hlist_node *p; + + hlist_for_each(p, &b->list) { + n = hlist_entry(p, typeof(*n), link); + if (n->token == token) + return n; + } + + return NULL; +} + +static void kvm_pv_async_pf_wait(u32 token, int interrupt_kernel) +{ + u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); + struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; + struct kvm_task_sleep_node n, *e; + DECLARE_SWAITQUEUE(wait); + + raw_spin_lock(&b->lock); + e = kvm_pv_async_pf_find(b, token); + if (e) { + /* dummy entry exist -> wake up was delivered ahead of PF */ + hlist_del(&e->link); + kfree(e); + raw_spin_unlock(&b->lock); + + return; + } + + n.token = token; + n.cpu = smp_processor_id(); + n.halted = is_idle_task(current) || + (IS_ENABLED(CONFIG_PREEMPT_COUNT) ? + preempt_count() > 1 || rcu_preempt_depth() : + interrupt_kernel); + init_swait_queue_head(&n.wq); + hlist_add_head(&n.link, &b->list); + raw_spin_unlock(&b->lock); + + for (;;) { + if (!n.halted) { + prepare_to_swait_exclusive(&n.wq, &wait, + TASK_UNINTERRUPTIBLE); + } + + if (hlist_unhashed(&n.link)) + break; + + /* + * Enable the IRQ explicitly. Otherwise, the task + * won't be scheduled or waken up properly. + */ + local_irq_enable(); + + if (!n.halted) { + schedule(); + } else { + dsb(sy); + wfi(); + } + + local_irq_disable(); + } + + if (!n.halted) + finish_swait(&n.wq, &wait); +} + +static inline void kvm_pv_async_pf_wake_one(struct kvm_task_sleep_node *n) +{ + /* The task will be waken up once being detached */ + hlist_del_init(&n->link); + + if (!n->halted) + swake_up_one(&n->wq); + else + smp_send_reschedule(n->cpu); +} + +static void kvm_pv_async_pf_wake_all(void) +{ + struct kvm_task_sleep_head *b; + struct kvm_task_sleep_node *n; + struct hlist_node *p, *next; + int i; + + for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { + b = &async_pf_sleepers[i]; + + raw_spin_lock(&b->lock); + + hlist_for_each_safe(p, next, &b->list) { + n = hlist_entry(p, typeof(*n), link); + if (n->cpu != smp_processor_id()) + continue; + + kvm_pv_async_pf_wake_one(n); + } + + raw_spin_unlock(&b->lock); + } +} + +static void kvm_pv_async_pf_wake(u32 token) +{ + u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); + struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; + struct kvm_task_sleep_node *n; + + if (token == ~0) { + kvm_pv_async_pf_wake_all(); + return; + } + +again: + raw_spin_lock(&b->lock); + + n = kvm_pv_async_pf_find(b, token); + if (!n) { + /* + * Async PF was not yet handled. Add dummy entry + * for the token. Busy wait until other CPU handles + * the async PF on allocation failure. + */ + n = kzalloc(sizeof(*n), GFP_ATOMIC); + if (!n) { + raw_spin_unlock(&b->lock); + cpu_relax(); + goto again; + } + n->token = token; + n->cpu = smp_processor_id(); + init_swait_queue_head(&n->wq); + hlist_add_head(&n->link, &b->list); + } else { + kvm_pv_async_pf_wake_one(n); + } + + raw_spin_unlock(&b->lock); +} +#endif /* CONFIG_KVM_GUEST */ + +static int do_lockdown(unsigned long addr, unsigned int esr, + struct pt_regs *regs) +{ +#ifdef CONFIG_KVM_GUEST + u32 reason = 0; + + if (__this_cpu_read(apf_data.enabled)) { + reason = __this_cpu_read(apf_data.reason); + __this_cpu_write(apf_data.reason, 0); + } + + switch (reason) { + case KVM_PV_REASON_PAGE_NOT_PRESENT: + kvm_pv_async_pf_wait((u32)addr, !user_mode(regs)); + return 0; + case KVM_PV_REASON_PAGE_READY: + kvm_pv_async_pf_wake((u32)addr); + return 0; + } +#endif /* CONFIG_KVM_GUEST */ + + pr_info("%s: addr=0x%lx, esr=0x%x\n", __func__, addr, esr); + return 1; +} + static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs) { return 1; /* "fault" */ @@ -703,7 +900,8 @@ static const struct fault_info fault_info[] = { { do_bad, SIGKILL, SI_KERNEL, "Unsupported atomic hardware update fault" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 50" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 51" }, - { do_bad, SIGKILL, SI_KERNEL, "implementation fault (lockdown abort)" }, + { do_lockdown, SIGKILL, SI_KERNEL, + "implementation fault (lockdown abort)" }, { do_bad, SIGBUS, BUS_OBJERR, "implementation fault (unsupported exclusive)" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 54" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 55" }, @@ -878,3 +1076,42 @@ void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr, debug_exception_exit(regs); } NOKPROBE_SYMBOL(do_debug_exception); + +#ifdef CONFIG_KVM_GUEST +void kvm_pv_async_pf_enable(void) +{ + u64 pa; + int i; + + if (!kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) || + __this_cpu_read(apf_data.enabled)) + return; + + if (!async_pf_initialized) { + async_pf_initialized = true; + for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) + raw_spin_lock_init(&async_pf_sleepers[i].lock); + } + + /* FIXME: Enable KVM_ASYNC_PF_SEND_ALWAYS on CONFIG_PREEMPTION */ + pa = virt_to_phys(this_cpu_ptr(&apf_data)); + pa |= KVM_ASYNC_PF_ENABLED; + + __this_cpu_write(apf_data.enabled, 1); + write_sysreg_s(pa, SYS_ASYNC_PF_EL1); + + pr_info("Async PF enabled on CPU %d\n", smp_processor_id()); +} + +void kvm_pv_async_pf_disable(void) +{ + if (!kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) || + !__this_cpu_read(apf_data.enabled)) + return; + + write_sysreg_s(0, SYS_ASYNC_PF_EL1); + __this_cpu_write(apf_data.enabled, 0); + + pr_info("Async PF disabled on CPU %d\n", smp_processor_id()); +} +#endif /* CONFIG_KVM_GUEST */ -- 2.23.0 _______________________________________________ kvmarm mailing list kvmarm@xxxxxxxxxxxxxxxxxxxxx https://lists.cs.columbia.edu/mailman/listinfo/kvmarm