On Fri, May 08, 2020 at 01:29:19PM +1000, Gavin Shan wrote: > This supports asynchronous page fault for the guest. The design is > similar to what x86 has: on receiving a PAGE_NOT_PRESENT signal from > the host, the current task is either rescheduled or put into power > saving mode. The task will be waken up when PAGE_READY signal is > received. The PAGE_READY signal might be received in the context > of the suspended process, to be waken up. That means the suspended > process has to wake up itself, but it's not safe and prone to cause > dead-lock on CPU runqueue lock. So the wakeup is delayed on returning > from kernel space to user space or idle process is picked for running. > > The signals are conveyed through the async page fault control block, > which was passed to host on enabling the functionality. On each page > fault, the control block is checked and switch to the async page fault > handling flow if any signals exist. > > The feature is put into the CONFIG_KVM_GUEST umbrella, which is added > by this patch. So we have inline functions implemented in kvm_para.h, > like other architectures do, to check if async page fault (one of the > KVM para-virtualized features) is available. Also, the kernel boot > parameter "no-kvmapf" can be specified to disable the feature. > > Signed-off-by: Gavin Shan <gshan@xxxxxxxxxx> > --- > arch/arm64/Kconfig | 11 + > arch/arm64/include/asm/exception.h | 3 + > arch/arm64/include/asm/kvm_para.h | 27 +- > arch/arm64/kernel/entry.S | 33 +++ > arch/arm64/kernel/process.c | 4 + > arch/arm64/mm/fault.c | 434 +++++++++++++++++++++++++++++ > 6 files changed, 505 insertions(+), 7 deletions(-) > > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig > index 40fb05d96c60..2d5e5ee62d6d 100644 > --- a/arch/arm64/Kconfig > +++ b/arch/arm64/Kconfig > @@ -1045,6 +1045,17 @@ config PARAVIRT > under a hypervisor, potentially improving performance significantly > over full virtualization. > > +config KVM_GUEST > + bool "KVM Guest Support" > + depends on PARAVIRT > + default y > + help > + This option enables various optimizations for running under the KVM > + hypervisor. Overhead for the kernel when not running inside KVM should > + be minimal. > + > + In case of doubt, say Y > + > config PARAVIRT_TIME_ACCOUNTING > bool "Paravirtual steal time accounting" > select PARAVIRT > diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h > index 7a6e81ca23a8..d878afa42746 100644 > --- a/arch/arm64/include/asm/exception.h > +++ b/arch/arm64/include/asm/exception.h > @@ -46,4 +46,7 @@ void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr); > void do_cp15instr(unsigned int esr, struct pt_regs *regs); > void do_el0_svc(struct pt_regs *regs); > void do_el0_svc_compat(struct pt_regs *regs); > +#ifdef CONFIG_KVM_GUEST > +void kvm_async_pf_delayed_wake(void); > +#endif > #endif /* __ASM_EXCEPTION_H */ > diff --git a/arch/arm64/include/asm/kvm_para.h b/arch/arm64/include/asm/kvm_para.h > index 0ea481dd1c7a..b2f8ef243df7 100644 > --- a/arch/arm64/include/asm/kvm_para.h > +++ b/arch/arm64/include/asm/kvm_para.h > @@ -3,6 +3,20 @@ > #define _ASM_ARM_KVM_PARA_H > > #include <uapi/asm/kvm_para.h> > +#include <linux/of.h> > +#include <asm/hypervisor.h> > + > +#ifdef CONFIG_KVM_GUEST > +static inline int kvm_para_available(void) > +{ > + return 1; > +} > +#else > +static inline int kvm_para_available(void) > +{ > + return 0; > +} > +#endif /* CONFIG_KVM_GUEST */ Please make these bool, and return true/false, as was the case with the existing stub. > > static inline bool kvm_check_and_clear_guest_paused(void) > { > @@ -11,17 +25,16 @@ static inline bool kvm_check_and_clear_guest_paused(void) > > static inline unsigned int kvm_arch_para_features(void) > { > - return 0; > + unsigned int features = 0; > + > + if (kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_APF)) > + features |= (1 << KVM_FEATURE_ASYNC_PF); > + > + return features; > } > > static inline unsigned int kvm_arch_para_hints(void) > { > return 0; > } > - > -static inline bool kvm_para_available(void) > -{ > - return false; > -} > - > #endif /* _ASM_ARM_KVM_PARA_H */ > diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S > index ddcde093c433..15efd57129ff 100644 > --- a/arch/arm64/kernel/entry.S > +++ b/arch/arm64/kernel/entry.S > @@ -751,12 +751,45 @@ finish_ret_to_user: > enable_step_tsk x1, x2 > #ifdef CONFIG_GCC_PLUGIN_STACKLEAK > bl stackleak_erase > +#endif > +#ifdef CONFIG_KVM_GUEST > + bl kvm_async_pf_delayed_wake > #endif Yuck. I am very much not keen on this living in the entry assembly. What precisely is this needed for? > kernel_exit 0 > ENDPROC(ret_to_user) > > .popsection // .entry.text > > +#ifdef CONFIG_KVM_GUEST > + .pushsection ".rodata", "a" > +SYM_DATA_START(__exception_handlers_offset) > + .quad 0 > + .quad 0 > + .quad 0 > + .quad 0 > + .quad el1_sync - vectors > + .quad el1_irq - vectors > + .quad 0 > + .quad el1_error - vectors > + .quad el0_sync - vectors > + .quad el0_irq - vectors > + .quad 0 > + .quad el0_error - vectors > +#ifdef CONFIG_COMPAT > + .quad el0_sync_compat - vectors > + .quad el0_irq_compat - vectors > + .quad 0 > + .quad el0_error_compat - vectors > +#else > + .quad 0 > + .quad 0 > + .quad 0 > + .quad 0 > +#endif > +SYM_DATA_END(__exception_handlers_offset) > + .popsection // .rodata > +#endif /* CONFIG_KVM_GUEST */ This looks scary, and needs an introduction in the commit message. > + > #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 > /* > * Exception vectors trampoline. > diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c > index 56be4cbf771f..5e7ee553566d 100644 > --- a/arch/arm64/kernel/process.c > +++ b/arch/arm64/kernel/process.c > @@ -53,6 +53,7 @@ > #include <asm/processor.h> > #include <asm/pointer_auth.h> > #include <asm/stacktrace.h> > +#include <asm/exception.h> > > #if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK) > #include <linux/stackprotector.h> > @@ -70,6 +71,9 @@ void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd); > > static void __cpu_do_idle(void) > { > +#ifdef CONFIG_KVM_GUEST > + kvm_async_pf_delayed_wake(); > +#endif > dsb(sy); > wfi(); > } This is meant to be a very low-level helper, so I don't think this should live here. If nothing else, this needs to have no overhead when async page faults are not in use, so this probably needs an inline with a static key. > diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c > index c9cedc0432d2..cbf8b52135c9 100644 > --- a/arch/arm64/mm/fault.c > +++ b/arch/arm64/mm/fault.c > @@ -19,10 +19,12 @@ > #include <linux/page-flags.h> > #include <linux/sched/signal.h> > #include <linux/sched/debug.h> > +#include <linux/swait.h> > #include <linux/highmem.h> > #include <linux/perf_event.h> > #include <linux/preempt.h> > #include <linux/hugetlb.h> > +#include <linux/kvm_para.h> > > #include <asm/acpi.h> > #include <asm/bug.h> > @@ -48,8 +50,31 @@ struct fault_info { > const char *name; > }; > > +#ifdef CONFIG_KVM_GUEST > +struct kvm_task_sleep_node { > + struct hlist_node link; > + struct swait_queue_head wq; > + u32 token; > + struct task_struct *task; > + int cpu; > + bool halted; > + bool delayed; > +}; > + > +struct kvm_task_sleep_head { > + raw_spinlock_t lock; > + struct hlist_head list; > +}; > +#endif /* CONFIG_KVM_GUEST */ > + > static const struct fault_info fault_info[]; > static struct fault_info debug_fault_info[]; > +#ifdef CONFIG_KVM_GUEST > +extern char __exception_handlers_offset[]; > +static bool async_pf_available = true; > +static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_data) __aligned(64); > +static DEFINE_PER_CPU(struct kvm_task_sleep_head, apf_head); > +#endif > > static inline const struct fault_info *esr_to_fault_info(unsigned int esr) > { > @@ -717,10 +742,281 @@ static const struct fault_info fault_info[] = { > { do_bad, SIGKILL, SI_KERNEL, "unknown 63" }, > }; > > +#ifdef CONFIG_KVM_GUEST > +static inline unsigned int kvm_async_pf_read_enabled(void) > +{ > + return __this_cpu_read(apf_data.enabled); > +} > + > +static inline void kvm_async_pf_write_enabled(unsigned int val) > +{ > + __this_cpu_write(apf_data.enabled, val); > +} > + > +static inline unsigned int kvm_async_pf_read_reason(void) > +{ > + return __this_cpu_read(apf_data.reason); > +} > + > +static inline void kvm_async_pf_write_reason(unsigned int val) > +{ > + __this_cpu_write(apf_data.reason, val); > +} > + > +#define kvm_async_pf_lock(b, flags) \ > + raw_spin_lock_irqsave(&(b)->lock, (flags)) > +#define kvm_async_pf_trylock(b, flags) \ > + raw_spin_trylock_irqsave(&(b)->lock, (flags)) > +#define kvm_async_pf_unlock(b, flags) \ > + raw_spin_unlock_irqrestore(&(b)->lock, (flags)) > +#define kvm_async_pf_unlock_and_clear(b, flags) \ > + do { \ > + raw_spin_unlock_irqrestore(&(b)->lock, (flags)); \ > + kvm_async_pf_write_reason(0); \ > + } while (0) > + > +static struct kvm_task_sleep_node *kvm_async_pf_find( > + struct kvm_task_sleep_head *b, u32 token) > +{ > + struct kvm_task_sleep_node *n; > + struct hlist_node *p; > + > + hlist_for_each(p, &b->list) { > + n = hlist_entry(p, typeof(*n), link); > + if (n->token == token) > + return n; > + } > + > + return NULL; > +} > + > +static void kvm_async_pf_wait(u32 token, int in_kernel) > +{ > + struct kvm_task_sleep_head *b = this_cpu_ptr(&apf_head); > + struct kvm_task_sleep_node n, *e; > + DECLARE_SWAITQUEUE(wait); > + unsigned long flags; > + > + kvm_async_pf_lock(b, flags); > + e = kvm_async_pf_find(b, token); > + if (e) { > + /* dummy entry exist -> wake up was delivered ahead of PF */ > + hlist_del(&e->link); > + kfree(e); > + kvm_async_pf_unlock_and_clear(b, flags); > + > + return; > + } > + > + n.token = token; > + n.task = current; > + n.cpu = smp_processor_id(); > + n.halted = is_idle_task(current) || > + (IS_ENABLED(CONFIG_PREEMPT_COUNT) ? > + preempt_count() > 1 || rcu_preempt_depth() : in_kernel); > + n.delayed = false; > + init_swait_queue_head(&n.wq); > + hlist_add_head(&n.link, &b->list); > + kvm_async_pf_unlock_and_clear(b, flags); > + > + for (;;) { > + if (!n.halted) { > + prepare_to_swait_exclusive(&n.wq, &wait, > + TASK_UNINTERRUPTIBLE); > + } > + > + if (hlist_unhashed(&n.link)) > + break; > + > + if (!n.halted) { > + schedule(); > + } else { > + dsb(sy); > + wfi(); > + } Please don't open-code idle sequences. I strongly suspect this won't work with pseudo-nmi, and regardless we don't want to duplicate this. > + } > + > + if (!n.halted) > + finish_swait(&n.wq, &wait); > +} > + > +/* > + * There are two cases the suspended processed can't be waken up > + * immediately: The waker is exactly the suspended process, or > + * the current CPU runqueue has been locked. Otherwise, we might > + * run into dead-lock. > + */ > +static inline void kvm_async_pf_wake_one(struct kvm_task_sleep_node *n) > +{ > + if (n->task == current || > + cpu_rq_is_locked(smp_processor_id())) { > + n->delayed = true; > + return; > + } > + > + hlist_del_init(&n->link); > + if (n->halted) > + smp_send_reschedule(n->cpu); > + else > + swake_up_one(&n->wq); > +} > + > +void kvm_async_pf_delayed_wake(void) > +{ > + struct kvm_task_sleep_head *b; > + struct kvm_task_sleep_node *n; > + struct hlist_node *p, *next; > + unsigned int reason; > + unsigned long flags; > + > + if (!kvm_async_pf_read_enabled()) > + return; > + > + /* > + * We're running in the edging context, we need to complete > + * the work as quick as possible. So we have a preliminary > + * check without holding the lock. > + */ What is 'the edging context'? > + b = this_cpu_ptr(&apf_head); > + if (hlist_empty(&b->list)) > + return; > + > + /* > + * Set the async page fault reason to something to avoid > + * receiving the signals, which might cause lock contention > + * and possibly dead-lock. As we're in guest context, it's > + * safe to set the reason here. > + * > + * There might be pending signals. For that case, we needn't > + * do anything. Otherwise, the pending signal will be lost. > + */ > + reason = kvm_async_pf_read_reason(); > + if (!reason) { > + kvm_async_pf_write_reason(KVM_PV_REASON_PAGE_NOT_PRESENT + > + KVM_PV_REASON_PAGE_READY); > + } Huh? Are we doing this to prevent the host from writing tho this area? > + > + if (!kvm_async_pf_trylock(b, flags)) > + goto done; > + > + hlist_for_each_safe(p, next, &b->list) { > + n = hlist_entry(p, typeof(*n), link); > + if (n->cpu != smp_processor_id()) > + continue; > + if (!n->delayed) > + continue; > + > + kvm_async_pf_wake_one(n); > + } > + > + kvm_async_pf_unlock(b, flags); > + > +done: > + if (!reason) > + kvm_async_pf_write_reason(0); > +} > +NOKPROBE_SYMBOL(kvm_async_pf_delayed_wake); > + > +static void kvm_async_pf_wake_all(void) > +{ > + struct kvm_task_sleep_head *b; > + struct kvm_task_sleep_node *n; > + struct hlist_node *p, *next; > + unsigned long flags; > + > + b = this_cpu_ptr(&apf_head); > + kvm_async_pf_lock(b, flags); > + > + hlist_for_each_safe(p, next, &b->list) { > + n = hlist_entry(p, typeof(*n), link); > + kvm_async_pf_wake_one(n); > + } > + > + kvm_async_pf_unlock(b, flags); > + > + kvm_async_pf_write_reason(0); > +} > + > +static void kvm_async_pf_wake(u32 token) > +{ > + struct kvm_task_sleep_head *b = this_cpu_ptr(&apf_head); > + struct kvm_task_sleep_node *n; > + unsigned long flags; > + > + if (token == ~0) { > + kvm_async_pf_wake_all(); > + return; > + } > + > +again: > + kvm_async_pf_lock(b, flags); > + > + n = kvm_async_pf_find(b, token); > + if (!n) { > + /* > + * Async PF was not yet handled. Add dummy entry > + * for the token. Busy wait until other CPU handles > + * the async PF on allocation failure. > + */ > + n = kzalloc(sizeof(*n), GFP_ATOMIC); > + if (!n) { > + kvm_async_pf_unlock(b, flags); > + cpu_relax(); > + goto again; > + } > + n->token = token; > + n->task = current; > + n->cpu = smp_processor_id(); > + n->halted = false; > + n->delayed = false; > + init_swait_queue_head(&n->wq); > + hlist_add_head(&n->link, &b->list); > + } else { > + kvm_async_pf_wake_one(n); > + } > + > + kvm_async_pf_unlock_and_clear(b, flags); > +} > + > +static bool do_async_pf(unsigned long addr, unsigned int esr, > + struct pt_regs *regs) > +{ > + u32 reason; > + > + if (!kvm_async_pf_read_enabled()) > + return false; > + > + reason = kvm_async_pf_read_reason(); > + if (!reason) > + return false; > + > + switch (reason) { > + case KVM_PV_REASON_PAGE_NOT_PRESENT: > + kvm_async_pf_wait((u32)addr, !user_mode(regs)); > + break; > + case KVM_PV_REASON_PAGE_READY: > + kvm_async_pf_wake((u32)addr); > + break; > + default: > + if (reason) { > + pr_warn("%s: Illegal reason %d\n", __func__, reason); > + kvm_async_pf_write_reason(0); > + } > + } > + > + return true; > +} > +#endif /* CONFIG_KVM_GUEST */ > + > void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs) > { > const struct fault_info *inf = esr_to_fault_info(esr); > > +#ifdef CONFIG_KVM_GUEST > + if (do_async_pf(addr, esr, regs)) > + return; > +#endif > + > if (!inf->fn(addr, esr, regs)) > return; > > @@ -878,3 +1174,141 @@ void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr, > debug_exception_exit(regs); > } > NOKPROBE_SYMBOL(do_debug_exception); > + > +#ifdef CONFIG_KVM_GUEST > +static int __init kvm_async_pf_available(char *arg) > +{ > + async_pf_available = false; > + return 0; > +} > +early_param("no-kvmapf", kvm_async_pf_available); > + > +static void kvm_async_pf_enable(bool enable) > +{ > + struct arm_smccc_res res; > + unsigned long *offsets = (unsigned long *)__exception_handlers_offset; > + u32 enabled = kvm_async_pf_read_enabled(); > + u64 val; > + int i; > + > + if (enable == enabled) > + return; > + > + if (enable) { > + /* > + * Asychonous page faults will be prohibited when CPU runs > + * instructions between the vector base and the maximal > + * offset, plus 4096. The 4096 is the assumped maximal > + * length for individual handler. The hardware registers > + * should be saved to stack at the beginning of the handlers, > + * so 4096 shuld be safe enough. > + */ > + val = 0; > + for (i = 0; i < 16; i++) { > + if (offsets[i] > val) > + val = offsets[i]; > + } > + > + val += 4096; NAK. This assumption is not true, and regardless we should not make any assumptions of this sort; we should derive this from the code explicitly. Guessing is not ok. Given that non-reentrant exception handling code is scattered across at least: * kernel/debug-monitors.c * kernel/entry.S * kernel/entry-common.S * kernel/traps.c * mm/fault.c ... we *cannot* assume that fault handling code is virtually contiguous, and certainly cannot assume where this falls w.r.t. the architectural vectors that VBAR_ELx points to. How does x86 handle this? Thanks, Mark. _______________________________________________ kvmarm mailing list kvmarm@xxxxxxxxxxxxxxxxxxxxx https://lists.cs.columbia.edu/mailman/listinfo/kvmarm