Hi Gavin, On 8/15/21 2:59 AM, Gavin Shan wrote: > The requested page might be not resident in memory during the stage-2 > page fault. For example, the requested page could be resident in swap > device (file). In this case, disk I/O is issued in order to fetch the > requested page and it could take tens of milliseconds, even hundreds > of milliseconds in extreme situation. During the period, the guest's > vCPU is suspended until the requested page becomes ready. Actually, > the something else on the guest's vCPU could be rescheduled during s/the// > the period, so that the time slice isn't wasted as the guest's vCPU > can see. This is the primary goal of the feature (Asynchronous Page > Fault). > > This supports delivery of page-not-present notification through SDEI > event when the requested page isn't present. When the notification is > received on the guest's vCPU, something else (another process) can be > scheduled. The design is highlighted as below: > > * There is dedicated memory region shared by host and guest. It's > represented by "struct kvm_vcpu_pv_apf_data". The field @reason > indicates the reason why the SDEI event is triggered, while the > unique @token is used by guest to associate the event with the > suspended process. > > * One control block is associated with each guest's vCPU and it's > represented by "struct kvm_arch_async_pf_control". It allows the > guest to configure the functionality to indicate the situations > where the host can deliver the page-not-present notification to > kick off asyncrhonous page fault. Besides, runtime states are asynchronous > also maintained in this struct. > > * Before the page-not-present notification is sent to the guest's > vCPU, a worker is started and executed asynchronously on host, > to fetch the requested page. "struct kvm{_,_arch}async_pf" is > associated with the worker, to track the work. > > The feature isn't enabled by CONFIG_KVM_ASYNC_PF yet. Also, the > page-ready notification delivery and control path isn't implemented > and will be done in the subsequent patches. > > Signed-off-by: Gavin Shan <gshan@xxxxxxxxxx> > --- > arch/arm64/include/asm/kvm_host.h | 52 +++++++++ > arch/arm64/include/uapi/asm/kvm_para.h | 15 +++ > arch/arm64/kvm/Makefile | 1 + > arch/arm64/kvm/arm.c | 3 + > arch/arm64/kvm/async_pf.c | 145 +++++++++++++++++++++++++ > arch/arm64/kvm/mmu.c | 33 +++++- > 6 files changed, 247 insertions(+), 2 deletions(-) > create mode 100644 arch/arm64/kvm/async_pf.c > > diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h > index 581825b9df77..6b98aef936b4 100644 > --- a/arch/arm64/include/asm/kvm_host.h > +++ b/arch/arm64/include/asm/kvm_host.h > @@ -283,6 +283,31 @@ struct vcpu_reset_state { > bool reset; > }; > > +/* Should be a power of two number */ > +#define ASYNC_PF_PER_VCPU 64 > + > +/* > + * The association of gfn and token. The token will be sent to guest as > + * page fault address. Also, the guest could be in aarch32 mode. So its s/as page fault address/together with page fault address? > + * length should be 32-bits. > + */ > +struct kvm_arch_async_pf { > + u32 token; > + gfn_t gfn; > + u32 esr; > +}; > + > +struct kvm_arch_async_pf_control { > + struct gfn_to_hva_cache cache; > + u64 control_block; > + bool send_user_only; > + u64 sdei_event_num; > + nit: spare empty line > + u16 id; > + bool notpresent_pending; > + u32 notpresent_token; > +}; > + > struct kvm_vcpu_arch { > struct kvm_cpu_context ctxt; > void *sve_state; > @@ -346,6 +371,9 @@ struct kvm_vcpu_arch { > /* SDEI support */ > struct kvm_sdei_vcpu *sdei; > > + /* Asynchronous page fault support */ > + struct kvm_arch_async_pf_control *apf; > + > /* > * Guest registers we preserve during guest debugging. > * > @@ -741,6 +769,30 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, > long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, > struct kvm_arm_copy_mte_tags *copy_tags); > > +#ifdef CONFIG_KVM_ASYNC_PF > +void kvm_arch_async_pf_create_vcpu(struct kvm_vcpu *vcpu); > +bool kvm_arch_async_not_present_allowed(struct kvm_vcpu *vcpu); > +bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, > + u32 esr, gpa_t gpa, gfn_t gfn); > +bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, > + struct kvm_async_pf *work); > +void kvm_arch_async_pf_destroy_vcpu(struct kvm_vcpu *vcpu); > +#else > +static inline void kvm_arch_async_pf_create_vcpu(struct kvm_vcpu *vcpu) { } > +static inline void kvm_arch_async_pf_destroy_vcpu(struct kvm_vcpu *vcpu) { } > + > +static inline bool kvm_arch_async_not_present_allowed(struct kvm_vcpu *vcpu) > +{ > + return false; > +} > + > +static inline bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, > + u32 esr, gpa_t gpa, gfn_t gfn) > +{ > + return false; > +} > +#endif > + > /* Guest/host FPSIMD coordination helpers */ > int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu); > void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu); > diff --git a/arch/arm64/include/uapi/asm/kvm_para.h b/arch/arm64/include/uapi/asm/kvm_para.h > index cd212282b90c..3fa04006714e 100644 > --- a/arch/arm64/include/uapi/asm/kvm_para.h > +++ b/arch/arm64/include/uapi/asm/kvm_para.h > @@ -2,4 +2,19 @@ > #ifndef _UAPI_ASM_ARM_KVM_PARA_H > #define _UAPI_ASM_ARM_KVM_PARA_H > > +#include <linux/types.h> > + > +/* Async PF */ > +#define KVM_ASYNC_PF_ENABLED (1 << 0) > +#define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1) The above define is not used in this patch. Besides can you explain what it aims at? > + > +#define KVM_PV_REASON_PAGE_NOT_PRESENT 1 > + > +struct kvm_vcpu_pv_apf_data { > + __u32 reason; on x86 it was renamed into flags. Should we do the same right now? > + __u32 token; > + __u8 pad[56]; > + __u32 enabled; > +}; > + > #endif /* _UAPI_ASM_ARM_KVM_PARA_H */ > diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile > index eefca8ca394d..c9aa307ea542 100644 > --- a/arch/arm64/kvm/Makefile > +++ b/arch/arm64/kvm/Makefile > @@ -25,3 +25,4 @@ kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \ > vgic/vgic-its.o vgic/vgic-debug.o > > kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o > +kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o async_pf.o > diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c > index 7d9bbc888ae5..af251896b41d 100644 > --- a/arch/arm64/kvm/arm.c > +++ b/arch/arm64/kvm/arm.c > @@ -342,6 +342,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) > > kvm_sdei_create_vcpu(vcpu); > > + kvm_arch_async_pf_create_vcpu(vcpu); > + > vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu; > > err = kvm_vgic_vcpu_init(vcpu); > @@ -363,6 +365,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) > kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); > kvm_timer_vcpu_terminate(vcpu); > kvm_pmu_vcpu_destroy(vcpu); > + kvm_arch_async_pf_destroy_vcpu(vcpu); > kvm_sdei_destroy_vcpu(vcpu); > > kvm_arm_vcpu_destroy(vcpu); > diff --git a/arch/arm64/kvm/async_pf.c b/arch/arm64/kvm/async_pf.c > new file mode 100644 > index 000000000000..742bb8a0a8c0 > --- /dev/null > +++ b/arch/arm64/kvm/async_pf.c > @@ -0,0 +1,145 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * Asynchronous page fault support. > + * > + * Copyright (C) 2021 Red Hat, Inc. > + * > + * Author(s): Gavin Shan <gshan@xxxxxxxxxx> > + */ > + > +#include <linux/arm-smccc.h> > +#include <linux/kvm_host.h> > +#include <asm/kvm_emulate.h> > +#include <kvm/arm_hypercalls.h> > +#include <kvm/arm_vgic.h> > +#include <asm/kvm_sdei.h> > + > +static inline int read_cache(struct kvm_vcpu *vcpu, u32 offset, u32 *val) > +{ > + struct kvm *kvm = vcpu->kvm; > + struct kvm_arch_async_pf_control *apf = vcpu->arch.apf; > + > + return kvm_read_guest_offset_cached(kvm, &apf->cache, > + val, offset, sizeof(*val)); > +} > + > +static inline int write_cache(struct kvm_vcpu *vcpu, u32 offset, u32 val) > +{ > + struct kvm *kvm = vcpu->kvm; > + struct kvm_arch_async_pf_control *apf = vcpu->arch.apf; > + > + return kvm_write_guest_offset_cached(kvm, &apf->cache, > + &val, offset, sizeof(val)); > +} > + > +void kvm_arch_async_pf_create_vcpu(struct kvm_vcpu *vcpu) > +{ > + vcpu->arch.apf = kzalloc(sizeof(*(vcpu->arch.apf)), GFP_KERNEL); shouldn't we escalate the alloc failure and fail the vcpu creation instead of checking everywhere that apf is !null which is error prone. By the way I saw that on x86 this is a struct included in the vcpu one instead of a poinnter. > +} > + > +bool kvm_arch_async_not_present_allowed(struct kvm_vcpu *vcpu) > +{ > + struct kvm *kvm = vcpu->kvm; > + struct kvm_arch_async_pf_control *apf = vcpu->arch.apf; > + struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei; > + u32 reason, token; > + int ret; > + > + if (!apf || !(apf->control_block & KVM_ASYNC_PF_ENABLED)) > + return false; > + > + if (apf->send_user_only && vcpu_mode_priv(vcpu)) > + return false; > + > + if (!irqchip_in_kernel(vcpu->kvm)) > + return false; can you explain why this is needed? > + > + if (!vsdei || vsdei->critical_event || vsdei->normal_event) > + return false; don't you need some locking mechanism to void that vdsei fields change after that check? At the moment we may have a single SDEI num but nothing prevents from adding others in the future, right? > + > + /* Pending page fault, which isn't acknowledged by guest */ > + ret = read_cache(vcpu, offsetof(struct kvm_vcpu_pv_apf_data, reason), > + &reason); > + if (ret) { > + kvm_err("%s: Error %d to read reason (%d-%d)\n", > + __func__, ret, kvm->userspace_pid, vcpu->vcpu_idx); > + return false; > + } x86 code does not have those kvm_err(). You may simply drop them. > + > + ret = read_cache(vcpu, offsetof(struct kvm_vcpu_pv_apf_data, token), > + &token); > + if (ret) { > + kvm_err("%s: Error %d to read token %d-%d\n", > + __func__, ret, kvm->userspace_pid, vcpu->vcpu_idx); > + return false; > + } > + > + if (reason || token) can't the token be null? > + return false; > + > + return true; > +} > + > +bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, > + u32 esr, gpa_t gpa, gfn_t gfn) > +{ > + struct kvm_arch_async_pf_control *apf = vcpu->arch.apf; > + struct kvm_arch_async_pf arch; > + unsigned long hva = kvm_vcpu_gfn_to_hva(vcpu, gfn); > + > + arch.token = (apf->id++ << 12) | vcpu->vcpu_id; > + arch.gfn = gfn; > + arch.esr = esr; > + > + return kvm_setup_async_pf(vcpu, gpa, hva, &arch); > +} > + > +/* > + * It's guaranteed that no pending asynchronous page fault when this is that no APF is pending > + * called. It means all previous issued asynchronous page faults have > + * been acknowledged. > + */ > +bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, > + struct kvm_async_pf *work) > +{ > + struct kvm *kvm = vcpu->kvm; > + struct kvm_arch_async_pf_control *apf = vcpu->arch.apf; > + int ret; > + > + kvm_async_pf_add_slot(vcpu, work->arch.gfn); > + > + ret = write_cache(vcpu, offsetof(struct kvm_vcpu_pv_apf_data, token), > + work->arch.token); > + if (ret) { > + kvm_err("%s: Error %d to write token (%d-%d %08x)\n", kvm_err's may be dropped > + __func__, ret, kvm->userspace_pid, > + vcpu->vcpu_idx, work->arch.token); > + goto fail; > + } > + > + ret = write_cache(vcpu, offsetof(struct kvm_vcpu_pv_apf_data, reason), > + KVM_PV_REASON_PAGE_NOT_PRESENT); > + if (ret) { > + kvm_err("%s: Error %d to write reason (%d-%d %08x)\n", > + __func__, ret, kvm->userspace_pid, > + vcpu->vcpu_idx, work->arch.token); > + goto fail; > + } > + > + apf->notpresent_pending = true; > + apf->notpresent_token = work->arch.token; > + > + return !kvm_sdei_inject(vcpu, apf->sdei_event_num, true); > + > +fail: > + write_cache(vcpu, offsetof(struct kvm_vcpu_pv_apf_data, token), 0); > + write_cache(vcpu, offsetof(struct kvm_vcpu_pv_apf_data, reason), 0); > + kvm_async_pf_remove_slot(vcpu, work->arch.gfn); > + return false; > +} > + > +void kvm_arch_async_pf_destroy_vcpu(struct kvm_vcpu *vcpu) > +{ > + kfree(vcpu->arch.apf); > + vcpu->arch.apf = NULL; > +} > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c > index e4038c5e931d..4ba78bd1f18c 100644 > --- a/arch/arm64/kvm/mmu.c > +++ b/arch/arm64/kvm/mmu.c > @@ -914,6 +914,33 @@ static inline bool is_write_fault(unsigned int esr) > return esr_dabt_is_wnr(esr); > } > > +static bool try_async_pf(struct kvm_vcpu *vcpu, unsigned int esr, > + gpa_t gpa, gfn_t gfn, kvm_pfn_t *pfn, > + bool write, bool *writable, bool prefault) > +{ > + struct kvm_arch_async_pf_control *apf = vcpu->arch.apf; > + struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); > + bool async = false; > + > + if (apf) { checking apf each time for a potential kacl of resource at vcpu creation look heavy to me. > + /* Bail if *pfn has correct page */ s/bail/bail out? Comment rather related to !async check. > + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, > + write, writable, NULL); > + if (!async) > + return false; > + > + if (!prefault && kvm_arch_async_not_present_allowed(vcpu)) { x86 kvm_can_do_async_pf() naming look more straightforward than kvm_arch_async_not_present_allowed > + if (kvm_async_pf_find_slot(vcpu, gfn) || x86 has some trace points. You may envision to add some, maybe later on. > + kvm_arch_setup_async_pf(vcpu, esr, gpa, gfn)) > + return true; > + } > + } > + > + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, > + write, writable, NULL); > + return false; > +} > + > int kvm_handle_user_mem_abort(struct kvm_vcpu *vcpu, > struct kvm_memory_slot *memslot, > phys_addr_t fault_ipa, > @@ -1035,8 +1062,10 @@ int kvm_handle_user_mem_abort(struct kvm_vcpu *vcpu, > */ > smp_rmb(); > > - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, > - write_fault, &writable, NULL); > + if (try_async_pf(vcpu, esr, fault_ipa, gfn, &pfn, > + write_fault, &writable, prefault)) > + return 1; > + > if (pfn == KVM_PFN_ERR_HWPOISON) { > kvm_send_hwpoison_signal(hva, vma_shift); > return 0; > Eric