On 4/19/2024 4:59 PM, Paolo Bonzini wrote:
From: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
Add a new ioctl KVM_PRE_FAULT_MEMORY in the KVM common code. It iterates on the
memory range and calls the arch-specific function. Add stub arch function
as a weak symbol.
Suggested-by: Sean Christopherson <seanjc@xxxxxxxxxx>
Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
Reviewed-by: Rick Edgecombe <rick.p.edgecombe@xxxxxxxxx>
Message-ID: <819322b8f25971f2b9933bfa4506e618508ad782.1712785629.git.isaku.yamahata@xxxxxxxxx>
Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>
---
include/linux/kvm_host.h | 5 ++++
include/uapi/linux/kvm.h | 10 +++++++
virt/kvm/Kconfig | 3 ++
virt/kvm/kvm_main.c | 63 ++++++++++++++++++++++++++++++++++++++++
4 files changed, 81 insertions(+)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8dea11701ab2..9e9943e5e37c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2478,4 +2478,9 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages
void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
#endif
+#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
+long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
+ struct kvm_pre_fault_memory *range);
+#endif
+
#endif
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 2190adbe3002..917d2964947d 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -917,6 +917,7 @@ struct kvm_enable_cap {
#define KVM_CAP_MEMORY_ATTRIBUTES 233
#define KVM_CAP_GUEST_MEMFD 234
#define KVM_CAP_VM_TYPES 235
+#define KVM_CAP_PRE_FAULT_MEMORY 236
struct kvm_irq_routing_irqchip {
__u32 irqchip;
@@ -1548,4 +1549,13 @@ struct kvm_create_guest_memfd {
__u64 reserved[6];
};
+#define KVM_PRE_FAULT_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory)
+
+struct kvm_pre_fault_memory {
+ __u64 gpa;
+ __u64 size;
+ __u64 flags;
+ __u64 padding[5];
+};
+
#endif /* __LINUX_KVM_H */
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 754c6c923427..b14e14cdbfb9 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -67,6 +67,9 @@ config HAVE_KVM_INVALID_WAKEUPS
config KVM_GENERIC_DIRTYLOG_READ_PROTECT
bool
+config KVM_GENERIC_PRE_FAULT_MEMORY
+ bool
+
config KVM_COMPAT
def_bool y
depends on KVM && COMPAT && !(S390 || ARM64 || RISCV)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 38b498669ef9..51d8dbe7e93b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4379,6 +4379,55 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
return fd;
}
+#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
+static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
+ struct kvm_pre_fault_memory *range)
+{
+ int idx;
+ long r;
+ u64 full_size;
+
+ if (range->flags)
+ return -EINVAL;
+
+ if (!PAGE_ALIGNED(range->gpa) ||
+ !PAGE_ALIGNED(range->size) ||
+ range->gpa + range->size <= range->gpa)
+ return -EINVAL;
+
+ if (!range->size)
+ return 0;
range->size equals 0 can be covered by "range->gpa + range->size <=
range->gpa"
If we want to return success when size is 0 (, though I am not sure it's
needed),
we need to use "range->gpa + range->size < range->gpa" instead.
+
+ vcpu_load(vcpu);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ full_size = range->size;
+ do {
+ if (signal_pending(current)) {
+ r = -EINTR;
+ break;
+ }
+
+ r = kvm_arch_vcpu_pre_fault_memory(vcpu, range);
+ if (r < 0)
+ break;
+
+ if (WARN_ON_ONCE(r == 0))
+ break;
+
+ range->size -= r;
+ range->gpa += r;
+ cond_resched();
+ } while (range->size);
+
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ vcpu_put(vcpu);
+
+ /* Return success if at least one page was mapped successfully. */
+ return full_size == range->size ? r : 0;
+}
+#endif
+
static long kvm_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -4580,6 +4629,20 @@ static long kvm_vcpu_ioctl(struct file *filp,
r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
break;
}
+#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
+ case KVM_PRE_FAULT_MEMORY: {
+ struct kvm_pre_fault_memory range;
+
+ r = -EFAULT;
+ if (copy_from_user(&range, argp, sizeof(range)))
+ break;
+ r = kvm_vcpu_pre_fault_memory(vcpu, &range);
+ /* Pass back leftover range. */
+ if (copy_to_user(argp, &range, sizeof(range)))
+ r = -EFAULT;
+ break;
+ }
+#endif
default:
r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
}