Re: [RFC PATCH V2 1/3] KVM: guest_memfd: Extend creation API to support NUMA mempolicy

Chao Gao <chao.gao@xxxxxxxxx> · Mon, 23 Sep 2024 16:01:47 +0800

On Thu, Sep 19, 2024 at 09:44:36AM +0000, Shivank Garg wrote:
>From: Shivansh Dhiman <shivansh.dhiman@xxxxxxx>
>
>Extend the API of creating guest-memfd to introduce proper NUMA support,
>allowing VMM to set memory policies effectively. The memory policy defines
>from which node memory is allocated.
>
>The current implementation of KVM guest-memfd does not honor the settings
>provided by VMM. While mbind() can be used for NUMA policy support in
>userspace applications, it is not functional for guest-memfd as the memory
>is not mapped to userspace.
>
>Currently, SEV-SNP guest use guest-memfd as a memory backend and would
>benefit from NUMA support. It enables fine-grained control over memory
>allocation, optimizing performance for specific workload requirements.
>
>To apply memory policy on a guest-memfd, extend the KVM_CREATE_GUEST_MEMFD
>IOCTL with additional fields related to mempolicy.
>- mpol_mode represents the policy mode (default, bind, interleave, or
>  preferred).
>- host_nodes_addr denotes the userspace address of the nodemask, a bit
>  mask of nodes containing up to maxnode bits.
>- First bit of flags must be set to use mempolicy.

Do you need a way for the userspace to enumerate supported flags?

The direction was to implement a fbind() syscall [1]. I am not sure if it has
changed. What are the benefits of this proposal compared to the fbind() syscall?

I believe one limitation of this proposal is that the policy must be set during
the creation of the guest-memfd. i.e., the policy cannot be changed at runtime.
is it a practical problem?

[1]: https://lore.kernel.org/kvm/ZOjpIL0SFH+E3Dj4@xxxxxxxxxx/

>
>Store the mempolicy struct in i_private_data of the memfd's inode, which
>is currently unused in the context of guest-memfd.
>
>Signed-off-by: Shivansh Dhiman <shivansh.dhiman@xxxxxxx>
>Signed-off-by: Shivank Garg <shivankg@xxxxxxx>
>---
> Documentation/virt/kvm/api.rst | 13 ++++++++-
> include/linux/mempolicy.h      |  4 +++
> include/uapi/linux/kvm.h       |  5 +++-
> mm/mempolicy.c                 | 52 ++++++++++++++++++++++++++++++++++
> tools/include/uapi/linux/kvm.h |  5 +++-
> virt/kvm/guest_memfd.c         | 21 ++++++++++++--
> virt/kvm/kvm_mm.h              |  3 ++
> 7 files changed, 97 insertions(+), 6 deletions(-)
>
>diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
>index b3be87489108..dcb61282c773 100644
>--- a/Documentation/virt/kvm/api.rst
>+++ b/Documentation/virt/kvm/api.rst
>@@ -6346,7 +6346,10 @@ and cannot be resized  (guest_memfd files do however support PUNCH_HOLE).
>   struct kvm_create_guest_memfd {
> 	__u64 size;
> 	__u64 flags;
>-	__u64 reserved[6];
>+	__u64 host_nodes_addr;
>+	__u16 maxnode;
>+	__u8 mpol_mode;
>+	__u8 reserved[37];
>   };
> 
> Conceptually, the inode backing a guest_memfd file represents physical memory,
>@@ -6367,6 +6370,14 @@ a single guest_memfd file, but the bound ranges must not overlap).
> 
> See KVM_SET_USER_MEMORY_REGION2 for additional details.
> 
>+NUMA memory policy support for KVM guest_memfd allows the host to specify
>+memory allocation behavior for guest NUMA nodes, similar to mbind(). If
>+KVM_GUEST_MEMFD_NUMA_ENABLE flag is set, memory allocations from the guest
>+will use the specified policy and host-nodes for physical memory.
>+- mpol_mode refers to the policy mode: default, preferred, bind, interleave, or
>+  preferred.
>+- host_nodes_addr points to bitmask of nodes containing up to maxnode bits.
>+
> 4.143 KVM_PRE_FAULT_MEMORY
> ---------------------------
> 
>diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
>index 1add16f21612..468eeda2ec2f 100644
>--- a/include/linux/mempolicy.h
>+++ b/include/linux/mempolicy.h
>@@ -299,4 +299,8 @@ static inline bool mpol_is_preferred_many(struct mempolicy *pol)
> }
> 
> #endif /* CONFIG_NUMA */
>+
>+struct mempolicy *create_mpol_from_args(unsigned char mode,
>+					const unsigned long __user *nmask,
>+					unsigned short maxnode);
> #endif
>diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>index 637efc055145..fda6cbef0a1d 100644
>--- a/include/uapi/linux/kvm.h
>+++ b/include/uapi/linux/kvm.h
>@@ -1561,7 +1561,10 @@ struct kvm_memory_attributes {
> struct kvm_create_guest_memfd {
> 	__u64 size;
> 	__u64 flags;
>-	__u64 reserved[6];
>+	__u64 host_nodes_addr;
>+	__u16 maxnode;
>+	__u8 mpol_mode;
>+	__u8 reserved[37];
> };
> 
> #define KVM_PRE_FAULT_MEMORY	_IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory)
>diff --git a/mm/mempolicy.c b/mm/mempolicy.c
>index b858e22b259d..9e9450433fcc 100644
>--- a/mm/mempolicy.c
>+++ b/mm/mempolicy.c
>@@ -3557,3 +3557,55 @@ static int __init mempolicy_sysfs_init(void)
> 
> late_initcall(mempolicy_sysfs_init);
> #endif /* CONFIG_SYSFS */
>+
>+#ifdef CONFIG_KVM_PRIVATE_MEM
>+/**
>+ * create_mpol_from_args - create a mempolicy structure from args
>+ * @mode:  NUMA memory policy mode
>+ * @nmask:  bitmask of NUMA nodes
>+ * @maxnode:  number of bits in the nodes bitmask
>+ *
>+ * Create a mempolicy from given nodemask and memory policy such as
>+ * default, preferred, interleave or bind.
>+ *
>+ * Return: error encoded in a pointer or memory policy on success.
>+ */
>+struct mempolicy *create_mpol_from_args(unsigned char mode,
>+					const unsigned long __user *nmask,
>+					unsigned short maxnode)
>+{
>+	struct mm_struct *mm = current->mm;
>+	unsigned short mode_flags;
>+	struct mempolicy *mpol;
>+	nodemask_t nodes;
>+	int lmode = mode;
>+	int err = -ENOMEM;
>+
>+	err = sanitize_mpol_flags(&lmode, &mode_flags);
>+	if (err)
>+		return ERR_PTR(err);
>+
>+	err = get_nodes(&nodes, nmask, maxnode);
>+	if (err)
>+		return ERR_PTR(err);
>+
>+	mpol = mpol_new(mode, mode_flags, &nodes);
>+	if (IS_ERR_OR_NULL(mpol))
>+		return mpol;
>+
>+	NODEMASK_SCRATCH(scratch);
>+	if (!scratch)
>+		return ERR_PTR(-ENOMEM);
>+
>+	mmap_write_lock(mm);
>+	err = mpol_set_nodemask(mpol, &nodes, scratch);
>+	mmap_write_unlock(mm);
>+	NODEMASK_SCRATCH_FREE(scratch);
>+
>+	if (err)
>+		return ERR_PTR(err);
>+
>+	return mpol;
>+}
>+EXPORT_SYMBOL(create_mpol_from_args);
>+#endif
>diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
>index e5af8c692dc0..e3effcd1e358 100644
>--- a/tools/include/uapi/linux/kvm.h
>+++ b/tools/include/uapi/linux/kvm.h
>@@ -1546,7 +1546,10 @@ struct kvm_memory_attributes {
> struct kvm_create_guest_memfd {
> 	__u64 size;
> 	__u64 flags;
>-	__u64 reserved[6];
>+	__u64 host_nodes_addr;
>+	__u16 maxnode;
>+	__u8 mpol_mode;
>+	__u8 reserved[37];
> };
> 
> #define KVM_PRE_FAULT_MEMORY	_IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory)
>diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
>index e930014b4bdc..8f1877be4976 100644
>--- a/virt/kvm/guest_memfd.c
>+++ b/virt/kvm/guest_memfd.c
>@@ -4,6 +4,7 @@
> #include <linux/kvm_host.h>
> #include <linux/pagemap.h>
> #include <linux/anon_inodes.h>
>+#include <linux/mempolicy.h>
> 
> #include "kvm_mm.h"
> 
>@@ -445,7 +446,8 @@ static const struct inode_operations kvm_gmem_iops = {
> 	.setattr	= kvm_gmem_setattr,
> };
> 
>-static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
>+static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags,
>+			     struct mempolicy *pol)
> {
> 	const char *anon_name = "[kvm-gmem]";
> 	struct kvm_gmem *gmem;
>@@ -478,6 +480,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
> 	inode->i_private = (void *)(unsigned long)flags;
> 	inode->i_op = &kvm_gmem_iops;
> 	inode->i_mapping->a_ops = &kvm_gmem_aops;
>+	inode->i_mapping->i_private_data = (void *)pol;
> 	inode->i_mode |= S_IFREG;
> 	inode->i_size = size;
> 	mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
>@@ -505,7 +508,8 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
> {
> 	loff_t size = args->size;
> 	u64 flags = args->flags;
>-	u64 valid_flags = 0;
>+	u64 valid_flags = GUEST_MEMFD_NUMA_ENABLE;
>+	struct mempolicy *mpol = NULL;
> 
> 	if (flags & ~valid_flags)
> 		return -EINVAL;
>@@ -513,7 +517,18 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
> 	if (size <= 0 || !PAGE_ALIGNED(size))
> 		return -EINVAL;
> 
>-	return __kvm_gmem_create(kvm, size, flags);
>+	if (flags & GUEST_MEMFD_NUMA_ENABLE) {
>+		unsigned char mode = args->mpol_mode;
>+		unsigned short maxnode = args->maxnode;
>+		const unsigned long __user *user_nmask =
>+				(const unsigned long *)args->host_nodes_addr;
>+
>+		mpol = create_mpol_from_args(mode, user_nmask, maxnode);
>+		if (IS_ERR_OR_NULL(mpol))
>+			return PTR_ERR(mpol);
>+	}
>+
>+	return __kvm_gmem_create(kvm, size, flags, mpol);
> }
> 
> int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
>diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
>index 715f19669d01..3dd8495ae03d 100644
>--- a/virt/kvm/kvm_mm.h
>+++ b/virt/kvm/kvm_mm.h
>@@ -36,6 +36,9 @@ static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
> #endif /* HAVE_KVM_PFNCACHE */
> 
> #ifdef CONFIG_KVM_PRIVATE_MEM
>+/* Flag to check NUMA policy while creating KVM guest-memfd. */
>+#define GUEST_MEMFD_NUMA_ENABLE BIT_ULL(0)
>+
> void kvm_gmem_init(struct module *module);
> int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args);
> int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
>-- 
>2.34.1
>
>