On Thu, Sep 19, 2024 at 09:44:36AM +0000, Shivank Garg wrote: >From: Shivansh Dhiman <shivansh.dhiman@xxxxxxx> > >Extend the API of creating guest-memfd to introduce proper NUMA support, >allowing VMM to set memory policies effectively. The memory policy defines >from which node memory is allocated. > >The current implementation of KVM guest-memfd does not honor the settings >provided by VMM. While mbind() can be used for NUMA policy support in >userspace applications, it is not functional for guest-memfd as the memory >is not mapped to userspace. > >Currently, SEV-SNP guest use guest-memfd as a memory backend and would >benefit from NUMA support. It enables fine-grained control over memory >allocation, optimizing performance for specific workload requirements. > >To apply memory policy on a guest-memfd, extend the KVM_CREATE_GUEST_MEMFD >IOCTL with additional fields related to mempolicy. >- mpol_mode represents the policy mode (default, bind, interleave, or > preferred). >- host_nodes_addr denotes the userspace address of the nodemask, a bit > mask of nodes containing up to maxnode bits. >- First bit of flags must be set to use mempolicy. Do you need a way for the userspace to enumerate supported flags? The direction was to implement a fbind() syscall [1]. I am not sure if it has changed. What are the benefits of this proposal compared to the fbind() syscall? I believe one limitation of this proposal is that the policy must be set during the creation of the guest-memfd. i.e., the policy cannot be changed at runtime. is it a practical problem? [1]: https://lore.kernel.org/kvm/ZOjpIL0SFH+E3Dj4@xxxxxxxxxx/ > >Store the mempolicy struct in i_private_data of the memfd's inode, which >is currently unused in the context of guest-memfd. > >Signed-off-by: Shivansh Dhiman <shivansh.dhiman@xxxxxxx> >Signed-off-by: Shivank Garg <shivankg@xxxxxxx> >--- > Documentation/virt/kvm/api.rst | 13 ++++++++- > include/linux/mempolicy.h | 4 +++ > include/uapi/linux/kvm.h | 5 +++- > mm/mempolicy.c | 52 ++++++++++++++++++++++++++++++++++ > tools/include/uapi/linux/kvm.h | 5 +++- > virt/kvm/guest_memfd.c | 21 ++++++++++++-- > virt/kvm/kvm_mm.h | 3 ++ > 7 files changed, 97 insertions(+), 6 deletions(-) > >diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst >index b3be87489108..dcb61282c773 100644 >--- a/Documentation/virt/kvm/api.rst >+++ b/Documentation/virt/kvm/api.rst >@@ -6346,7 +6346,10 @@ and cannot be resized (guest_memfd files do however support PUNCH_HOLE). > struct kvm_create_guest_memfd { > __u64 size; > __u64 flags; >- __u64 reserved[6]; >+ __u64 host_nodes_addr; >+ __u16 maxnode; >+ __u8 mpol_mode; >+ __u8 reserved[37]; > }; > > Conceptually, the inode backing a guest_memfd file represents physical memory, >@@ -6367,6 +6370,14 @@ a single guest_memfd file, but the bound ranges must not overlap). > > See KVM_SET_USER_MEMORY_REGION2 for additional details. > >+NUMA memory policy support for KVM guest_memfd allows the host to specify >+memory allocation behavior for guest NUMA nodes, similar to mbind(). If >+KVM_GUEST_MEMFD_NUMA_ENABLE flag is set, memory allocations from the guest >+will use the specified policy and host-nodes for physical memory. >+- mpol_mode refers to the policy mode: default, preferred, bind, interleave, or >+ preferred. >+- host_nodes_addr points to bitmask of nodes containing up to maxnode bits. >+ > 4.143 KVM_PRE_FAULT_MEMORY > --------------------------- > >diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h >index 1add16f21612..468eeda2ec2f 100644 >--- a/include/linux/mempolicy.h >+++ b/include/linux/mempolicy.h >@@ -299,4 +299,8 @@ static inline bool mpol_is_preferred_many(struct mempolicy *pol) > } > > #endif /* CONFIG_NUMA */ >+ >+struct mempolicy *create_mpol_from_args(unsigned char mode, >+ const unsigned long __user *nmask, >+ unsigned short maxnode); > #endif >diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h >index 637efc055145..fda6cbef0a1d 100644 >--- a/include/uapi/linux/kvm.h >+++ b/include/uapi/linux/kvm.h >@@ -1561,7 +1561,10 @@ struct kvm_memory_attributes { > struct kvm_create_guest_memfd { > __u64 size; > __u64 flags; >- __u64 reserved[6]; >+ __u64 host_nodes_addr; >+ __u16 maxnode; >+ __u8 mpol_mode; >+ __u8 reserved[37]; > }; > > #define KVM_PRE_FAULT_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory) >diff --git a/mm/mempolicy.c b/mm/mempolicy.c >index b858e22b259d..9e9450433fcc 100644 >--- a/mm/mempolicy.c >+++ b/mm/mempolicy.c >@@ -3557,3 +3557,55 @@ static int __init mempolicy_sysfs_init(void) > > late_initcall(mempolicy_sysfs_init); > #endif /* CONFIG_SYSFS */ >+ >+#ifdef CONFIG_KVM_PRIVATE_MEM >+/** >+ * create_mpol_from_args - create a mempolicy structure from args >+ * @mode: NUMA memory policy mode >+ * @nmask: bitmask of NUMA nodes >+ * @maxnode: number of bits in the nodes bitmask >+ * >+ * Create a mempolicy from given nodemask and memory policy such as >+ * default, preferred, interleave or bind. >+ * >+ * Return: error encoded in a pointer or memory policy on success. >+ */ >+struct mempolicy *create_mpol_from_args(unsigned char mode, >+ const unsigned long __user *nmask, >+ unsigned short maxnode) >+{ >+ struct mm_struct *mm = current->mm; >+ unsigned short mode_flags; >+ struct mempolicy *mpol; >+ nodemask_t nodes; >+ int lmode = mode; >+ int err = -ENOMEM; >+ >+ err = sanitize_mpol_flags(&lmode, &mode_flags); >+ if (err) >+ return ERR_PTR(err); >+ >+ err = get_nodes(&nodes, nmask, maxnode); >+ if (err) >+ return ERR_PTR(err); >+ >+ mpol = mpol_new(mode, mode_flags, &nodes); >+ if (IS_ERR_OR_NULL(mpol)) >+ return mpol; >+ >+ NODEMASK_SCRATCH(scratch); >+ if (!scratch) >+ return ERR_PTR(-ENOMEM); >+ >+ mmap_write_lock(mm); >+ err = mpol_set_nodemask(mpol, &nodes, scratch); >+ mmap_write_unlock(mm); >+ NODEMASK_SCRATCH_FREE(scratch); >+ >+ if (err) >+ return ERR_PTR(err); >+ >+ return mpol; >+} >+EXPORT_SYMBOL(create_mpol_from_args); >+#endif >diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h >index e5af8c692dc0..e3effcd1e358 100644 >--- a/tools/include/uapi/linux/kvm.h >+++ b/tools/include/uapi/linux/kvm.h >@@ -1546,7 +1546,10 @@ struct kvm_memory_attributes { > struct kvm_create_guest_memfd { > __u64 size; > __u64 flags; >- __u64 reserved[6]; >+ __u64 host_nodes_addr; >+ __u16 maxnode; >+ __u8 mpol_mode; >+ __u8 reserved[37]; > }; > > #define KVM_PRE_FAULT_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory) >diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c >index e930014b4bdc..8f1877be4976 100644 >--- a/virt/kvm/guest_memfd.c >+++ b/virt/kvm/guest_memfd.c >@@ -4,6 +4,7 @@ > #include <linux/kvm_host.h> > #include <linux/pagemap.h> > #include <linux/anon_inodes.h> >+#include <linux/mempolicy.h> > > #include "kvm_mm.h" > >@@ -445,7 +446,8 @@ static const struct inode_operations kvm_gmem_iops = { > .setattr = kvm_gmem_setattr, > }; > >-static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) >+static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags, >+ struct mempolicy *pol) > { > const char *anon_name = "[kvm-gmem]"; > struct kvm_gmem *gmem; >@@ -478,6 +480,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) > inode->i_private = (void *)(unsigned long)flags; > inode->i_op = &kvm_gmem_iops; > inode->i_mapping->a_ops = &kvm_gmem_aops; >+ inode->i_mapping->i_private_data = (void *)pol; > inode->i_mode |= S_IFREG; > inode->i_size = size; > mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); >@@ -505,7 +508,8 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) > { > loff_t size = args->size; > u64 flags = args->flags; >- u64 valid_flags = 0; >+ u64 valid_flags = GUEST_MEMFD_NUMA_ENABLE; >+ struct mempolicy *mpol = NULL; > > if (flags & ~valid_flags) > return -EINVAL; >@@ -513,7 +517,18 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) > if (size <= 0 || !PAGE_ALIGNED(size)) > return -EINVAL; > >- return __kvm_gmem_create(kvm, size, flags); >+ if (flags & GUEST_MEMFD_NUMA_ENABLE) { >+ unsigned char mode = args->mpol_mode; >+ unsigned short maxnode = args->maxnode; >+ const unsigned long __user *user_nmask = >+ (const unsigned long *)args->host_nodes_addr; >+ >+ mpol = create_mpol_from_args(mode, user_nmask, maxnode); >+ if (IS_ERR_OR_NULL(mpol)) >+ return PTR_ERR(mpol); >+ } >+ >+ return __kvm_gmem_create(kvm, size, flags, mpol); > } > > int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, >diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h >index 715f19669d01..3dd8495ae03d 100644 >--- a/virt/kvm/kvm_mm.h >+++ b/virt/kvm/kvm_mm.h >@@ -36,6 +36,9 @@ static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, > #endif /* HAVE_KVM_PFNCACHE */ > > #ifdef CONFIG_KVM_PRIVATE_MEM >+/* Flag to check NUMA policy while creating KVM guest-memfd. */ >+#define GUEST_MEMFD_NUMA_ENABLE BIT_ULL(0) >+ > void kvm_gmem_init(struct module *module); > int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args); > int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, >-- >2.34.1 > >