From: Shivansh Dhiman <shivansh.dhiman@xxxxxxx> Extend the API of creating guest-memfd to introduce proper NUMA support, allowing VMM to set memory policies effectively. The memory policy defines from which node memory is allocated. The current implementation of KVM guest-memfd does not honor the settings provided by VMM. While mbind() can be used for NUMA policy support in userspace applications, it is not functional for guest-memfd as the memory is not mapped to userspace. Currently, SEV-SNP guest use guest-memfd as a memory backend and would benefit from NUMA support. It enables fine-grained control over memory allocation, optimizing performance for specific workload requirements. To apply memory policy on a guest-memfd, extend the KVM_CREATE_GUEST_MEMFD IOCTL with additional fields related to mempolicy. - mpol_mode represents the policy mode (default, bind, interleave, or preferred). - host_nodes_addr denotes the userspace address of the nodemask, a bit mask of nodes containing up to maxnode bits. - First bit of flags must be set to use mempolicy. Store the mempolicy struct in i_private_data of the memfd's inode, which is currently unused in the context of guest-memfd. Signed-off-by: Shivansh Dhiman <shivansh.dhiman@xxxxxxx> Signed-off-by: Shivank Garg <shivankg@xxxxxxx> --- Documentation/virt/kvm/api.rst | 13 ++++++++- include/linux/mempolicy.h | 4 +++ include/uapi/linux/kvm.h | 5 +++- mm/mempolicy.c | 52 ++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/kvm.h | 5 +++- virt/kvm/guest_memfd.c | 21 ++++++++++++-- virt/kvm/kvm_mm.h | 3 ++ 7 files changed, 97 insertions(+), 6 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index b3be87489108..dcb61282c773 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6346,7 +6346,10 @@ and cannot be resized (guest_memfd files do however support PUNCH_HOLE). struct kvm_create_guest_memfd { __u64 size; __u64 flags; - __u64 reserved[6]; + __u64 host_nodes_addr; + __u16 maxnode; + __u8 mpol_mode; + __u8 reserved[37]; }; Conceptually, the inode backing a guest_memfd file represents physical memory, @@ -6367,6 +6370,14 @@ a single guest_memfd file, but the bound ranges must not overlap). See KVM_SET_USER_MEMORY_REGION2 for additional details. +NUMA memory policy support for KVM guest_memfd allows the host to specify +memory allocation behavior for guest NUMA nodes, similar to mbind(). If +KVM_GUEST_MEMFD_NUMA_ENABLE flag is set, memory allocations from the guest +will use the specified policy and host-nodes for physical memory. +- mpol_mode refers to the policy mode: default, preferred, bind, interleave, or + preferred. +- host_nodes_addr points to bitmask of nodes containing up to maxnode bits. + 4.143 KVM_PRE_FAULT_MEMORY --------------------------- diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 1add16f21612..468eeda2ec2f 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -299,4 +299,8 @@ static inline bool mpol_is_preferred_many(struct mempolicy *pol) } #endif /* CONFIG_NUMA */ + +struct mempolicy *create_mpol_from_args(unsigned char mode, + const unsigned long __user *nmask, + unsigned short maxnode); #endif diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 637efc055145..fda6cbef0a1d 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1561,7 +1561,10 @@ struct kvm_memory_attributes { struct kvm_create_guest_memfd { __u64 size; __u64 flags; - __u64 reserved[6]; + __u64 host_nodes_addr; + __u16 maxnode; + __u8 mpol_mode; + __u8 reserved[37]; }; #define KVM_PRE_FAULT_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b858e22b259d..9e9450433fcc 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -3557,3 +3557,55 @@ static int __init mempolicy_sysfs_init(void) late_initcall(mempolicy_sysfs_init); #endif /* CONFIG_SYSFS */ + +#ifdef CONFIG_KVM_PRIVATE_MEM +/** + * create_mpol_from_args - create a mempolicy structure from args + * @mode: NUMA memory policy mode + * @nmask: bitmask of NUMA nodes + * @maxnode: number of bits in the nodes bitmask + * + * Create a mempolicy from given nodemask and memory policy such as + * default, preferred, interleave or bind. + * + * Return: error encoded in a pointer or memory policy on success. + */ +struct mempolicy *create_mpol_from_args(unsigned char mode, + const unsigned long __user *nmask, + unsigned short maxnode) +{ + struct mm_struct *mm = current->mm; + unsigned short mode_flags; + struct mempolicy *mpol; + nodemask_t nodes; + int lmode = mode; + int err = -ENOMEM; + + err = sanitize_mpol_flags(&lmode, &mode_flags); + if (err) + return ERR_PTR(err); + + err = get_nodes(&nodes, nmask, maxnode); + if (err) + return ERR_PTR(err); + + mpol = mpol_new(mode, mode_flags, &nodes); + if (IS_ERR_OR_NULL(mpol)) + return mpol; + + NODEMASK_SCRATCH(scratch); + if (!scratch) + return ERR_PTR(-ENOMEM); + + mmap_write_lock(mm); + err = mpol_set_nodemask(mpol, &nodes, scratch); + mmap_write_unlock(mm); + NODEMASK_SCRATCH_FREE(scratch); + + if (err) + return ERR_PTR(err); + + return mpol; +} +EXPORT_SYMBOL(create_mpol_from_args); +#endif diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index e5af8c692dc0..e3effcd1e358 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1546,7 +1546,10 @@ struct kvm_memory_attributes { struct kvm_create_guest_memfd { __u64 size; __u64 flags; - __u64 reserved[6]; + __u64 host_nodes_addr; + __u16 maxnode; + __u8 mpol_mode; + __u8 reserved[37]; }; #define KVM_PRE_FAULT_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index e930014b4bdc..8f1877be4976 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -4,6 +4,7 @@ #include <linux/kvm_host.h> #include <linux/pagemap.h> #include <linux/anon_inodes.h> +#include <linux/mempolicy.h> #include "kvm_mm.h" @@ -445,7 +446,8 @@ static const struct inode_operations kvm_gmem_iops = { .setattr = kvm_gmem_setattr, }; -static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) +static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags, + struct mempolicy *pol) { const char *anon_name = "[kvm-gmem]"; struct kvm_gmem *gmem; @@ -478,6 +480,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) inode->i_private = (void *)(unsigned long)flags; inode->i_op = &kvm_gmem_iops; inode->i_mapping->a_ops = &kvm_gmem_aops; + inode->i_mapping->i_private_data = (void *)pol; inode->i_mode |= S_IFREG; inode->i_size = size; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); @@ -505,7 +508,8 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) { loff_t size = args->size; u64 flags = args->flags; - u64 valid_flags = 0; + u64 valid_flags = GUEST_MEMFD_NUMA_ENABLE; + struct mempolicy *mpol = NULL; if (flags & ~valid_flags) return -EINVAL; @@ -513,7 +517,18 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) if (size <= 0 || !PAGE_ALIGNED(size)) return -EINVAL; - return __kvm_gmem_create(kvm, size, flags); + if (flags & GUEST_MEMFD_NUMA_ENABLE) { + unsigned char mode = args->mpol_mode; + unsigned short maxnode = args->maxnode; + const unsigned long __user *user_nmask = + (const unsigned long *)args->host_nodes_addr; + + mpol = create_mpol_from_args(mode, user_nmask, maxnode); + if (IS_ERR_OR_NULL(mpol)) + return PTR_ERR(mpol); + } + + return __kvm_gmem_create(kvm, size, flags, mpol); } int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h index 715f19669d01..3dd8495ae03d 100644 --- a/virt/kvm/kvm_mm.h +++ b/virt/kvm/kvm_mm.h @@ -36,6 +36,9 @@ static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, #endif /* HAVE_KVM_PFNCACHE */ #ifdef CONFIG_KVM_PRIVATE_MEM +/* Flag to check NUMA policy while creating KVM guest-memfd. */ +#define GUEST_MEMFD_NUMA_ENABLE BIT_ULL(0) + void kvm_gmem_init(struct module *module); int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args); int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, -- 2.34.1