Shivank Garg <shivankg@xxxxxxx> writes: > Previously, guest-memfd allocations followed local NUMA node id in absence > of process mempolicy, resulting in arbitrary memory allocation. > Moreover, mbind() couldn't be used since memory wasn't mapped to userspace > in the VMM. > > Enable NUMA policy support by implementing vm_ops for guest-memfd mmap > operation. This allows the VMM to map the memory and use mbind() to set > the desired NUMA policy. The policy is then retrieved via > mpol_shared_policy_lookup() and passed to filemap_grab_folio_mpol() to > ensure that allocations follow the specified memory policy. > > This enables the VMM to control guest memory NUMA placement by calling > mbind() on the mapped memory regions, providing fine-grained control over > guest memory allocation across NUMA nodes. > > The policy change only affect future allocations and does not migrate > existing memory. This matches mbind(2)'s default behavior which affects > only new allocations unless overridden with MPOL_MF_MOVE/MPOL_MF_MOVE_ALL > flags, which are not supported for guest_memfd as it is unmovable. > > Suggested-by: David Hildenbrand <david@xxxxxxxxxx> > Signed-off-by: Shivank Garg <shivankg@xxxxxxx> > --- > virt/kvm/guest_memfd.c | 76 +++++++++++++++++++++++++++++++++++++++++- > 1 file changed, 75 insertions(+), 1 deletion(-) > > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c > index f18176976ae3..b3a8819117a0 100644 > --- a/virt/kvm/guest_memfd.c > +++ b/virt/kvm/guest_memfd.c > @@ -2,6 +2,7 @@ > #include <linux/backing-dev.h> > #include <linux/falloc.h> > #include <linux/kvm_host.h> > +#include <linux/mempolicy.h> > #include <linux/pagemap.h> > #include <linux/anon_inodes.h> > > @@ -11,8 +12,12 @@ struct kvm_gmem { > struct kvm *kvm; > struct xarray bindings; > struct list_head entry; > + struct shared_policy policy; > }; > struct shared_policy should be stored on the inode rather than the file, since the memory policy is a property of the memory (struct inode), rather than a property of how the memory is used for a given VM (struct file). When the shared_policy is stored on the inode, intra-host migration [1] will work correctly, since the while the inode will be transferred from one VM (struct kvm) to another, the file (a VM's view/bindings of the memory) will be recreated for the new VM. I'm thinking of having a patch like this [2] to introduce inodes. With this, we shouldn't need to pass file pointers instead of inode pointers. > +static struct mempolicy *kvm_gmem_get_pgoff_policy(struct kvm_gmem *gmem, > + pgoff_t index); > + > /** > * folio_file_pfn - like folio_file_page, but return a pfn. > * @folio: The folio which contains this index. > @@ -99,7 +104,25 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, > static struct folio *kvm_gmem_get_folio(struct file *file, pgoff_t index) > { > /* TODO: Support huge pages. */ > - return filemap_grab_folio(file_inode(file)->i_mapping, index); > + struct kvm_gmem *gmem = file->private_data; > + struct inode *inode = file_inode(file); > + struct mempolicy *policy; > + struct folio *folio; > + > + /* > + * Fast-path: See if folio is already present in mapping to avoid > + * policy_lookup. > + */ > + folio = __filemap_get_folio(inode->i_mapping, index, > + FGP_LOCK | FGP_ACCESSED, 0); > + if (!IS_ERR(folio)) > + return folio; > + > + policy = kvm_gmem_get_pgoff_policy(gmem, index); > + folio = filemap_grab_folio_mpol(inode->i_mapping, index, policy); > + mpol_cond_put(policy); > + > + return folio; > } > > static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, > @@ -291,6 +314,7 @@ static int kvm_gmem_release(struct inode *inode, struct file *file) > mutex_unlock(&kvm->slots_lock); > > xa_destroy(&gmem->bindings); > + mpol_free_shared_policy(&gmem->policy); > kfree(gmem); > > kvm_put_kvm(kvm); > @@ -312,8 +336,57 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) > { > return gfn - slot->base_gfn + slot->gmem.pgoff; > } > +#ifdef CONFIG_NUMA > +static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) > +{ > + struct file *file = vma->vm_file; > + struct kvm_gmem *gmem = file->private_data; > + > + return mpol_set_shared_policy(&gmem->policy, vma, new); > +} > + > +static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma, > + unsigned long addr, pgoff_t *pgoff) > +{ > + struct file *file = vma->vm_file; > + struct kvm_gmem *gmem = file->private_data; > + > + *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT); > + return mpol_shared_policy_lookup(&gmem->policy, *pgoff); > +} > + > +static struct mempolicy *kvm_gmem_get_pgoff_policy(struct kvm_gmem *gmem, > + pgoff_t index) > +{ > + struct mempolicy *mpol; > + > + mpol = mpol_shared_policy_lookup(&gmem->policy, index); > + return mpol ? mpol : get_task_policy(current); > +} > +#else > +static struct mempolicy *kvm_gmem_get_pgoff_policy(struct kvm_gmem *gmem, > + pgoff_t index) > +{ > + return NULL; > +} > +#endif /* CONFIG_NUMA */ > + > +static const struct vm_operations_struct kvm_gmem_vm_ops = { > +#ifdef CONFIG_NUMA > + .get_policy = kvm_gmem_get_policy, > + .set_policy = kvm_gmem_set_policy, > +#endif > +}; > + > +static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) > +{ > + file_accessed(file); > + vma->vm_ops = &kvm_gmem_vm_ops; > + return 0; > +} > > static struct file_operations kvm_gmem_fops = { > + .mmap = kvm_gmem_mmap, > .open = generic_file_open, > .release = kvm_gmem_release, > .fallocate = kvm_gmem_fallocate, > @@ -446,6 +519,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) > kvm_get_kvm(kvm); > gmem->kvm = kvm; > xa_init(&gmem->bindings); > + mpol_shared_policy_init(&gmem->policy, NULL); > list_add(&gmem->entry, &inode->i_mapping->i_private_list); > > fd_install(fd, file); [1] https://lore.kernel.org/lkml/cover.1691446946.git.ackerleytng@xxxxxxxxxx/T/ [2] https://lore.kernel.org/all/d1940d466fc69472c8b6dda95df2e0522b2d8744.1726009989.git.ackerleytng@xxxxxxxxxx/