Re: [PATCH v3 04/11] KVM: guest_memfd: Allow host to mmap guest_memfd() pages when shared

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, Oct 10, 2024 at 09:59:23AM +0100, Fuad Tabba wrote:
> Add support for mmap() and fault() for guest_memfd in the host.
> The ability to fault in a guest page is contingent on that page
> being shared with the host.
> 
> The guest_memfd PRIVATE memory attribute is not used for two
> reasons. First because it reflects the userspace expectation for
> that memory location, and therefore can be toggled by userspace.
> The second is, although each guest_memfd file has a 1:1 binding
> with a KVM instance, the plan is to allow multiple files per
> inode, e.g. to allow intra-host migration to a new KVM instance,
> without destroying guest_memfd.
> 
> The mapping is restricted to only memory explicitly shared with
> the host. KVM checks that the host doesn't have any mappings for
> private memory via the folio's refcount. To avoid races between
> paths that check mappability and paths that check whether the
> host has any mappings (via the refcount), the folio lock is held
> in while either check is being performed.
> 
> This new feature is gated with a new configuration option,
> CONFIG_KVM_GMEM_MAPPABLE.
> 
> Co-developed-by: Ackerley Tng <ackerleytng@xxxxxxxxxx>
> Signed-off-by: Ackerley Tng <ackerleytng@xxxxxxxxxx>
> Co-developed-by: Elliot Berman <quic_eberman@xxxxxxxxxxx>
> Signed-off-by: Elliot Berman <quic_eberman@xxxxxxxxxxx>
> Signed-off-by: Fuad Tabba <tabba@xxxxxxxxxx>
> 
> ---
> 
> Note that the functions kvm_gmem_is_mapped(),
> kvm_gmem_set_mappable(), and int kvm_gmem_clear_mappable() are
> not used in this patch series. They are intended to be used in
> future patches [*], which check and toggle mapability when the
> guest shares/unshares pages with the host.
> 
> [*] https://android-kvm.googlesource.com/linux/+/refs/heads/tabba/guestmem-6.12-v3-pkvm
> 
> ---
>  include/linux/kvm_host.h |  52 +++++++++++
>  virt/kvm/Kconfig         |   4 +
>  virt/kvm/guest_memfd.c   | 185 +++++++++++++++++++++++++++++++++++++++
>  virt/kvm/kvm_main.c      | 138 +++++++++++++++++++++++++++++
>  4 files changed, 379 insertions(+)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index acf85995b582..bda7fda9945e 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -2527,4 +2527,56 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
>  				    struct kvm_pre_fault_memory *range);
>  #endif
>  
> +#ifdef CONFIG_KVM_GMEM_MAPPABLE
> +bool kvm_gmem_is_mappable(struct kvm *kvm, gfn_t gfn, gfn_t end);
> +bool kvm_gmem_is_mapped(struct kvm *kvm, gfn_t start, gfn_t end);
> +int kvm_gmem_set_mappable(struct kvm *kvm, gfn_t start, gfn_t end);
> +int kvm_gmem_clear_mappable(struct kvm *kvm, gfn_t start, gfn_t end);
> +int kvm_slot_gmem_set_mappable(struct kvm_memory_slot *slot, gfn_t start,
> +			       gfn_t end);
> +int kvm_slot_gmem_clear_mappable(struct kvm_memory_slot *slot, gfn_t start,
> +				 gfn_t end);
> +bool kvm_slot_gmem_is_mappable(struct kvm_memory_slot *slot, gfn_t gfn);
> +#else
> +static inline bool kvm_gmem_is_mappable(struct kvm *kvm, gfn_t gfn, gfn_t end)
> +{
> +	WARN_ON_ONCE(1);
> +	return false;
> +}
> +static inline bool kvm_gmem_is_mapped(struct kvm *kvm, gfn_t start, gfn_t end)
> +{
> +	WARN_ON_ONCE(1);
> +	return false;
> +}
> +static inline int kvm_gmem_set_mappable(struct kvm *kvm, gfn_t start, gfn_t end)
> +{
> +	WARN_ON_ONCE(1);
> +	return -EINVAL;
> +}
> +static inline int kvm_gmem_clear_mappable(struct kvm *kvm, gfn_t start,
> +					  gfn_t end)
> +{
> +	WARN_ON_ONCE(1);
> +	return -EINVAL;
> +}
> +static inline int kvm_slot_gmem_set_mappable(struct kvm_memory_slot *slot,
> +					     gfn_t start, gfn_t end)
> +{
> +	WARN_ON_ONCE(1);
> +	return -EINVAL;
> +}
> +static inline int kvm_slot_gmem_clear_mappable(struct kvm_memory_slot *slot,
> +					       gfn_t start, gfn_t end)
> +{
> +	WARN_ON_ONCE(1);
> +	return -EINVAL;
> +}
> +static inline bool kvm_slot_gmem_is_mappable(struct kvm_memory_slot *slot,
> +					     gfn_t gfn)
> +{
> +	WARN_ON_ONCE(1);
> +	return false;
> +}
> +#endif /* CONFIG_KVM_GMEM_MAPPABLE */
> +
>  #endif
> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> index fd6a3010afa8..2cfcb0848e37 100644
> --- a/virt/kvm/Kconfig
> +++ b/virt/kvm/Kconfig
> @@ -120,3 +120,7 @@ config HAVE_KVM_ARCH_GMEM_PREPARE
>  config HAVE_KVM_ARCH_GMEM_INVALIDATE
>         bool
>         depends on KVM_PRIVATE_MEM
> +
> +config KVM_GMEM_MAPPABLE
> +       select KVM_PRIVATE_MEM
> +       bool
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index f414646c475b..df3a6f05a16e 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -370,7 +370,184 @@ static void kvm_gmem_init_mount(void)
>  	kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC;
>  }
>  
> +#ifdef CONFIG_KVM_GMEM_MAPPABLE
> +static struct folio *
> +__kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot,
> +		   gfn_t gfn, kvm_pfn_t *pfn, bool *is_prepared,
> +		   int *max_order);
> +
> +static int gmem_set_mappable(struct inode *inode, pgoff_t start, pgoff_t end)
> +{
> +	struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets;
> +	void *xval = xa_mk_value(true);
> +	pgoff_t i;
> +	bool r;
> +
> +	filemap_invalidate_lock(inode->i_mapping);
> +	for (i = start; i < end; i++) {
> +		r = xa_err(xa_store(mappable_offsets, i, xval, GFP_KERNEL));

I think it might not be strictly necessary,

> +		if (r)
> +			break;
> +	}
> +	filemap_invalidate_unlock(inode->i_mapping);
> +
> +	return r;
> +}
> +
> +static int gmem_clear_mappable(struct inode *inode, pgoff_t start, pgoff_t end)
> +{
> +	struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets;
> +	pgoff_t i;
> +	int r = 0;
> +
> +	filemap_invalidate_lock(inode->i_mapping);
> +	for (i = start; i < end; i++) {
> +		struct folio *folio;
> +
> +		/*
> +		 * Holds the folio lock until after checking its refcount,
> +		 * to avoid races with paths that fault in the folio.
> +		 */
> +		folio = kvm_gmem_get_folio(inode, i);

We don't need to allocate the folio here. I think we can use

		folio = filemap_lock_folio(inode, i);
		if (!folio || WARN_ON_ONCE(IS_ERR(folio)))
			continue;

> +		if (WARN_ON_ONCE(IS_ERR(folio)))
> +			continue;
> +
> +		/*
> +		 * Check that the host doesn't have any mappings on clearing
> +		 * the mappable flag, because clearing the flag implies that the
> +		 * memory will be unshared from the host. Therefore, to maintain
> +		 * the invariant that the host cannot access private memory, we
> +		 * need to check that it doesn't have any mappings to that
> +		 * memory before making it private.
> +		 *
> +		 * Two references are expected because of kvm_gmem_get_folio().
> +		 */
> +		if (folio_ref_count(folio) > 2)

If we'd like to be prepared for large folios, it should be
folio_nr_pages(folio) + 1. 

> +			r = -EPERM;
> +		else
> +			xa_erase(mappable_offsets, i);
> +
> +		folio_put(folio);
> +		folio_unlock(folio);
> +
> +		if (r)
> +			break;
> +	}
> +	filemap_invalidate_unlock(inode->i_mapping);
> +
> +	return r;
> +}
> +
> +static bool gmem_is_mappable(struct inode *inode, pgoff_t pgoff)
> +{
> +	struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets;
> +	bool r;
> +
> +	filemap_invalidate_lock_shared(inode->i_mapping);
> +	r = xa_find(mappable_offsets, &pgoff, pgoff, XA_PRESENT);
> +	filemap_invalidate_unlock_shared(inode->i_mapping);
> +
> +	return r;
> +}
> +
> +int kvm_slot_gmem_set_mappable(struct kvm_memory_slot *slot, gfn_t start, gfn_t end)
> +{
> +	struct inode *inode = file_inode(slot->gmem.file);
> +	pgoff_t start_off = slot->gmem.pgoff + start - slot->base_gfn;
> +	pgoff_t end_off = start_off + end - start;
> +
> +	return gmem_set_mappable(inode, start_off, end_off);
> +}
> +
> +int kvm_slot_gmem_clear_mappable(struct kvm_memory_slot *slot, gfn_t start, gfn_t end)
> +{
> +	struct inode *inode = file_inode(slot->gmem.file);
> +	pgoff_t start_off = slot->gmem.pgoff + start - slot->base_gfn;
> +	pgoff_t end_off = start_off + end - start;
> +
> +	return gmem_clear_mappable(inode, start_off, end_off);
> +}
> +
> +bool kvm_slot_gmem_is_mappable(struct kvm_memory_slot *slot, gfn_t gfn)
> +{
> +	struct inode *inode = file_inode(slot->gmem.file);
> +	unsigned long pgoff = slot->gmem.pgoff + gfn - slot->base_gfn;
> +
> +	return gmem_is_mappable(inode, pgoff);
> +}
> +
> +static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf)
> +{
> +	struct inode *inode = file_inode(vmf->vma->vm_file);
> +	struct folio *folio;
> +	vm_fault_t ret = VM_FAULT_LOCKED;
> +
> +	/*
> +	 * Holds the folio lock until after checking whether it can be faulted
> +	 * in, to avoid races with paths that change a folio's mappability.
> +	 */
> +	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
> +	if (!folio)
> +		return VM_FAULT_SIGBUS;
> +
> +	if (folio_test_hwpoison(folio)) {
> +		ret = VM_FAULT_HWPOISON;
> +		goto out;
> +	}
> +
> +	if (!gmem_is_mappable(inode, vmf->pgoff)) {
> +		ret = VM_FAULT_SIGBUS;
> +		goto out;
> +	}
> +
> +	if (!folio_test_uptodate(folio)) {
> +		unsigned long nr_pages = folio_nr_pages(folio);
> +		unsigned long i;
> +
> +		for (i = 0; i < nr_pages; i++)
> +			clear_highpage(folio_page(folio, i));
> +
> +		folio_mark_uptodate(folio);
> +	}
> +
> +	vmf->page = folio_file_page(folio, vmf->pgoff);
> +out:
> +	if (ret != VM_FAULT_LOCKED) {
> +		folio_put(folio);
> +		folio_unlock(folio);
> +	}
> +
> +	return ret;
> +}
> +
> +static const struct vm_operations_struct kvm_gmem_vm_ops = {
> +	.fault = kvm_gmem_fault,
> +};
> +
> +static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +	if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
> +	    (VM_SHARED | VM_MAYSHARE)) {
> +		return -EINVAL;
> +	}
> +
> +	file_accessed(file);
> +	vm_flags_set(vma, VM_DONTDUMP);
> +	vma->vm_ops = &kvm_gmem_vm_ops;
> +
> +	return 0;
> +}
> +#else
> +static int gmem_set_mappable(struct inode *inode, pgoff_t start, pgoff_t end)
> +{
> +	WARN_ON_ONCE(1);
> +	return -EINVAL;
> +}
> +#define kvm_gmem_mmap NULL
> +#endif /* CONFIG_KVM_GMEM_MAPPABLE */
> +
>  static struct file_operations kvm_gmem_fops = {
> +	.mmap		= kvm_gmem_mmap,
>  	.open		= generic_file_open,
>  	.release	= kvm_gmem_release,
>  	.fallocate	= kvm_gmem_fallocate,
> @@ -557,6 +734,14 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
>  		goto err_gmem;
>  	}
>  
> +	if (IS_ENABLED(CONFIG_KVM_GMEM_MAPPABLE)) {
> +		err = gmem_set_mappable(file_inode(file), 0, size >> PAGE_SHIFT);
> +		if (err) {
> +			fput(file);
> +			goto err_gmem;
> +		}
> +	}
> +
>  	kvm_get_kvm(kvm);
>  	gmem->kvm = kvm;
>  	xa_init(&gmem->bindings);
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 05cbb2548d99..aed9cf2f1685 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -3263,6 +3263,144 @@ static int next_segment(unsigned long len, int offset)
>  		return len;
>  }
>  
> +#ifdef CONFIG_KVM_GMEM_MAPPABLE
> +static bool __kvm_gmem_is_mappable(struct kvm *kvm, gfn_t start, gfn_t end)
> +{
> +	struct kvm_memslot_iter iter;
> +
> +	lockdep_assert_held(&kvm->slots_lock);
> +
> +	kvm_for_each_memslot_in_gfn_range(&iter, kvm_memslots(kvm), start, end) {
> +		struct kvm_memory_slot *memslot = iter.slot;
> +		gfn_t gfn_start, gfn_end, i;
> +
> +		gfn_start = max(start, memslot->base_gfn);
> +		gfn_end = min(end, memslot->base_gfn + memslot->npages);
> +		if (WARN_ON_ONCE(gfn_start >= gfn_end))
> +			continue;
> +
> +		for (i = gfn_start; i < gfn_end; i++) {
> +			if (!kvm_slot_gmem_is_mappable(memslot, i))
> +				return false;
> +		}
> +	}
> +
> +	return true;
> +}
> +
> +bool kvm_gmem_is_mappable(struct kvm *kvm, gfn_t start, gfn_t end)
> +{
> +	bool r;
> +
> +	mutex_lock(&kvm->slots_lock);
> +	r = __kvm_gmem_is_mappable(kvm, start, end);
> +	mutex_unlock(&kvm->slots_lock);
> +
> +	return r;
> +}
> +
> +static bool kvm_gmem_is_pfn_mapped(struct kvm *kvm, struct kvm_memory_slot *memslot, gfn_t gfn_idx)
> +{
> +	struct page *page;
> +	bool is_mapped;
> +	kvm_pfn_t pfn;
> +
> +	/*
> +	 * Holds the folio lock until after checking its refcount,
> +	 * to avoid races with paths that fault in the folio.
> +	 */
> +	if (WARN_ON_ONCE(kvm_gmem_get_pfn_locked(kvm, memslot, gfn_idx, &pfn, NULL)))
> +		return false;
> +
> +	page = pfn_to_page(pfn);
> +
> +	/* Two references are expected because of kvm_gmem_get_pfn_locked(). */
> +	is_mapped = page_ref_count(page) > 2;
> +
> +	put_page(page);
> +	unlock_page(page);
> +
> +	return is_mapped;
> +}
> +
> +static bool __kvm_gmem_is_mapped(struct kvm *kvm, gfn_t start, gfn_t end)
> +{
> +	struct kvm_memslot_iter iter;
> +
> +	lockdep_assert_held(&kvm->slots_lock);
> +
> +	kvm_for_each_memslot_in_gfn_range(&iter, kvm_memslots(kvm), start, end) {
> +		struct kvm_memory_slot *memslot = iter.slot;
> +		gfn_t gfn_start, gfn_end, i;
> +
> +		gfn_start = max(start, memslot->base_gfn);
> +		gfn_end = min(end, memslot->base_gfn + memslot->npages);
> +		if (WARN_ON_ONCE(gfn_start >= gfn_end))
> +			continue;
> +
> +		for (i = gfn_start; i < gfn_end; i++) {
> +			if (kvm_gmem_is_pfn_mapped(kvm, memslot, i))
> +				return true;
> +		}
> +	}
> +
> +	return false;
> +}
> +
> +bool kvm_gmem_is_mapped(struct kvm *kvm, gfn_t start, gfn_t end)
> +{
> +	bool r;
> +
> +	mutex_lock(&kvm->slots_lock);
> +	r = __kvm_gmem_is_mapped(kvm, start, end);
> +	mutex_unlock(&kvm->slots_lock);
> +
> +	return r;
> +}
> +
> +static int kvm_gmem_toggle_mappable(struct kvm *kvm, gfn_t start, gfn_t end,
> +				    bool is_mappable)
> +{
> +	struct kvm_memslot_iter iter;
> +	int r = 0;
> +
> +	mutex_lock(&kvm->slots_lock);
> +
> +	kvm_for_each_memslot_in_gfn_range(&iter, kvm_memslots(kvm), start, end) {
> +		struct kvm_memory_slot *memslot = iter.slot;
> +		gfn_t gfn_start, gfn_end;
> +
> +		gfn_start = max(start, memslot->base_gfn);
> +		gfn_end = min(end, memslot->base_gfn + memslot->npages);
> +		if (WARN_ON_ONCE(start >= end))
> +			continue;
> +
> +		if (is_mappable)
> +			r = kvm_slot_gmem_set_mappable(memslot, gfn_start, gfn_end);
> +		else
> +			r = kvm_slot_gmem_clear_mappable(memslot, gfn_start, gfn_end);
> +
> +		if (WARN_ON_ONCE(r))
> +			break;
> +	}
> +
> +	mutex_unlock(&kvm->slots_lock);
> +
> +	return r;
> +}
> +
> +int kvm_gmem_set_mappable(struct kvm *kvm, gfn_t start, gfn_t end)
> +{
> +	return kvm_gmem_toggle_mappable(kvm, start, end, true);
> +}
> +
> +int kvm_gmem_clear_mappable(struct kvm *kvm, gfn_t start, gfn_t end)
> +{
> +	return kvm_gmem_toggle_mappable(kvm, start, end, false);
> +}
> +
> +#endif /* CONFIG_KVM_GMEM_MAPPABLE */
> +
>  /* Copy @len bytes from guest memory at '(@gfn * PAGE_SIZE) + @offset' to @data */
>  static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
>  				 void *data, int offset, int len)
> -- 
> 2.47.0.rc0.187.ge670bccf7e-goog
> 




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux