Re: [PATCH 2/3] drm/xe/hmm: Don't dereference struct page pointers without notifier lock

Matthew Auld <matthew.auld@xxxxxxxxx> · Fri, 28 Feb 2025 18:32:16 +0000

On 28/02/2025 13:08, Thomas Hellström wrote:
On Fri, 2025-02-28 at 12:55 +0000, Matthew Auld wrote:
On 28/02/2025 10:44, Thomas Hellström wrote:
The pnfs that we obtain from hmm_range_fault() point to pages that
we don't have a reference on, and the guarantee that they are still
in the cpu page-tables is that the notifier lock must be held and
the
notifier seqno is still valid.

So while building the sg table and marking the pages accesses /
dirty
we need to hold this lock with a validated seqno.

However, the lock is reclaim tainted which makes
sg_alloc_table_from_pages_segment() unusable, since it internally
allocates memory.

Instead build the sg-table manually. For the non-iommu case
this might lead to fewer coalesces, but if that's a problem it can
be fixed up later in the resource cursor code. For the iommu case,
the whole sg-table may still be coalesced to a single contigous
device va region.

This avoids marking pages that we don't own dirty and accessed, and
it also avoid dereferencing struct pages that we don't own.

Fixes: 81e058a3e7fd ("drm/xe: Introduce helper to populate
userptr")
Cc: Oak Zeng <oak.zeng@xxxxxxxxx>
Cc: <stable@xxxxxxxxxxxxxxx> # v6.10+
Signed-off-by: Thomas Hellström <thomas.hellstrom@xxxxxxxxxxxxxxx>
---
   drivers/gpu/drm/xe/xe_hmm.c | 115 ++++++++++++++++++++++++++-----
-----
   1 file changed, 85 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_hmm.c
b/drivers/gpu/drm/xe/xe_hmm.c
index c56738fa713b..d3b5551496d0 100644
--- a/drivers/gpu/drm/xe/xe_hmm.c
+++ b/drivers/gpu/drm/xe/xe_hmm.c
@@ -42,6 +42,36 @@ static void xe_mark_range_accessed(struct
hmm_range *range, bool write)
   	}
   }
   
+static int xe_alloc_sg(struct sg_table *st, struct hmm_range
*range,
+		       struct rw_semaphore *notifier_sem)
+{
+	unsigned long i, npages, hmm_pfn;
+	unsigned long num_chunks = 0;
+	int ret;
+
+	/* HMM docs says this is needed. */
+	ret = down_read_interruptible(notifier_sem);
+	if (ret)
+		return ret;
+
+	if (mmu_interval_read_retry(range->notifier, range-
notifier_seq))
+		return -EAGAIN;
+
+	npages = xe_npages_in_range(range->start, range->end);
+	for (i = 0; i < npages;) {
+		hmm_pfn = range->hmm_pfns[i];
+		if (!(hmm_pfn & HMM_PFN_VALID)) {

Is this possible? The default_flags are always REQ_FAULT, so that
should
ensure PFN_VALID, or the hmm_fault would have returned an error?

  From the docs:

"HMM_PFN_REQ_FAULT - The output must have HMM_PFN_VALID or
hmm_range_fault() will fail"

Should this be an assert?

Also probably dumb question, but why do we need to hold the notifier
lock over this loop? What is it protecting?

Docs for hmm_pfn_to_map_order():

/*
  * This must be called under the caller 'user_lock' after a successful
  * mmu_interval_read_begin(). The caller must have tested for
HMM_PFN_VALID
  * already.
  */

I'm fine with changing to an assert, and I agree that the lock is
pointless: We're operating on thread local data, but I also think that
not adhering to the doc requirements might cause problems in the
future. Like if the map order encoding is dropped and the order was
grabbed from the underlying page.

Makes sense. Keeping the lock indeed looks sensible.

Staring some more at hmm_pfn_to_map_order(), it also says:

"The caller must be careful with edge cases as the start and end VA of 
the given page may extend past the range used with hmm_range_fault()."

I take that to mean it will still return a huge page order even if there 
is say some 2M PTE, but the hmm_range is just some small sub range of 
pfn covering part of the huge page, like say 8K, where the first 4K page 
is exactly at the end of the 2M page. Are there some potentially nasty 
surprises with stuff like:

i += 1UL << hmm_pfn_to_map_order(hmm_pfn);

Since the increment here (512) could go past even npages (2), and then 
num_chunks is too small (1)?


/Thomas



+			up_read(notifier_sem);
+			return -EFAULT;
+		}
+		num_chunks++;
+		i += 1UL << hmm_pfn_to_map_order(hmm_pfn);
+	}
+	up_read(notifier_sem);
+
+	return sg_alloc_table(st, num_chunks, GFP_KERNEL);
+}
+
   /**
    * xe_build_sg() - build a scatter gather table for all the
physical pages/pfn
    * in a hmm_range. dma-map pages if necessary. dma-address is
save in sg table
@@ -50,6 +80,7 @@ static void xe_mark_range_accessed(struct
hmm_range *range, bool write)
    * @range: the hmm range that we build the sg table from. range-
hmm_pfns[]
    * has the pfn numbers of pages that back up this hmm address
range.
    * @st: pointer to the sg table.
+ * @notifier_sem: The xe notifier lock.
    * @write: whether we write to this range. This decides dma map
direction
    * for system pages. If write we map it bi-diretional; otherwise
    * DMA_TO_DEVICE
@@ -76,38 +107,33 @@ static void xe_mark_range_accessed(struct
hmm_range *range, bool write)
    * Returns 0 if successful; -ENOMEM if fails to allocate memory
    */
   static int xe_build_sg(struct xe_device *xe, struct hmm_range
*range,
-		       struct sg_table *st, bool write)
+		       struct sg_table *st,
+		       struct rw_semaphore *notifier_sem,
+		       bool write)
   {
   	struct device *dev = xe->drm.dev;
-	struct page **pages;
-	u64 i, npages;
-	int ret;
-
-	npages = xe_npages_in_range(range->start, range->end);
-	pages = kvmalloc_array(npages, sizeof(*pages),
GFP_KERNEL);
-	if (!pages)
-		return -ENOMEM;
-
-	for (i = 0; i < npages; i++) {
-		pages[i] = hmm_pfn_to_page(range->hmm_pfns[i]);
-		xe_assert(xe, !is_device_private_page(pages[i]));
-	}
-
-	ret = sg_alloc_table_from_pages_segment(st, pages, npages,
0, npages << PAGE_SHIFT,
-
						xe_sg_segment_size(dev), GFP_KERNEL);
-	if (ret)
-		goto free_pages;
-
-	ret = dma_map_sgtable(dev, st, write ? DMA_BIDIRECTIONAL :
DMA_TO_DEVICE,
-			      DMA_ATTR_SKIP_CPU_SYNC |
DMA_ATTR_NO_KERNEL_MAPPING);
-	if (ret) {
-		sg_free_table(st);
-		st = NULL;
+	unsigned long hmm_pfn, size;
+	struct scatterlist *sgl;
+	struct page *page;
+	unsigned long i, j;
+
+	lockdep_assert_held(notifier_sem);
+
+	i = 0;
+	for_each_sg(st->sgl, sgl, st->nents, j) {
+		hmm_pfn = range->hmm_pfns[i];
+		page = hmm_pfn_to_page(hmm_pfn);
+		xe_assert(xe, !is_device_private_page(page));
+		size = 1UL << hmm_pfn_to_map_order(hmm_pfn);
+		sg_set_page(sgl, page, size << PAGE_SHIFT, 0);
+		if (unlikely(j == st->nents - 1))
+			sg_mark_end(sgl);
+		i += size;
   	}
+	xe_assert(xe, i == xe_npages_in_range(range->start, range-
end));
   
-free_pages:
-	kvfree(pages);
-	return ret;
+	return dma_map_sgtable(dev, st, write ? DMA_BIDIRECTIONAL
: DMA_TO_DEVICE,
+			       DMA_ATTR_SKIP_CPU_SYNC |
DMA_ATTR_NO_KERNEL_MAPPING);
   }
   
   /**
@@ -235,16 +261,45 @@ int xe_hmm_userptr_populate_range(struct
xe_userptr_vma *uvma,
   	if (ret)
   		goto free_pfns;
   
-	ret = xe_build_sg(vm->xe, &hmm_range, &userptr->sgt,
write);
+	if (unlikely(userptr->sg)) {
+		ret = down_write_killable(&vm-
userptr.notifier_lock);
+		if (ret)
+			goto free_pfns;
+
+		xe_hmm_userptr_free_sg(uvma);
+		up_write(&vm->userptr.notifier_lock);
+	}
+
+	ret = xe_alloc_sg(&userptr->sgt, &hmm_range, &vm-
userptr.notifier_lock);
   	if (ret)
   		goto free_pfns;
   
+	ret = down_read_interruptible(&vm->userptr.notifier_lock);
+	if (ret)
+		goto free_st;
+
+	if (mmu_interval_read_retry(hmm_range.notifier,
hmm_range.notifier_seq)) {
+		ret = -EAGAIN;
+		goto out_unlock;
+	}
+
+	ret = xe_build_sg(vm->xe, &hmm_range, &userptr->sgt,
+			  &vm->userptr.notifier_lock, write);
+	if (ret)
+		goto out_unlock;
+
   	xe_mark_range_accessed(&hmm_range, write);
   	userptr->sg = &userptr->sgt;
   	userptr->notifier_seq = hmm_range.notifier_seq;
+	up_read(&vm->userptr.notifier_lock);
+	kvfree(pfns);
+	return 0;
   
+out_unlock:
+	up_read(&vm->userptr.notifier_lock);
+free_st:
+	sg_free_table(&userptr->sgt);
   free_pfns:
   	kvfree(pfns);
   	return ret;
   }
-