Re: [PATCH v2 11/13] mm: handling Non-LRU pages returned by vm_normal_pages

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On 5/23/2022 7:02 AM, Alistair Popple wrote:
Technically I think this patch should be earlier in the series. As I
understand it patch 1 allows DEVICE_COHERENT pages to be inserted in the
page tables and therefore makes it possible for page table walkers to
see non-LRU pages.

Patch will reordered in V3.

Regards,
Alex Sierra


Some more comments below:

Alex Sierra <alex.sierra@xxxxxxx> writes:

With DEVICE_COHERENT, we'll soon have vm_normal_pages() return
device-managed anonymous pages that are not LRU pages. Although they
behave like normal pages for purposes of mapping in CPU page, and for
COW. They do not support LRU lists, NUMA migration or THP.

We also introduced a FOLL_LRU flag that adds the same behaviour to
follow_page and related APIs, to allow callers to specify that they
expect to put pages on an LRU list.
This means by default GUP can return non-LRU pages. I didn't see
anywhere that would be a problem but I didn't check everything. Did you
check this or is there some other reason I've missed that makes this not
a problem?

I have double checked all gup and pin_user_pages callers and none of them seem to have interaction with LRU APIs.

Regards,
Alex Sierra


[...]

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a4e5eaf3eb01..eb3cfd679800 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -627,7 +627,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
  			goto out;
  		}
  		page = vm_normal_page(vma, address, pteval);
-		if (unlikely(!page)) {
+		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
  			result = SCAN_PAGE_NULL;
  			goto out;
  		}
@@ -1276,7 +1276,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
  			writable = true;

  		page = vm_normal_page(vma, _address, pteval);
-		if (unlikely(!page)) {
+		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
  			result = SCAN_PAGE_NULL;
  			goto out_unmap;
  		}
@@ -1484,7 +1484,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
  			goto abort;

  		page = vm_normal_page(vma, addr, *pte);
-
+		if (page && is_zone_device_page(page))
+			page = NULL;
  		/*
  		 * Note that uprobe, debugger, or MAP_PRIVATE may change the
  		 * page table, but the new page will not be a subpage of hpage.
@@ -1502,6 +1503,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
  		if (pte_none(*pte))
  			continue;
  		page = vm_normal_page(vma, addr, *pte);
+		if (page && is_zone_device_page(page))
+			goto abort;
Are either of these two cases actually possible? DEVICE_COHERENT doesn't
currently support THP, so if I'm understanding correctly we couldn't
have a pte mapped DEVICE_COHERENT THP right? Assuming that's the case I
think WARN_ON_ONCE() would be better.

Correct, change included in V3 patch series.

Regards,
Alex


Otherwise I think everything else looks reasonable.

  		page_remove_rmap(page, vma, false);
  	}

diff --git a/mm/ksm.c b/mm/ksm.c
index 063a48eeb5ee..f16056efca21 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -474,7 +474,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
  	do {
  		cond_resched();
  		page = follow_page(vma, addr,
-				FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
+				FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE | FOLL_LRU);
  		if (IS_ERR_OR_NULL(page))
  			break;
  		if (PageKsm(page))
@@ -559,7 +559,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
  	if (!vma)
  		goto out;

-	page = follow_page(vma, addr, FOLL_GET);
+	page = follow_page(vma, addr, FOLL_GET | FOLL_LRU);
  	if (IS_ERR_OR_NULL(page))
  		goto out;
  	if (PageAnon(page)) {
@@ -2288,7 +2288,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
  		while (ksm_scan.address < vma->vm_end) {
  			if (ksm_test_exit(mm))
  				break;
-			*page = follow_page(vma, ksm_scan.address, FOLL_GET);
+			*page = follow_page(vma, ksm_scan.address, FOLL_GET | FOLL_LRU);
  			if (IS_ERR_OR_NULL(*page)) {
  				ksm_scan.address += PAGE_SIZE;
  				cond_resched();
diff --git a/mm/madvise.c b/mm/madvise.c
index 1873616a37d2..e9c24c834e98 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -413,7 +413,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
  			continue;

  		page = vm_normal_page(vma, addr, ptent);
-		if (!page)
+		if (!page || is_zone_device_page(page))
  			continue;

  		/*
@@ -628,7 +628,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
  		}

  		page = vm_normal_page(vma, addr, ptent);
-		if (!page)
+		if (!page || is_zone_device_page(page))
  			continue;

  		/*
diff --git a/mm/memory.c b/mm/memory.c
index 76e3af9639d9..571a26805ee1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -621,6 +621,13 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
  		if (is_zero_pfn(pfn))
  			return NULL;
  		if (pte_devmap(pte))
+/*
+ * NOTE: New uers of ZONE_DEVICE will not set pte_devmap() and will have
+ * refcounts incremented on their struct pages when they are inserted into
+ * PTEs, thus they are safe to return here. Legacy ZONE_DEVICE pages that set
+ * pte_devmap() do not have refcounts. Example of legacy ZONE_DEVICE is
+ * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
+ */
  			return NULL;

  		print_bad_pte(vma, addr, pte, NULL);
@@ -4422,7 +4429,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
  	pte = pte_modify(old_pte, vma->vm_page_prot);

  	page = vm_normal_page(vma, vmf->address, pte);
-	if (!page)
+	if (!page || is_zone_device_page(page))
  		goto out_map;

  	/* TODO: handle PTE-mapped THP */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8c74107a2b15..e32edbecb0cd 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -528,7 +528,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
  		if (!pte_present(*pte))
  			continue;
  		page = vm_normal_page(vma, addr, *pte);
-		if (!page)
+		if (!page || is_zone_device_page(page))
  			continue;
  		/*
  		 * vm_normal_page() filters out zero pages, but there might
diff --git a/mm/migrate.c b/mm/migrate.c
index 6c31ee1e1c9b..c5d50e96ecd7 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1611,7 +1611,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
  		goto out;

  	/* FOLL_DUMP to ignore special (like zero) pages */
-	page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
+	page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);

  	err = PTR_ERR(page);
  	if (IS_ERR(page))
@@ -1802,7 +1802,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
  			goto set_status;

  		/* FOLL_DUMP to ignore special (like zero) pages */
-		page = follow_page(vma, addr, FOLL_DUMP);
+		page = follow_page(vma, addr, FOLL_DUMP | FOLL_LRU);

  		err = PTR_ERR(page);
  		if (IS_ERR(page))
diff --git a/mm/mlock.c b/mm/mlock.c
index 716caf851043..b14e929084cc 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -333,7 +333,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
  		if (!pte_present(*pte))
  			continue;
  		page = vm_normal_page(vma, addr, *pte);
-		if (!page)
+		if (!page || is_zone_device_page(page))
  			continue;
  		if (PageTransCompound(page))
  			continue;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index b69ce7a7b2b7..a6f3587ea29a 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -91,7 +91,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
  					continue;

  				page = vm_normal_page(vma, addr, oldpte);
-				if (!page || PageKsm(page))
+				if (!page || is_zone_device_page(page) || PageKsm(page))
  					continue;

  				/* Also skip shared copy-on-write pages */



[Index of Archives]     [XFS Filesystem Development (older mail)]     [Linux Filesystem Development]     [Linux Audio Users]     [Yosemite Trails]     [Linux Kernel]     [Linux RAID]     [Linux SCSI]


  Powered by Linux