Signed-off-by: Alex Sierra <alex.sierra@xxxxxxx>
Acked-by: Felix Kuehling <Felix.Kuehling@xxxxxxx>
---
fs/proc/task_mmu.c | 2 +-
include/linux/mm.h | 3 ++-
mm/gup.c | 2 ++
mm/huge_memory.c | 2 +-
mm/khugepaged.c | 9 ++++++---
mm/ksm.c | 6 +++---
mm/madvise.c | 4 ++--
mm/memory.c | 9 ++++++++-
mm/mempolicy.c | 2 +-
mm/migrate.c | 4 ++--
mm/mlock.c | 2 +-
mm/mprotect.c | 2 +-
12 files changed, 30 insertions(+), 17 deletions(-)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f46060eb91b5..5d620733f173 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1785,7 +1785,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
return NULL;
page = vm_normal_page(vma, addr, pte);
- if (!page)
+ if (!page || is_zone_device_page(page))
return NULL;
if (PageReserved(page))
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9f44254af8ce..d7f253a0c41e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -601,7 +601,7 @@ struct vm_operations_struct {
#endif
/*
* Called by vm_normal_page() for special PTEs to find the
- * page for @addr. This is useful if the default behavior
+ * page for @addr. This is useful if the default behavior
* (using pte_page()) would not find the correct page.
*/
struct page *(*find_special_page)(struct vm_area_struct *vma,
@@ -2929,6 +2929,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
+#define FOLL_LRU 0x1000 /* return only LRU (anon or page cache) */
#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */
#define FOLL_COW 0x4000 /* internal GUP flag */
#define FOLL_ANON 0x8000 /* don't do file mappings */
diff --git a/mm/gup.c b/mm/gup.c
index 501bc150792c..c9cbac06bcc5 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -479,6 +479,8 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
}
page = vm_normal_page(vma, address, pte);
+ if ((flags & FOLL_LRU) && page && is_zone_device_page(page))
+ page = NULL;
if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
/*
* Only return device mapping pages in the FOLL_GET or FOLL_PIN
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 910a138e9859..eed80696c5fd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2856,7 +2856,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
}
/* FOLL_DUMP to ignore special (like zero) pages */
- page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
+ page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);
if (IS_ERR(page))
continue;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a4e5eaf3eb01..8bf4126b6b9c 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -627,7 +627,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
goto out;
}
page = vm_normal_page(vma, address, pteval);
- if (unlikely(!page)) {
+ if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
result = SCAN_PAGE_NULL;
goto out;
}
@@ -1276,7 +1276,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
writable = true;
page = vm_normal_page(vma, _address, pteval);
- if (unlikely(!page)) {
+ if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
result = SCAN_PAGE_NULL;
goto out_unmap;
}
@@ -1484,7 +1484,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
goto abort;
page = vm_normal_page(vma, addr, *pte);
-
+ if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+ page = NULL;
/*
* Note that uprobe, debugger, or MAP_PRIVATE may change the
* page table, but the new page will not be a subpage of hpage.
@@ -1502,6 +1503,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
if (pte_none(*pte))
continue;
page = vm_normal_page(vma, addr, *pte);
+ if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+ goto abort;
page_remove_rmap(page, vma, false);
}
diff --git a/mm/ksm.c b/mm/ksm.c
index 063a48eeb5ee..f16056efca21 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -474,7 +474,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
do {
cond_resched();
page = follow_page(vma, addr,
- FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
+ FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE | FOLL_LRU);
if (IS_ERR_OR_NULL(page))
break;
if (PageKsm(page))
@@ -559,7 +559,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
if (!vma)
goto out;
- page = follow_page(vma, addr, FOLL_GET);
+ page = follow_page(vma, addr, FOLL_GET | FOLL_LRU);
if (IS_ERR_OR_NULL(page))
goto out;
if (PageAnon(page)) {
@@ -2288,7 +2288,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
while (ksm_scan.address < vma->vm_end) {
if (ksm_test_exit(mm))
break;
- *page = follow_page(vma, ksm_scan.address, FOLL_GET);
+ *page = follow_page(vma, ksm_scan.address, FOLL_GET | FOLL_LRU);
if (IS_ERR_OR_NULL(*page)) {
ksm_scan.address += PAGE_SIZE;
cond_resched();
diff --git a/mm/madvise.c b/mm/madvise.c
index 1873616a37d2..e9c24c834e98 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -413,7 +413,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
continue;
page = vm_normal_page(vma, addr, ptent);
- if (!page)
+ if (!page || is_zone_device_page(page))
continue;
/*
@@ -628,7 +628,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
}
page = vm_normal_page(vma, addr, ptent);
- if (!page)
+ if (!page || is_zone_device_page(page))
continue;
/*
diff --git a/mm/memory.c b/mm/memory.c
index 76e3af9639d9..571a26805ee1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -621,6 +621,13 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
if (is_zero_pfn(pfn))
return NULL;
if (pte_devmap(pte))
+/*
+ * NOTE: New uers of ZONE_DEVICE will not set pte_devmap() and will have
+ * refcounts incremented on their struct pages when they are inserted into
+ * PTEs, thus they are safe to return here. Legacy ZONE_DEVICE pages that set
+ * pte_devmap() do not have refcounts. Example of legacy ZONE_DEVICE is
+ * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
+ */
return NULL;
print_bad_pte(vma, addr, pte, NULL);
@@ -4422,7 +4429,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
pte = pte_modify(old_pte, vma->vm_page_prot);
page = vm_normal_page(vma, vmf->address, pte);
- if (!page)
+ if (!page || is_zone_device_page(page))
goto out_map;
/* TODO: handle PTE-mapped THP */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8c74107a2b15..e32edbecb0cd 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -528,7 +528,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
if (!pte_present(*pte))
continue;
page = vm_normal_page(vma, addr, *pte);
- if (!page)
+ if (!page || is_zone_device_page(page))
continue;
/*
* vm_normal_page() filters out zero pages, but there might
diff --git a/mm/migrate.c b/mm/migrate.c
index 6c31ee1e1c9b..c5d50e96ecd7 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1611,7 +1611,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
goto out;
/* FOLL_DUMP to ignore special (like zero) pages */
- page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
+ page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);
err = PTR_ERR(page);
if (IS_ERR(page))
@@ -1802,7 +1802,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
goto set_status;
/* FOLL_DUMP to ignore special (like zero) pages */
- page = follow_page(vma, addr, FOLL_DUMP);
+ page = follow_page(vma, addr, FOLL_DUMP | FOLL_LRU);
err = PTR_ERR(page);
if (IS_ERR(page))
diff --git a/mm/mlock.c b/mm/mlock.c
index 716caf851043..b14e929084cc 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -333,7 +333,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
if (!pte_present(*pte))
continue;
page = vm_normal_page(vma, addr, *pte);
- if (!page)
+ if (!page || is_zone_device_page(page))
continue;
if (PageTransCompound(page))
continue;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index b69ce7a7b2b7..a6f3587ea29a 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -91,7 +91,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
continue;
page = vm_normal_page(vma, addr, oldpte);
- if (!page || PageKsm(page))
+ if (!page || is_zone_device_page(page) || PageKsm(page))
continue;
/* Also skip shared copy-on-write pages */