Re: [PATCH] mm/gup: restore the ability to pin more than 2GB at a time

David Hildenbrand <david@xxxxxxxxxx> · Wed, 30 Oct 2024 10:01:37 +0100

On 30.10.24 09:34, David Hildenbrand wrote:
On 30.10.24 07:50, John Hubbard wrote:
On 10/29/24 11:18 PM, Alistair Popple wrote:
John Hubbard <jhubbard@xxxxxxxxxx> writes:
On 10/29/24 9:42 PM, Christoph Hellwig wrote:
On Tue, Oct 29, 2024 at 09:39:15PM -0700, John Hubbard wrote:
...
Because pinning down these amounts of memoryt is completely insane.
I don't mind the switch to kvmalloc, but we need to put in an upper
bound of what can be pinned.

I'm wondering though, how it is that we decide how much of the user's
system we prevent them from using? :)  People with hardware accelerators
do not always have page fault capability, and yet these troublesome
users insist on stacking their system full of DRAM and then pointing
the accelerator to it.

How would we choose a value? Memory sizes keep going up...

The obvious answer is you let users decide. I did have a patch series to
do that via a cgroup[1]. However I dropped that series mostly because I
couldn't find any users of such a limit to provide feedback on how they
would use it or how they wanted it to work.


Trawling through the discussion there, I see that Jason Gunthorpe mentioned:

"Things like VFIO & KVM use cases effectively pin 90% of all system memory"

The unusual thing is not the amount of system memory we are pinning but
*how many* pages we try pinning in the single call.

If you stare at vfio_pin_pages_remote, we seem to be batching it.

long req_pages = min_t(long, npage, batch->capacity);

Which is

#define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *))


So you can fix this in your driver ;)


We should maybe try a similar limit internally: if you call
pin_user_pages_remote() with a large number, we'll cap it at some magic
value (similar to above). The caller will simply realize that not all
pages were pinned and will retry.

See get_user_pages_remote(): "Returns either number of pages pinned
(which may be less than the number requested), or an error. Details
about the return value:"


Alternatively, I recall there was a way to avoid the temporary
allocation ... let me hack up a prototype real quick.

Completely untested (also note the interesting TODO):


From a23984b4f1a39ec984489fbe16708aedf4f9db95 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@xxxxxxxxxx>
Date: Wed, 30 Oct 2024 10:00:50 +0100
Subject: [PATCH] tmp

Signed-off-by: David Hildenbrand <david@xxxxxxxxxx>
---
 mm/gup.c | 120 ++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 78 insertions(+), 42 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index a82890b46a36..8807b36c2363 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2273,20 +2273,56 @@ struct page *get_dump_page(unsigned long addr)
 #endif /* CONFIG_ELF_CORE */
 
 #ifdef CONFIG_MIGRATION
+
+/*
+ * An array of either pages or folios. While we could currently interpret a
+ * list of folios like a list of pages, it would only work as long as
+ * "struct folio" overlays "struct page" -- and it would not allow for
+ * avoiding page_folio() calls.
+ */
+struct pages_or_folios {
+	union {
+		struct page **pages;
+		struct folio **folios;
+		void **entries;
+	};
+	bool has_folios;
+	long nr_entries;
+};
+
+static struct folio *pofs_get_folio(struct pages_or_folios *pofs, long i)
+{
+	if (pofs->has_folios)
+		return pofs->folios[i];
+	return page_folio(pofs->pages[i]);
+}
+
+static void pofs_clear_entry(struct pages_or_folios *pofs, long i)
+{
+	pofs->entries[i] = NULL;
+}
+
+static void pofs_unpin(struct pages_or_folios *pofs)
+{
+	if (pofs->has_folios)
+		unpin_folios(pofs->folios, pofs->nr_entries);
+	else
+		unpin_user_pages(pofs->pages, pofs->nr_entries);
+}
+
 /*
  * Returns the number of collected folios. Return value is always >= 0.
  */
 static unsigned long collect_longterm_unpinnable_folios(
-					struct list_head *movable_folio_list,
-					unsigned long nr_folios,
-					struct folio **folios)
+		struct list_head *movable_folio_list,
+		struct pages_or_folios *pofs)
 {
 	unsigned long i, collected = 0;
 	struct folio *prev_folio = NULL;
 	bool drain_allow = true;
 
-	for (i = 0; i < nr_folios; i++) {
-		struct folio *folio = folios[i];
+	for (i = 0; i < pofs->nr_entries; i++) {
+		struct folio *folio = pofs_get_folio(pofs, i);
 
 		if (folio == prev_folio)
 			continue;
@@ -2310,6 +2346,11 @@ static unsigned long collect_longterm_unpinnable_folios(
 			drain_allow = false;
 		}
 
+		/*
+		 * TODO: if isolation fails we might already have it in the
+		 * list, if pages of different folios are interleaved
+		 * (e.g., COW). We might want to check all entries in the list.
+		 */
 		if (!folio_isolate_lru(folio))
 			continue;
 
@@ -2328,15 +2369,14 @@ static unsigned long collect_longterm_unpinnable_folios(
  * failure (or partial success).
  */
 static int migrate_longterm_unpinnable_folios(
-					struct list_head *movable_folio_list,
-					unsigned long nr_folios,
-					struct folio **folios)
+		struct list_head *movable_folio_list,
+		struct pages_or_folios *pofs)
 {
 	int ret;
 	unsigned long i;
 
-	for (i = 0; i < nr_folios; i++) {
-		struct folio *folio = folios[i];
+	for (i = 0; i < pofs->nr_entries; i++) {
+		struct folio *folio = pofs_get_folio(pofs, i);
 
 		if (folio_is_device_coherent(folio)) {
 			/*
@@ -2344,7 +2384,7 @@ static int migrate_longterm_unpinnable_folios(
 			 * convert the pin on the source folio to a normal
 			 * reference.
 			 */
-			folios[i] = NULL;
+			pofs_clear_entry(pofs, i);
 			folio_get(folio);
 			gup_put_folio(folio, 1, FOLL_PIN);
 
@@ -2363,8 +2403,8 @@ static int migrate_longterm_unpinnable_folios(
 		 * calling folio_isolate_lru() which takes a reference so the
 		 * folio won't be freed if it's migrating.
 		 */
-		unpin_folio(folios[i]);
-		folios[i] = NULL;
+		unpin_folio(pofs_get_folio(pofs, i));
+		pofs_clear_entry(pofs, i);
 	}
 
 	if (!list_empty(movable_folio_list)) {
@@ -2387,12 +2427,24 @@ static int migrate_longterm_unpinnable_folios(
 	return -EAGAIN;
 
 err:
-	unpin_folios(folios, nr_folios);
+	pofs_unpin(pofs);
 	putback_movable_pages(movable_folio_list);
 
 	return ret;
 }
 
+static long check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs)
+{
+	LIST_HEAD(movable_folio_list);
+	unsigned long collected;
+
+	collected = collect_longterm_unpinnable_folios(&movable_folio_list, pofs);
+	if (!collected)
+		return 0;
+
+	return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs);
+}
+
 /*
  * Check whether all folios are *allowed* to be pinned indefinitely (longterm).
  * Rather confusingly, all folios in the range are required to be pinned via
@@ -2412,41 +2464,25 @@ static int migrate_longterm_unpinnable_folios(
 static long check_and_migrate_movable_folios(unsigned long nr_folios,
 					     struct folio **folios)
 {
-	unsigned long collected;
-	LIST_HEAD(movable_folio_list);
-
-	collected = collect_longterm_unpinnable_folios(&movable_folio_list,
-						       nr_folios, folios);
-	if (!collected)
-		return 0;
+	struct pages_or_folios pofs = {
+		.folios = folios,
+		.has_folios = true,
+		.nr_entries = nr_folios,
+	};
 
-	return migrate_longterm_unpinnable_folios(&movable_folio_list,
-						  nr_folios, folios);
+	return check_and_migrate_movable_pages_or_folios(&pofs);
 }
 
-/*
- * This routine just converts all the pages in the @pages array to folios and
- * calls check_and_migrate_movable_folios() to do the heavy lifting.
- *
- * Please see the check_and_migrate_movable_folios() documentation for details.
- */
+/* See check_and_migrate_movable_folios(). */
 static long check_and_migrate_movable_pages(unsigned long nr_pages,
 					    struct page **pages)
 {
-	struct folio **folios;
-	long i, ret;
+	struct pages_or_folios pofs = {
+		.pages = pages,
+		.nr_entries = nr_pages,
+	};
 
-	folios = kmalloc_array(nr_pages, sizeof(*folios), GFP_KERNEL);
-	if (!folios)
-		return -ENOMEM;
-
-	for (i = 0; i < nr_pages; i++)
-		folios[i] = page_folio(pages[i]);
-
-	ret = check_and_migrate_movable_folios(nr_pages, folios);
-
-	kfree(folios);
-	return ret;
+	return check_and_migrate_movable_pages_or_folios(&pofs);
 }
 #else
 static long check_and_migrate_movable_pages(unsigned long nr_pages,
--
2.47.0


--
Cheers,

David / dhildenb