Re: [PATCH 01/13] mm: Update ptep_get_lockless()s comment

Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> · Sat, 29 Oct 2022 11:58:23 -0700

On Sat, Oct 29, 2022 at 11:36 AM Linus Torvalds
<torvalds@xxxxxxxxxxxxxxxxxxxx> wrote:
>
> Anyway, I think the best documentation for "this is what I meant" is
> simply the patch. Does this affect your PoC on your setup?

Here's a slightly cleaned up set with preliminary commit messages, and
an explanation for why some of the 'struct page' declarations were
moved around a bit in case you wondered about that part of the change
in the full patch.

The end result should be the same, so if you already looked at the
previous unified patch, never mind. But this one tries to make for a
better patch series.

Still not tested in any way, shape, or form. I decided I wanted to
send this one before booting into this and possibly blowing up ;^)

                   Linus
From 8caca6a93ebe3b0e4adabfb1b8d13e86d41fd329 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Date: Sat, 29 Oct 2022 10:42:25 -0700
Subject: [PATCH 1/2] mm: zap_page_range: narrow down 'page' variable scope

We're using the same 'struct page *page' variable for three very
distinct cases.  That works and the compiler does the right thing, but
I'm about to add some page-related attributes that only affects one of
them, so let's make the whole "these are really different uses"
explicit.

Signed-off-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
---
 mm/memory.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index f88c351aecd4..d52f5a68c561 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1423,7 +1423,6 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 	arch_enter_lazy_mmu_mode();
 	do {
 		pte_t ptent = *pte;
-		struct page *page;
 
 		if (pte_none(ptent))
 			continue;
@@ -1432,7 +1431,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			break;
 
 		if (pte_present(ptent)) {
-			page = vm_normal_page(vma, addr, ptent);
+			struct page *page = vm_normal_page(vma, addr, ptent);
 			if (unlikely(!should_zap_page(details, page)))
 				continue;
 			ptent = ptep_get_and_clear_full(mm, addr, pte,
@@ -1467,7 +1466,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 		entry = pte_to_swp_entry(ptent);
 		if (is_device_private_entry(entry) ||
 		    is_device_exclusive_entry(entry)) {
-			page = pfn_swap_entry_to_page(entry);
+			struct page *page = pfn_swap_entry_to_page(entry);
 			if (unlikely(!should_zap_page(details, page)))
 				continue;
 			/*
@@ -1489,7 +1488,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			if (unlikely(!free_swap_and_cache(entry)))
 				print_bad_pte(vma, addr, ptent, NULL);
 		} else if (is_migration_entry(entry)) {
-			page = pfn_swap_entry_to_page(entry);
+			struct page *page = pfn_swap_entry_to_page(entry);
 			if (!should_zap_page(details, page))
 				continue;
 			rss[mm_counter(page)]--;
-- 
2.37.1.289.g45aa1e5c72.dirty

From 86d1a3807c013abca72086278d9308e398e7b41d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Date: Sat, 29 Oct 2022 11:45:07 -0700
Subject: [PATCH 2/2] mm: make sure to flush TLB before marking page dcirty

When we remove a page table entry, we are very careful to only free the
page after we have flushed the TLB, because other CPU's could still be
using the page through stale TLB entries until after the flush.

However, we mark the underlying page dirty immediately, and then remove
the rmap entry for the page, which means that

 (a) another CPU could come in and clean it, never seeing our mapping of
     the page

 (b) yet another CPU could continue to use the stale and dirty TLB entry
     and continue to write to said page

resulting in a page that has been dirtied, but then marked clean again,
all while another CPU might have dirtied it some more.  End result:
possibly lost dirty data.

This commit uses the same old TLB gather array that we use to delay the
freeing of the page to also keep the dirty state of the page table
entry, so that the 'set_page_dirty()' from the page table can be done
after the TLB flush, closing the race.

Reported-by: Nadav Amit <nadav.amit@xxxxxxxxx>
Signed-off-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
---
 include/asm-generic/tlb.h | 28 +++++++++++++++++++++++-----
 mm/memory.c               | 10 +++++-----
 mm/mmu_gather.c           | 36 ++++++++++++++++++++++++++++++++----
 3 files changed, 60 insertions(+), 14 deletions(-)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 492dce43236e..a95085f6dd47 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -238,11 +238,29 @@ extern void tlb_remove_table(struct mmu_gather *tlb, void *table);
  */
 #define MMU_GATHER_BUNDLE	8
 
+/* Fake type for an encoded page pointer with the dirty bit in the low bit */
+struct encoded_page;
+
+static inline struct encoded_page *encode_page(struct page *page, bool dirty)
+{
+	return (struct encoded_page *)(dirty | (unsigned long)page);
+}
+
+static inline bool encoded_page_dirty(struct encoded_page *page)
+{
+	return 1 & (unsigned long)page;
+}
+
+static inline struct page *encoded_page_ptr(struct encoded_page *page)
+{
+	return (struct page *)(~1ul & (unsigned long)page);
+}
+
 struct mmu_gather_batch {
 	struct mmu_gather_batch	*next;
 	unsigned int		nr;
 	unsigned int		max;
-	struct page		*pages[];
+	struct encoded_page	*encoded_pages[];
 };
 
 #define MAX_GATHER_BATCH	\
@@ -257,7 +275,7 @@ struct mmu_gather_batch {
 #define MAX_GATHER_BATCH_COUNT	(10000UL/MAX_GATHER_BATCH)
 
 extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
-				   int page_size);
+				   int page_size, bool dirty);
 #endif
 
 /*
@@ -431,13 +449,13 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
 static inline void tlb_remove_page_size(struct mmu_gather *tlb,
 					struct page *page, int page_size)
 {
-	if (__tlb_remove_page_size(tlb, page, page_size))
+	if (__tlb_remove_page_size(tlb, page, page_size, false))
 		tlb_flush_mmu(tlb);
 }
 
-static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page, bool dirty)
 {
-	return __tlb_remove_page_size(tlb, page, PAGE_SIZE);
+	return __tlb_remove_page_size(tlb, page, PAGE_SIZE, dirty);
 }
 
 /* tlb_remove_page
diff --git a/mm/memory.c b/mm/memory.c
index d52f5a68c561..8ab4c0d7e99e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1432,6 +1432,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 
 		if (pte_present(ptent)) {
 			struct page *page = vm_normal_page(vma, addr, ptent);
+			int dirty;
+
 			if (unlikely(!should_zap_page(details, page)))
 				continue;
 			ptent = ptep_get_and_clear_full(mm, addr, pte,
@@ -1442,11 +1444,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			if (unlikely(!page))
 				continue;
 
+			dirty = 0;
 			if (!PageAnon(page)) {
-				if (pte_dirty(ptent)) {
-					force_flush = 1;
-					set_page_dirty(page);
-				}
+				dirty = pte_dirty(ptent);
 				if (pte_young(ptent) &&
 				    likely(!(vma->vm_flags & VM_SEQ_READ)))
 					mark_page_accessed(page);
@@ -1455,7 +1455,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			page_remove_rmap(page, vma, false);
 			if (unlikely(page_mapcount(page) < 0))
 				print_bad_pte(vma, addr, ptent, page);
-			if (unlikely(__tlb_remove_page(tlb, page))) {
+			if (unlikely(__tlb_remove_page(tlb, page, dirty))) {
 				force_flush = 1;
 				addr += PAGE_SIZE;
 				break;
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index add4244e5790..fa79e054413a 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -43,12 +43,40 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
 	return true;
 }
 
+/*
+ * We get an 'encoded page' array, which has page pointers with
+ * the dirty bit in the low bit of the array.
+ *
+ * The TLB has been flushed, now we need to move the dirty bit into
+ * the 'struct page', clean the array in-place, and then free the
+ * pages and their swap cache.
+ */
+static void clean_and_free_pages_and_swap_cache(struct encoded_page **pages, unsigned int nr)
+{
+	for (unsigned int i = 0; i < nr; i++) {
+		struct encoded_page *encoded = pages[i];
+		if (encoded_page_dirty(encoded)) {
+			struct page *page = encoded_page_ptr(encoded);
+			/* Clean the dirty pointer in-place */
+			pages[i] = encode_page(page, 0);
+			set_page_dirty(page);
+		}
+	}
+
+	/*
+	 * Now all entries have been un-encoded, and changed to plain
+	 * page pointers, so we can cast the 'encoded_page' array to
+	 * a plain page array and free them
+	 */
+	free_pages_and_swap_cache((struct page **)pages, nr);
+}
+
 static void tlb_batch_pages_flush(struct mmu_gather *tlb)
 {
 	struct mmu_gather_batch *batch;
 
 	for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
-		struct page **pages = batch->pages;
+		struct encoded_page **pages = batch->encoded_pages;
 
 		do {
 			/*
@@ -56,7 +84,7 @@ static void tlb_batch_pages_flush(struct mmu_gather *tlb)
 			 */
 			unsigned int nr = min(512U, batch->nr);
 
-			free_pages_and_swap_cache(pages, nr);
+			clean_and_free_pages_and_swap_cache(pages, nr);
 			pages += nr;
 			batch->nr -= nr;
 
@@ -77,7 +105,7 @@ static void tlb_batch_list_free(struct mmu_gather *tlb)
 	tlb->local.next = NULL;
 }
 
-bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
+bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size, bool dirty)
 {
 	struct mmu_gather_batch *batch;
 
@@ -92,7 +120,7 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
 	 * Add the page and check if we are full. If so
 	 * force a flush.
 	 */
-	batch->pages[batch->nr++] = page;
+	batch->encoded_pages[batch->nr++] = encode_page(page, dirty);
 	if (batch->nr == batch->max) {
 		if (!tlb_next_batch(tlb))
 			return true;
-- 
2.37.1.289.g45aa1e5c72.dirty