On Sat, Oct 29, 2022 at 11:36 AM Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> wrote: > > Anyway, I think the best documentation for "this is what I meant" is > simply the patch. Does this affect your PoC on your setup? Here's a slightly cleaned up set with preliminary commit messages, and an explanation for why some of the 'struct page' declarations were moved around a bit in case you wondered about that part of the change in the full patch. The end result should be the same, so if you already looked at the previous unified patch, never mind. But this one tries to make for a better patch series. Still not tested in any way, shape, or form. I decided I wanted to send this one before booting into this and possibly blowing up ;^) Linus
From 8caca6a93ebe3b0e4adabfb1b8d13e86d41fd329 Mon Sep 17 00:00:00 2001 From: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> Date: Sat, 29 Oct 2022 10:42:25 -0700 Subject: [PATCH 1/2] mm: zap_page_range: narrow down 'page' variable scope We're using the same 'struct page *page' variable for three very distinct cases. That works and the compiler does the right thing, but I'm about to add some page-related attributes that only affects one of them, so let's make the whole "these are really different uses" explicit. Signed-off-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> --- mm/memory.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index f88c351aecd4..d52f5a68c561 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1423,7 +1423,6 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, arch_enter_lazy_mmu_mode(); do { pte_t ptent = *pte; - struct page *page; if (pte_none(ptent)) continue; @@ -1432,7 +1431,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, break; if (pte_present(ptent)) { - page = vm_normal_page(vma, addr, ptent); + struct page *page = vm_normal_page(vma, addr, ptent); if (unlikely(!should_zap_page(details, page))) continue; ptent = ptep_get_and_clear_full(mm, addr, pte, @@ -1467,7 +1466,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, entry = pte_to_swp_entry(ptent); if (is_device_private_entry(entry) || is_device_exclusive_entry(entry)) { - page = pfn_swap_entry_to_page(entry); + struct page *page = pfn_swap_entry_to_page(entry); if (unlikely(!should_zap_page(details, page))) continue; /* @@ -1489,7 +1488,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (unlikely(!free_swap_and_cache(entry))) print_bad_pte(vma, addr, ptent, NULL); } else if (is_migration_entry(entry)) { - page = pfn_swap_entry_to_page(entry); + struct page *page = pfn_swap_entry_to_page(entry); if (!should_zap_page(details, page)) continue; rss[mm_counter(page)]--; -- 2.37.1.289.g45aa1e5c72.dirty
From 86d1a3807c013abca72086278d9308e398e7b41d Mon Sep 17 00:00:00 2001 From: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> Date: Sat, 29 Oct 2022 11:45:07 -0700 Subject: [PATCH 2/2] mm: make sure to flush TLB before marking page dcirty When we remove a page table entry, we are very careful to only free the page after we have flushed the TLB, because other CPU's could still be using the page through stale TLB entries until after the flush. However, we mark the underlying page dirty immediately, and then remove the rmap entry for the page, which means that (a) another CPU could come in and clean it, never seeing our mapping of the page (b) yet another CPU could continue to use the stale and dirty TLB entry and continue to write to said page resulting in a page that has been dirtied, but then marked clean again, all while another CPU might have dirtied it some more. End result: possibly lost dirty data. This commit uses the same old TLB gather array that we use to delay the freeing of the page to also keep the dirty state of the page table entry, so that the 'set_page_dirty()' from the page table can be done after the TLB flush, closing the race. Reported-by: Nadav Amit <nadav.amit@xxxxxxxxx> Signed-off-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> --- include/asm-generic/tlb.h | 28 +++++++++++++++++++++++----- mm/memory.c | 10 +++++----- mm/mmu_gather.c | 36 ++++++++++++++++++++++++++++++++---- 3 files changed, 60 insertions(+), 14 deletions(-) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 492dce43236e..a95085f6dd47 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -238,11 +238,29 @@ extern void tlb_remove_table(struct mmu_gather *tlb, void *table); */ #define MMU_GATHER_BUNDLE 8 +/* Fake type for an encoded page pointer with the dirty bit in the low bit */ +struct encoded_page; + +static inline struct encoded_page *encode_page(struct page *page, bool dirty) +{ + return (struct encoded_page *)(dirty | (unsigned long)page); +} + +static inline bool encoded_page_dirty(struct encoded_page *page) +{ + return 1 & (unsigned long)page; +} + +static inline struct page *encoded_page_ptr(struct encoded_page *page) +{ + return (struct page *)(~1ul & (unsigned long)page); +} + struct mmu_gather_batch { struct mmu_gather_batch *next; unsigned int nr; unsigned int max; - struct page *pages[]; + struct encoded_page *encoded_pages[]; }; #define MAX_GATHER_BATCH \ @@ -257,7 +275,7 @@ struct mmu_gather_batch { #define MAX_GATHER_BATCH_COUNT (10000UL/MAX_GATHER_BATCH) extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, - int page_size); + int page_size, bool dirty); #endif /* @@ -431,13 +449,13 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { - if (__tlb_remove_page_size(tlb, page, page_size)) + if (__tlb_remove_page_size(tlb, page, page_size, false)) tlb_flush_mmu(tlb); } -static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) +static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page, bool dirty) { - return __tlb_remove_page_size(tlb, page, PAGE_SIZE); + return __tlb_remove_page_size(tlb, page, PAGE_SIZE, dirty); } /* tlb_remove_page diff --git a/mm/memory.c b/mm/memory.c index d52f5a68c561..8ab4c0d7e99e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1432,6 +1432,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (pte_present(ptent)) { struct page *page = vm_normal_page(vma, addr, ptent); + int dirty; + if (unlikely(!should_zap_page(details, page))) continue; ptent = ptep_get_and_clear_full(mm, addr, pte, @@ -1442,11 +1444,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (unlikely(!page)) continue; + dirty = 0; if (!PageAnon(page)) { - if (pte_dirty(ptent)) { - force_flush = 1; - set_page_dirty(page); - } + dirty = pte_dirty(ptent); if (pte_young(ptent) && likely(!(vma->vm_flags & VM_SEQ_READ))) mark_page_accessed(page); @@ -1455,7 +1455,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, page_remove_rmap(page, vma, false); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); - if (unlikely(__tlb_remove_page(tlb, page))) { + if (unlikely(__tlb_remove_page(tlb, page, dirty))) { force_flush = 1; addr += PAGE_SIZE; break; diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index add4244e5790..fa79e054413a 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -43,12 +43,40 @@ static bool tlb_next_batch(struct mmu_gather *tlb) return true; } +/* + * We get an 'encoded page' array, which has page pointers with + * the dirty bit in the low bit of the array. + * + * The TLB has been flushed, now we need to move the dirty bit into + * the 'struct page', clean the array in-place, and then free the + * pages and their swap cache. + */ +static void clean_and_free_pages_and_swap_cache(struct encoded_page **pages, unsigned int nr) +{ + for (unsigned int i = 0; i < nr; i++) { + struct encoded_page *encoded = pages[i]; + if (encoded_page_dirty(encoded)) { + struct page *page = encoded_page_ptr(encoded); + /* Clean the dirty pointer in-place */ + pages[i] = encode_page(page, 0); + set_page_dirty(page); + } + } + + /* + * Now all entries have been un-encoded, and changed to plain + * page pointers, so we can cast the 'encoded_page' array to + * a plain page array and free them + */ + free_pages_and_swap_cache((struct page **)pages, nr); +} + static void tlb_batch_pages_flush(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; for (batch = &tlb->local; batch && batch->nr; batch = batch->next) { - struct page **pages = batch->pages; + struct encoded_page **pages = batch->encoded_pages; do { /* @@ -56,7 +84,7 @@ static void tlb_batch_pages_flush(struct mmu_gather *tlb) */ unsigned int nr = min(512U, batch->nr); - free_pages_and_swap_cache(pages, nr); + clean_and_free_pages_and_swap_cache(pages, nr); pages += nr; batch->nr -= nr; @@ -77,7 +105,7 @@ static void tlb_batch_list_free(struct mmu_gather *tlb) tlb->local.next = NULL; } -bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) +bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size, bool dirty) { struct mmu_gather_batch *batch; @@ -92,7 +120,7 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ * Add the page and check if we are full. If so * force a flush. */ - batch->pages[batch->nr++] = page; + batch->encoded_pages[batch->nr++] = encode_page(page, dirty); if (batch->nr == batch->max) { if (!tlb_next_batch(tlb)) return true; -- 2.37.1.289.g45aa1e5c72.dirty