Re: [PATCH v7 2/7] mm: swap: free_swap_and_cache_nr() as batched free_swap_and_cache()

David Hildenbrand <david@xxxxxxxxxx> · Tue, 9 Apr 2024 11:24:53 +0200

On 09.04.24 11:22, Barry Song wrote:
On Tue, Apr 9, 2024 at 8:51 PM Barry Song <21cnbao@xxxxxxxxx> wrote:

On Tue, Apr 9, 2024 at 6:40 AM Ryan Roberts <ryan.roberts@xxxxxxx> wrote:

Now that we no longer have a convenient flag in the cluster to determine
if a folio is large, free_swap_and_cache() will take a reference and
lock a large folio much more often, which could lead to contention and
(e.g.) failure to split large folios, etc.

Let's solve that problem by batch freeing swap and cache with a new
function, free_swap_and_cache_nr(), to free a contiguous range of swap
entries together. This allows us to first drop a reference to each swap
slot before we try to release the cache folio. This means we only try to
release the folio once, only taking the reference and lock once - much
better than the previous 512 times for the 2M THP case.

Contiguous swap entries are gathered in zap_pte_range() and
madvise_free_pte_range() in a similar way to how present ptes are
already gathered in zap_pte_range().

While we are at it, let's simplify by converting the return type of both
functions to void. The return value was used only by zap_pte_range() to
print a bad pte, and was ignored by everyone else, so the extra
reporting wasn't exactly guaranteed. We will still get the warning with
most of the information from get_swap_device(). With the batch version,
we wouldn't know which pte was bad anyway so could print the wrong one.

Signed-off-by: Ryan Roberts <ryan.roberts@xxxxxxx>
---
  include/linux/pgtable.h | 29 ++++++++++++
  include/linux/swap.h    | 12 +++--
  mm/internal.h           | 63 ++++++++++++++++++++++++++
  mm/madvise.c            | 12 +++--
  mm/memory.c             | 13 +++---
  mm/swapfile.c           | 97 +++++++++++++++++++++++++++++++++--------
  6 files changed, 195 insertions(+), 31 deletions(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index a3fc8150b047..75096025fe52 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -708,6 +708,35 @@ static inline void pte_clear_not_present_full(struct mm_struct *mm,
  }
  #endif

+#ifndef clear_not_present_full_ptes
+/**
+ * clear_not_present_full_ptes - Clear multiple not present PTEs which are
+ *                              consecutive in the pgtable.
+ * @mm: Address space the ptes represent.
+ * @addr: Address of the first pte.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to clear.
+ * @full: Whether we are clearing a full mm.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over pte_clear_not_present_full().
+ *
+ * Context: The caller holds the page table lock.  The PTEs are all not present.
+ * The PTEs are all in the same PMD.
+ */
+static inline void clear_not_present_full_ptes(struct mm_struct *mm,
+               unsigned long addr, pte_t *ptep, unsigned int nr, int full)
+{
+       for (;;) {
+               pte_clear_not_present_full(mm, addr, ptep, full);
+               if (--nr == 0)
+                       break;
+               ptep++;
+               addr += PAGE_SIZE;
+       }
+}
+#endif
+
  #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
  extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
                               unsigned long address,
diff --git a/include/linux/swap.h b/include/linux/swap.h
index f6f78198f000..5737236dc3ce 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -471,7 +471,7 @@ extern int swap_duplicate(swp_entry_t);
  extern int swapcache_prepare(swp_entry_t);
  extern void swap_free(swp_entry_t);
  extern void swapcache_free_entries(swp_entry_t *entries, int n);
-extern int free_swap_and_cache(swp_entry_t);
+extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
  int swap_type_of(dev_t device, sector_t offset);
  int find_first_swap(dev_t *device);
  extern unsigned int count_swap_pages(int, int);
@@ -520,8 +520,9 @@ static inline void put_swap_device(struct swap_info_struct *si)
  #define free_pages_and_swap_cache(pages, nr) \
         release_pages((pages), (nr));

-/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
-#define free_swap_and_cache(e) is_pfn_swap_entry(e)
+static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
+{
+}

  static inline void free_swap_cache(struct folio *folio)
  {
@@ -589,6 +590,11 @@ static inline int add_swap_extent(struct swap_info_struct *sis,
  }
  #endif /* CONFIG_SWAP */

+static inline void free_swap_and_cache(swp_entry_t entry)
+{
+       free_swap_and_cache_nr(entry, 1);
+}
+
  #ifdef CONFIG_MEMCG
  static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
  {
diff --git a/mm/internal.h b/mm/internal.h
index 3bdc8693b54f..de68705624b0 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -11,6 +11,8 @@
  #include <linux/mm.h>
  #include <linux/pagemap.h>
  #include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
  #include <linux/tracepoint-defs.h>

  struct folio_batch;
@@ -189,6 +191,67 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,

         return min(ptep - start_ptep, max_nr);
  }
+
+/**
+ * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
+ * @pte: The initial pte state; is_swap_pte(pte) must be true.
+ *
+ * Increments the swap offset, while maintaining all other fields, including
+ * swap type, and any swp pte bits. The resulting pte is returned.
+ */
+static inline pte_t pte_next_swp_offset(pte_t pte)
+{
+       swp_entry_t entry = pte_to_swp_entry(pte);
+       pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
+                                                  swp_offset(entry) + 1));
+
+       if (pte_swp_soft_dirty(pte))
+               new = pte_swp_mksoft_dirty(new);
+       if (pte_swp_exclusive(pte))
+               new = pte_swp_mkexclusive(new);
+       if (pte_swp_uffd_wp(pte))
+               new = pte_swp_mkuffd_wp(new);

I don't quite understand this. If this page table entry is exclusive,
will its subsequent page table entry also be exclusive without
question?
in try_to_unmap_one, exclusive is per-subpage but not per-folio:

                 anon_exclusive = folio_test_anon(folio) &&
                                  PageAnonExclusive(subpage);

same questions also for diry, wp etc.

Sorry for the noise. you are right. based on your new version, I think I should
entirely drop:

[PATCH v2 3/5] mm: swap_pte_batch: add an output argument to reture if
all swap entries are exclusive

Yes. If we ever want to ignore some bits, we should likely add flags to 
change the behavior, like for folio_pte_batch().

For swapin, you really want the exclusive bits to match, though. 
softdirty and uffd-wp as well at least initially for simplicity.

--
Cheers,

David / dhildenb