[Chapter Two] THP shattering: the reverse of collapsing

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



In contrast to split, shatter migrates occupied pages in a partially
mapped THP to a bunch of base folios. IOW, unlike split done in place,
shatter is the exact opposite of collapse.

The advantage of shattering is that it keeps the original THP intact.
The cost of copying during the migration is not a side effect, but
rather by design, since splitting is considered a discouraged
behavior. In retail terms, the return of a purchase is charged with a
restocking fee and the original goods can be resold.

THPs from ZONE_NOMERGE can only be shattered, since they cannot be
split or merged. THPs from ZONE_NOSPLIT can be shattered or split (the
latter requires [1]), if they are above the minimum order.

[1] https://lore.kernel.org/20240226205534.1603748-1-zi.yan@xxxxxxxx/

Signed-off-by: Yu Zhao <yuzhao@xxxxxxxxxx>
---
 include/linux/memcontrol.h    |   5 +
 include/linux/mm_inline.h     |  24 +++
 include/linux/mm_types.h      |   8 +-
 include/linux/vm_event_item.h |   3 +
 mm/huge_memory.c              | 303 ++++++++++++++++++++++++++++------
 mm/internal.h                 |  38 +++++
 mm/madvise.c                  |  11 +-
 mm/memcontrol.c               |  47 ++++++
 mm/memory-failure.c           |   2 +-
 mm/migrate.c                  |  44 +++--
 mm/page_alloc.c               |   4 +
 mm/rmap.c                     |   4 +
 mm/shmem.c                    |   4 +-
 mm/truncate.c                 |   6 +-
 mm/userfaultfd.c              |   2 +-
 mm/vmscan.c                   |   9 +
 mm/vmstat.c                   |   3 +
 17 files changed, 443 insertions(+), 74 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 20ff87f8e001..435cf114c6e2 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1163,6 +1163,7 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
 }
 
 void split_page_memcg(struct page *head, unsigned int nr);
+void folio_copy_memcg(struct folio *folio);
 
 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
 						gfp_t gfp_mask,
@@ -1624,6 +1625,10 @@ static inline void split_page_memcg(struct page *head, unsigned int nr)
 {
 }
 
+static inline void folio_copy_memcg(struct folio *folio)
+{
+}
+
 static inline
 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
 					    gfp_t gfp_mask,
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index f4fe593c1400..aa96d6ed0223 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -218,6 +218,25 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli
 	VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
 }
 
+static inline bool lru_gen_add_dst(struct lruvec *lruvec, struct folio *dst)
+{
+	int gen = folio_lru_gen(dst);
+	int type = folio_is_file_lru(dst);
+	int zone = folio_zonenum(dst);
+	struct lru_gen_folio *lrugen = &lruvec->lrugen;
+
+	if (gen < 0)
+		return false;
+
+	lockdep_assert_held(&lruvec->lru_lock);
+	VM_WARN_ON_ONCE_FOLIO(folio_lruvec(dst) != lruvec, dst);
+
+	list_add_tail(&dst->lru, &lrugen->folios[gen][type][zone]);
+	lru_gen_update_size(lruvec, dst, -1, gen);
+
+	return true;
+}
+
 static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
 {
 	unsigned long seq;
@@ -303,6 +322,11 @@ static inline bool lru_gen_in_fault(void)
 	return false;
 }
 
+static inline bool lru_gen_add_dst(struct lruvec *lruvec, struct folio *dst)
+{
+	return false;
+}
+
 static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
 {
 	return false;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8b611e13153e..f483b273e80e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -323,14 +323,19 @@ struct folio {
 		struct {
 			unsigned long _flags_1;
 			unsigned long _head_1;
-			unsigned long _folio_avail;
 	/* public: */
 			atomic_t _entire_mapcount;
 			atomic_t _nr_pages_mapped;
 			atomic_t _pincount;
 #ifdef CONFIG_64BIT
+			unsigned int __padding;
 			unsigned int _folio_nr_pages;
 #endif
+			union {
+				unsigned long _private_1;
+				unsigned long *_dst_ul;
+				struct page **_dst_pp;
+			};
 	/* private: the union with struct page is transitional */
 		};
 		struct page __page_1;
@@ -382,6 +387,7 @@ FOLIO_MATCH(_last_cpupid, _last_cpupid);
 			offsetof(struct page, pg) + sizeof(struct page))
 FOLIO_MATCH(flags, _flags_1);
 FOLIO_MATCH(compound_head, _head_1);
+FOLIO_MATCH(private, _private_1);
 #undef FOLIO_MATCH
 #define FOLIO_MATCH(pg, fl)						\
 	static_assert(offsetof(struct folio, fl) ==			\
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 9a54d15d5ec3..027851c795bc 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -105,6 +105,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		THP_SPLIT_PAGE_FAILED,
 		THP_DEFERRED_SPLIT_PAGE,
 		THP_SPLIT_PMD,
+		THP_SHATTER_PAGE,
+		THP_SHATTER_PAGE_FAILED,
+		THP_SHATTER_PAGE_DISCARDED,
 		THP_SCAN_EXCEED_NONE_PTE,
 		THP_SCAN_EXCEED_SWAP_PTE,
 		THP_SCAN_EXCEED_SHARED_PTE,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b57faa0a1e83..62d2254bc51c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2586,6 +2586,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 				entry = pte_swp_mksoft_dirty(entry);
 			if (uffd_wp)
 				entry = pte_swp_mkuffd_wp(entry);
+			if (vma->vm_flags & VM_LOCKED)
+				set_src_usage(page + i, SRC_PAGE_MLOCKED);
+			else
+				set_src_usage(page + i, SRC_PAGE_MAPPED);
 		} else {
 			entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
 			if (write)
@@ -2732,6 +2736,178 @@ static void remap_page(struct folio *folio, unsigned long nr)
 	}
 }
 
+static int prep_to_unmap(struct folio *src)
+{
+	int nr_pages = folio_nr_pages(src);
+
+	if (folio_can_split(src))
+		return 0;
+
+	WARN_ON_ONCE(src->_dst_pp);
+
+	src->_dst_pp = kcalloc(nr_pages, sizeof(struct page *), GFP_ATOMIC);
+
+	return src->_dst_pp ? 0 : -ENOMEM;
+}
+
+static bool try_to_discard(struct folio *src, int i)
+{
+	int usage;
+	void *addr;
+	struct page *page = folio_page(src, i);
+
+	if (!folio_test_anon(src))
+		return false;
+
+	if (folio_test_swapcache(src))
+		return false;
+
+	usage = src_page_usage(page);
+	if (usage & SRC_PAGE_MLOCKED)
+		return false;
+
+	if (!(usage & SRC_PAGE_MAPPED))
+		return true;
+
+	addr = kmap_local_page(page);
+	if (!memchr_inv(addr, 0, PAGE_SIZE))
+		set_src_usage(page, SRC_PAGE_CLEAN);
+	kunmap_local(addr);
+
+	return can_discard_src(page);
+}
+
+static int prep_dst_pages(struct folio *src)
+{
+	int i;
+	int nr_pages = folio_nr_pages(src);
+
+	if (folio_can_split(src))
+		return 0;
+
+	if (WARN_ON_ONCE(!src->_dst_pp))
+		return -ENOMEM;
+
+	for (i = 0; i < nr_pages; i++) {
+		struct page *dst = NULL;
+
+		if (try_to_discard(src, i)) {
+			count_vm_event(THP_SHATTER_PAGE_DISCARDED);
+			continue;
+		}
+
+		do {
+			int nid = folio_nid(src);
+			gfp_t gfp = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
+				    GFP_NOWAIT | __GFP_THISNODE;
+
+			if (dst)
+				__free_page(dst);
+
+			dst = alloc_pages_node(nid, gfp, 0);
+			if (!dst)
+				return -ENOMEM;
+		} while (!page_ref_freeze(dst, 1));
+
+		copy_highpage(dst, folio_page(src, i));
+		src->_dst_ul[i] |= (unsigned long)dst;
+
+		cond_resched();
+	}
+
+	return 0;
+}
+
+static void free_dst_pages(struct folio *src)
+{
+	int i;
+	int nr_pages = folio_nr_pages(src);
+
+	if (folio_can_split(src))
+		return;
+
+	if (!src->_dst_pp)
+		return;
+
+	for (i = 0; i < nr_pages; i++) {
+		struct page *dst = folio_dst_page(src, i);
+
+		if (!dst)
+			continue;
+
+		page_ref_unfreeze(dst, 1);
+		__free_page(dst);
+	}
+
+	kfree(src->_dst_pp);
+	src->_dst_pp = NULL;
+}
+
+static void reset_src_folio(struct folio *src)
+{
+	if (folio_can_split(src))
+		return;
+
+	if (WARN_ON_ONCE(!src->_dst_pp))
+		return;
+
+	if (!folio_mapping_flags(src))
+		src->mapping = NULL;
+
+	if (folio_test_anon(src) && folio_test_swapcache(src)) {
+		folio_clear_swapcache(src);
+		src->swap.val = 0;
+	}
+
+	kfree(src->_dst_pp);
+	src->_dst_pp = NULL;
+}
+
+static void copy_page_owner(struct folio *src)
+{
+	int i;
+	int nr_pages = folio_nr_pages(src);
+
+	if (folio_can_split(src))
+		return;
+
+	if (WARN_ON_ONCE(!src->_dst_pp))
+		return;
+
+	for (i = 0; i < nr_pages; i++) {
+		struct page *dst = folio_dst_page(src, i);
+
+		if (dst)
+			folio_copy_owner(src, page_folio(dst));
+	}
+}
+
+static bool lru_add_dst(struct lruvec *lruvec, struct folio *src, struct folio *dst)
+{
+	if (folio_can_split(src))
+		return false;
+
+	VM_WARN_ON_ONCE_FOLIO(!folio_test_lru(src), src);
+	VM_WARN_ON_ONCE_FOLIO(folio_test_lru(dst), dst);
+	VM_WARN_ON_ONCE_FOLIO(folio_lruvec(dst) != folio_lruvec(src), dst);
+
+	if (!lru_gen_add_dst(lruvec, dst)) {
+		enum lru_list lru = folio_lru_list(dst);
+		int zone = folio_zonenum(dst);
+		int delta = folio_nr_pages(dst);
+
+		if (folio_test_unevictable(dst))
+			dst->mlock_count = 0;
+		else
+			list_add_tail(&dst->lru, &src->lru);
+		update_lru_size(lruvec, lru, zone, delta);
+	}
+
+	folio_set_lru(dst);
+
+	return true;
+}
+
 static void lru_add_page_tail(struct page *head, struct page *tail,
 		struct lruvec *lruvec, struct list_head *list)
 {
@@ -2745,7 +2921,7 @@ static void lru_add_page_tail(struct page *head, struct page *tail,
 		VM_WARN_ON(PageLRU(head));
 		get_page(tail);
 		list_add_tail(&tail->lru, list);
-	} else {
+	} else if (!lru_add_dst(lruvec, page_folio(head), page_folio(tail))) {
 		/* head is still on lru (and we have it frozen) */
 		VM_WARN_ON(!PageLRU(head));
 		if (PageUnevictable(tail))
@@ -2760,7 +2936,7 @@ static void __split_huge_page_tail(struct folio *folio, int tail,
 		struct lruvec *lruvec, struct list_head *list)
 {
 	struct page *head = &folio->page;
-	struct page *page_tail = head + tail;
+	struct page *page_tail = folio_dst_page(folio, tail);
 	/*
 	 * Careful: new_folio is not a "real" folio before we cleared PageTail.
 	 * Don't pass it around before clear_compound_head().
@@ -2801,8 +2977,8 @@ static void __split_huge_page_tail(struct folio *folio, int tail,
 			 LRU_GEN_MASK | LRU_REFS_MASK));
 
 	/* ->mapping in first and second tail page is replaced by other uses */
-	VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
-			page_tail);
+	VM_BUG_ON_PAGE(folio_can_split(folio) && tail > 2 &&
+		       page_tail->mapping != TAIL_MAPPING, page_tail);
 	page_tail->mapping = head->mapping;
 	page_tail->index = head->index + tail;
 
@@ -2857,9 +3033,13 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 	unsigned long offset = 0;
 	unsigned int nr = thp_nr_pages(head);
 	int i, nr_dropped = 0;
+	bool can_split = folio_can_split(folio);
 
 	/* complete memcg works before add pages to LRU */
-	split_page_memcg(head, nr);
+	if (can_split)
+		split_page_memcg(head, nr);
+	else
+		folio_copy_memcg(folio);
 
 	if (folio_test_anon(folio) && folio_test_swapcache(folio)) {
 		offset = swp_offset(folio->swap);
@@ -2872,46 +3052,53 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 
 	ClearPageHasHWPoisoned(head);
 
-	for (i = nr - 1; i >= 1; i--) {
+	for (i = nr - 1; i >= can_split; i--) {
+		struct page *dst = folio_dst_page(folio, i);
+
+		if (!dst)
+			continue;
+
 		__split_huge_page_tail(folio, i, lruvec, list);
 		/* Some pages can be beyond EOF: drop them from page cache */
-		if (head[i].index >= end) {
-			struct folio *tail = page_folio(head + i);
+		if (dst->index >= end) {
+			struct folio *tail = page_folio(dst);
 
-			if (shmem_mapping(head->mapping))
+			if (shmem_mapping(tail->mapping))
 				nr_dropped++;
 			else if (folio_test_clear_dirty(tail))
 				folio_account_cleaned(tail,
-					inode_to_wb(folio->mapping->host));
+					inode_to_wb(tail->mapping->host));
 			__filemap_remove_folio(tail, NULL);
 			folio_put(tail);
-		} else if (!PageAnon(page)) {
-			__xa_store(&head->mapping->i_pages, head[i].index,
-					head + i, 0);
+		} else if (!PageAnon(dst)) {
+			__xa_store(&dst->mapping->i_pages, dst->index, dst, 0);
 		} else if (swap_cache) {
-			__xa_store(&swap_cache->i_pages, offset + i,
-					head + i, 0);
+			__xa_store(&swap_cache->i_pages, offset + i, dst, 0);
 		}
 	}
 
-	ClearPageCompound(head);
+	if (can_split)
+		ClearPageCompound(head);
 	unlock_page_lruvec(lruvec);
 	/* Caller disabled irqs, so they are still disabled here */
 
-	split_page_owner(head, nr);
+	if (can_split)
+		split_page_owner(head, nr);
+	else
+		copy_page_owner(folio);
 
 	/* See comment in __split_huge_page_tail() */
 	if (PageAnon(head)) {
 		/* Additional pin to swap cache */
 		if (PageSwapCache(head)) {
-			page_ref_add(head, 2);
+			page_ref_add(head, 2 - !can_split);
 			xa_unlock(&swap_cache->i_pages);
 		} else {
 			page_ref_inc(head);
 		}
 	} else {
 		/* Additional pin to page cache */
-		page_ref_add(head, 2);
+		page_ref_add(head, 2 - !can_split);
 		xa_unlock(&head->mapping->i_pages);
 	}
 	local_irq_enable();
@@ -2924,8 +3111,9 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 		split_swap_cluster(folio->swap);
 
 	for (i = 0; i < nr; i++) {
-		struct page *subpage = head + i;
-		if (subpage == page)
+		struct page *subpage = folio_dst_page(folio, i);
+
+		if (!subpage || subpage == page)
 			continue;
 		unlock_page(subpage);
 
@@ -2945,9 +3133,6 @@ static bool can_split_folio(struct folio *folio, int *pextra_pins)
 {
 	int extra_pins;
 
-	if (!folio_can_split(folio))
-		return false;
-
 	/* Additional pins from page cache */
 	if (folio_test_anon(folio))
 		extra_pins = folio_test_swapcache(folio) ?
@@ -3067,8 +3252,21 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 		goto out_unlock;
 	}
 
+	ret = prep_to_unmap(folio);
+	if (ret)
+		goto out_unlock;
+
 	unmap_folio(folio);
 
+	if (!folio_ref_freeze(folio, 1 + extra_pins)) {
+		ret = -EAGAIN;
+		goto fail;
+	}
+
+	ret = prep_dst_pages(folio);
+	if (ret)
+		goto fail;
+
 	/* block interrupt reentry in xa_lock and spinlock */
 	local_irq_disable();
 	if (mapping) {
@@ -3078,44 +3276,41 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 		 */
 		xas_lock(&xas);
 		xas_reset(&xas);
-		if (xas_load(&xas) != folio)
+		if (xas_load(&xas) != folio) {
+			xas_unlock(&xas);
+			local_irq_enable();
+			ret = -EAGAIN;
 			goto fail;
+		}
 	}
 
 	/* Prevent deferred_split_scan() touching ->_refcount */
 	spin_lock(&ds_queue->split_queue_lock);
-	if (folio_ref_freeze(folio, 1 + extra_pins)) {
-		if (!list_empty(&folio->_deferred_list)) {
-			ds_queue->split_queue_len--;
-			list_del(&folio->_deferred_list);
-		}
-		spin_unlock(&ds_queue->split_queue_lock);
-		if (mapping) {
-			int nr = folio_nr_pages(folio);
+	if (!list_empty(&folio->_deferred_list)) {
+		ds_queue->split_queue_len--;
+		list_del_init(&folio->_deferred_list);
+	}
+	spin_unlock(&ds_queue->split_queue_lock);
+	if (mapping) {
+		int nr = folio_nr_pages(folio);
 
-			xas_split(&xas, folio, folio_order(folio));
-			if (folio_test_pmd_mappable(folio)) {
-				if (folio_test_swapbacked(folio)) {
-					__lruvec_stat_mod_folio(folio,
-							NR_SHMEM_THPS, -nr);
-				} else {
-					__lruvec_stat_mod_folio(folio,
-							NR_FILE_THPS, -nr);
-					filemap_nr_thps_dec(mapping);
-				}
+		xas_split(&xas, folio, folio_order(folio));
+		if (folio_test_pmd_mappable(folio)) {
+			if (folio_test_swapbacked(folio)) {
+				__lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr);
+			} else {
+				__lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);
+				filemap_nr_thps_dec(mapping);
 			}
 		}
+	}
 
-		__split_huge_page(page, list, end);
-		ret = 0;
-	} else {
-		spin_unlock(&ds_queue->split_queue_lock);
+	__split_huge_page(page, list, end);
+	reset_src_folio(folio);
 fail:
-		if (mapping)
-			xas_unlock(&xas);
-		local_irq_enable();
+	if (ret) {
+		free_dst_pages(folio);
 		remap_page(folio, folio_nr_pages(folio));
-		ret = -EAGAIN;
 	}
 
 out_unlock:
@@ -3127,6 +3322,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 		i_mmap_unlock_read(mapping);
 out:
 	xas_destroy(&xas);
+
+	if (!folio_can_split(folio)) {
+		count_vm_event(!ret ? THP_SHATTER_PAGE : THP_SHATTER_PAGE_FAILED);
+		return ret ? : 1;
+	}
+
 	count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
 	return ret;
 }
diff --git a/mm/internal.h b/mm/internal.h
index f309a010d50f..ac1d27468899 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1266,4 +1266,42 @@ static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
 }
 #endif /* CONFIG_SHRINKER_DEBUG */
 
+#define SRC_PAGE_MAPPED		BIT(0)
+#define SRC_PAGE_MLOCKED	BIT(1)
+#define SRC_PAGE_CLEAN		BIT(2)
+#define SRC_PAGE_USAGE_MASK	(BIT(3) - 1)
+
+static inline unsigned long src_page_usage(struct page *page)
+{
+	struct folio *src = page_folio(page);
+	int i = folio_page_idx(src, page);
+
+	if (folio_can_split(src) || !src->_dst_ul)
+		return 0;
+
+	return src->_dst_ul[i] & SRC_PAGE_USAGE_MASK;
+}
+
+static inline bool can_discard_src(struct page *page)
+{
+	return src_page_usage(page) & SRC_PAGE_CLEAN;
+}
+
+static inline void set_src_usage(struct page *page, unsigned long usage)
+{
+	struct folio *src = page_folio(page);
+	int i = folio_page_idx(src, page);
+
+	if (!folio_can_split(src) && src->_dst_ul)
+		src->_dst_ul[i] |= usage;
+}
+
+static inline struct page *folio_dst_page(struct folio *src, int i)
+{
+	if (folio_can_split(src) || !src->_dst_ul)
+		return folio_page(src, i);
+
+	return (void *)(src->_dst_ul[i] & ~SRC_PAGE_USAGE_MASK);
+}
+
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/madvise.c b/mm/madvise.c
index cfa5e7288261..0f82e132cd52 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -381,7 +381,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 			err = split_folio(folio);
 			folio_unlock(folio);
 			folio_put(folio);
-			if (!err)
+			if (err >= 0)
 				goto regular_folio;
 			return 0;
 		}
@@ -466,8 +466,10 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 			err = split_folio(folio);
 			folio_unlock(folio);
 			folio_put(folio);
-			if (err)
+			if (err < 0)
 				break;
+			if (err)
+				goto restart;
 			start_pte = pte =
 				pte_offset_map_lock(mm, pmd, addr, &ptl);
 			if (!start_pte)
@@ -635,6 +637,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 			return 0;
 
 	tlb_change_page_size(tlb, PAGE_SIZE);
+restart:
 	start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	if (!start_pte)
 		return 0;
@@ -688,8 +691,10 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 			err = split_folio(folio);
 			folio_unlock(folio);
 			folio_put(folio);
-			if (err)
+			if (err < 0)
 				break;
+			if (err)
+				goto restart;
 			start_pte = pte =
 				pte_offset_map_lock(mm, pmd, addr, &ptl);
 			if (!start_pte)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 61932c9215e7..71b4d1e610db 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3624,6 +3624,53 @@ void split_page_memcg(struct page *head, unsigned int nr)
 		css_get_many(&memcg->css, nr - 1);
 }
 
+void folio_copy_memcg(struct folio *src)
+{
+	int i;
+	unsigned long flags;
+	int delta = 0;
+	int nr_pages = folio_nr_pages(src);
+	struct mem_cgroup *memcg = folio_memcg(src);
+
+	if (folio_can_split(src))
+		return;
+
+	if (WARN_ON_ONCE(!src->_dst_pp))
+		return;
+
+	if (mem_cgroup_disabled())
+		return;
+
+	if (WARN_ON_ONCE(!memcg))
+		return;
+
+	VM_WARN_ON_ONCE_FOLIO(!folio_test_large(src), src);
+	VM_WARN_ON_ONCE_FOLIO(folio_ref_count(src), src);
+
+	for (i = 0; i < nr_pages; i++) {
+		struct page *dst = folio_dst_page(src, i);
+
+		if (!dst)
+			continue;
+
+		commit_charge(page_folio(dst), memcg);
+		delta++;
+	}
+
+	if (!mem_cgroup_is_root(memcg)) {
+		page_counter_charge(&memcg->memory, delta);
+		if (do_memsw_account())
+			page_counter_charge(&memcg->memsw, delta);
+	}
+
+	css_get_many(&memcg->css, delta);
+
+	local_irq_save(flags);
+	mem_cgroup_charge_statistics(memcg, delta);
+	memcg_check_events(memcg, folio_nid(src));
+	local_irq_restore(flags);
+}
+
 #ifdef CONFIG_SWAP
 /**
  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9349948f1abf..b9d2f821ba63 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -2289,7 +2289,7 @@ int memory_failure(unsigned long pfn, int flags)
 		 * page is a valid handlable page.
 		 */
 		SetPageHasHWPoisoned(hpage);
-		if (try_to_split_thp_page(p) < 0) {
+		if (try_to_split_thp_page(p)) {
 			res = action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
 			goto unlock_mutex;
 		}
diff --git a/mm/migrate.c b/mm/migrate.c
index f615c0c22046..610be0029efd 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -180,36 +180,52 @@ void putback_movable_pages(struct list_head *l)
 /*
  * Restore a potential migration pte to a working pte entry
  */
-static bool remove_migration_pte(struct folio *folio,
-		struct vm_area_struct *vma, unsigned long addr, void *old)
+static bool remove_migration_pte(struct folio *dst,
+		struct vm_area_struct *vma, unsigned long addr, void *arg)
 {
-	DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
+	struct folio *src = arg;
+	DEFINE_FOLIO_VMA_WALK(pvmw, src, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
 
 	while (page_vma_mapped_walk(&pvmw)) {
 		rmap_t rmap_flags = RMAP_NONE;
 		pte_t old_pte;
 		pte_t pte;
 		swp_entry_t entry;
-		struct page *new;
+		struct page *page;
+		struct folio *folio;
 		unsigned long idx = 0;
 
 		/* pgoff is invalid for ksm pages, but they are never large */
-		if (folio_test_large(folio) && !folio_test_hugetlb(folio))
+		if (folio_test_large(dst) && !folio_test_hugetlb(dst))
 			idx = linear_page_index(vma, pvmw.address) - pvmw.pgoff;
-		new = folio_page(folio, idx);
+		page = folio_page(dst, idx);
+
+		if (src == dst) {
+			if (can_discard_src(page)) {
+				VM_WARN_ON_ONCE_FOLIO(!folio_test_anon(src), src);
+
+				pte_clear_not_present_full(pvmw.vma->vm_mm, pvmw.address,
+							   pvmw.pte, false);
+				dec_mm_counter(pvmw.vma->vm_mm, MM_ANONPAGES);
+				continue;
+			}
+			page = folio_dst_page(src, idx);
+		}
+
+		folio = page_folio(page);
 
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
 		/* PMD-mapped THP migration entry */
 		if (!pvmw.pte) {
 			VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
 					!folio_test_pmd_mappable(folio), folio);
-			remove_migration_pmd(&pvmw, new);
+			remove_migration_pmd(&pvmw, page);
 			continue;
 		}
 #endif
 
 		folio_get(folio);
-		pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
+		pte = mk_pte(page, READ_ONCE(vma->vm_page_prot));
 		old_pte = ptep_get(pvmw.pte);
 		if (pte_swp_soft_dirty(old_pte))
 			pte = pte_mksoft_dirty(pte);
@@ -227,13 +243,13 @@ static bool remove_migration_pte(struct folio *folio,
 		if (folio_test_anon(folio) && !is_readable_migration_entry(entry))
 			rmap_flags |= RMAP_EXCLUSIVE;
 
-		if (unlikely(is_device_private_page(new))) {
+		if (unlikely(is_device_private_page(page))) {
 			if (pte_write(pte))
 				entry = make_writable_device_private_entry(
-							page_to_pfn(new));
+							page_to_pfn(page));
 			else
 				entry = make_readable_device_private_entry(
-							page_to_pfn(new));
+							page_to_pfn(page));
 			pte = swp_entry_to_pte(entry);
 			if (pte_swp_soft_dirty(old_pte))
 				pte = pte_swp_mksoft_dirty(pte);
@@ -259,17 +275,17 @@ static bool remove_migration_pte(struct folio *folio,
 #endif
 		{
 			if (folio_test_anon(folio))
-				folio_add_anon_rmap_pte(folio, new, vma,
+				folio_add_anon_rmap_pte(folio, page, vma,
 							pvmw.address, rmap_flags);
 			else
-				folio_add_file_rmap_pte(folio, new, vma);
+				folio_add_file_rmap_pte(folio, page, vma);
 			set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
 		}
 		if (vma->vm_flags & VM_LOCKED)
 			mlock_drain_local();
 
 		trace_remove_migration_pte(pvmw.address, pte_val(pte),
-					   compound_order(new));
+					   compound_order(page));
 
 		/* No need to invalidate - it was non-present before */
 		update_mmu_cache(vma, pvmw.address, pvmw.pte);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6a4da8f8691c..dd843fb04f78 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1012,6 +1012,10 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
 			bad_page(page, "nonzero pincount");
 			goto out;
 		}
+		if (unlikely(folio->_private_1)) {
+			bad_page(page, "nonzero _private_1");
+			goto out;
+		}
 		break;
 	case 2:
 		/*
diff --git a/mm/rmap.c b/mm/rmap.c
index f5d43edad529..0ddb28c52961 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -2260,6 +2260,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 						hsz);
 			else
 				set_pte_at(mm, address, pvmw.pte, swp_pte);
+			if (vma->vm_flags & VM_LOCKED)
+				set_src_usage(subpage, SRC_PAGE_MLOCKED);
+			else
+				set_src_usage(subpage, SRC_PAGE_MAPPED);
 			trace_set_migration_pte(address, pte_val(swp_pte),
 						compound_order(&folio->page));
 			/*
diff --git a/mm/shmem.c b/mm/shmem.c
index d7c84ff62186..8fa8056d3724 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -696,7 +696,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
 		folio_put(folio);
 
 		/* If split failed move the inode on the list back to shrinklist */
-		if (ret)
+		if (ret < 0)
 			goto move_back;
 
 		split++;
@@ -1450,7 +1450,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 	if (folio_test_large(folio)) {
 		/* Ensure the subpages are still dirty */
 		folio_test_set_dirty(folio);
-		if (split_huge_page(page) < 0)
+		if (split_huge_page(page))
 			goto redirty;
 		folio = page_folio(page);
 		folio_clear_dirty(folio);
diff --git a/mm/truncate.c b/mm/truncate.c
index 725b150e47ac..df0680cfe6a2 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -209,6 +209,7 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
  */
 bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
 {
+	int err;
 	loff_t pos = folio_pos(folio);
 	unsigned int offset, length;
 
@@ -239,8 +240,11 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
 		folio_invalidate(folio, offset, length);
 	if (!folio_test_large(folio))
 		return true;
-	if (split_folio(folio) == 0)
+	err = split_folio(folio);
+	if (!err)
 		return true;
+	if (err > 0)
+		return false;
 	if (folio_test_dirty(folio))
 		return false;
 	truncate_inode_folio(folio->mapping, folio);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 7cf7d4384259..cf490b101cac 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -1094,7 +1094,7 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
 			pte_unmap(&orig_dst_pte);
 			src_pte = dst_pte = NULL;
 			err = split_folio(src_folio);
-			if (err)
+			if (err < 0)
 				goto out;
 			/* have to reacquire the folio after it got split */
 			folio_unlock(src_folio);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ae061ec4866a..d6c31421a3b9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1223,6 +1223,15 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
 				goto keep_locked;
 		}
 
+		if (folio_ref_count(folio) == 1) {
+			folio_unlock(folio);
+			if (folio_put_testzero(folio))
+				goto free_it;
+
+			nr_reclaimed += nr_pages;
+			continue;
+		}
+
 		/*
 		 * If the folio was split above, the tail pages will make
 		 * their own pass through this function and be accounted
diff --git a/mm/vmstat.c b/mm/vmstat.c
index adbd032e6a0f..ff2114452334 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1364,6 +1364,9 @@ const char * const vmstat_text[] = {
 	"thp_split_page_failed",
 	"thp_deferred_split_page",
 	"thp_split_pmd",
+	"thp_shatter_page",
+	"thp_shatter_page_failed",
+	"thp_shatter_page_discarded",
 	"thp_scan_exceed_none_pte",
 	"thp_scan_exceed_swap_pte",
 	"thp_scan_exceed_share_pte",
-- 
2.44.0.rc1.240.g4c46232300-goog





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux