trace_dir="/sys/kernel/tracing"
echo 1 > $trace_dir/options/stacktrace
echo 1 > $trace_dir/events/page_pin_owner/enable
echo "contig_range" > $trace_dir/events/page_pin_owner/report_page_pinners/filter
===
example:
If you are interested in compaction failure, you want to use
"compaction" as a filter instead of "contig_range".
For the other filter strings, you can refer migrate_reason_names
in mm/debug.c
===
..
run workload
..
cat $trace_dir/trace
..
bash-7442 [007] ...1. 288.504690: report_page_pinners: pfn=0x91430 flags=uptodate|swapbacked count=2 mapcount=0 mapping=ffff88f5480a2709 mt=5 reason=contig_range err=-11
bash-7442 [007] ...1. 288.504691: <stack trace>
=> trace_event_raw_event_report_page_pinners
=> __report_page_pinners
=> migrate_pages
=> alloc_contig_range
=> cma_alloc.cold
=> cma_alloc_write
=> simple_attr_write
=> debugfs_attr_write
=> full_proxy_write
=> vfs_write
=> ksys_write
=> do_syscall_64
=> entry_SYSCALL_64_after_hwframe
..
..
bash-7442 [007] ...1. 288.506426: page_pin_owner_put: pfn=0x91430 flags=uptodate|swapbacked count=2 mapcount=0 mapping=ffff88f5480a2709 mt=5
bash-7442 [007] ...1. 288.506427: <stack trace>
=> trace_event_raw_event_page_pin_owner_put
=> __page_pin_owner_put
=> put_page
=> putback_movable_pages
=> alloc_contig_range
=> cma_alloc.cold
=> cma_alloc_write
=> simple_attr_write
=> debugfs_attr_write
=> full_proxy_write
=> vfs_write
=> ksys_write
=> do_syscall_64
=> entry_SYSCALL_64_after_hwframe
..
cc1-31104 [001] d..2. 288.507632: page_pin_owner_put: pfn=0x91430 flags=uptodate|swapbacked count=1 mapcount=0 mapping=ffff88f5480a2709 mt=4
cc1-31104 [001] d..2. 288.507636: <stack trace>
=> trace_event_raw_event_page_pin_owner_put
=> __page_pin_owner_put
=> release_pages
=> tlb_flush_mmu
=> unmap_page_range
=> unmap_vmas
=> exit_mmap
=> mmput
=> do_exit
=> do_group_exit
=> __x64_sys_exit_group
=> do_syscall_64
=> entry_SYSCALL_64_after_hwframe
..
make-31157 [007] d..3. 288.508333: page_pin_owner_put: pfn=0x91430 flags=uptodate|lru|swapbacked count=0 mapcount=0 mapping=ffff88f5480a2709 mt=5
make-31157 [007] d..3. 288.508335: <stack trace>
=> trace_event_raw_event_page_pin_owner_put
=> __page_pin_owner_put
=> release_pages
=> __pagevec_lru_add
=> lru_add_drain_cpu
=> lru_add_drain
=> exit_mmap
=> mmput
=> begin_new_exec
=> load_elf_binary
=> bprm_execve
=> do_execveat_common.isra.0
=> __x64_sys_execve
=> do_syscall_64
=> entry_SYSCALL_64_after_hwframe
Signed-off-by: Minchan Kim <minchan@xxxxxxxxxx>
---
* from v1 - https://lore.kernel.org/lkml/20211206184730.858850-1-minchan@xxxxxxxxxx/
* PagePinOwner naming suggestion - jhubbard@
* Description/Kconfig suggestion - jhubbard@
* General migrate_page supports - huww98@
include/linux/mm.h | 7 ++-
include/linux/page_ext.h | 3 +
include/linux/page_pin_owner.h | 48 ++++++++++++++
include/trace/events/page_pin_owner.h | 81 ++++++++++++++++++++++++
mm/Kconfig.debug | 15 +++++
mm/Makefile | 1 +
mm/migrate.c | 5 +-
mm/page_alloc.c | 3 +
mm/page_ext.c | 4 ++
mm/page_pin_owner.c | 91 +++++++++++++++++++++++++++
10 files changed, 256 insertions(+), 2 deletions(-)
create mode 100644 include/linux/page_pin_owner.h
create mode 100644 include/trace/events/page_pin_owner.h
create mode 100644 mm/page_pin_owner.c
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 82be88c1fdbb..5c437faf129c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -26,6 +26,7 @@
#include <linux/err.h>
#include <linux/page-flags.h>
#include <linux/page_ref.h>
+#include <linux/page_pin_owner.h>
#include <linux/memremap.h>
#include <linux/overflow.h>
#include <linux/sizes.h>
@@ -714,8 +715,12 @@ static inline unsigned int folio_order(struct folio *folio)
*/
static inline int put_page_testzero(struct page *page)
{
+ int ret;
+
VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
- return page_ref_dec_and_test(page);
+ ret = page_ref_dec_and_test(page);
+ page_pin_owner_put(page);
+ return ret;
}
static inline int folio_put_testzero(struct folio *folio)
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index fabb2e1e087f..3236efd5ab07 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -23,6 +23,9 @@ enum page_ext_flags {
PAGE_EXT_YOUNG,
PAGE_EXT_IDLE,
#endif
+#if defined(CONFIG_PAGE_PIN_OWNER)
+ PAGE_EXT_PIN_OWNER,
+#endif
};
/*
diff --git a/include/linux/page_pin_owner.h b/include/linux/page_pin_owner.h
new file mode 100644
index 000000000000..e68adcdd6799
--- /dev/null
+++ b/include/linux/page_pin_owner.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_PAGE_PIN_OWNER_H
+#define __LINUX_PAGE_PIN_OWNER_H
+
+#include <linux/jump_label.h>
+
+#ifdef CONFIG_PAGE_PIN_OWNER
+extern struct static_key_false page_pin_owner_inited;
+extern struct page_ext_operations page_pin_owner_ops;
+
+void __report_page_pinners(struct page *page, int reason, int err);
+void __page_pin_owner_put(struct page *page);
+void __reset_page_pin_owner(struct page *page, unsigned int order);
+
+static inline void reset_page_pin_owner(struct page *page, unsigned int order)
+{
+ if (static_branch_unlikely(&page_pin_owner_inited))
+ __reset_page_pin_owner(page, order);
+}
+
+static inline void report_page_pinners(struct page *page, int reason, int err)
+{
+ if (!static_branch_unlikely(&page_pin_owner_inited))
+ return;
+
+ __report_page_pinners(page, reason, err);
+}
+
+static inline void page_pin_owner_put(struct page *page)
+{
+ if (!static_branch_unlikely(&page_pin_owner_inited))
+ return;
+
+ __page_pin_owner_put(page);
+}
+
+#else
+static inline void reset_page_pin_owner(struct page *page, unsigned int order)
+{
+}
+static inline void report_page_pinners(struct page *page, int reason, int err)
+{
+}
+static inline void page_pin_owner_put(struct page *page)
+{
+}
+#endif /* CONFIG_PAGE_PIN_OWNER */
+#endif /*__LINUX_PAGE_PIN_OWNER_H */
diff --git a/include/trace/events/page_pin_owner.h b/include/trace/events/page_pin_owner.h
new file mode 100644
index 000000000000..0096297312f5
--- /dev/null
+++ b/include/trace/events/page_pin_owner.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM page_pin_owner
+
+#if !defined(_TRACE_PAGE_PIN_OWNER_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PAGE_PIN_OWNER_H
+
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+#include <trace/events/mmflags.h>
+
+TRACE_EVENT(page_pin_owner_put,
+
+ TP_PROTO(struct page *page),
+
+ TP_ARGS(page),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, pfn)
+ __field(unsigned long, flags)
+ __field(int, count)
+ __field(int, mapcount)
+ __field(void *, mapping)
+ __field(int, mt)
+ ),
+
+ TP_fast_assign(
+ __entry->pfn = page_to_pfn(page);
+ __entry->flags = page->flags;
+ __entry->count = page_ref_count(page);
+ __entry->mapcount = page_mapcount(page);
+ __entry->mapping = page->mapping;
+ __entry->mt = get_pageblock_migratetype(page);
+ ),
+
+ TP_printk("pfn=0x%lx flags=%s count=%d mapcount=%d mapping=%p mt=%d",
+ __entry->pfn,
+ show_page_flags(__entry->flags & ((1UL << NR_PAGEFLAGS) - 1)),
+ __entry->count,
+ __entry->mapcount, __entry->mapping, __entry->mt)
+);
+
+TRACE_EVENT(report_page_pinners,
+
+ TP_PROTO(struct page *page, const char *reason, int err),
+
+ TP_ARGS(page, reason, err),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, pfn)
+ __field(unsigned long, flags)
+ __field(int, count)
+ __field(int, mapcount)
+ __field(void *, mapping)
+ __field(int, mt)
+ __field(const char *, reason)
+ __field(int, err)
+ ),
+
+ TP_fast_assign(
+ __entry->pfn = page_to_pfn(page);
+ __entry->flags = page->flags;
+ __entry->count = page_ref_count(page);
+ __entry->mapcount = page_mapcount(page);
+ __entry->mapping = page->mapping;
+ __entry->mt = get_pageblock_migratetype(page);
+ __entry->reason = reason;
+ __entry->err = err;
+ ),
+
+ TP_printk("pfn=0x%lx flags=%s count=%d mapcount=%d mapping=%p mt=%d reason=%s err=%d",
+ __entry->pfn,
+ show_page_flags(__entry->flags & ((1UL << NR_PAGEFLAGS) - 1)),
+ __entry->count, __entry->mapcount, __entry->mapping,
+ __entry->mt, __entry->reason, __entry->err)
+);
+
+#endif /* _TRACE_PAGE_PIN_OWNER_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 5bd5bb097252..ed2d5e6450b7 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -86,6 +86,21 @@ config PAGE_TABLE_CHECK_ENFORCED
If unsure say "n".
+config PAGE_PIN_OWNER
+ bool "Track page pinner"
+ depends on DEBUG_KERNEL && TRACEPOINTS && STACKTRACE_SUPPORT
+ select PAGE_EXTENSION
+ select STACKTRACE
+ help
+ This keeps track of what call chain is the pinner of a page, may
+ help to find contiguous page allocation(memory-hotplug, compaction,
+ cma and so on) failure. Even if you include this feature in your
+ build, it is disabled by default. You should pass "page_pin_owner=on"
+ to boot parameter in order to enable it. Eats a fair amount of memory
+ if enabled.
+
+ If unsure, say N.
+
config PAGE_POISONING
bool "Poison pages after freeing"
help
diff --git a/mm/Makefile b/mm/Makefile
index 588d3113f3b0..ca89f4fa38ce 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -104,6 +104,7 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o
obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o
obj-$(CONFIG_PAGE_OWNER) += page_owner.o
+obj-$(CONFIG_PAGE_PIN_OWNER) += page_pin_owner.o
obj-$(CONFIG_CLEANCACHE) += cleancache.o
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
obj-$(CONFIG_ZPOOL) += zpool.o
diff --git a/mm/migrate.c b/mm/migrate.c
index c7da064b4781..aa8a2c21da8e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1105,6 +1105,8 @@ static int unmap_and_move(new_page_t get_new_page,
rc = __unmap_and_move(page, newpage, force, mode);
if (rc == MIGRATEPAGE_SUCCESS)
set_page_owner_migrate_reason(newpage, reason);
+ else
+ report_page_pinners(page, reason, rc);
out:
if (rc != -EAGAIN) {
@@ -1273,7 +1275,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
if (rc == MIGRATEPAGE_SUCCESS) {
move_hugetlb_state(hpage, new_hpage, reason);
put_new_page = NULL;
- }
+ } else
+ report_page_pinners(hpage, reason, rc);
out_unlock:
unlock_page(hpage);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b5d62e1c8d81..98125b1b6e7e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -64,6 +64,7 @@
#include <linux/sched/rt.h>
#include <linux/sched/mm.h>
#include <linux/page_owner.h>
+#include <linux/page_pin_owner.h>
#include <linux/page_table_check.h>
#include <linux/kthread.h>
#include <linux/memcontrol.h>
@@ -1310,6 +1311,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
if (memcg_kmem_enabled() && PageMemcgKmem(page))
__memcg_kmem_uncharge_page(page, order);
reset_page_owner(page, order);
+ reset_page_pin_owner(page, order);
page_table_check_free(page, order);
return false;
}
@@ -1350,6 +1352,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
page_cpupid_reset_last(page);
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
reset_page_owner(page, order);
+ reset_page_pin_owner(page, order);
page_table_check_free(page, order);
if (!PageHighMem(page)) {
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 2e66d934d63f..3c0df97b9b8b 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -7,6 +7,7 @@
#include <linux/vmalloc.h>
#include <linux/kmemleak.h>
#include <linux/page_owner.h>
+#include <linux/page_pin_owner.h>
#include <linux/page_idle.h>
#include <linux/page_table_check.h>
@@ -79,6 +80,9 @@ static struct page_ext_operations *page_ext_ops[] __initdata = {
#ifdef CONFIG_PAGE_TABLE_CHECK
&page_table_check_ops,
#endif
+#ifdef CONFIG_PAGE_PIN_OWNER
+ &page_pin_owner_ops,
+#endif
};
unsigned long page_ext_size = sizeof(struct page_ext);
diff --git a/mm/page_pin_owner.c b/mm/page_pin_owner.c
new file mode 100644
index 000000000000..736617df093c
--- /dev/null
+++ b/mm/page_pin_owner.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/page_pin_owner.h>
+#include <linux/migrate.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/page_pin_owner.h>
+
+static bool page_pin_owner_enabled;
+DEFINE_STATIC_KEY_FALSE(page_pin_owner_inited);
+EXPORT_SYMBOL(page_pin_owner_inited);
+
+static int __init early_page_pin_owner_param(char *buf)
+{
+ return kstrtobool(buf, &page_pin_owner_enabled);
+}
+early_param("page_pin_owner", early_page_pin_owner_param);
+
+static bool need_page_pin_owner(void)
+{
+ return page_pin_owner_enabled;
+}
+
+static void init_page_pin_owner(void)
+{
+ if (!page_pin_owner_enabled)
+ return;
+
+ static_branch_enable(&page_pin_owner_inited);
+}
+
+struct page_ext_operations page_pin_owner_ops = {
+ .need = need_page_pin_owner,
+ .init = init_page_pin_owner,
+};
+
+void __reset_page_pin_owner(struct page *page, unsigned int order)
+{
+ struct page_ext *page_ext;
+ int i;
+
+ page_ext = lookup_page_ext(page);
+ if (unlikely(!page_ext))
+ return;
+
+ for (i = 0; i < (1 << order); i++) {
+ if (!test_bit(PAGE_EXT_PIN_OWNER, &page_ext->flags))
+ break;
+
+ clear_bit(PAGE_EXT_PIN_OWNER, &page_ext->flags);
+ page_ext = page_ext_next(page_ext);
+ }
+}
+
+void __report_page_pinners(struct page *page, int reason, int err)
+{
+ struct page_ext *page_ext;
+
+ page_ext = lookup_page_ext(page);
+ if (unlikely(!page_ext))
+ return;
+
+ test_and_set_bit(PAGE_EXT_PIN_OWNER, &page_ext->flags);
+ trace_report_page_pinners(page, migrate_reason_names[reason], err);
+}
+
+void __page_pin_owner_put(struct page *page)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+
+ if (unlikely(!page_ext))
+ return;
+
+ if (!test_bit(PAGE_EXT_PIN_OWNER, &page_ext->flags))
+ return;
+
+ trace_page_pin_owner_put(page);
+}
+EXPORT_SYMBOL(__page_pin_owner_put);
+
+static int __init page_pin_owner_init(void)
+{
+ if (!static_branch_unlikely(&page_pin_owner_inited)) {
+ pr_info("page_pin_owner is disabled\n");
+ return 0;
+ }
+
+ pr_info("page_pin_owner is enabled\n");
+ return 0;
+}
+late_initcall(page_pin_owner_init)