This patch introduces a mechanism to trigger memory reclaim as a per-node sysfs interface, inspired by compaction's equivalent; ie: echo 1G > /sys/devices/system/node/nodeX/reclaim It is based on the discussions from David's thread[1] as well as the current upstreaming of the memcg[2] interface (which has nice explanations for the benefits of userspace reclaim overall). In both cases conclusions were that either way of inducing proactive reclaim should be KISS, and can be later extended. So this patch does not allow the user much fine tuning beyond the size of the reclaim, such as anon/file or whether or semantics of demotion. [1] https://lore.kernel.org/all/5df21376-7dd1-bf81-8414-32a73cea45dd@xxxxxxxxxx/ [2] https://lore.kernel.org/all/20220408045743.1432968-1-yosryahmed@xxxxxxxxxx/ Signed-off-by: Davidlohr Bueso <dave@xxxxxxxxxxxx> --- Documentation/ABI/stable/sysfs-devices-node | 10 ++++ drivers/base/node.c | 2 + include/linux/swap.h | 16 ++++++ mm/vmscan.c | 59 +++++++++++++++++++++ 4 files changed, 87 insertions(+) diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node index 8db67aa472f1..3c935e1334f7 100644 --- a/Documentation/ABI/stable/sysfs-devices-node +++ b/Documentation/ABI/stable/sysfs-devices-node @@ -182,3 +182,13 @@ Date: November 2021 Contact: Jarkko Sakkinen <jarkko@xxxxxxxxxx> Description: The total amount of SGX physical memory in bytes. + +What: /sys/devices/system/node/nodeX/reclaim +Date: April 2022 +Contact: Davidlohr Bueso <dave@xxxxxxxxxxxx> +Description: + Write the amount of bytes to induce memory reclaim in this node. + This file accepts a single key, the number of bytes to reclaim. + When it completes successfully, the specified amount or more memory + will have been reclaimed, and -EAGAIN if less bytes are reclaimed + than the specified amount. diff --git a/drivers/base/node.c b/drivers/base/node.c index 6cdf25fd26c3..d80c478e2a6e 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -670,6 +670,7 @@ static int register_node(struct node *node, int num) hugetlb_register_node(node); compaction_register_node(node); + reclaim_register_node(node); return 0; } @@ -685,6 +686,7 @@ void unregister_node(struct node *node) hugetlb_unregister_node(node); /* no-op, if memoryless node */ node_remove_accesses(node); node_remove_caches(node); + reclaim_unregister_node(node); device_unregister(&node->dev); } diff --git a/include/linux/swap.h b/include/linux/swap.h index 27093b477c5f..cca43ae6d770 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -398,6 +398,22 @@ extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; long remove_mapping(struct address_space *mapping, struct folio *folio); +#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) +extern int reclaim_register_node(struct node *node); +extern void reclaim_unregister_node(struct node *node); + +#else + +static inline int reclaim_register_node(struct node *node) +{ + return 0; +} + +static inline void reclaim_unregister_node(struct node *node) +{ +} +#endif /* CONFIG_SYSFS && CONFIG_NUMA */ + extern unsigned long reclaim_pages(struct list_head *page_list); #ifdef CONFIG_NUMA extern int node_reclaim_mode; diff --git a/mm/vmscan.c b/mm/vmscan.c index 1735c302831c..3539f8a0f0ea 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4819,3 +4819,62 @@ void check_move_unevictable_pages(struct pagevec *pvec) } } EXPORT_SYMBOL_GPL(check_move_unevictable_pages); + +#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) +static ssize_t reclaim_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int err, nid = dev->id; + gfp_t gfp_mask = GFP_KERNEL; + struct pglist_data *pgdat = NODE_DATA(nid); + unsigned long nr_to_reclaim, nr_reclaimed = 0; + unsigned int nr_retries = MAX_RECLAIM_RETRIES; + struct scan_control sc = { + .gfp_mask = current_gfp_context(gfp_mask), + .reclaim_idx = gfp_zone(gfp_mask), + .priority = NODE_RECLAIM_PRIORITY, + .may_writepage = !laptop_mode, + .may_unmap = 1, + .may_swap = 1, + }; + + buf = strstrip((char *)buf); + err = page_counter_memparse(buf, "", &nr_to_reclaim); + if (err) + return err; + + sc.nr_to_reclaim = max(nr_to_reclaim, SWAP_CLUSTER_MAX); + + while (nr_reclaimed < nr_to_reclaim) { + unsigned long reclaimed; + + if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) + return -EAGAIN; + + /* does cond_resched() */ + reclaimed = __node_reclaim(pgdat, gfp_mask, + nr_to_reclaim - nr_reclaimed, &sc); + + clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags); + + if (!reclaimed && !nr_retries--) + break; + + nr_reclaimed += reclaimed; + } + + return nr_reclaimed < nr_to_reclaim ? -EAGAIN : count; +} + +static DEVICE_ATTR_WO(reclaim); +int reclaim_register_node(struct node *node) +{ + return device_create_file(&node->dev, &dev_attr_reclaim); +} + +void reclaim_unregister_node(struct node *node) +{ + return device_remove_file(&node->dev, &dev_attr_reclaim); +} +#endif -- 2.26.2