On Fri, 2022-04-15 at 22:39 -0700, Davidlohr Bueso wrote: > This patch introduces a mechanism to trigger memory reclaim > as a per-node sysfs interface, inspired by compaction's > equivalent; ie: > > echo 1G > /sys/devices/system/node/nodeX/reclaim > I think it will be more flexible to specify a node mask as a parameter along with amount of memory with the memory.reclaim memcg interface proposed by Yosry. Doing it node by node is more cumbersome. It is just a special case of reclaiming from root cgroup for a specific node. Wei Gu, YIng and I have some discssions on this https://lore.kernel.org/all/df6110a09cacc80ee1cbe905a71273a5f3953e16.camel@xxxxxxxxxxxxxxx/ Tim > It is based on the discussions from David's thread[1] as > well as the current upstreaming of the memcg[2] interface > (which has nice explanations for the benefits of userspace > reclaim overall). In both cases conclusions were that either > way of inducing proactive reclaim should be KISS, and can be > later extended. So this patch does not allow the user much > fine tuning beyond the size of the reclaim, such as anon/file > or whether or semantics of demotion. > > [1] https://lore.kernel.org/all/5df21376-7dd1-bf81-8414-32a73cea45dd@xxxxxxxxxx/ > [2] https://lore.kernel.org/all/20220408045743.1432968-1-yosryahmed@xxxxxxxxxx/ > > Signed-off-by: Davidlohr Bueso <dave@xxxxxxxxxxxx> > --- > Documentation/ABI/stable/sysfs-devices-node | 10 ++++ > drivers/base/node.c | 2 + > include/linux/swap.h | 16 ++++++ > mm/vmscan.c | 59 +++++++++++++++++++++ > 4 files changed, 87 insertions(+) > > diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node > index 8db67aa472f1..3c935e1334f7 100644 > --- a/Documentation/ABI/stable/sysfs-devices-node > +++ b/Documentation/ABI/stable/sysfs-devices-node > @@ -182,3 +182,13 @@ Date: November 2021 > Contact: Jarkko Sakkinen <jarkko@xxxxxxxxxx> > Description: > The total amount of SGX physical memory in bytes. > + > +What: /sys/devices/system/node/nodeX/reclaim > +Date: April 2022 > +Contact: Davidlohr Bueso <dave@xxxxxxxxxxxx> > +Description: > + Write the amount of bytes to induce memory reclaim in this node. > + This file accepts a single key, the number of bytes to reclaim. > + When it completes successfully, the specified amount or more memory > + will have been reclaimed, and -EAGAIN if less bytes are reclaimed > + than the specified amount. > diff --git a/drivers/base/node.c b/drivers/base/node.c > index 6cdf25fd26c3..d80c478e2a6e 100644 > --- a/drivers/base/node.c > +++ b/drivers/base/node.c > @@ -670,6 +670,7 @@ static int register_node(struct node *node, int num) > > hugetlb_register_node(node); > compaction_register_node(node); > + reclaim_register_node(node); > return 0; > } > > @@ -685,6 +686,7 @@ void unregister_node(struct node *node) > hugetlb_unregister_node(node); /* no-op, if memoryless node */ > node_remove_accesses(node); > node_remove_caches(node); > + reclaim_unregister_node(node); > device_unregister(&node->dev); > } > > diff --git a/include/linux/swap.h b/include/linux/swap.h > index 27093b477c5f..cca43ae6d770 100644 > --- a/include/linux/swap.h > +++ b/include/linux/swap.h > @@ -398,6 +398,22 @@ extern unsigned long shrink_all_memory(unsigned long nr_pages); > extern int vm_swappiness; > long remove_mapping(struct address_space *mapping, struct folio *folio); > > +#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) > +extern int reclaim_register_node(struct node *node); > +extern void reclaim_unregister_node(struct node *node); > + > +#else > + > +static inline int reclaim_register_node(struct node *node) > +{ > + return 0; > +} > + > +static inline void reclaim_unregister_node(struct node *node) > +{ > +} > +#endif /* CONFIG_SYSFS && CONFIG_NUMA */ > + > extern unsigned long reclaim_pages(struct list_head *page_list); > #ifdef CONFIG_NUMA > extern int node_reclaim_mode; > diff --git a/mm/vmscan.c b/mm/vmscan.c > index 1735c302831c..3539f8a0f0ea 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -4819,3 +4819,62 @@ void check_move_unevictable_pages(struct pagevec *pvec) > } > } > EXPORT_SYMBOL_GPL(check_move_unevictable_pages); > + > +#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) > +static ssize_t reclaim_store(struct device *dev, > + struct device_attribute *attr, > + const char *buf, size_t count) > +{ > + int err, nid = dev->id; > + gfp_t gfp_mask = GFP_KERNEL; > + struct pglist_data *pgdat = NODE_DATA(nid); > + unsigned long nr_to_reclaim, nr_reclaimed = 0; > + unsigned int nr_retries = MAX_RECLAIM_RETRIES; > + struct scan_control sc = { > + .gfp_mask = current_gfp_context(gfp_mask), > + .reclaim_idx = gfp_zone(gfp_mask), > + .priority = NODE_RECLAIM_PRIORITY, > + .may_writepage = !laptop_mode, > + .may_unmap = 1, > + .may_swap = 1, > + }; > + > + buf = strstrip((char *)buf); > + err = page_counter_memparse(buf, "", &nr_to_reclaim); > + if (err) > + return err; > + > + sc.nr_to_reclaim = max(nr_to_reclaim, SWAP_CLUSTER_MAX); > + > + while (nr_reclaimed < nr_to_reclaim) { > + unsigned long reclaimed; > + > + if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) > + return -EAGAIN; > + > + /* does cond_resched() */ > + reclaimed = __node_reclaim(pgdat, gfp_mask, > + nr_to_reclaim - nr_reclaimed, &sc); > + > + clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags); > + > + if (!reclaimed && !nr_retries--) > + break; > + > + nr_reclaimed += reclaimed; > + } > + > + return nr_reclaimed < nr_to_reclaim ? -EAGAIN : count; > +} > + > +static DEVICE_ATTR_WO(reclaim); > +int reclaim_register_node(struct node *node) > +{ > + return device_create_file(&node->dev, &dev_attr_reclaim); > +} > + > +void reclaim_unregister_node(struct node *node) > +{ > + return device_remove_file(&node->dev, &dev_attr_reclaim); > +} > +#endif