On Tue, Oct 21, 2014 at 02:22:39PM -0400, Johannes Weiner wrote: > On Tue, Oct 21, 2014 at 05:15:50PM +0400, Vladimir Davydov wrote: > > mem_cgroup_reclaimable() checks whether a cgroup has reclaimable pages > > on *any* NUMA node. However, the only place where it's called is > > mem_cgroup_soft_reclaim(), which tries to reclaim memory from a > > *specific* zone. So the way how it's used is incorrect - it will return > > true even if the cgroup doesn't have pages on the zone we're scanning. > > > > I think we can get rid of this check completely, because > > mem_cgroup_shrink_node_zone(), which is called by > > mem_cgroup_soft_reclaim() if mem_cgroup_reclaimable() returns true, is > > equivalent to shrink_lruvec(), which exits almost immediately if the > > lruvec passed to it is empty. So there's no need to optimize anything > > here. Besides, we don't have such a check in the general scan path > > (shrink_zone) either. > > > > Signed-off-by: Vladimir Davydov <vdavydov@xxxxxxxxxxxxx> > > Acked-by: Johannes Weiner <hannes@xxxxxxxxxxx> > > How about this on top? > > --- > > From 27bd24b00433d9f6c8d60ba2b13dbff158b06c13 Mon Sep 17 00:00:00 2001 > From: Johannes Weiner <hannes@xxxxxxxxxxx> > Date: Tue, 21 Oct 2014 09:53:54 -0400 > Subject: [patch] mm: memcontrol: do not filter reclaimable nodes in NUMA > round-robin > > The round-robin node reclaim currently tries to include only nodes > that have memory of the memcg in question, which is quite elaborate. > > Just use plain round-robin over the nodes that are allowed by the > task's cpuset, which are the most likely to contain that memcg's > memory. But even if zones without memcg memory are encountered, > direct reclaim will skip over them without too much hassle. > > Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx> Totally agree. Acked-by: Vladimir Davydov <vdavydov@xxxxxxxxxxxxx> > --- > mm/memcontrol.c | 97 +++------------------------------------------------------ > 1 file changed, 5 insertions(+), 92 deletions(-) > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index d353d9e1fdca..293db8234179 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -54,6 +54,7 @@ > #include <linux/page_cgroup.h> > #include <linux/cpu.h> > #include <linux/oom.h> > +#include <linux/cpuset.h> > #include <linux/lockdep.h> > #include <linux/file.h> > #include "internal.h" > @@ -129,12 +130,10 @@ static const char * const mem_cgroup_lru_names[] = { > enum mem_cgroup_events_target { > MEM_CGROUP_TARGET_THRESH, > MEM_CGROUP_TARGET_SOFTLIMIT, > - MEM_CGROUP_TARGET_NUMAINFO, > MEM_CGROUP_NTARGETS, > }; > #define THRESHOLDS_EVENTS_TARGET 128 > #define SOFTLIMIT_EVENTS_TARGET 1024 > -#define NUMAINFO_EVENTS_TARGET 1024 > > struct mem_cgroup_stat_cpu { > long count[MEM_CGROUP_STAT_NSTATS]; > @@ -352,11 +351,6 @@ struct mem_cgroup { > #endif > > int last_scanned_node; > -#if MAX_NUMNODES > 1 > - nodemask_t scan_nodes; > - atomic_t numainfo_events; > - atomic_t numainfo_updating; > -#endif > > /* List of events which userspace want to receive */ > struct list_head event_list; > @@ -965,9 +959,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, > case MEM_CGROUP_TARGET_SOFTLIMIT: > next = val + SOFTLIMIT_EVENTS_TARGET; > break; > - case MEM_CGROUP_TARGET_NUMAINFO: > - next = val + NUMAINFO_EVENTS_TARGET; > - break; > default: > break; > } > @@ -986,22 +977,10 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) > /* threshold event is triggered in finer grain than soft limit */ > if (unlikely(mem_cgroup_event_ratelimit(memcg, > MEM_CGROUP_TARGET_THRESH))) { > - bool do_softlimit; > - bool do_numainfo __maybe_unused; > - > - do_softlimit = mem_cgroup_event_ratelimit(memcg, > - MEM_CGROUP_TARGET_SOFTLIMIT); > -#if MAX_NUMNODES > 1 > - do_numainfo = mem_cgroup_event_ratelimit(memcg, > - MEM_CGROUP_TARGET_NUMAINFO); > -#endif > mem_cgroup_threshold(memcg); > - if (unlikely(do_softlimit)) > + if (mem_cgroup_event_ratelimit(memcg, > + MEM_CGROUP_TARGET_SOFTLIMIT)) > mem_cgroup_update_tree(memcg, page); > -#if MAX_NUMNODES > 1 > - if (unlikely(do_numainfo)) > - atomic_inc(&memcg->numainfo_events); > -#endif > } > } > > @@ -1708,61 +1687,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, > NULL, "Memory cgroup out of memory"); > } > > -/** > - * test_mem_cgroup_node_reclaimable > - * @memcg: the target memcg > - * @nid: the node ID to be checked. > - * @noswap : specify true here if the user wants flle only information. > - * > - * This function returns whether the specified memcg contains any > - * reclaimable pages on a node. Returns true if there are any reclaimable > - * pages in the node. > - */ > -static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, > - int nid, bool noswap) > -{ > - if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) > - return true; > - if (noswap || !total_swap_pages) > - return false; > - if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) > - return true; > - return false; > - > -} > #if MAX_NUMNODES > 1 > - > -/* > - * Always updating the nodemask is not very good - even if we have an empty > - * list or the wrong list here, we can start from some node and traverse all > - * nodes based on the zonelist. So update the list loosely once per 10 secs. > - * > - */ > -static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) > -{ > - int nid; > - /* > - * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET > - * pagein/pageout changes since the last update. > - */ > - if (!atomic_read(&memcg->numainfo_events)) > - return; > - if (atomic_inc_return(&memcg->numainfo_updating) > 1) > - return; > - > - /* make a nodemask where this memcg uses memory from */ > - memcg->scan_nodes = node_states[N_MEMORY]; > - > - for_each_node_mask(nid, node_states[N_MEMORY]) { > - > - if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) > - node_clear(nid, memcg->scan_nodes); > - } > - > - atomic_set(&memcg->numainfo_events, 0); > - atomic_set(&memcg->numainfo_updating, 0); > -} > - > /* > * Selecting a node where we start reclaim from. Because what we need is just > * reducing usage counter, start from anywhere is O,K. Considering > @@ -1779,21 +1704,9 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) > { > int node; > > - mem_cgroup_may_update_nodemask(memcg); > - node = memcg->last_scanned_node; > - > - node = next_node(node, memcg->scan_nodes); > + node = next_node(memcg->last_scanned_node, cpuset_current_mems_allowed); > if (node == MAX_NUMNODES) > - node = first_node(memcg->scan_nodes); > - /* > - * We call this when we hit limit, not when pages are added to LRU. > - * No LRU may hold pages because all pages are UNEVICTABLE or > - * memcg is too small and all pages are not on LRU. In that case, > - * we use curret node. > - */ > - if (unlikely(node == MAX_NUMNODES)) > - node = numa_node_id(); > - > + node = first_node(cpuset_current_mems_allowed); > memcg->last_scanned_node = node; > return node; > } > -- > 2.1.2 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>