[Cc linux-api. Please always cc this list when proposing a new user visible api. Keeping the rest of the email intact for reference] On Mon 27-05-19 13:05:58, Konstantin Khlebnikov wrote: > Memory cgroup has no background memory reclaimer. Reclaiming after passing > high-limit blocks task because works synchronously in task-work. > > This implements manual kswapd-style memory reclaim initiated by userspace. > It reclaims both physical memory and cgroup pages. It works in context of > task who calls syscall madvise thus cpu time is accounted correctly. > > Interface: > > ret = madvise(ptr, size, MADV_STOCKPILE) > > Returns: > 0 - ok, free memory >= size > -EINVAL - not supported > -ENOMEM - not enough memory/cgroup limit > -EINTR - interrupted by pending signal > -EAGAIN - cannot reclaim enough memory > > Argument 'size' is interpreted size of required free memory. > Implementation triggers direct reclaim until amount of free memory is > lower than that size. Argument 'ptr' could points to vma for specifying > numa allocation policy, right now should be NULL. > > Usage scenario: independent thread or standalone daemon estimates rate of > allocations and calls MADV_STOCKPILE in loop to prepare free pages. > Thus fast path avoids allocation latency induced by direct reclaim. > > We are using this embedded into memory allocator based on MADV_FREE. > > > Demonstration in memory cgroup with limit 1G: > > touch zero > truncate -s 5G zero > > Without stockpile: > > perf stat -e vmscan:* md5sum zero > > Performance counter stats for 'md5sum zero': > > 0 vmscan:mm_vmscan_kswapd_sleep > 0 vmscan:mm_vmscan_kswapd_wake > 0 vmscan:mm_vmscan_wakeup_kswapd > 0 vmscan:mm_vmscan_direct_reclaim_begin > 10147 vmscan:mm_vmscan_memcg_reclaim_begin > 0 vmscan:mm_vmscan_memcg_softlimit_reclaim_begin > 0 vmscan:mm_vmscan_direct_reclaim_end > 10147 vmscan:mm_vmscan_memcg_reclaim_end > 0 vmscan:mm_vmscan_memcg_softlimit_reclaim_end > 99910 vmscan:mm_shrink_slab_start > 99910 vmscan:mm_shrink_slab_end > 39654 vmscan:mm_vmscan_lru_isolate > 0 vmscan:mm_vmscan_writepage > 39652 vmscan:mm_vmscan_lru_shrink_inactive > 2 vmscan:mm_vmscan_lru_shrink_active > 19982 vmscan:mm_vmscan_inactive_list_is_low > > 10.886832585 seconds time elapsed > > 8.928366000 seconds user > 1.935212000 seconds sys > > With stockpile: > > stockpile 100 10 & # up to 100M every 10ms > perf stat -e vmscan:* md5sum zero > > Performance counter stats for 'md5sum zero': > > 0 vmscan:mm_vmscan_kswapd_sleep > 0 vmscan:mm_vmscan_kswapd_wake > 0 vmscan:mm_vmscan_wakeup_kswapd > 0 vmscan:mm_vmscan_direct_reclaim_begin > 0 vmscan:mm_vmscan_memcg_reclaim_begin > 0 vmscan:mm_vmscan_memcg_softlimit_reclaim_begin > 0 vmscan:mm_vmscan_direct_reclaim_end > 0 vmscan:mm_vmscan_memcg_reclaim_end > 0 vmscan:mm_vmscan_memcg_softlimit_reclaim_end > 0 vmscan:mm_shrink_slab_start > 0 vmscan:mm_shrink_slab_end > 0 vmscan:mm_vmscan_lru_isolate > 0 vmscan:mm_vmscan_writepage > 0 vmscan:mm_vmscan_lru_shrink_inactive > 0 vmscan:mm_vmscan_lru_shrink_active > 0 vmscan:mm_vmscan_inactive_list_is_low > > 10.469776675 seconds time elapsed > > 8.976261000 seconds user > 1.491378000 seconds sys > > Signed-off-by: Konstantin Khlebnikov <khlebnikov@xxxxxxxxxxxxxx> > --- > include/linux/memcontrol.h | 6 +++++ > include/uapi/asm-generic/mman-common.h | 2 ++ > mm/madvise.c | 39 ++++++++++++++++++++++++++++++ > mm/memcontrol.c | 41 ++++++++++++++++++++++++++++++++ > tools/vm/Makefile | 2 +- > tools/vm/stockpile.c | 30 +++++++++++++++++++++++ > 6 files changed, 119 insertions(+), 1 deletion(-) > create mode 100644 tools/vm/stockpile.c > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index bc74d6a4407c..25325f18ad55 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -517,6 +517,7 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, > } > > void mem_cgroup_handle_over_high(void); > +int mem_cgroup_stockpile(unsigned long goal_pages); > > unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); > > @@ -968,6 +969,11 @@ static inline void mem_cgroup_handle_over_high(void) > { > } > > +static inline int mem_cgroup_stockpile(unsigned long goal_page) > +{ > + return 0; > +} > + > static inline void mem_cgroup_enter_user_fault(void) > { > } > diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h > index abd238d0f7a4..675145864fee 100644 > --- a/include/uapi/asm-generic/mman-common.h > +++ b/include/uapi/asm-generic/mman-common.h > @@ -64,6 +64,8 @@ > #define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ > #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ > > +#define MADV_STOCKPILE 20 /* stockpile free pages */ > + > /* compatibility flags */ > #define MAP_FILE 0 > > diff --git a/mm/madvise.c b/mm/madvise.c > index 628022e674a7..f908b08ecc9f 100644 > --- a/mm/madvise.c > +++ b/mm/madvise.c > @@ -686,6 +686,41 @@ static int madvise_inject_error(int behavior, > } > #endif > > +static long madvise_stockpile(unsigned long start, size_t len) > +{ > + unsigned long goal_pages, progress; > + struct zonelist *zonelist; > + int ret; > + > + if (start) > + return -EINVAL; > + > + goal_pages = len >> PAGE_SHIFT; > + > + if (goal_pages > totalram_pages() - totalreserve_pages) > + return -ENOMEM; > + > + ret = mem_cgroup_stockpile(goal_pages); > + if (ret) > + return ret; > + > + /* TODO: use vma mempolicy */ > + zonelist = node_zonelist(numa_node_id(), GFP_HIGHUSER); > + > + while (global_zone_page_state(NR_FREE_PAGES) < > + goal_pages + totalreserve_pages) { > + > + if (signal_pending(current)) > + return -EINTR; > + > + progress = try_to_free_pages(zonelist, 0, GFP_HIGHUSER, NULL); > + if (!progress) > + return -EAGAIN; > + } > + > + return 0; > +} > + > static long > madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, > unsigned long start, unsigned long end, int behavior) > @@ -728,6 +763,7 @@ madvise_behavior_valid(int behavior) > case MADV_DODUMP: > case MADV_WIPEONFORK: > case MADV_KEEPONFORK: > + case MADV_STOCKPILE: > #ifdef CONFIG_MEMORY_FAILURE > case MADV_SOFT_OFFLINE: > case MADV_HWPOISON: > @@ -834,6 +870,9 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) > return madvise_inject_error(behavior, start, start + len_in); > #endif > > + if (behavior == MADV_STOCKPILE) > + return madvise_stockpile(start, len); > + > write = madvise_need_mmap_write(behavior); > if (write) { > if (down_write_killable(¤t->mm->mmap_sem)) > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index e50a2db5b4ff..dc23dc6bbeb3 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -2276,6 +2276,47 @@ void mem_cgroup_handle_over_high(void) > current->memcg_nr_pages_over_high = 0; > } > > +int mem_cgroup_stockpile(unsigned long goal_pages) > +{ > + int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; > + unsigned long limit, nr_free, progress; > + struct mem_cgroup *memcg, *pos; > + int ret = 0; > + > + pos = memcg = get_mem_cgroup_from_mm(current->mm); > + > +retry: > + if (signal_pending(current)) { > + ret = -EINTR; > + goto out; > + } > + > + limit = min(pos->memory.max, pos->high); > + if (goal_pages > limit) { > + ret = -ENOMEM; > + goto out; > + } > + > + nr_free = limit - page_counter_read(&pos->memory); > + if ((long)nr_free < (long)goal_pages) { > + progress = try_to_free_mem_cgroup_pages(pos, > + goal_pages - nr_free, GFP_HIGHUSER, true); > + if (progress || nr_retries--) > + goto retry; > + ret = -EAGAIN; > + goto out; > + } > + > + nr_retries = MEM_CGROUP_RECLAIM_RETRIES; > + pos = parent_mem_cgroup(pos); > + if (pos) > + goto retry; > + > +out: > + css_put(&memcg->css); > + return ret; > +} > + > static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, > unsigned int nr_pages) > { > diff --git a/tools/vm/Makefile b/tools/vm/Makefile > index 20f6cf04377f..e5b5bc0d9421 100644 > --- a/tools/vm/Makefile > +++ b/tools/vm/Makefile > @@ -1,7 +1,7 @@ > # SPDX-License-Identifier: GPL-2.0 > # Makefile for vm tools > # > -TARGETS=page-types slabinfo page_owner_sort > +TARGETS=page-types slabinfo page_owner_sort stockpile > > LIB_DIR = ../lib/api > LIBS = $(LIB_DIR)/libapi.a > diff --git a/tools/vm/stockpile.c b/tools/vm/stockpile.c > new file mode 100644 > index 000000000000..245e24f293ec > --- /dev/null > +++ b/tools/vm/stockpile.c > @@ -0,0 +1,30 @@ > +// SPDX-License-Identifier: GPL-2.0 > +#include <sys/mman.h> > +#include <stdlib.h> > +#include <unistd.h> > +#include <err.h> > +#include <errno.h> > + > +#ifndef MADV_STOCKPILE > +# define MADV_STOCKPILE 20 > +#endif > + > +int main(int argc, char **argv) > +{ > + int interval; > + size_t size; > + int ret; > + > + if (argc != 3) > + errx(1, "usage: %s <size_mb> <interval_ms>", argv[0]); > + > + size = atol(argv[1]) << 20; > + interval = atoi(argv[2]) * 1000; > + > + while (1) { > + ret = madvise(NULL, size, MADV_STOCKPILE); > + if (ret && errno != EAGAIN) > + err(2, "madvise(NULL, %zu, MADV_STOCKPILE)", size); > + usleep(interval); > + } > +} > -- Michal Hocko SUSE Labs