The brk(), mmap(), mlock(), mlockall() and mprotect() syscalls are modified to check the memcg_over_limit flag and return ENOMEM when it is set and memory control action is PR_MEMACT_ENOMEM. In case the action is PR_MEMACT_SLOWDOWN, an artificial delay of 20ms will be added to slow down the memory allocation syscalls. Signed-off-by: Waiman Long <longman@xxxxxxxxxx> --- include/linux/sched.h | 16 ++++++++++++++++ kernel/fork.c | 1 + mm/memcontrol.c | 25 +++++++++++++++++++++++-- mm/mlock.c | 6 ++++++ mm/mmap.c | 12 ++++++++++++ mm/mprotect.c | 3 +++ 6 files changed, 61 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index c79d606d27ab..9ec1bd072334 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1477,6 +1477,22 @@ static inline char task_state_to_char(struct task_struct *tsk) return task_index_to_char(task_state_index(tsk)); } +#ifdef CONFIG_MEMCG +extern bool mem_cgroup_check_over_limit(void); + +static inline bool mem_over_memcg_limit(void) +{ + if (READ_ONCE(current->memcg_over_limit)) + return mem_cgroup_check_over_limit(); + return false; +} +#else +static inline bool mem_over_memcg_limit(void) +{ + return false; +} +#endif + /** * is_global_init - check if a task structure is init. Since init * is free to have sub-threads we need to check tgid. diff --git a/kernel/fork.c b/kernel/fork.c index 4d32190861bd..61f9a9e5f857 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -940,6 +940,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #ifdef CONFIG_MEMCG tsk->active_memcg = NULL; + tsk->memcg_over_limit = false; #endif return tsk; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1106dac024ac..5cad7bb26d13 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2646,7 +2646,9 @@ static bool __mem_cgroup_over_high_action(struct mem_cgroup *memcg, u8 action) if (!mm) return true; /* No more check is needed */ - current->memcg_over_limit = false; + if (READ_ONCE(current->memcg_over_limit)) + WRITE_ONCE(current->memcg_over_limit, false); + if ((action == PR_MEMACT_SIGNAL) && !signal) goto out; @@ -2660,7 +2662,11 @@ static bool __mem_cgroup_over_high_action(struct mem_cgroup *memcg, u8 action) WRITE_ONCE(current->memcg_over_limit, true); break; case PR_MEMACT_SLOWDOWN: - /* Slow down by yielding the cpu */ + /* + * Slow down by yielding the cpu & adding delay to + * memory allocation syscalls. + */ + WRITE_ONCE(current->memcg_over_limit, true); set_tsk_need_resched(current); set_preempt_need_resched(); break; @@ -2694,6 +2700,21 @@ static inline bool mem_cgroup_over_high_action(struct mem_cgroup *memcg) return __mem_cgroup_over_high_action(memcg, action); } +/* + * Called from memory allocation syscalls. + * Return true if ENOMEM should be returned, false otherwise. + */ +bool mem_cgroup_check_over_limit(void) +{ + u8 action = READ_ONCE(current->memcg_over_high_action); + + if (action == PR_MEMACT_ENOMEM) + return true; + if (action == PR_MEMACT_SLOWDOWN) + msleep(20); /* Artificial delay of 20ms */ + return false; +} + static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int nr_pages) { diff --git a/mm/mlock.c b/mm/mlock.c index 93ca2bf30b4f..130d4b3fa0f5 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -678,6 +678,9 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla if (!can_do_mlock()) return -EPERM; + if (mem_over_memcg_limit()) + return -ENOMEM; + len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; @@ -807,6 +810,9 @@ SYSCALL_DEFINE1(mlockall, int, flags) if (!can_do_mlock()) return -EPERM; + if (mem_over_memcg_limit()) + return -ENOMEM; + lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; diff --git a/mm/mmap.c b/mm/mmap.c index 40248d84ad5f..873ccf2560a6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -198,6 +198,10 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) bool downgraded = false; LIST_HEAD(uf); + /* Too much memory used? */ + if (mem_over_memcg_limit()) + return -ENOMEM; + if (mmap_write_lock_killable(mm)) return -EINTR; @@ -1407,6 +1411,10 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (mm->map_count > sysctl_max_map_count) return -ENOMEM; + /* Too much memory used? */ + if (mem_over_memcg_limit()) + return -ENOMEM; + /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. */ @@ -1557,6 +1565,10 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, struct file *file = NULL; unsigned long retval; + /* Too much memory used? */ + if (mem_over_memcg_limit()) + return -ENOMEM; + if (!(flags & MAP_ANONYMOUS)) { audit_mmap_fd(fd, flags); file = fget(fd); diff --git a/mm/mprotect.c b/mm/mprotect.c index ce8b8a5eacbb..b2c0f50bb0a0 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -519,6 +519,9 @@ static int do_mprotect_pkey(unsigned long start, size_t len, const bool rier = (current->personality & READ_IMPLIES_EXEC) && (prot & PROT_READ); + if (mem_over_memcg_limit()) + return -ENOMEM; + start = untagged_addr(start); prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); -- 2.18.1