* David Rientjes <rientjes@xxxxxxxxxx> [2010-12-21 23:27:25]: > Completely disabling the oom killer for a memcg is problematic if > userspace is unable to address the condition itself, usually because > userspace is unresponsive. This scenario creates a memcg livelock: > tasks are continuously trying to allocate memory and nothing is getting > killed, so memory freeing is impossible since reclaim has failed, and > all work stalls with no remedy in sight. > > This patch adds an oom killer delay so that a memcg may be configured to > wait at least a pre-defined number of milliseconds before calling the > oom killer. If the oom condition persists for this number of > milliseconds, the oom killer will be called the next time the memory > controller attempts to charge a page (and memory.oom_control is set to > 0). This allows userspace to have a short period of time to respond to > the condition before timing out and deferring to the kernel to kill a > task. > > Admins may set the oom killer timeout using the new interface: > > # echo 60000 > memory.oom_delay > > This will defer oom killing to the kernel only after 60 seconds has > elapsed. When setting memory.oom_delay, all pending timeouts are > restarted. I think Paul mentioned this problem and solution (I think already in use at google) some time back. Is x miliseconds a per oom kill decision timer or is it global? > > Signed-off-by: David Rientjes <rientjes@xxxxxxxxxx> > --- > Documentation/cgroups/memory.txt | 15 ++++++++++ > mm/memcontrol.c | 56 +++++++++++++++++++++++++++++++++---- > 2 files changed, 65 insertions(+), 6 deletions(-) > > diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt > --- a/Documentation/cgroups/memory.txt > +++ b/Documentation/cgroups/memory.txt > @@ -68,6 +68,7 @@ Brief summary of control files. > (See sysctl's vm.swappiness) > memory.move_charge_at_immigrate # set/show controls of moving charges > memory.oom_control # set/show oom controls. > + memory.oom_delay # set/show millisecs to wait before oom kill > > 1. History > > @@ -640,6 +641,20 @@ At reading, current status of OOM is shown. > under_oom 0 or 1 (if 1, the memory cgroup is under OOM, tasks may > be stopped.) > > +It is also possible to configure an oom killer timeout to prevent the > +possibility that the memcg will livelock looking for memory if userspace > +has disabled the oom killer with oom_control but cannot act to fix the > +condition itself (usually because userspace has become unresponsive). > + > +To set an oom killer timeout for a memcg, write the number of milliseconds > +to wait before killing a task to memory.oom_delay: > + > + # echo 60000 > memory.oom_delay # wait 60 seconds, then oom kill > + > +This timeout is reset the next time the memcg successfully charges memory > +to a task. > + > + > 11. TODO > > 1. Add support for accounting huge pages (as a separate controller) > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -233,12 +233,16 @@ struct mem_cgroup { > * Should the accounting and control be hierarchical, per subtree? > */ > bool use_hierarchy; > + /* oom_delay has expired and still out of memory? */ > + bool oom_kill_delay_expired; > atomic_t oom_lock; > atomic_t refcnt; > > unsigned int swappiness; > /* OOM-Killer disable */ > int oom_kill_disable; > + /* min number of ticks to wait before calling oom killer */ > + int oom_kill_delay; > > /* set when res.limit == memsw.limit */ > bool memsw_is_minimum; > @@ -1524,6 +1528,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem) > > static void memcg_oom_recover(struct mem_cgroup *mem) > { > + mem->oom_kill_delay_expired = false; > if (mem && atomic_read(&mem->oom_lock)) > memcg_wakeup_oom(mem); > } > @@ -1531,17 +1536,18 @@ static void memcg_oom_recover(struct mem_cgroup *mem) > /* > * try to call OOM killer. returns false if we should exit memory-reclaim loop. > */ > -bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) > +static bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) > { > struct oom_wait_info owait; > - bool locked, need_to_kill; > + bool locked; > + bool need_to_kill = true; > + bool need_to_delay = false; > > owait.mem = mem; > owait.wait.flags = 0; > owait.wait.func = memcg_oom_wake_function; > owait.wait.private = current; > INIT_LIST_HEAD(&owait.wait.task_list); > - need_to_kill = true; > /* At first, try to OOM lock hierarchy under mem.*/ > mutex_lock(&memcg_oom_mutex); > locked = mem_cgroup_oom_lock(mem); > @@ -1553,26 +1559,34 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) > prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); > if (!locked || mem->oom_kill_disable) > need_to_kill = false; > - if (locked) > + if (locked) { > mem_cgroup_oom_notify(mem); > + if (mem->oom_kill_delay && !mem->oom_kill_delay_expired) { > + need_to_kill = false; > + need_to_delay = true; > + } > + } > mutex_unlock(&memcg_oom_mutex); > > if (need_to_kill) { > finish_wait(&memcg_oom_waitq, &owait.wait); > mem_cgroup_out_of_memory(mem, mask); > } else { > - schedule(); > + schedule_timeout(need_to_delay ? mem->oom_kill_delay : > + MAX_SCHEDULE_TIMEOUT); > finish_wait(&memcg_oom_waitq, &owait.wait); > } > mutex_lock(&memcg_oom_mutex); > mem_cgroup_oom_unlock(mem); > memcg_wakeup_oom(mem); > + mem->oom_kill_delay_expired = need_to_delay; > mutex_unlock(&memcg_oom_mutex); > > if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) > return false; > /* Give chance to dying process */ > - schedule_timeout(1); > + if (!need_to_delay) > + schedule_timeout(1); > return true; > } I think we need additional statistics for tracking oom kills due to timer expiry. > > @@ -2007,6 +2021,7 @@ again: > refill_stock(mem, csize - PAGE_SIZE); > css_put(&mem->css); > done: > + mem->oom_kill_delay_expired = false; > *memcg = mem; > return 0; > nomem: > @@ -4053,6 +4068,29 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, > return 0; > } > > +static u64 mem_cgroup_oom_delay_read(struct cgroup *cgrp, struct cftype *cft) > +{ > + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); > + > + return jiffies_to_msecs(memcg->oom_kill_delay); > +} > + > +static int mem_cgroup_oom_delay_write(struct cgroup *cgrp, struct cftype *cft, > + u64 val) > +{ > + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); > + > + /* Sanity check -- don't wait longer than an hour */ > + if (val > (60 * 60 * 1000)) > + return -EINVAL; Why do this and not document it? These sort of things get exremely confusing. I would prefer not to have it without a resonable use case or very good documentation, explaining why we need an upper bound. > + > + cgroup_lock(); > + memcg->oom_kill_delay = msecs_to_jiffies(val); > + memcg_oom_recover(memcg); > + cgroup_unlock(); > + return 0; > +} > + > static struct cftype mem_cgroup_files[] = { > { > .name = "usage_in_bytes", > @@ -4116,6 +4154,11 @@ static struct cftype mem_cgroup_files[] = { > .unregister_event = mem_cgroup_oom_unregister_event, > .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), > }, > + { > + .name = "oom_delay", > + .read_u64 = mem_cgroup_oom_delay_read, > + .write_u64 = mem_cgroup_oom_delay_write, > + }, > }; > > #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP > @@ -4357,6 +4400,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) > parent = mem_cgroup_from_cont(cont->parent); > mem->use_hierarchy = parent->use_hierarchy; > mem->oom_kill_disable = parent->oom_kill_disable; > + mem->oom_kill_delay = parent->oom_kill_delay; > } > > if (parent && parent->use_hierarchy) { -- Three Cheers, Balbir -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/ Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>