On Thu, Mar 11, 2010 at 9:57 AM, KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> wrote: > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> > > Considering containers or other resource management softwares in userland, > event notification of OOM in memcg should be implemented. > Now, memcg has "threshold" notifier which uses eventfd, we can make > use of it for oom notification. > > This patch adds oom notification eventfd callback for memcg. The usage > is very similar to threshold notifier, but control file is > memory.oom_control and no arguments other than eventfd is required. > > % cgroup_event_notifier /cgroup/A/memory.oom_control dummy > (About cgroup_event_notifier, see Documentation/cgroup/) > > TODO: > - add a knob to disable oom-kill under a memcg. > - add read/write function to oom_control > > Changelog: 20100309 > - splitted from threshold functions. use list rather than array. > - moved all to inside of mutex. > Changelog: 20100304 > - renewed implemenation. > > Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> Looks great! Two remarks below. Reviewed-by: Kirill A. Shutemov <kirill@xxxxxxxxxxxxx> > --- > Documentation/cgroups/memory.txt | 20 +++++++ > mm/memcontrol.c | 105 ++++++++++++++++++++++++++++++++++++--- > 2 files changed, 116 insertions(+), 9 deletions(-) > > Index: mmotm-2.6.34-Mar9/mm/memcontrol.c > =================================================================== > --- mmotm-2.6.34-Mar9.orig/mm/memcontrol.c > +++ mmotm-2.6.34-Mar9/mm/memcontrol.c > @@ -149,6 +149,7 @@ struct mem_cgroup_threshold { > u64 threshold; > }; > > +/* For threshold */ > struct mem_cgroup_threshold_ary { > /* An array index points to threshold just below usage. */ > atomic_t current_threshold; > @@ -157,8 +158,14 @@ struct mem_cgroup_threshold_ary { > /* Array of thresholds */ > struct mem_cgroup_threshold entries[0]; > }; > +/* for OOM */ > +struct mem_cgroup_eventfd_list { > + struct list_head list; > + struct eventfd_ctx *eventfd; > +}; > > static void mem_cgroup_threshold(struct mem_cgroup *mem); > +static void mem_cgroup_oom_notify(struct mem_cgroup *mem); > > /* > * The memory controller data structure. The memory controller controls both > @@ -220,6 +227,9 @@ struct mem_cgroup { > /* thresholds for mem+swap usage. RCU-protected */ > struct mem_cgroup_threshold_ary *memsw_thresholds; > > + /* For oom notifier event fd */ > + struct list_head oom_notify; > + > /* > * Should we move charges of a task when a task is moved into this > * mem_cgroup ? And what type of charges should we move ? > @@ -282,9 +292,12 @@ enum charge_type { > /* for encoding cft->private value on file */ > #define _MEM (0) > #define _MEMSWAP (1) > +#define _OOM_TYPE (2) > #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) > #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) > #define MEMFILE_ATTR(val) ((val) & 0xffff) > +/* Used for OOM nofiier */ > +#define OOM_CONTROL (0) > > /* > * Reclaim flags for mem_cgroup_hierarchical_reclaim > @@ -1351,6 +1364,8 @@ bool mem_cgroup_handle_oom(struct mem_cg > */ > if (!locked) > prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); > + else > + mem_cgroup_oom_notify(mem); > mutex_unlock(&memcg_oom_mutex); > > if (locked) > @@ -3398,8 +3413,22 @@ static int compare_thresholds(const void > return _a->threshold - _b->threshold; > } > > -static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft, > - struct eventfd_ctx *eventfd, const char *args) > +static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data) > +{ > + struct mem_cgroup_eventfd_list *ev; > + > + list_for_each_entry(ev, &mem->oom_notify, list) > + eventfd_signal(ev->eventfd, 1); > + return 0; > +} > + > +static void mem_cgroup_oom_notify(struct mem_cgroup *mem) > +{ > + mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb); > +} > + > +static int mem_cgroup_usage_register_event(struct cgroup *cgrp, > + struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) > { > struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); > struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; > @@ -3483,8 +3512,8 @@ unlock: > return ret; > } > > -static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, > - struct eventfd_ctx *eventfd) > +static int mem_cgroup_usage_unregister_event(struct cgroup *cgrp, > + struct cftype *cft, struct eventfd_ctx *eventfd) > { > struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); > struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; > @@ -3568,13 +3597,66 @@ unlock: > return ret; > } > > +static int mem_cgroup_oom_register_event(struct cgroup *cgrp, > + struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) > +{ > + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); > + struct mem_cgroup_eventfd_list *event; > + int type = MEMFILE_TYPE(cft->private); > + int ret = -ENOMEM; > + > + BUG_ON(type != _OOM_TYPE); > + > + mutex_lock(&memcg_oom_mutex); > + > + /* Allocate memory for new array of thresholds */ Irrelevant comment? > + event = kmalloc(sizeof(*event), GFP_KERNEL); > + if (!event) > + goto unlock; > + /* Add new threshold */ Ditto. > + event->eventfd = eventfd; > + list_add(&event->list, &memcg->oom_notify); > + > + /* already in OOM ? */ > + if (atomic_read(&memcg->oom_lock)) > + eventfd_signal(eventfd, 1); > + ret = 0; > +unlock: > + mutex_unlock(&memcg_oom_mutex); > + > + return ret; > +} > + > +static int mem_cgroup_oom_unregister_event(struct cgroup *cgrp, > + struct cftype *cft, struct eventfd_ctx *eventfd) > +{ > + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); > + struct mem_cgroup_eventfd_list *ev, *tmp; > + int type = MEMFILE_TYPE(cft->private); > + > + BUG_ON(type != _OOM_TYPE); > + > + mutex_lock(&memcg_oom_mutex); > + > + list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { > + if (ev->eventfd == eventfd) { > + list_del(&ev->list); > + kfree(ev); > + } > + } > + > + mutex_unlock(&memcg_oom_mutex); > + > + return 0; > +} > + > static struct cftype mem_cgroup_files[] = { > { > .name = "usage_in_bytes", > .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), > .read_u64 = mem_cgroup_read, > - .register_event = mem_cgroup_register_event, > - .unregister_event = mem_cgroup_unregister_event, > + .register_event = mem_cgroup_usage_register_event, > + .unregister_event = mem_cgroup_usage_unregister_event, > }, > { > .name = "max_usage_in_bytes", > @@ -3623,6 +3705,12 @@ static struct cftype mem_cgroup_files[] > .read_u64 = mem_cgroup_move_charge_read, > .write_u64 = mem_cgroup_move_charge_write, > }, > + { > + .name = "oom_control", > + .register_event = mem_cgroup_oom_register_event, > + .unregister_event = mem_cgroup_oom_unregister_event, > + .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), > + }, > }; > > #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP > @@ -3631,8 +3719,8 @@ static struct cftype memsw_cgroup_files[ > .name = "memsw.usage_in_bytes", > .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), > .read_u64 = mem_cgroup_read, > - .register_event = mem_cgroup_register_event, > - .unregister_event = mem_cgroup_unregister_event, > + .register_event = mem_cgroup_usage_register_event, > + .unregister_event = mem_cgroup_usage_unregister_event, > }, > { > .name = "memsw.max_usage_in_bytes", > @@ -3876,6 +3964,7 @@ mem_cgroup_create(struct cgroup_subsys * > } > mem->last_scanned_child = 0; > spin_lock_init(&mem->reclaim_param_lock); > + INIT_LIST_HEAD(&mem->oom_notify); > > if (parent) > mem->swappiness = get_swappiness(parent); > Index: mmotm-2.6.34-Mar9/Documentation/cgroups/memory.txt > =================================================================== > --- mmotm-2.6.34-Mar9.orig/Documentation/cgroups/memory.txt > +++ mmotm-2.6.34-Mar9/Documentation/cgroups/memory.txt > @@ -184,6 +184,9 @@ limits on the root cgroup. > > Note2: When panic_on_oom is set to "2", the whole system will panic. > > +When oom event notifier is registered, event will be delivered. > +(See oom_control section) > + > 2. Locking > > The memory controller uses the following hierarchy > @@ -488,7 +491,22 @@ threshold in any direction. > > It's applicable for root and non-root cgroup. > > -10. TODO > +10. OOM Control > + > +Memory controler implements oom notifier using cgroup notification > +API (See cgroups.txt). It allows to register multiple oom notification > +delivery and gets notification when oom happens. > + > +To register a notifier, application need: > + - create an eventfd using eventfd(2) > + - open memory.oom_control file > + - write string like "<event_fd> <memory.oom_control>" to cgroup.event_control > + > +Application will be notifier through eventfd when oom happens. > +OOM notification doesn't work for root cgroup. > + > + > +11. TODO > > 1. Add support for accounting huge pages (as a separate controller) > 2. Make per-cgroup scanner reclaim not-shared pages first > > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href