On Mon, 16 Dec 2013 16:17:05 +0400 Vladimir Davydov <vdavydov@xxxxxxxxxxxxx> wrote: > From: Glauber Costa <glommer@xxxxxxxxxx> > > During the past weeks, it became clear to us that the shrinker interface > we have right now works very well for some particular types of users, > but not that well for others. The latter are usually people interested > in one-shot notifications, that were forced to adapt themselves to the > count+scan behavior of shrinkers. To do so, they had no choice than to > greatly abuse the shrinker interface producing little monsters all over. > > During LSF/MM, one of the proposals that popped out during our session > was to reuse Anton Voronstsov's vmpressure for this. They are designed > for userspace consumption, but also provide a well-stablished, > cgroup-aware entry point for notifications. I have the exact problem described above for a project I'm working on and this solution seems to solve it well. However, I had a few issues while trying to use this interface. I'll comment on them below, but please take this more as advice seeking than patch review. > This patch extends that to also support in-kernel users. Events that > should be generated for in-kernel consumption will be marked as such, > and for those, we will call a registered function instead of triggering > an eventfd notification. > > Please note that due to my lack of understanding of each shrinker user, > I will stay away from converting the actual users, you are all welcome > to do so. > > Signed-off-by: Glauber Costa <glommer@xxxxxxxxxx> > Signed-off-by: Vladimir Davydov <vdavydov@xxxxxxxxxxxxx> > Acked-by: Anton Vorontsov <anton@xxxxxxxxxx> > Acked-by: Pekka Enberg <penberg@xxxxxxxxxx> > Reviewed-by: Greg Thelen <gthelen@xxxxxxxxxx> > Cc: Dave Chinner <dchinner@xxxxxxxxxx> > Cc: John Stultz <john.stultz@xxxxxxxxxx> > Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> > Cc: Joonsoo Kim <iamjoonsoo.kim@xxxxxxx> > Cc: Michal Hocko <mhocko@xxxxxxx> > Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> > Cc: Johannes Weiner <hannes@xxxxxxxxxxx> > --- > include/linux/vmpressure.h | 5 +++++ > mm/vmpressure.c | 53 +++++++++++++++++++++++++++++++++++++++++--- > 2 files changed, 55 insertions(+), 3 deletions(-) > > diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h > index 3f3788d..9102e53 100644 > --- a/include/linux/vmpressure.h > +++ b/include/linux/vmpressure.h > @@ -19,6 +19,9 @@ struct vmpressure { > /* Have to grab the lock on events traversal or modifications. */ > struct mutex events_lock; > > + /* False if only kernel users want to be notified, true otherwise. */ > + bool notify_userspace; > + > struct work_struct work; > }; > > @@ -38,6 +41,8 @@ extern int vmpressure_register_event(struct cgroup_subsys_state *css, > struct cftype *cft, > struct eventfd_ctx *eventfd, > const char *args); > +extern int vmpressure_register_kernel_event(struct cgroup_subsys_state *css, > + void (*fn)(void)); > extern void vmpressure_unregister_event(struct cgroup_subsys_state *css, > struct cftype *cft, > struct eventfd_ctx *eventfd); > diff --git a/mm/vmpressure.c b/mm/vmpressure.c > index e0f6283..730e7c1 100644 > --- a/mm/vmpressure.c > +++ b/mm/vmpressure.c > @@ -130,8 +130,12 @@ static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned, > } > > struct vmpressure_event { > - struct eventfd_ctx *efd; > + union { > + struct eventfd_ctx *efd; > + void (*fn)(void); How does the callback access its private data? > + }; > enum vmpressure_levels level; > + bool kernel_event; > struct list_head node; > }; > > @@ -147,12 +151,15 @@ static bool vmpressure_event(struct vmpressure *vmpr, > mutex_lock(&vmpr->events_lock); > > list_for_each_entry(ev, &vmpr->events, node) { > - if (level >= ev->level) { > + if (ev->kernel_event) { > + ev->fn(); I think it would be interesting to pass 'level' to the callback (I'll probably use it myself), but we could wait for a in-tree user before adding it. > + } else if (vmpr->notify_userspace && level >= ev->level) { > eventfd_signal(ev->efd, 1); > signalled = true; > } > } > > + vmpr->notify_userspace = false; > mutex_unlock(&vmpr->events_lock); > > return signalled; > @@ -222,7 +229,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, > * we account it too. > */ > if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS))) > - return; > + goto schedule; > > /* > * If we got here with no pages scanned, then that is an indicator > @@ -239,8 +246,15 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, > vmpr->scanned += scanned; > vmpr->reclaimed += reclaimed; > scanned = vmpr->scanned; > + /* > + * If we didn't reach this point, only kernel events will be triggered. > + * It is the job of the worker thread to clean this up once the > + * notifications are all delivered. > + */ > + vmpr->notify_userspace = true; > spin_unlock(&vmpr->sr_lock); > > +schedule: > if (scanned < vmpressure_win) > return; > schedule_work(&vmpr->work); > @@ -324,6 +338,39 @@ int vmpressure_register_event(struct cgroup_subsys_state *css, > } > > /** > + * vmpressure_register_kernel_event() - Register kernel-side notification > + * @css: css that is interested in vmpressure notifications > + * @fn: function to be called when pressure happens > + * > + * This function register in-kernel users interested in receiving notifications > + * about pressure conditions. Pressure notifications will be triggered at the > + * same time as userspace notifications (with no particular ordering relative > + * to it). > + * > + * Pressure notifications are a alternative method to shrinkers and will serve > + * well users that are interested in a one-shot notification, with a > + * well-defined cgroup aware interface. > + */ > +int vmpressure_register_kernel_event(struct cgroup_subsys_state *css, > + void (*fn)(void)) > +{ > + struct vmpressure *vmpr = css_to_vmpressure(css); This doesn't allow for css=NULL. What's the recommended way for a today's shrinker (which is not related to cgroups) to register with this API? Also, you don't seem to provide a way to de-register from the event. I hacked a patch to be able to use this, seems to work but it's a ugly hack: --- include/linux/vmpressure.h | 3 ++- mm/vmpressure.c | 13 +++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index 9102e53..de416b6 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h @@ -42,7 +42,8 @@ extern int vmpressure_register_event(struct cgroup_subsys_state *css, struct eventfd_ctx *eventfd, const char *args); extern int vmpressure_register_kernel_event(struct cgroup_subsys_state *css, - void (*fn)(void)); + void (*fn)(void *data, int level), + void *data); extern void vmpressure_unregister_event(struct cgroup_subsys_state *css, struct cftype *cft, struct eventfd_ctx *eventfd); diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 730e7c1..4ed0e85 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -132,9 +132,10 @@ static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned, struct vmpressure_event { union { struct eventfd_ctx *efd; - void (*fn)(void); + void (*fn)(void *data, int level); }; enum vmpressure_levels level; + void *data; bool kernel_event; struct list_head node; }; @@ -152,7 +153,7 @@ static bool vmpressure_event(struct vmpressure *vmpr, list_for_each_entry(ev, &vmpr->events, node) { if (ev->kernel_event) { - ev->fn(); + ev->fn(ev->data, level); } else if (vmpr->notify_userspace && level >= ev->level) { eventfd_signal(ev->efd, 1); signalled = true; @@ -352,21 +353,25 @@ int vmpressure_register_event(struct cgroup_subsys_state *css, * well-defined cgroup aware interface. */ int vmpressure_register_kernel_event(struct cgroup_subsys_state *css, - void (*fn)(void)) + void (*fn)(void *data, int level), void *data) { - struct vmpressure *vmpr = css_to_vmpressure(css); + struct vmpressure *vmpr; struct vmpressure_event *ev; + vmpr = css ? css_to_vmpressure(css) : memcg_to_vmpressure(NULL); + ev = kzalloc(sizeof(*ev), GFP_KERNEL); if (!ev) return -ENOMEM; ev->kernel_event = true; + ev->data = data; ev->fn = fn; mutex_lock(&vmpr->events_lock); list_add(&ev->node, &vmpr->events); mutex_unlock(&vmpr->events_lock); + return 0; } -- 1.8.3.1 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>