On Fri, 3 Jun 2011 09:12:16 -0700 Greg Thelen <gthelen@xxxxxxxxxx> wrote: > Introduce memcg routines to assist in per-memcg dirty page management: > > - mem_cgroup_balance_dirty_pages() walks a memcg hierarchy comparing > dirty memory usage against memcg foreground and background thresholds. > If an over-background-threshold memcg is found, then per-memcg > background writeback is queued. Per-memcg writeback differs from > classic, non-memcg, per bdi writeback by setting the new > writeback_control.for_cgroup bit. > > If an over-foreground-threshold memcg is found, then foreground > writeout occurs. When performing foreground writeout, first consider > inodes exclusive to the memcg. If unable to make enough progress, > then consider inodes shared between memcg. Such cross-memcg inode > sharing likely to be rare in situations that use per-cgroup memory > isolation. The approach tries to handle the common (non-shared) > case well without punishing well behaved (non-sharing) cgroups. > As a last resort writeback shared inodes. > > This routine is used by balance_dirty_pages() in a later change. > > - mem_cgroup_hierarchical_dirty_info() returns the dirty memory usage > and limits of the memcg closest to (or over) its dirty limit. This > will be used by throttle_vm_writeout() in a latter change. > > Signed-off-by: Greg Thelen <gthelen@xxxxxxxxxx> > --- > Changelog since v7: > - Add more detail to commit description. > > - Declare the new writeback_control for_cgroup bit in this change, the > first patch that uses the new field is first used. In -v7 the field > was declared in a separate patch. > > include/linux/memcontrol.h | 18 +++++ > include/linux/writeback.h | 1 + > include/trace/events/memcontrol.h | 83 ++++++++++++++++++++ > mm/memcontrol.c | 150 +++++++++++++++++++++++++++++++++++++ > 4 files changed, 252 insertions(+), 0 deletions(-) > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index 3d72e09..0d0363e 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -167,6 +167,11 @@ bool should_writeback_mem_cgroup_inode(struct inode *inode, > struct writeback_control *wbc); > bool mem_cgroups_over_bground_dirty_thresh(void); > void mem_cgroup_writeback_done(void); > +bool mem_cgroup_hierarchical_dirty_info(unsigned long sys_available_mem, > + struct mem_cgroup *mem, > + struct dirty_info *info); > +void mem_cgroup_balance_dirty_pages(struct address_space *mapping, > + unsigned long write_chunk); > > unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, > gfp_t gfp_mask, > @@ -383,6 +388,19 @@ static inline void mem_cgroup_writeback_done(void) > { > } > > +static inline void mem_cgroup_balance_dirty_pages(struct address_space *mapping, > + unsigned long write_chunk) > +{ > +} > + > +static inline bool > +mem_cgroup_hierarchical_dirty_info(unsigned long sys_available_mem, > + struct mem_cgroup *mem, > + struct dirty_info *info) > +{ > + return false; > +} > + > static inline > unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, > gfp_t gfp_mask, > diff --git a/include/linux/writeback.h b/include/linux/writeback.h > index 66ec339..4f5c0d2 100644 > --- a/include/linux/writeback.h > +++ b/include/linux/writeback.h > @@ -47,6 +47,7 @@ struct writeback_control { > unsigned for_reclaim:1; /* Invoked from the page allocator */ > unsigned range_cyclic:1; /* range_start is cyclic */ > unsigned more_io:1; /* more io to be dispatched */ > + unsigned for_cgroup:1; /* enable cgroup writeback */ > unsigned shared_inodes:1; /* write inodes spanning cgroups */ > }; > > diff --git a/include/trace/events/memcontrol.h b/include/trace/events/memcontrol.h > index 326a66b..b42dae1 100644 > --- a/include/trace/events/memcontrol.h > +++ b/include/trace/events/memcontrol.h > @@ -109,6 +109,89 @@ TRACE_EVENT(mem_cgroups_over_bground_dirty_thresh, > __entry->first_id) > ) > > +DECLARE_EVENT_CLASS(mem_cgroup_consider_writeback, > + TP_PROTO(unsigned short css_id, > + struct backing_dev_info *bdi, > + unsigned long nr_reclaimable, > + unsigned long thresh, > + bool over_limit), > + > + TP_ARGS(css_id, bdi, nr_reclaimable, thresh, over_limit), > + > + TP_STRUCT__entry( > + __field(unsigned short, css_id) > + __field(struct backing_dev_info *, bdi) > + __field(unsigned long, nr_reclaimable) > + __field(unsigned long, thresh) > + __field(bool, over_limit) > + ), > + > + TP_fast_assign( > + __entry->css_id = css_id; > + __entry->bdi = bdi; > + __entry->nr_reclaimable = nr_reclaimable; > + __entry->thresh = thresh; > + __entry->over_limit = over_limit; > + ), > + > + TP_printk("css_id=%d bdi=%p nr_reclaimable=%ld thresh=%ld " > + "over_limit=%d", __entry->css_id, __entry->bdi, > + __entry->nr_reclaimable, __entry->thresh, __entry->over_limit) > +) > + > +#define DEFINE_MEM_CGROUP_CONSIDER_WRITEBACK_EVENT(name) \ > +DEFINE_EVENT(mem_cgroup_consider_writeback, name, \ > + TP_PROTO(unsigned short id, \ > + struct backing_dev_info *bdi, \ > + unsigned long nr_reclaimable, \ > + unsigned long thresh, \ > + bool over_limit), \ > + TP_ARGS(id, bdi, nr_reclaimable, thresh, over_limit) \ > +) > + > +DEFINE_MEM_CGROUP_CONSIDER_WRITEBACK_EVENT(mem_cgroup_consider_bg_writeback); > +DEFINE_MEM_CGROUP_CONSIDER_WRITEBACK_EVENT(mem_cgroup_consider_fg_writeback); > + > +TRACE_EVENT(mem_cgroup_fg_writeback, > + TP_PROTO(unsigned long write_chunk, > + struct writeback_control *wbc), > + > + TP_ARGS(write_chunk, wbc), > + > + TP_STRUCT__entry( > + __field(unsigned long, write_chunk) > + __field(long, wbc_to_write) > + __field(bool, shared_inodes) > + ), > + > + TP_fast_assign( > + __entry->write_chunk = write_chunk; > + __entry->wbc_to_write = wbc->nr_to_write; > + __entry->shared_inodes = wbc->shared_inodes; > + ), > + > + TP_printk("write_chunk=%ld nr_to_write=%ld shared_inodes=%d", > + __entry->write_chunk, > + __entry->wbc_to_write, > + __entry->shared_inodes) > +) > + > +TRACE_EVENT(mem_cgroup_enable_shared_writeback, > + TP_PROTO(unsigned short css_id), > + > + TP_ARGS(css_id), > + > + TP_STRUCT__entry( > + __field(unsigned short, css_id) > + ), > + > + TP_fast_assign( > + __entry->css_id = css_id; > + ), > + > + TP_printk("enabling shared writeback for memcg %d", __entry->css_id) > +) > + > #endif /* _TRACE_MEMCONTROL_H */ > > /* This part must be outside protection */ > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index a5b1794..17cb888 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -1622,6 +1622,156 @@ void mem_cgroup_writeback_done(void) > } > } > > +/* > + * This routine must be called by processes which are generating dirty pages. > + * It considers the dirty pages usage and thresholds of the current cgroup and > + * (depending if hierarchical accounting is enabled) ancestral memcg. If any of > + * the considered memcg are over their background dirty limit, then background > + * writeback is queued. If any are over the foreground dirty limit then > + * throttle the dirtying task while writing dirty data. The per-memcg dirty > + * limits check by this routine are distinct from either the per-system, > + * per-bdi, or per-task limits considered by balance_dirty_pages(). > + */ > +void mem_cgroup_balance_dirty_pages(struct address_space *mapping, > + unsigned long write_chunk) > +{ > + struct backing_dev_info *bdi = mapping->backing_dev_info; > + struct mem_cgroup *mem; > + struct mem_cgroup *ref_mem; > + struct dirty_info info; > + unsigned long nr_reclaimable; > + unsigned long sys_available_mem; > + unsigned long pause = 1; > + unsigned short id; > + bool over; > + bool shared_inodes; > + > + if (mem_cgroup_disabled()) > + return; > + > + sys_available_mem = determine_dirtyable_memory(); > + > + /* reference the memcg so it is not deleted during this routine */ > + rcu_read_lock(); > + mem = mem_cgroup_from_task(current); > + if (mem && mem_cgroup_is_root(mem)) > + mem = NULL; > + if (mem) > + css_get(&mem->css); > + rcu_read_unlock(); > + ref_mem = mem; > + > + /* balance entire ancestry of current's mem. */ > + for (; mem_cgroup_has_dirty_limit(mem); mem = parent_mem_cgroup(mem)) { > + id = css_id(&mem->css); > + Hmm, this sounds natural...but...don't we need to restart checking from ref_mem's dirty_ratio once we find an ancestor is over dirty_ratio and we slept ? Even if parent's dirty ratio comes to be clean state, children's may not. So, I think some "restart loop" jump after io_schedule_timeout(). Thanks, -Kame > + /* > + * keep throttling and writing inode data so long as mem is over > + * its dirty limit. > + */ > + for (shared_inodes = false; ; ) { > + struct writeback_control wbc = { > + .sync_mode = WB_SYNC_NONE, > + .older_than_this = NULL, > + .range_cyclic = 1, > + .for_cgroup = 1, > + .nr_to_write = write_chunk, > + .shared_inodes = shared_inodes, > + }; > + > + /* > + * if mem is under dirty limit, then break from > + * throttling loop. > + */ > + mem_cgroup_dirty_info(sys_available_mem, mem, &info); > + nr_reclaimable = dirty_info_reclaimable(&info); > + over = nr_reclaimable > info.dirty_thresh; > + trace_mem_cgroup_consider_fg_writeback( > + id, bdi, nr_reclaimable, info.dirty_thresh, > + over); > + if (!over) > + break; > + > + mem_cgroup_mark_over_bg_thresh(mem); > + writeback_inodes_wb(&bdi->wb, &wbc); > + trace_mem_cgroup_fg_writeback(write_chunk, &wbc); > + /* if no progress, then consider shared inodes */ > + if ((wbc.nr_to_write == write_chunk) && > + !shared_inodes) { > + trace_mem_cgroup_enable_shared_writeback(id); > + shared_inodes = true; > + } > + > + /* > + * Sleep up to 100ms to throttle writer and wait for > + * queued background I/O to complete. > + */ > + __set_current_state(TASK_UNINTERRUPTIBLE); > + io_schedule_timeout(pause); > + pause <<= 1; > + if (pause > HZ / 10) > + pause = HZ / 10; Hmm, is this exponential back off "pause" is from mm/page-writeback.c ? I'm happy if we can have a shared code. (But ok if it adds some messy.) > + } > + > + /* if mem is over background limit, then queue bg writeback */ > + over = nr_reclaimable >= info.background_thresh; > + trace_mem_cgroup_consider_bg_writeback( > + id, bdi, nr_reclaimable, info.background_thresh, > + over); > + if (over) > + mem_cgroup_queue_bg_writeback(mem, bdi); > + } > + > + if (ref_mem) > + css_put(&ref_mem->css); > +} > + > +/* > + * Return the dirty thresholds and usage for the mem (within the ancestral chain > + * of @mem) closest to its dirty limit or the first memcg over its limit. > + * > + * The check is not stable because the usage and limits can change asynchronous > + * to this routine. > + */ > +bool mem_cgroup_hierarchical_dirty_info(unsigned long sys_available_mem, > + struct mem_cgroup *mem, > + struct dirty_info *info) > +{ > + unsigned long usage; > + struct dirty_info uninitialized_var(cur_info); > + > + if (mem_cgroup_disabled()) > + return false; > + > + info->nr_writeback = ULONG_MAX; /* invalid initial value */ > + > + /* walk up hierarchy enabled parents */ > + for (; mem_cgroup_has_dirty_limit(mem); mem = parent_mem_cgroup(mem)) { > + mem_cgroup_dirty_info(sys_available_mem, mem, &cur_info); > + usage = dirty_info_reclaimable(&cur_info) + > + cur_info.nr_writeback; > + > + /* if over limit, stop searching */ > + if (usage >= cur_info.dirty_thresh) { > + *info = cur_info; > + break; > + } > + > + /* > + * Save dirty usage of mem closest to its limit if either: > + * - mem is the first mem considered > + * - mem dirty margin is smaller than last recorded one > + */ > + if ((info->nr_writeback == ULONG_MAX) || > + (cur_info.dirty_thresh - usage) < > + (info->dirty_thresh - > + (dirty_info_reclaimable(info) + info->nr_writeback))) > + *info = cur_info; > + } > + > + return info->nr_writeback != ULONG_MAX; > +} > + > static void mem_cgroup_start_move(struct mem_cgroup *mem) > { > int cpu; > -- > 1.7.3.1 > -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html