On Mon 26-08-19 09:06:56, Tejun Heo wrote: > There's an inherent mismatch between memcg and writeback. The former > trackes ownership per-page while the latter per-inode. This was a > deliberate design decision because honoring per-page ownership in the > writeback path is complicated, may lead to higher CPU and IO overheads > and deemed unnecessary given that write-sharing an inode across > different cgroups isn't a common use-case. > > Combined with inode majority-writer ownership switching, this works > well enough in most cases but there are some pathological cases. For > example, let's say there are two cgroups A and B which keep writing to > different but confined parts of the same inode. B owns the inode and > A's memory is limited far below B's. A's dirty ratio can rise enough > to trigger balance_dirty_pages() sleeps but B's can be low enough to > avoid triggering background writeback. A will be slowed down without > a way to make writeback of the dirty pages happen. > > This patch implements foreign dirty recording and foreign mechanism so > that when a memcg encounters a condition as above it can trigger > flushes on bdi_writebacks which can clean its pages. Please see the > comment on top of mem_cgroup_track_foreign_dirty_slowpath() for > details. > > A reproducer follows. > > write-range.c:: > > #include <stdio.h> > #include <stdlib.h> > #include <unistd.h> > #include <fcntl.h> > #include <sys/types.h> > > static const char *usage = "write-range FILE START SIZE\n"; > > int main(int argc, char **argv) > { > int fd; > unsigned long start, size, end, pos; > char *endp; > char buf[4096]; > > if (argc < 4) { > fprintf(stderr, usage); > return 1; > } > > fd = open(argv[1], O_WRONLY); > if (fd < 0) { > perror("open"); > return 1; > } > > start = strtoul(argv[2], &endp, 0); > if (*endp != '\0') { > fprintf(stderr, usage); > return 1; > } > > size = strtoul(argv[3], &endp, 0); > if (*endp != '\0') { > fprintf(stderr, usage); > return 1; > } > > end = start + size; > > while (1) { > for (pos = start; pos < end; ) { > long bread, bwritten = 0; > > if (lseek(fd, pos, SEEK_SET) < 0) { > perror("lseek"); > return 1; > } > > bread = read(0, buf, sizeof(buf) < end - pos ? > sizeof(buf) : end - pos); > if (bread < 0) { > perror("read"); > return 1; > } > if (bread == 0) > return 0; > > while (bwritten < bread) { > long this; > > this = write(fd, buf + bwritten, > bread - bwritten); > if (this < 0) { > perror("write"); > return 1; > } > > bwritten += this; > pos += bwritten; > } > } > } > } > > repro.sh:: > > #!/bin/bash > > set -e > set -x > > sysctl -w vm.dirty_expire_centisecs=300000 > sysctl -w vm.dirty_writeback_centisecs=300000 > sysctl -w vm.dirtytime_expire_seconds=300000 > echo 3 > /proc/sys/vm/drop_caches > > TEST=/sys/fs/cgroup/test > A=$TEST/A > B=$TEST/B > > mkdir -p $A $B > echo "+memory +io" > $TEST/cgroup.subtree_control > echo $((1<<30)) > $A/memory.high > echo $((32<<30)) > $B/memory.high > > rm -f testfile > touch testfile > fallocate -l 4G testfile > > echo "Starting B" > > (echo $BASHPID > $B/cgroup.procs > pv -q --rate-limit 70M < /dev/urandom | ./write-range testfile $((2<<30)) $((2<<30))) & > > echo "Waiting 10s to ensure B claims the testfile inode" > sleep 5 > sync > sleep 5 > sync > echo "Starting A" > > (echo $BASHPID > $A/cgroup.procs > pv < /dev/urandom | ./write-range testfile 0 $((2<<30))) > > v2: Added comments explaining why the specific intervals are being used. > > v3: Use 0 @nr when calling cgroup_writeback_by_id() to use best-effort > flushing while avoding possible livelocks. > > v4: Use get_jiffies_64() and time_before/after64() instead of raw > jiffies_64 and arthimetic comparisons as suggested by Jan. > > Signed-off-by: Tejun Heo <tj@xxxxxxxxxx> The patch looks good to me. You can add: Reviewed-by: Jan Kara <jack@xxxxxxx> Honza > --- > include/linux/backing-dev-defs.h | 1 + > include/linux/memcontrol.h | 39 +++++++++ > mm/memcontrol.c | 134 +++++++++++++++++++++++++++++++ > mm/page-writeback.c | 4 + > 4 files changed, 178 insertions(+) > > diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h > index 1075f2552cfc..4fc87dee005a 100644 > --- a/include/linux/backing-dev-defs.h > +++ b/include/linux/backing-dev-defs.h > @@ -63,6 +63,7 @@ enum wb_reason { > * so it has a mismatch name. > */ > WB_REASON_FORKER_THREAD, > + WB_REASON_FOREIGN_FLUSH, > > WB_REASON_MAX, > }; > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index 2cd4359cb38c..ad8f1a397ae4 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -183,6 +183,23 @@ struct memcg_padding { > #define MEMCG_PADDING(name) > #endif > > +/* > + * Remember four most recent foreign writebacks with dirty pages in this > + * cgroup. Inode sharing is expected to be uncommon and, even if we miss > + * one in a given round, we're likely to catch it later if it keeps > + * foreign-dirtying, so a fairly low count should be enough. > + * > + * See mem_cgroup_track_foreign_dirty_slowpath() for details. > + */ > +#define MEMCG_CGWB_FRN_CNT 4 > + > +struct memcg_cgwb_frn { > + u64 bdi_id; /* bdi->id of the foreign inode */ > + int memcg_id; /* memcg->css.id of foreign inode */ > + u64 at; /* jiffies_64 at the time of dirtying */ > + struct wb_completion done; /* tracks in-flight foreign writebacks */ > +}; > + > /* > * The memory controller data structure. The memory controller controls both > * page cache and RSS per cgroup. We would eventually like to provide > @@ -307,6 +324,7 @@ struct mem_cgroup { > #ifdef CONFIG_CGROUP_WRITEBACK > struct list_head cgwb_list; > struct wb_domain cgwb_domain; > + struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT]; > #endif > > /* List of events which userspace want to receive */ > @@ -1237,6 +1255,18 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, > unsigned long *pheadroom, unsigned long *pdirty, > unsigned long *pwriteback); > > +void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, > + struct bdi_writeback *wb); > + > +static inline void mem_cgroup_track_foreign_dirty(struct page *page, > + struct bdi_writeback *wb) > +{ > + if (unlikely(&page->mem_cgroup->css != wb->memcg_css)) > + mem_cgroup_track_foreign_dirty_slowpath(page, wb); > +} > + > +void mem_cgroup_flush_foreign(struct bdi_writeback *wb); > + > #else /* CONFIG_CGROUP_WRITEBACK */ > > static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) > @@ -1252,6 +1282,15 @@ static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb, > { > } > > +static inline void mem_cgroup_track_foreign_dirty(struct page *page, > + struct bdi_writeback *wb) > +{ > +} > + > +static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb) > +{ > +} > + > #endif /* CONFIG_CGROUP_WRITEBACK */ > > struct sock; > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 26e2999af608..eb626a290d93 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -87,6 +87,10 @@ int do_swap_account __read_mostly; > #define do_swap_account 0 > #endif > > +#ifdef CONFIG_CGROUP_WRITEBACK > +static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); > +#endif > + > /* Whether legacy memory+swap accounting is active */ > static bool do_memsw_account(void) > { > @@ -4238,6 +4242,127 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, > } > } > > +/* > + * Foreign dirty flushing > + * > + * There's an inherent mismatch between memcg and writeback. The former > + * trackes ownership per-page while the latter per-inode. This was a > + * deliberate design decision because honoring per-page ownership in the > + * writeback path is complicated, may lead to higher CPU and IO overheads > + * and deemed unnecessary given that write-sharing an inode across > + * different cgroups isn't a common use-case. > + * > + * Combined with inode majority-writer ownership switching, this works well > + * enough in most cases but there are some pathological cases. For > + * example, let's say there are two cgroups A and B which keep writing to > + * different but confined parts of the same inode. B owns the inode and > + * A's memory is limited far below B's. A's dirty ratio can rise enough to > + * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid > + * triggering background writeback. A will be slowed down without a way to > + * make writeback of the dirty pages happen. > + * > + * Conditions like the above can lead to a cgroup getting repatedly and > + * severely throttled after making some progress after each > + * dirty_expire_interval while the underyling IO device is almost > + * completely idle. > + * > + * Solving this problem completely requires matching the ownership tracking > + * granularities between memcg and writeback in either direction. However, > + * the more egregious behaviors can be avoided by simply remembering the > + * most recent foreign dirtying events and initiating remote flushes on > + * them when local writeback isn't enough to keep the memory clean enough. > + * > + * The following two functions implement such mechanism. When a foreign > + * page - a page whose memcg and writeback ownerships don't match - is > + * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning > + * bdi_writeback on the page owning memcg. When balance_dirty_pages() > + * decides that the memcg needs to sleep due to high dirty ratio, it calls > + * mem_cgroup_flush_foreign() which queues writeback on the recorded > + * foreign bdi_writebacks which haven't expired. Both the numbers of > + * recorded bdi_writebacks and concurrent in-flight foreign writebacks are > + * limited to MEMCG_CGWB_FRN_CNT. > + * > + * The mechanism only remembers IDs and doesn't hold any object references. > + * As being wrong occasionally doesn't matter, updates and accesses to the > + * records are lockless and racy. > + */ > +void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, > + struct bdi_writeback *wb) > +{ > + struct mem_cgroup *memcg = page->mem_cgroup; > + struct memcg_cgwb_frn *frn; > + u64 now = get_jiffies_64(); > + u64 oldest_at = now; > + int oldest = -1; > + int i; > + > + /* > + * Pick the slot to use. If there is already a slot for @wb, keep > + * using it. If not replace the oldest one which isn't being > + * written out. > + */ > + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { > + frn = &memcg->cgwb_frn[i]; > + if (frn->bdi_id == wb->bdi->id && > + frn->memcg_id == wb->memcg_css->id) > + break; > + if (time_before64(frn->at, oldest_at) && > + atomic_read(&frn->done.cnt) == 1) { > + oldest = i; > + oldest_at = frn->at; > + } > + } > + > + if (i < MEMCG_CGWB_FRN_CNT) { > + /* > + * Re-using an existing one. Update timestamp lazily to > + * avoid making the cacheline hot. We want them to be > + * reasonably up-to-date and significantly shorter than > + * dirty_expire_interval as that's what expires the record. > + * Use the shorter of 1s and dirty_expire_interval / 8. > + */ > + unsigned long update_intv = > + min_t(unsigned long, HZ, > + msecs_to_jiffies(dirty_expire_interval * 10) / 8); > + > + if (time_before64(frn->at, now - update_intv)) > + frn->at = now; > + } else if (oldest >= 0) { > + /* replace the oldest free one */ > + frn = &memcg->cgwb_frn[oldest]; > + frn->bdi_id = wb->bdi->id; > + frn->memcg_id = wb->memcg_css->id; > + frn->at = now; > + } > +} > + > +/* issue foreign writeback flushes for recorded foreign dirtying events */ > +void mem_cgroup_flush_foreign(struct bdi_writeback *wb) > +{ > + struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); > + unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10); > + u64 now = jiffies_64; > + int i; > + > + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { > + struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i]; > + > + /* > + * If the record is older than dirty_expire_interval, > + * writeback on it has already started. No need to kick it > + * off again. Also, don't start a new one if there's > + * already one in flight. > + */ > + if (time_after64(frn->at, now - intv) && > + atomic_read(&frn->done.cnt) == 1) { > + frn->at = 0; > + cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0, > + WB_REASON_FOREIGN_FLUSH, > + &frn->done); > + } > + } > +} > + > #else /* CONFIG_CGROUP_WRITEBACK */ > > static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) > @@ -4760,6 +4885,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) > struct mem_cgroup *memcg; > unsigned int size; > int node; > + int __maybe_unused i; > > size = sizeof(struct mem_cgroup); > size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); > @@ -4803,6 +4929,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void) > #endif > #ifdef CONFIG_CGROUP_WRITEBACK > INIT_LIST_HEAD(&memcg->cgwb_list); > + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) > + memcg->cgwb_frn[i].done = > + __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); > #endif > idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); > return memcg; > @@ -4932,7 +5061,12 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css) > static void mem_cgroup_css_free(struct cgroup_subsys_state *css) > { > struct mem_cgroup *memcg = mem_cgroup_from_css(css); > + int __maybe_unused i; > > +#ifdef CONFIG_CGROUP_WRITEBACK > + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) > + wb_wait_for_completion(&memcg->cgwb_frn[i].done); > +#endif > if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) > static_branch_dec(&memcg_sockets_enabled_key); > > diff --git a/mm/page-writeback.c b/mm/page-writeback.c > index 1804f64ff43c..50055d2e4ea8 100644 > --- a/mm/page-writeback.c > +++ b/mm/page-writeback.c > @@ -1667,6 +1667,8 @@ static void balance_dirty_pages(struct bdi_writeback *wb, > if (unlikely(!writeback_in_progress(wb))) > wb_start_background_writeback(wb); > > + mem_cgroup_flush_foreign(wb); > + > /* > * Calculate global domain's pos_ratio and select the > * global dtc by default. > @@ -2427,6 +2429,8 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) > task_io_account_write(PAGE_SIZE); > current->nr_dirtied++; > this_cpu_inc(bdp_ratelimits); > + > + mem_cgroup_track_foreign_dirty(page, wb); > } > } > > -- > 2.17.1 > -- Jan Kara <jack@xxxxxxxx> SUSE Labs, CR