[PATCH 05/45] writeback: make backing_dev_info host cgroup-specific bdi_writebacks

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



For the planned cgroup writeback support, on each bdi
(backing_dev_info), each cgroup will be served by a separate wb
(bdi_writeback).  This patch updates bdi so that a bdi can host
multiple wbs (bdi_writebacks).

bdi->wb remains unchanged and will keep serving the root cgroup.
cgwb's (cgroup wb's) for non-root cgroups are created on-demand or
looked up during init_cgwb_dirty_page_contex() according to the dirty
blkcg of the page being dirtied.  Each cgwb is indexed on
bdi->cgwb_tree by its blkcg id.

Once dirty_context is initialized for a page, the page's wb can be
looked up using page_cgwb_{dirty|wb}() while the page is dirty or
under writeback respectively.  Once created, a cgwb is destroyed iff
either its associated bdi or blkcg is destroyed, meaning that as long
as a page is dirty or under writeback, its associated cgwb is
accessible without further locking.

dirty_context grew a new field ->wb which caches the selected wb and
account_page_dirtied() is updated to use that instead of
unconditionally using bdi->wb.

Currently, none of the filesystems has FS_CGROUP_WRITEBACK and all
pages will keep being associated with bdi->wb.

Signed-off-by: Tejun Heo <tj@xxxxxxxxxx>
Cc: Jens Axboe <axboe@xxxxxxxxx>
Cc: Jan Kara <jack@xxxxxxx>
---
 block/blk-cgroup.c               |  11 ++-
 fs/fs-writeback.c                |  19 +++-
 include/linux/backing-dev-defs.h |  17 +++-
 include/linux/backing-dev.h      | 123 +++++++++++++++++++++++++
 include/linux/blk-cgroup.h       |   4 +
 mm/backing-dev.c                 | 189 +++++++++++++++++++++++++++++++++++++++
 mm/page-writeback.c              |   4 +-
 7 files changed, 361 insertions(+), 6 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 9e0fe38..8bebaa9 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/err.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/slab.h>
 #include <linux/genhd.h>
 #include <linux/delay.h>
@@ -813,6 +814,11 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
 	spin_unlock_irq(&blkcg->lock);
 }
 
+static void blkcg_css_released(struct cgroup_subsys_state *css)
+{
+	cgwb_blkcg_released(css);
+}
+
 static void blkcg_css_free(struct cgroup_subsys_state *css)
 {
 	struct blkcg *blkcg = css_to_blkcg(css);
@@ -841,7 +847,9 @@ done:
 	spin_lock_init(&blkcg->lock);
 	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
 	INIT_HLIST_HEAD(&blkcg->blkg_list);
-
+#ifdef CONFIG_CGROUP_WRITEBACK
+	INIT_LIST_HEAD(&blkcg->cgwb_list);
+#endif
 	return &blkcg->css;
 }
 
@@ -926,6 +934,7 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css,
 struct cgroup_subsys blkio_cgrp_subsys = {
 	.css_alloc = blkcg_css_alloc,
 	.css_offline = blkcg_css_offline,
+	.css_released = blkcg_css_released,
 	.css_free = blkcg_css_free,
 	.can_attach = blkcg_can_attach,
 	.legacy_cftypes = blkcg_files,
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 138a5ea..3b54835 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -117,21 +117,37 @@ out_unlock:
  */
 static void init_cgwb_dirty_page_context(struct dirty_context *dctx)
 {
+	struct backing_dev_info *bdi = dctx->mapping->backing_dev_info;
+	struct cgroup_subsys_state *blkcg_css;
+
 	/* cgroup writeback requires support from both the bdi and filesystem */
 	if (!mapping_cgwb_enabled(dctx->mapping))
 		goto force_root;
 
-	page_blkcg_attach_dirty(dctx->page);
+	/*
+	 * @dctx->page is a candidate for cgroup writeback and about to be
+	 * dirtied.  Attach the dirty blkcg to the page and pre-allocate
+	 * all resources necessary for cgroup writeback.  On failure, fall
+	 * back to the root blkcg.
+	 */
+	blkcg_css = page_blkcg_attach_dirty(dctx->page);
+	dctx->wb = cgwb_lookup_create(bdi, blkcg_css);
+	if (!dctx->wb) {
+		page_blkcg_detach_dirty(dctx->page);
+		goto force_root;
+	}
 	return;
 
 force_root:
 	page_blkcg_force_root_dirty(dctx->page);
+	dctx->wb = &bdi->wb;
 }
 
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
 static void init_cgwb_dirty_page_context(struct dirty_context *dctx)
 {
+	dctx->wb = &dctx->mapping->backing_dev_info->wb;
 }
 
 #endif	/* CONFIG_CGROUP_WRITEBACK */
@@ -176,6 +192,7 @@ void init_dirty_inode_context(struct dirty_context *dctx, struct inode *inode)
 {
 	memset(dctx, 0, sizeof(*dctx));
 	dctx->inode = inode;
+	dctx->wb = &inode_to_bdi(inode)->wb;
 }
 
 static void __wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index bf20ef1..511066f 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -2,6 +2,7 @@
 #define __LINUX_BACKING_DEV_DEFS_H
 
 #include <linux/list.h>
+#include <linux/radix-tree.h>
 #include <linux/spinlock.h>
 #include <linux/percpu_counter.h>
 #include <linux/flex_proportions.h>
@@ -68,6 +69,15 @@ struct bdi_writeback {
 	spinlock_t work_lock;		/* protects work_list & dwork scheduling */
 	struct list_head work_list;
 	struct delayed_work dwork;	/* work item used for writeback */
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct cgroup_subsys_state *blkcg_css; /* the blkcg we belong to */
+	struct list_head blkcg_node;	/* anchored at blkcg->wb_list */
+	union {
+		struct list_head shutdown_node;
+		struct rcu_head rcu;
+	};
+#endif
 };
 
 struct backing_dev_info {
@@ -82,8 +92,10 @@ struct backing_dev_info {
 	unsigned int min_ratio;
 	unsigned int max_ratio, max_prop_frac;
 
-	struct bdi_writeback wb;  /* default writeback info for this bdi */
-
+	struct bdi_writeback wb; /* the root writeback info for this bdi */
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct radix_tree_root cgwb_tree; /* radix tree of !root cgroup wbs */
+#endif
 	struct device *dev;
 
 	struct timer_list laptop_mode_wb_timer;
@@ -102,6 +114,7 @@ struct dirty_context {
 	struct page		*page;
 	struct inode		*inode;
 	struct address_space	*mapping;
+	struct bdi_writeback	*wb;
 };
 
 enum {
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 7a20cff..3722796 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -13,6 +13,7 @@
 #include <linux/sched.h>
 #include <linux/writeback.h>
 #include <linux/memcontrol.h>
+#include <linux/blk-cgroup.h>
 
 #include <linux/backing-dev-defs.h>
 
@@ -273,6 +274,10 @@ void init_dirty_inode_context(struct dirty_context *dctx, struct inode *inode);
 
 #ifdef CONFIG_CGROUP_WRITEBACK
 
+void cgwb_blkcg_released(struct cgroup_subsys_state *blkcg_css);
+int __cgwb_create(struct backing_dev_info *bdi,
+		  struct cgroup_subsys_state *blkcg_css);
+
 /**
  * mapping_cgwb_enabled - test whether cgroup writeback is enabled on a mapping
  * @mapping: address_space of interest
@@ -290,6 +295,97 @@ static inline bool mapping_cgwb_enabled(struct address_space *mapping)
 		inode && (inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK);
 }
 
+/**
+ * cgwb_lookup - lookup cgwb for a given blkcg on a bdi
+ * @bdi: target bdi
+ * @blkcg_css: target blkcg
+ *
+ * Look up the cgwb (cgroup bdi_writeback) for @blkcg_css on @bdi.  The
+ * returned cgwb is accessible as long as @bdi and @blkcg_css stay alive.
+ *
+ * Returns the pointer to the found cgwb on success, NULL on failure.
+ */
+static inline struct bdi_writeback *
+cgwb_lookup(struct backing_dev_info *bdi, struct cgroup_subsys_state *blkcg_css)
+{
+	struct bdi_writeback *cgwb;
+
+	if (blkcg_css == blkcg_root_css)
+		return &bdi->wb;
+
+	/*
+	 * RCU locking protects the radix tree itself.  The looked up cgwb
+	 * is protected by the caller ensuring that @bdi and the blkcg w/
+	 * @blkcg_id are alive.
+	 */
+	rcu_read_lock();
+	cgwb = radix_tree_lookup(&bdi->cgwb_tree, blkcg_css->id);
+	rcu_read_unlock();
+	return cgwb;
+}
+
+/**
+ * cgwb_lookup_create - try to lookup cgwb and create one if not found
+ * @bdi: target bdi
+ * @blkcg_css: cgroup_subsys_state of the target blkcg
+ *
+ * Try to look up the cgwb (cgroup bdi_writeback) for the blkcg with
+ * @blkcg_css on @bdi.  If it doesn't exist, try to create one.  This
+ * function can be called under any context without locking as long as @bdi
+ * and @blkcg_css are kept alive.  See cgwb_lookup() for details.
+ *
+ * Returns the pointer to the found cgwb on success, NULL if such cgwb
+ * doesn't exist and creation failed due to memory pressure.
+ */
+static inline struct bdi_writeback *
+cgwb_lookup_create(struct backing_dev_info *bdi,
+		   struct cgroup_subsys_state *blkcg_css)
+{
+	struct bdi_writeback *wb;
+
+	do {
+		wb = cgwb_lookup(bdi, blkcg_css);
+		if (wb)
+			return wb;
+	} while (!__cgwb_create(bdi, blkcg_css));
+
+	return NULL;
+}
+
+/**
+ * page_cgwb_dirty - lookup the dirty cgwb of a page
+ * @page: target page
+ *
+ * Returns the dirty cgwb (cgroup bdi_writeback) of @page.  The returned
+ * wb is accessible as long as @page is dirty.
+ */
+static inline struct bdi_writeback *page_cgwb_dirty(struct page *page)
+{
+	struct backing_dev_info *bdi = page->mapping->backing_dev_info;
+	struct bdi_writeback *wb = cgwb_lookup(bdi, page_blkcg_dirty(page));
+
+	if (WARN_ON_ONCE(!wb))
+		return &bdi->wb;
+	return wb;
+}
+
+/**
+ * page_cgwb_wb - lookup the writeback cgwb of a page
+ * @page: target page
+ *
+ * Returns the writeback cgwb (cgroup bdi_writeback) of @page.  The
+ * returned wb is accessible as long as @page is under writeback.
+ */
+static inline struct bdi_writeback *page_cgwb_wb(struct page *page)
+{
+	struct backing_dev_info *bdi = page->mapping->backing_dev_info;
+	struct bdi_writeback *wb = cgwb_lookup(bdi, page_blkcg_wb(page));
+
+	if (WARN_ON_ONCE(!wb))
+		return &bdi->wb;
+	return wb;
+}
+
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
 static inline bool mapping_cgwb_enabled(struct address_space *mapping)
@@ -297,6 +393,33 @@ static inline bool mapping_cgwb_enabled(struct address_space *mapping)
 	return false;
 }
 
+static inline void cgwb_blkcg_released(struct cgroup_subsys_state *blkcg_css)
+{
+}
+
+static inline struct bdi_writeback *
+cgwb_lookup(struct backing_dev_info *bdi, struct cgroup_subsys_state *blkcg_css)
+{
+	return &bdi->wb;
+}
+
+static inline struct bdi_writeback *
+cgwb_lookup_create(struct backing_dev_info *bdi,
+		   struct cgroup_subsys_state *blkcg_css)
+{
+	return &bdi->wb;
+}
+
+static inline struct bdi_writeback *page_cgwb_dirty(struct page *page)
+{
+	return &page->mapping->backing_dev_info->wb;
+}
+
+static inline struct bdi_writeback *page_cgwb_wb(struct page *page)
+{
+	return &page->mapping->backing_dev_info->wb;
+}
+
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
 #endif		/* _LINUX_BACKING_DEV_H */
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 4dc643f..3033eb1 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -53,6 +53,10 @@ struct blkcg {
 	/* TODO: per-policy storage in blkcg */
 	unsigned int			cfq_weight;	/* belongs to cfq */
 	unsigned int			cfq_leaf_weight;
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct list_head		cgwb_list;
+#endif
 };
 
 struct blkg_stat {
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1c9b70e..c6dda82 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -440,6 +440,192 @@ static void wb_exit(struct bdi_writeback *wb)
 	fprop_local_destroy_percpu(&wb->completions);
 }
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+/*
+ * cgwb_lock protects bdi->cgwb_tree and blkcg->cgwb_list where the former
+ * is also RCU protected.  cgwb_shutdown_mutex synchronizes shutdown
+ * attempts from bdi and blkcg destructions.  For details, see
+ * cgwb_shutdown_prepare/commit().
+ */
+static DEFINE_SPINLOCK(cgwb_lock);
+static DEFINE_MUTEX(cgwb_shutdown_mutex);
+
+int __cgwb_create(struct backing_dev_info *bdi,
+		  struct cgroup_subsys_state *blkcg_css)
+{
+	struct blkcg *blkcg = css_to_blkcg(blkcg_css);
+	struct bdi_writeback *wb;
+	unsigned long flags;
+	int ret;
+
+	wb = kzalloc(sizeof(*wb), GFP_ATOMIC);
+	if (!wb)
+		return -ENOMEM;
+
+	ret = wb_init(wb, bdi, GFP_ATOMIC);
+	if (ret) {
+		kfree(wb);
+		return -ENOMEM;
+	}
+
+	wb->blkcg_css = blkcg_css;
+	set_bit(WB_registered, &wb->state); /* cgwbs are always registered */
+
+	ret = -ENODEV;
+	spin_lock_irqsave(&cgwb_lock, flags);
+	/* the root wb determines the registered state of the whole bdi */
+	if (test_bit(WB_registered, &bdi->wb.state)) {
+		/* we might have raced w/ another instance of this function */
+		ret = radix_tree_insert(&bdi->cgwb_tree, blkcg_css->id, wb);
+		if (!ret)
+			list_add_tail(&wb->blkcg_node, &blkcg->cgwb_list);
+	}
+	spin_unlock_irqrestore(&cgwb_lock, flags);
+	if (ret) {
+		wb_exit(wb);
+		if (ret != -EEXIST)
+			return ret;
+	}
+	return 0;
+}
+
+/**
+ * cgwb_shutdown_prepare - prepare to shutdown a cgwb
+ * @wb: cgwb to be shutdown
+ * @to_shutdown: list to queue @wb on
+ *
+ * This function is called to queue @wb for shutdown on @to_shutdown.  The
+ * bdi_writeback indexes use the cgwb_lock spinlock but wb_shutdown() needs
+ * process context, so this function can be called while holding cgwb_lock
+ * and cgwb_shutdown_mutex to queue cgwbs for shutdown.  Once all target
+ * cgwbs are queued, the caller should release cgwb_lock and invoke
+ * cgwb_shutdown_commit().
+ */
+static void cgwb_shutdown_prepare(struct bdi_writeback *wb,
+				  struct list_head *to_shutdown)
+{
+	lockdep_assert_held(&cgwb_lock);
+	lockdep_assert_held(&cgwb_shutdown_mutex);
+
+	WARN_ON(!test_bit(WB_registered, &wb->state));
+	clear_bit(WB_registered, &wb->state);
+	list_add_tail(&wb->shutdown_node, to_shutdown);
+}
+
+/**
+ * cgwb_shutdown_commit - commit cgwb shutdowns
+ * @to_shutdown: list of cgwbs to shutdown
+ *
+ * This function is called after @to_shutdown is built by calls to
+ * cgwb_shutdown_prepare() and cgwb_lock is released.  It invokes
+ * wb_shutdown() on all cgwbs on the list.  bdi and blkcg may try to
+ * shutdown the same cgwbs and should wait till completion if shutdown is
+ * initiated by the other.  This synchronization is achieved through
+ * cgwb_shutdown_mutex which should have been acquired before the
+ * cgwb_shutdown_prepare() invocations.
+ */
+static void cgwb_shutdown_commit(struct list_head *to_shutdown)
+{
+	struct bdi_writeback *wb;
+
+	lockdep_assert_held(&cgwb_shutdown_mutex);
+
+	list_for_each_entry(wb, to_shutdown, shutdown_node)
+		wb_shutdown(wb);
+}
+
+static void cgwb_exit(struct bdi_writeback *wb)
+{
+	WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->blkcg_css->id));
+	list_del(&wb->blkcg_node);
+	wb_exit(wb);
+	kfree_rcu(wb, rcu);
+}
+
+static void cgwb_bdi_init(struct backing_dev_info *bdi)
+{
+	bdi->wb.blkcg_css = blkcg_root_css;
+	INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
+}
+
+/**
+ * cgwb_bdi_shutdown - @bdi is being shut down, shut down all cgwbs
+ * @bdi: bdi being shut down
+ */
+static void cgwb_bdi_shutdown(struct backing_dev_info *bdi)
+{
+	LIST_HEAD(to_shutdown);
+	struct radix_tree_iter iter;
+	void **slot;
+
+	WARN_ON(test_bit(WB_registered, &bdi->wb.state));
+
+	mutex_lock(&cgwb_shutdown_mutex);
+	spin_lock_irq(&cgwb_lock);
+
+	radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
+		cgwb_shutdown_prepare(*slot, &to_shutdown);
+
+	spin_unlock_irq(&cgwb_lock);
+	cgwb_shutdown_commit(&to_shutdown);
+	mutex_unlock(&cgwb_shutdown_mutex);
+}
+
+/**
+ * cgwb_bdi_exit - @bdi is being exit, exit all its cgwbs
+ * @bdi: bdi being shut down
+ */
+static void cgwb_bdi_exit(struct backing_dev_info *bdi)
+{
+	LIST_HEAD(to_free);
+	struct radix_tree_iter iter;
+	void **slot;
+
+	spin_lock_irq(&cgwb_lock);
+	radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) {
+		struct bdi_writeback *wb = *slot;
+
+		WARN_ON(test_bit(WB_registered, &wb->state));
+		cgwb_exit(wb);
+	}
+	spin_unlock_irq(&cgwb_lock);
+}
+
+/**
+ * cgwb_blkcg_released - a blkcg is being destroyed, release all matching cgwbs
+ * @blkcg_css: blkcg being destroyed
+ */
+void cgwb_blkcg_released(struct cgroup_subsys_state *blkcg_css)
+{
+	LIST_HEAD(to_shutdown);
+	struct blkcg *blkcg = css_to_blkcg(blkcg_css);
+	struct bdi_writeback *wb, *next;
+
+	mutex_lock(&cgwb_shutdown_mutex);
+	spin_lock_irq(&cgwb_lock);
+
+	list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node)
+		cgwb_shutdown_prepare(wb, &to_shutdown);
+
+	spin_unlock_irq(&cgwb_lock);
+	cgwb_shutdown_commit(&to_shutdown);
+	mutex_unlock(&cgwb_shutdown_mutex);
+
+	spin_lock_irq(&cgwb_lock);
+	list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node)
+		cgwb_exit(wb);
+	spin_unlock_irq(&cgwb_lock);
+}
+
+#else	/* CONFIG_CGROUP_WRITEBACK */
+
+static void cgwb_bdi_init(struct backing_dev_info *bdi) { }
+static void cgwb_bdi_shutdown(struct backing_dev_info *bdi) { }
+static void cgwb_bdi_exit(struct backing_dev_info *bdi) { }
+
+#endif	/* CONFIG_CGROUP_WRITEBACK */
+
 int bdi_init(struct backing_dev_info *bdi)
 {
 	int err;
@@ -455,6 +641,7 @@ int bdi_init(struct backing_dev_info *bdi)
 	if (err)
 		return err;
 
+	cgwb_bdi_init(bdi);
 	return 0;
 }
 EXPORT_SYMBOL(bdi_init);
@@ -532,6 +719,7 @@ void bdi_unregister(struct backing_dev_info *bdi)
 			/* make sure nobody finds us on the bdi_list anymore */
 			bdi_remove_from_list(bdi);
 			wb_shutdown(&bdi->wb);
+			cgwb_bdi_shutdown(bdi);
 		}
 
 		bdi_debug_unregister(bdi);
@@ -544,6 +732,7 @@ EXPORT_SYMBOL(bdi_unregister);
 void bdi_destroy(struct backing_dev_info *bdi)
 {
 	bdi_unregister(bdi);
+	cgwb_bdi_exit(bdi);
 	wb_exit(&bdi->wb);
 }
 EXPORT_SYMBOL(bdi_destroy);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 72a0edf..6475504 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2102,8 +2102,8 @@ void account_page_dirtied(struct dirty_context *dctx)
 
 	__inc_zone_page_state(page, NR_FILE_DIRTY);
 	__inc_zone_page_state(page, NR_DIRTIED);
-	__inc_wb_stat(&mapping->backing_dev_info->wb, WB_RECLAIMABLE);
-	__inc_wb_stat(&mapping->backing_dev_info->wb, WB_DIRTIED);
+	__inc_wb_stat(dctx->wb, WB_RECLAIMABLE);
+	__inc_wb_stat(dctx->wb, WB_DIRTIED);
 	task_io_account_write(PAGE_CACHE_SIZE);
 	current->nr_dirtied++;
 	this_cpu_inc(bdp_ratelimits);
-- 
2.1.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>



[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]