Re: [PATCH 2/8] writeback: move dirty inodes from super_block to backing_dev_info

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Sep 02, 2009 at 10:42:41AM +0200, Jens Axboe wrote:
> This is a first step at introducing per-bdi flusher threads. We should
> have no change in behaviour, although sb_has_dirty_inodes() is now
> ridiculously expensive, as there's no easy way to answer that question.
> Not a huge problem, since it'll be deleted in subsequent patches.

After this patch generic_sync_sb_inodes becomes pretty useless.  We
have two callers which each want to call bdi_writeback_all, and one of
them wants to wait, so just split that into a separate helper.

Also move wakeup_flusher_threads into fs-writeback.c, that allows us
to make bdi_writeback_all static.

Btw, I do not think implementing sync_inodes_sb/sync_inodes_sb_wait
is a smart and efficient idea.  Right now we have a n:1 superblock:bdi
relation, so we really should make use of that instead of doing linear
search of all bdis in the system.  If we introduce multiple bdis per
superblock we should add an efficient lookup data structure for them.


Index: linux-2.6/fs/fs-writeback.c
===================================================================
--- linux-2.6.orig/fs/fs-writeback.c	2009-09-03 23:35:30.544832785 -0300
+++ linux-2.6/fs/fs-writeback.c	2009-09-03 23:42:52.044832715 -0300
@@ -836,7 +836,7 @@ int bdi_writeback_task(struct bdi_writeb
  * we are simply called for WB_SYNC_NONE, then writeback will merely be
  * scheduled to run.
  */
-void bdi_writeback_all(struct writeback_control *wbc)
+static void bdi_writeback_all(struct writeback_control *wbc)
 {
 	const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
 	struct backing_dev_info *bdi;
@@ -894,6 +894,25 @@ restart:
 	}
 }
 
+/*
+ * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
+ * the whole world.
+ */
+void wakeup_flusher_threads(long nr_pages)
+{
+	struct writeback_control wbc = {
+		.sync_mode	= WB_SYNC_NONE,
+		.older_than_this = NULL,
+		.range_cyclic	= 1,
+	};
+
+	if (nr_pages == 0)
+		nr_pages = global_page_state(NR_FILE_DIRTY) +
+				global_page_state(NR_UNSTABLE_NFS);
+	wbc.nr_to_write = nr_pages;
+	bdi_writeback_all(&wbc);
+}
+
 static noinline void block_dump___mark_inode_dirty(struct inode *inode)
 {
 	if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
@@ -1012,81 +1031,53 @@ out:
 EXPORT_SYMBOL(__mark_inode_dirty);
 
 /*
- * Write out a superblock's list of dirty inodes.  A wait will be performed
- * upon no inodes, all inodes or the final one, depending upon sync_mode.
- *
- * If older_than_this is non-NULL, then only write out inodes which
- * had their first dirtying at a time earlier than *older_than_this.
- *
- * If we're a pdlfush thread, then implement pdflush collision avoidance
- * against the entire list.
- *
- * If `bdi' is non-zero then we're being asked to writeback a specific queue.
- * This function assumes that the blockdev superblock's inodes are backed by
- * a variety of queues, so all inodes are searched.  For other superblocks,
- * assume that all inodes are backed by the same queue.
- *
- * The inodes to be written are parked on bdi->b_io.  They are moved back onto
- * bdi->b_dirty as they are selected for writing.  This way, none can be missed
- * on the writer throttling path, and we get decent balancing between many
- * throttled threads: we don't want them all piling up on inode_sync_wait.
+ * For a data integrity sync we must wait for all pages under writeback,
+ * because there may have been pages dirtied before our sync call, but which
+ * had writeout started before we write it out.  In which case, the inode
+ * may not be on the dirty list, but we still have to wait for that writeout.
  */
-static void generic_sync_sb_inodes(struct writeback_control *wbc)
+static void wait_sb_inodes(struct writeback_control *wbc)
 {
-	if (wbc->bdi)
-		bdi_start_writeback(wbc);
-	else
-		bdi_writeback_all(wbc);
+	struct inode *inode, *old_inode = NULL;
 
-	if (wbc->sync_mode == WB_SYNC_ALL) {
-		struct inode *inode, *old_inode = NULL;
+	/*
+	 * We need to be protected against the filesystem going from
+	 * r/o to r/w or vice versa.
+	 */
+	WARN_ON(!rwsem_is_locked(&wbc->sb->s_umount));
 
-		/*
-		 * We need to be protected against the filesystem going from
-		 * r/o to r/w or vice versa.
-		 */
-		WARN_ON(!rwsem_is_locked(&wbc->sb->s_umount));
+	spin_lock(&inode_lock);
 
-		spin_lock(&inode_lock);
+	list_for_each_entry(inode, &wbc->sb->s_inodes, i_sb_list) {
+		struct address_space *mapping;
+
+		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
+			continue;
+		mapping = inode->i_mapping;
+		if (mapping->nrpages == 0)
+			continue;
+		__iget(inode);
+		spin_unlock(&inode_lock);
 
 		/*
-		 * Data integrity sync. Must wait for all pages under writeback,
-		 * because there may have been pages dirtied before our sync
-		 * call, but which had writeout started before we write it out.
-		 * In which case, the inode may not be on the dirty list, but
-		 * we still have to wait for that writeout.
+		 * We hold a reference to 'inode' so it couldn't have
+		 * been removed from s_inodes list while we dropped the
+		 * inode_lock.  We cannot iput the inode now as we can
+		 * be holding the last reference and we cannot iput it
+		 * under inode_lock. So we keep the reference and iput
+		 * it later.
 		 */
-		list_for_each_entry(inode, &wbc->sb->s_inodes, i_sb_list) {
-			struct address_space *mapping;
-
-			if (inode->i_state &
-					(I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
-				continue;
-			mapping = inode->i_mapping;
-			if (mapping->nrpages == 0)
-				continue;
-			__iget(inode);
-			spin_unlock(&inode_lock);
-			/*
-			 * We hold a reference to 'inode' so it couldn't have
-			 * been removed from s_inodes list while we dropped the
-			 * inode_lock.  We cannot iput the inode now as we can
-			 * be holding the last reference and we cannot iput it
-			 * under inode_lock. So we keep the reference and iput
-			 * it later.
-			 */
-			iput(old_inode);
-			old_inode = inode;
+		iput(old_inode);
+		old_inode = inode;
 
-			filemap_fdatawait(mapping);
+		filemap_fdatawait(mapping);
 
-			cond_resched();
+		cond_resched();
 
-			spin_lock(&inode_lock);
-		}
-		spin_unlock(&inode_lock);
-		iput(old_inode);
+		spin_lock(&inode_lock);
 	}
+	spin_unlock(&inode_lock);
+	iput(old_inode);
 }
 
 /**
@@ -1112,7 +1103,7 @@ long sync_inodes_sb(struct super_block *
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
 
 	wbc.nr_to_write = nr_to_write;
-	generic_sync_sb_inodes(&wbc);
+	bdi_writeback_all(wbc);
 	return nr_to_write - wbc.nr_to_write;
 }
 EXPORT_SYMBOL(sync_inodes_sb);
@@ -1131,11 +1122,11 @@ long sync_inodes_sb_wait(struct super_bl
 		.sync_mode	= WB_SYNC_ALL,
 		.range_start	= 0,
 		.range_end	= LLONG_MAX,
+		.nr_to_write	= LLONG_MAX,	/* doesn't actually matter */
 	};
-	long nr_to_write = LLONG_MAX; /* doesn't actually matter */
 
-	wbc.nr_to_write = nr_to_write;
-	generic_sync_sb_inodes(&wbc);
+	bdi_writeback_all(wbc);
+	wait_sb_inodes(&wbc);
 	return nr_to_write - wbc.nr_to_write;
 }
 EXPORT_SYMBOL(sync_inodes_sb_wait);
Index: linux-2.6/include/linux/backing-dev.h
===================================================================
--- linux-2.6.orig/include/linux/backing-dev.h	2009-09-03 23:42:42.968834087 -0300
+++ linux-2.6/include/linux/backing-dev.h	2009-09-03 23:42:45.124832763 -0300
@@ -99,7 +99,6 @@ int bdi_register_dev(struct backing_dev_
 void bdi_unregister(struct backing_dev_info *bdi);
 void bdi_start_writeback(struct writeback_control *wbc);
 int bdi_writeback_task(struct bdi_writeback *wb);
-void bdi_writeback_all(struct writeback_control *wbc);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
 
 extern spinlock_t bdi_lock;
Index: linux-2.6/include/linux/writeback.h
===================================================================
--- linux-2.6.orig/include/linux/writeback.h	2009-09-03 23:42:11.956834796 -0300
+++ linux-2.6/include/linux/writeback.h	2009-09-03 23:42:32.708832476 -0300
@@ -85,6 +85,7 @@ long sync_inodes_sb(struct super_block *
 long sync_inodes_sb_wait(struct super_block *);
 void sync_inodes_wbc(struct writeback_control *wbc);
 long wb_do_writeback(struct bdi_writeback *wb, int force_wait);
+void wakeup_flusher_threads(long nr_pages);
 
 /* writeback.h requires fs.h; it, too, is not included from here. */
 static inline void wait_on_inode(struct inode *inode)
@@ -104,7 +105,6 @@ static inline void inode_sync_wait(struc
 /*
  * mm/page-writeback.c
  */
-void wakeup_flusher_threads(long nr_pages);
 void laptop_io_completion(void);
 void laptop_sync_completion(void);
 void throttle_vm_writeout(gfp_t gfp_mask);
Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c	2009-09-03 23:41:42.724833252 -0300
+++ linux-2.6/mm/page-writeback.c	2009-09-03 23:41:51.368832827 -0300
@@ -674,25 +674,6 @@ void throttle_vm_writeout(gfp_t gfp_mask
         }
 }
 
-/*
- * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
- * the whole world.
- */
-void wakeup_flusher_threads(long nr_pages)
-{
-	struct writeback_control wbc = {
-		.sync_mode	= WB_SYNC_NONE,
-		.older_than_this = NULL,
-		.range_cyclic	= 1,
-	};
-
-	if (nr_pages == 0)
-		nr_pages = global_page_state(NR_FILE_DIRTY) +
-				global_page_state(NR_UNSTABLE_NFS);
-	wbc.nr_to_write = nr_pages;
-	bdi_writeback_all(&wbc);
-}
-
 static void laptop_timer_fn(unsigned long unused);
 
 static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux