Re: Ceph on btrfs 3.4rc

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, Apr 27, 2012 at 01:02:08PM +0200, Christian Brunner wrote:
> Am 24. April 2012 18:26 schrieb Sage Weil <sage@xxxxxxxxxxxx>:
> > On Tue, 24 Apr 2012, Josef Bacik wrote:
> >> On Fri, Apr 20, 2012 at 05:09:34PM +0200, Christian Brunner wrote:
> >> > After running ceph on XFS for some time, I decided to try btrfs again.
> >> > Performance with the current "for-linux-min" branch and big metadata
> >> > is much better. The only problem (?) I'm still seeing is a warning
> >> > that seems to occur from time to time:
> >
> > Actually, before you do that... we have a new tool,
> > test_filestore_workloadgen, that generates a ceph-osd-like workload on the
> > local file system.  It's a subset of what a full OSD might do, but if
> > we're lucky it will be sufficient to reproduce this issue.  Something like
> >
> >  test_filestore_workloadgen --osd-data /foo --osd-journal /bar
> >
> > will hopefully do the trick.
> >
> > Christian, maybe you can see if that is able to trigger this warning?
> > You'll need to pull it from the current master branch; it wasn't in the
> > last release.
> 
> Trying to reproduce with test_filestore_workloadgen didn't work for
> me. So here are some instructions on how to reproduce with a minimal
> ceph setup.
> 
> You will need a single system with two disks and a bit of memory.
> 
> - Compile and install ceph (detailed instructions:
> http://ceph.newdream.net/docs/master/ops/install/mkcephfs/)
> 
> - For the test setup I've used two tmpfs files as journal devices. To
> create these, do the following:
> 
> # mkdir -p /ceph/temp
> # mount -t tmpfs tmpfs /ceph/temp
> # dd if=/dev/zero of=/ceph/temp/journal0 count=500 bs=1024k
> # dd if=/dev/zero of=/ceph/temp/journal1 count=500 bs=1024k
> 
> - Now you should create and mount btrfs. Here is what I did:
> 
> # mkfs.btrfs -l 64k -n 64k /dev/sda
> # mkfs.btrfs -l 64k -n 64k /dev/sdb
> # mkdir /ceph/osd.000
> # mkdir /ceph/osd.001
> # mount -o noatime,space_cache,inode_cache,autodefrag /dev/sda /ceph/osd.000
> # mount -o noatime,space_cache,inode_cache,autodefrag /dev/sdb /ceph/osd.001
> 
> - Create /etc/ceph/ceph.conf similar to the attached ceph.conf. You
> will probably have to change the btrfs devices and the hostname
> (os39).
> 
> - Create the ceph filesystems:
> 
> # mkdir /ceph/mon
> # mkcephfs -a -c /etc/ceph/ceph.conf
> 
> - Start ceph (e.g. "service ceph start")
> 
> - Now you should be able to use ceph - "ceph -s" will tell you about
> the state of the ceph cluster.
> 
> - "rbd create -size 100 testimg" will create an rbd image on the ceph cluster.
> 
> - Compile my test with "gcc -o rbdtest rbdtest.c -lrbd" and run it
> with "./rbdtest testimg".
> 
> I can see the first btrfs_orphan_commit_root warning after an hour or
> so... I hope that I've described all necessary steps. If there is a
> problem just send me a note.
> 

Well I feel like an idiot, I finally get it to reproduce, go look at where I
want to put my printks and theres the problem staring me right in the face.
I've looked seriously at this problem 2 or 3 times and have missed this every
single freaking time.  Here is the patch I'm trying, please try it on yours to
make sure it fixes the problem.  It takes like 2 hours for it to reproduce for
me so I won't be able to fully test it until tomorrow, but so far it hasn't
broken anything so it should be good.  Thanks,

Josef


diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index eefe573..4ad628d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -57,9 +57,6 @@ struct btrfs_inode {
 	/* used to order data wrt metadata */
 	struct btrfs_ordered_inode_tree ordered_tree;
 
-	/* for keeping track of orphaned inodes */
-	struct list_head i_orphan;
-
 	/* list of all the delalloc inodes in the FS.  There are times we need
 	 * to write all the delalloc pages to disk, and this list is used
 	 * to walk them all.
@@ -164,6 +161,7 @@ struct btrfs_inode {
 	unsigned dummy_inode:1;
 	unsigned in_defrag:1;
 	unsigned delalloc_meta_reserved:1;
+	unsigned has_orphan_item:1;
 
 	/*
 	 * always compress this one file
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8a89888..6dd20f3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1375,7 +1375,7 @@ struct btrfs_root {
 	struct list_head root_list;
 
 	spinlock_t orphan_lock;
-	struct list_head orphan_list;
+	atomic_t orphan_inodes;
 	struct btrfs_block_rsv *orphan_block_rsv;
 	int orphan_item_inserted;
 	int orphan_cleanup_state;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7f849b3..8bbe8c4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1148,7 +1148,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	root->orphan_block_rsv = NULL;
 
 	INIT_LIST_HEAD(&root->dirty_list);
-	INIT_LIST_HEAD(&root->orphan_list);
 	INIT_LIST_HEAD(&root->root_list);
 	spin_lock_init(&root->orphan_lock);
 	spin_lock_init(&root->inode_lock);
@@ -1161,6 +1160,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	atomic_set(&root->log_commit[0], 0);
 	atomic_set(&root->log_commit[1], 0);
 	atomic_set(&root->log_writers, 0);
+	atomic_set(&root->orphan_inodes, 0);
 	root->log_batch = 0;
 	root->log_transid = 0;
 	root->last_log_commit = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0218a4e..0265d40 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2138,12 +2138,12 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
 	struct btrfs_block_rsv *block_rsv;
 	int ret;
 
-	if (!list_empty(&root->orphan_list) ||
+	if (atomic_read(&root->orphan_inodes) ||
 	    root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
 		return;
 
 	spin_lock(&root->orphan_lock);
-	if (!list_empty(&root->orphan_list)) {
+	if (atomic_read(&root->orphan_inodes)) {
 		spin_unlock(&root->orphan_lock);
 		return;
 	}
@@ -2200,8 +2200,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 		block_rsv = NULL;
 	}
 
-	if (list_empty(&BTRFS_I(inode)->i_orphan)) {
-		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+	if (!BTRFS_I(inode)->has_orphan_item) {
+		BTRFS_I(inode)->has_orphan_item = 1;
 #if 0
 		/*
 		 * For proper ENOSPC handling, we should do orphan
@@ -2214,6 +2214,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 			insert = 1;
 #endif
 		insert = 1;
+		atomic_inc(&root->orphan_inodes);
 	}
 
 	if (!BTRFS_I(inode)->orphan_meta_reserved) {
@@ -2261,9 +2262,8 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
 	int release_rsv = 0;
 	int ret = 0;
 
-	spin_lock(&root->orphan_lock);
-	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
-		list_del_init(&BTRFS_I(inode)->i_orphan);
+	if (BTRFS_I(inode)->has_orphan_item) {
+		BTRFS_I(inode)->has_orphan_item = 0;
 		delete_item = 1;
 	}
 
@@ -2271,7 +2271,6 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
 		BTRFS_I(inode)->orphan_meta_reserved = 0;
 		release_rsv = 1;
 	}
-	spin_unlock(&root->orphan_lock);
 
 	if (trans && delete_item) {
 		ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
@@ -2281,6 +2280,9 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
 	if (release_rsv)
 		btrfs_orphan_release_metadata(inode);
 
+	if (trans && delete_item)
+		atomic_dec(&root->orphan_inodes);
+
 	return 0;
 }
 
@@ -2418,9 +2420,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 		 * add this inode to the orphan list so btrfs_orphan_del does
 		 * the proper thing when we hit it
 		 */
-		spin_lock(&root->orphan_lock);
-		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
-		spin_unlock(&root->orphan_lock);
+		atomic_inc(&root->orphan_inodes);
+		BTRFS_I(inode)->has_orphan_item = 1;
 
 		/* if we have links, this was a truncate, lets do that */
 		if (inode->i_nlink) {
@@ -3741,7 +3742,7 @@ void btrfs_evict_inode(struct inode *inode)
 	btrfs_wait_ordered_range(inode, 0, (u64)-1);
 
 	if (root->fs_info->log_root_recovering) {
-		BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan));
+		BUG_ON(!BTRFS_I(inode)->has_orphan_item);
 		goto no_delete;
 	}
 
@@ -6921,6 +6922,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei->in_defrag = 0;
 	ei->delalloc_meta_reserved = 0;
 	ei->complete_ordered = 0;
+	ei->has_orphan_item = 0;
 	ei->force_compress = BTRFS_COMPRESS_NONE;
 
 	ei->delayed_node = NULL;
@@ -6934,7 +6936,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	mutex_init(&ei->log_mutex);
 	mutex_init(&ei->delalloc_mutex);
 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
-	INIT_LIST_HEAD(&ei->i_orphan);
 	INIT_LIST_HEAD(&ei->delalloc_inodes);
 	INIT_LIST_HEAD(&ei->ordered_operations);
 	INIT_LIST_HEAD(&ei->ordered_finished);
@@ -6980,13 +6981,11 @@ void btrfs_destroy_inode(struct inode *inode)
 		spin_unlock(&root->fs_info->ordered_extent_lock);
 	}
 
-	spin_lock(&root->orphan_lock);
-	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+	if (BTRFS_I(inode)->has_orphan_item) {
 		printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n",
 		       (unsigned long long)btrfs_ino(inode));
-		list_del_init(&BTRFS_I(inode)->i_orphan);
+		atomic_dec(&root->orphan_inodes);
 	}
-	spin_unlock(&root->orphan_lock);
 
 	while (1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [CEPH Users]     [Ceph Large]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux