On Tue, Oct 25, 2011 at 01:56:48PM +0200, Christian Brunner wrote: > 2011/10/24 Josef Bacik <josef@xxxxxxxxxx>: > > On Mon, Oct 24, 2011 at 10:06:49AM -0700, Sage Weil wrote: > >> [adding linux-btrfs to cc] > >> > >> Josef, Chris, any ideas on the below issues? > >> > >> On Mon, 24 Oct 2011, Christian Brunner wrote: > >> > > >> > - When I run ceph with btrfs snaps disabled, the situation is getting > >> > slightly better. I can run an OSD for about 3 days without problems, > >> > but then again the load increases. This time, I can see that the > >> > ceph-osd (blkdev_issue_flush) and btrfs-endio-wri are doing more work > >> > than usual. > >> > >> FYI in this scenario you're exposed to the same journal replay issues that > >> ext4 and XFS are. The btrfs workload that ceph is generating will also > >> not be all that special, though, so this problem shouldn't be unique to > >> ceph. > >> > > > > Can you get sysrq+w when this happens? I'd like to see what btrfs-endio-write > > is up to. > > Capturing this seems to be not easy. I have a few traces (see > attachment), but with sysrq+w I do not get a stacktrace of > btrfs-endio-write. What I have is a "latencytop -c" output which is > interesting: > > In our Ceph-OSD server we have 4 disks with 4 btrfs filesystems. Ceph > tries to balance the load over all OSDs, so all filesystems should get > an nearly equal load. At the moment one filesystem seems to have a > problem. When running with iostat I see the following > > Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s > avgrq-sz avgqu-sz await svctm %util > sdd 0.00 0.00 0.00 4.33 0.00 53.33 > 12.31 0.08 19.38 12.23 5.30 > sdc 0.00 1.00 0.00 228.33 0.00 1957.33 > 8.57 74.33 380.76 2.74 62.57 > sdb 0.00 0.00 0.00 1.33 0.00 16.00 > 12.00 0.03 25.00 19.75 2.63 > sda 0.00 0.00 0.00 0.67 0.00 8.00 > 12.00 0.01 19.50 12.50 0.83 > > The PID of the ceph-osd taht is running on sdc is 2053 and when I look > with top I see this process and a btrfs-endio-writer (PID 5447): > > PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND > 2053 root 20 0 537m 146m 2364 S 33.2 0.6 43:31.24 ceph-osd > 5447 root 20 0 0 0 0 S 22.6 0.0 19:32.18 btrfs-endio-wri > > In the latencytop output you can see that those processes have a much > higher latency, than the other ceph-osd and btrfs-endio-writers. > > Regards, > Christian Ok just a shot in the dark, but could you give this a whirl and see if it helps you? Thanks Josef diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 125cf76..fbc196e 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -210,9 +210,9 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, } int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, - struct list_head *cluster, u64 start) + struct list_head *cluster, u64 start, unsigned long max_count) { - int count = 0; + unsigned long count = 0; struct btrfs_delayed_ref_root *delayed_refs; struct rb_node *node; struct btrfs_delayed_ref_node *ref; @@ -242,7 +242,7 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, node = rb_first(&delayed_refs->root); } again: - while (node && count < 32) { + while (node && count < max_count) { ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); if (btrfs_delayed_ref_is_head(ref)) { head = btrfs_delayed_node_to_head(ref); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index e287e3b..b15a6ad 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -169,7 +169,8 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head); int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, - struct list_head *cluster, u64 search_start); + struct list_head *cluster, u64 search_start, + unsigned long max_count); /* * a node might live in a head or a regular ref, this lets you * test for the proper type to use. diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 31d84e7..c190282 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -81,6 +81,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, u32 data_size; BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)); + WARN_ON(trans->endio); key.objectid = objectid; btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4eb7d2b..0977a10 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2295,7 +2295,7 @@ again: * lock */ ret = btrfs_find_ref_cluster(trans, &cluster, - delayed_refs->run_delayed_start); + delayed_refs->run_delayed_start, count); if (ret) break; @@ -2338,7 +2338,8 @@ again: node = rb_next(node); } spin_unlock(&delayed_refs->lock); - schedule_timeout(1); + if (need_resched()) + schedule_timeout(1); goto again; } out: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f12747c..73a5e66 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1752,6 +1752,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) else trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); + trans->endio = 1; trans->block_rsv = &root->fs_info->delalloc_block_rsv; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) @@ -2057,8 +2058,11 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) LIST_HEAD(list); struct btrfs_fs_info *fs_info = root->fs_info; struct delayed_iput *delayed; + struct btrfs_trans_handle *trans; int empty; + trans = current->journal_info; + WARN_ON(trans && trans->endio); spin_lock(&fs_info->delayed_iput_lock); empty = list_empty(&fs_info->delayed_iputs); spin_unlock(&fs_info->delayed_iput_lock); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a1c9404..ab68cfa 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -527,12 +527,15 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, */ int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) { + struct btrfs_trans_handle *trans; struct btrfs_inode *btrfs_inode; struct inode *inode; struct list_head splice; + trans = (struct btrfs_trans_handle *)current->journal_info; INIT_LIST_HEAD(&splice); + WARN_ON(trans && trans->endio); mutex_lock(&root->fs_info->ordered_operations_mutex); spin_lock(&root->fs_info->ordered_extent_lock); again: diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 29bef63..009d2db 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -310,6 +310,7 @@ again: h->use_count = 1; h->block_rsv = NULL; h->orig_rsv = NULL; + h->endio = 0; smp_mb(); if (cur_trans->blocked && may_wait_transaction(root, type)) { @@ -467,20 +468,17 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, while (count < 4) { unsigned long cur = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; - if (cur && - trans->transaction->delayed_refs.num_heads_ready > 64) { - trans->delayed_ref_updates = 0; - - /* - * do a full flush if the transaction is trying - * to close - */ - if (trans->transaction->delayed_refs.flushing) - cur = 0; - btrfs_run_delayed_refs(trans, root, cur); - } else { + if (!cur || + trans->transaction->delayed_refs.num_heads_ready <= 64) break; - } + + /* + * do a full flush if the transaction is trying + * to close + */ + if (trans->transaction->delayed_refs.flushing && throttle) + cur = 0; + btrfs_run_delayed_refs(trans, root, cur); count++; } @@ -498,6 +496,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, * our use_count. */ trans->use_count++; + WARN_ON(trans->endio); return btrfs_commit_transaction(trans, root); } else { wake_up_process(info->transaction_kthread); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 02564e6..7eae404 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -55,6 +55,7 @@ struct btrfs_trans_handle { struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; struct btrfs_block_rsv *orig_rsv; + unsigned endio; }; struct btrfs_pending_snapshot { -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html